1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 2050 86 87 /* 88 * Convenience structure used by rpcib_get_ib_addresses() 89 */ 90 typedef struct rpcib_ipaddrs { 91 void *ri_list; /* pointer to list of addresses */ 92 uint_t ri_count; /* number of addresses in list */ 93 uint_t ri_size; /* size of ri_list in bytes */ 94 } rpcib_ipaddrs_t; 95 96 /* 97 * Prototype declarations for driver ops 98 */ 99 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 100 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 101 void *, void **); 102 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 103 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 104 static int rpcib_do_ip_ioctl(int, int, void *); 105 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 106 static int rpcib_cache_kstat_update(kstat_t *, int); 107 static void rib_force_cleanup(void *); 108 109 struct { 110 kstat_named_t cache_limit; 111 kstat_named_t cache_allocation; 112 kstat_named_t cache_hits; 113 kstat_named_t cache_misses; 114 kstat_named_t cache_misses_above_the_limit; 115 } rpcib_kstat = { 116 {"cache_limit", KSTAT_DATA_UINT64 }, 117 {"cache_allocation", KSTAT_DATA_UINT64 }, 118 {"cache_hits", KSTAT_DATA_UINT64 }, 119 {"cache_misses", KSTAT_DATA_UINT64 }, 120 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 121 }; 122 123 /* rpcib cb_ops */ 124 static struct cb_ops rpcib_cbops = { 125 nulldev, /* open */ 126 nulldev, /* close */ 127 nodev, /* strategy */ 128 nodev, /* print */ 129 nodev, /* dump */ 130 nodev, /* read */ 131 nodev, /* write */ 132 nodev, /* ioctl */ 133 nodev, /* devmap */ 134 nodev, /* mmap */ 135 nodev, /* segmap */ 136 nochpoll, /* poll */ 137 ddi_prop_op, /* prop_op */ 138 NULL, /* stream */ 139 D_MP, /* cb_flag */ 140 CB_REV, /* rev */ 141 nodev, /* int (*cb_aread)() */ 142 nodev /* int (*cb_awrite)() */ 143 }; 144 145 /* 146 * Device options 147 */ 148 static struct dev_ops rpcib_ops = { 149 DEVO_REV, /* devo_rev, */ 150 0, /* refcnt */ 151 rpcib_getinfo, /* info */ 152 nulldev, /* identify */ 153 nulldev, /* probe */ 154 rpcib_attach, /* attach */ 155 rpcib_detach, /* detach */ 156 nodev, /* reset */ 157 &rpcib_cbops, /* driver ops - devctl interfaces */ 158 NULL, /* bus operations */ 159 NULL, /* power */ 160 ddi_quiesce_not_needed, /* quiesce */ 161 }; 162 163 /* 164 * Module linkage information. 165 */ 166 167 static struct modldrv rib_modldrv = { 168 &mod_driverops, /* Driver module */ 169 "RPCIB plugin driver", /* Driver name and version */ 170 &rpcib_ops, /* Driver ops */ 171 }; 172 173 static struct modlinkage rib_modlinkage = { 174 MODREV_1, 175 (void *)&rib_modldrv, 176 NULL 177 }; 178 179 typedef struct rib_lrc_entry { 180 struct rib_lrc_entry *forw; 181 struct rib_lrc_entry *back; 182 char *lrc_buf; 183 184 uint32_t lrc_len; 185 void *avl_node; 186 bool_t registered; 187 188 struct mrc lrc_mhandle; 189 bool_t lrc_on_freed_list; 190 } rib_lrc_entry_t; 191 192 typedef struct cache_struct { 193 rib_lrc_entry_t r; 194 uint32_t len; 195 uint32_t elements; 196 kmutex_t node_lock; 197 avl_node_t avl_link; 198 } cache_avl_struct_t; 199 200 static uint64_t rib_total_buffers = 0; 201 uint64_t cache_limit = 100 * 1024 * 1024; 202 static volatile uint64_t cache_allocation = 0; 203 static uint64_t cache_watermark = 80 * 1024 * 1024; 204 static uint64_t cache_hits = 0; 205 static uint64_t cache_misses = 0; 206 static uint64_t cache_cold_misses = 0; 207 static uint64_t cache_hot_misses = 0; 208 static uint64_t cache_misses_above_the_limit = 0; 209 static bool_t stats_enabled = FALSE; 210 211 static uint64_t max_unsignaled_rws = 5; 212 213 /* 214 * rib_stat: private data pointer used when registering 215 * with the IBTF. It is returned to the consumer 216 * in all callbacks. 217 */ 218 static rpcib_state_t *rib_stat = NULL; 219 220 #define RNR_RETRIES IBT_RNR_RETRY_1 221 #define MAX_PORTS 2 222 223 int preposted_rbufs = RDMA_BUFS_GRANT; 224 int send_threshold = 1; 225 226 /* 227 * State of the plugin. 228 * ACCEPT = accepting new connections and requests. 229 * NO_ACCEPT = not accepting new connection and requests. 230 * This should eventually move to rpcib_state_t structure, since this 231 * will tell in which state the plugin is for a particular type of service 232 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 233 * state for one and in no_accept state for the other. 234 */ 235 int plugin_state; 236 kmutex_t plugin_state_lock; 237 238 ldi_ident_t rpcib_li; 239 240 /* 241 * RPCIB RDMATF operations 242 */ 243 #if defined(MEASURE_POOL_DEPTH) 244 static void rib_posted_rbufs(uint32_t x) { return; } 245 #endif 246 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 247 static rdma_stat rib_disconnect(CONN *conn); 248 static void rib_listen(struct rdma_svc_data *rd); 249 static void rib_listen_stop(struct rdma_svc_data *rd); 250 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 251 uint_t buflen, struct mrc *buf_handle); 252 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 253 struct mrc buf_handle); 254 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 255 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 256 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 257 struct mrc buf_handle); 258 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 259 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 260 void *lrc); 261 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 262 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 263 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 264 caddr_t buf, int len, int cpu); 265 266 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 267 268 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 269 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 270 271 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 272 273 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 274 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 275 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 276 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 277 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 278 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 279 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 280 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 281 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 282 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 283 static rdma_stat rib_conn_release(CONN *conn); 284 static rdma_stat rib_getinfo(rdma_info_t *info); 285 286 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 287 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 288 static void rib_destroy_cache(rib_hca_t *hca); 289 static void rib_server_side_cache_reclaim(void *argp); 290 static int avl_compare(const void *t1, const void *t2); 291 292 static void rib_stop_services(rib_hca_t *); 293 static void rib_close_channels(rib_conn_list_t *); 294 295 /* 296 * RPCIB addressing operations 297 */ 298 299 /* 300 * RDMA operations the RPCIB module exports 301 */ 302 static rdmaops_t rib_ops = { 303 rib_reachable, 304 rib_conn_get, 305 rib_conn_release, 306 rib_listen, 307 rib_listen_stop, 308 rib_registermem, 309 rib_deregistermem, 310 rib_registermemsync, 311 rib_deregistermemsync, 312 rib_syncmem, 313 rib_reg_buf_alloc, 314 rib_reg_buf_free, 315 rib_send, 316 rib_send_resp, 317 rib_post_resp, 318 rib_post_resp_remove, 319 rib_post_recv, 320 rib_recv, 321 rib_read, 322 rib_write, 323 rib_getinfo, 324 }; 325 326 /* 327 * RDMATF RPCIB plugin details 328 */ 329 static rdma_mod_t rib_mod = { 330 "ibtf", /* api name */ 331 RDMATF_VERS_1, 332 0, 333 &rib_ops, /* rdma op vector for ibtf */ 334 }; 335 336 static rdma_stat open_hcas(rpcib_state_t *); 337 static rdma_stat rib_qp_init(rib_qp_t *, int); 338 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 339 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 340 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 341 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 342 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 343 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 344 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 345 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 346 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 347 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *, 348 ibt_ip_addr_t *, ibt_ip_addr_t *); 349 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 350 rib_qp_t **); 351 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 352 rib_qp_t **); 353 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 354 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 355 static int rib_free_sendwait(struct send_wid *); 356 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 357 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 358 static void rdma_done_rem_list(rib_qp_t *); 359 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 360 361 static void rib_async_handler(void *, 362 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 363 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 364 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 365 static int rib_free_svc_recv(struct svc_recv *); 366 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 367 static void rib_free_wid(struct recv_wid *); 368 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 369 static void rib_detach_hca(rib_hca_t *); 370 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int, 371 ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *); 372 373 /* 374 * Registration with IBTF as a consumer 375 */ 376 static struct ibt_clnt_modinfo_s rib_modinfo = { 377 IBTI_V_CURR, 378 IBT_GENERIC, 379 rib_async_handler, /* async event handler */ 380 NULL, /* Memory Region Handler */ 381 "nfs/ib" 382 }; 383 384 /* 385 * Global strucuture 386 */ 387 388 typedef struct rpcib_s { 389 dev_info_t *rpcib_dip; 390 kmutex_t rpcib_mutex; 391 } rpcib_t; 392 393 rpcib_t rpcib; 394 395 /* 396 * /etc/system controlled variable to control 397 * debugging in rpcib kernel module. 398 * Set it to values greater that 1 to control 399 * the amount of debugging messages required. 400 */ 401 int rib_debug = 0; 402 403 int 404 _init(void) 405 { 406 int error; 407 408 error = mod_install((struct modlinkage *)&rib_modlinkage); 409 if (error != 0) { 410 /* 411 * Could not load module 412 */ 413 return (error); 414 } 415 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 416 return (0); 417 } 418 419 int 420 _fini() 421 { 422 int status; 423 424 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) { 425 return (EBUSY); 426 } 427 428 /* 429 * Remove module 430 */ 431 if ((status = mod_remove(&rib_modlinkage)) != 0) { 432 (void) rdma_register_mod(&rib_mod); 433 return (status); 434 } 435 mutex_destroy(&plugin_state_lock); 436 return (0); 437 } 438 439 int 440 _info(struct modinfo *modinfop) 441 { 442 return (mod_info(&rib_modlinkage, modinfop)); 443 } 444 445 /* 446 * rpcib_getinfo() 447 * Given the device number, return the devinfo pointer or the 448 * instance number. 449 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 450 */ 451 452 /*ARGSUSED*/ 453 static int 454 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 455 { 456 int ret = DDI_SUCCESS; 457 458 switch (cmd) { 459 case DDI_INFO_DEVT2DEVINFO: 460 if (rpcib.rpcib_dip != NULL) 461 *result = rpcib.rpcib_dip; 462 else { 463 *result = NULL; 464 ret = DDI_FAILURE; 465 } 466 break; 467 468 case DDI_INFO_DEVT2INSTANCE: 469 *result = NULL; 470 break; 471 472 default: 473 ret = DDI_FAILURE; 474 } 475 return (ret); 476 } 477 478 static int 479 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 480 { 481 ibt_status_t ibt_status; 482 rdma_stat r_status; 483 484 switch (cmd) { 485 case DDI_ATTACH: 486 break; 487 case DDI_RESUME: 488 return (DDI_SUCCESS); 489 default: 490 return (DDI_FAILURE); 491 } 492 493 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 494 495 mutex_enter(&rpcib.rpcib_mutex); 496 if (rpcib.rpcib_dip != NULL) { 497 mutex_exit(&rpcib.rpcib_mutex); 498 return (DDI_FAILURE); 499 } 500 rpcib.rpcib_dip = dip; 501 mutex_exit(&rpcib.rpcib_mutex); 502 /* 503 * Create the "rpcib" minor-node. 504 */ 505 if (ddi_create_minor_node(dip, 506 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 507 /* Error message, no cmn_err as they print on console */ 508 return (DDI_FAILURE); 509 } 510 511 if (rib_stat == NULL) { 512 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 513 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 514 } 515 516 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 517 if (rib_stat->hca_count < 1) { 518 mutex_destroy(&rib_stat->open_hca_lock); 519 kmem_free(rib_stat, sizeof (*rib_stat)); 520 rib_stat = NULL; 521 return (DDI_FAILURE); 522 } 523 524 ibt_status = ibt_attach(&rib_modinfo, dip, 525 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 526 527 if (ibt_status != IBT_SUCCESS) { 528 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 529 mutex_destroy(&rib_stat->open_hca_lock); 530 kmem_free(rib_stat, sizeof (*rib_stat)); 531 rib_stat = NULL; 532 return (DDI_FAILURE); 533 } 534 535 mutex_enter(&rib_stat->open_hca_lock); 536 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 537 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 538 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 539 mutex_exit(&rib_stat->open_hca_lock); 540 mutex_destroy(&rib_stat->open_hca_lock); 541 kmem_free(rib_stat, sizeof (*rib_stat)); 542 rib_stat = NULL; 543 return (DDI_FAILURE); 544 } 545 mutex_exit(&rib_stat->open_hca_lock); 546 547 /* 548 * Register with rdmatf 549 */ 550 rib_mod.rdma_count = rib_stat->hca_count; 551 r_status = rdma_register_mod(&rib_mod); 552 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 553 rib_detach_hca(rib_stat->hca); 554 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 555 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 556 mutex_destroy(&rib_stat->open_hca_lock); 557 kmem_free(rib_stat, sizeof (*rib_stat)); 558 rib_stat = NULL; 559 return (DDI_FAILURE); 560 } 561 562 563 return (DDI_SUCCESS); 564 } 565 566 /*ARGSUSED*/ 567 static int 568 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 569 { 570 switch (cmd) { 571 572 case DDI_DETACH: 573 break; 574 575 case DDI_SUSPEND: 576 default: 577 return (DDI_FAILURE); 578 } 579 580 /* 581 * Detach the hca and free resources 582 */ 583 mutex_enter(&plugin_state_lock); 584 plugin_state = NO_ACCEPT; 585 mutex_exit(&plugin_state_lock); 586 rib_detach_hca(rib_stat->hca); 587 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 588 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 589 590 mutex_enter(&rpcib.rpcib_mutex); 591 rpcib.rpcib_dip = NULL; 592 mutex_exit(&rpcib.rpcib_mutex); 593 594 mutex_destroy(&rpcib.rpcib_mutex); 595 return (DDI_SUCCESS); 596 } 597 598 599 static void rib_rbufpool_free(rib_hca_t *, int); 600 static void rib_rbufpool_deregister(rib_hca_t *, int); 601 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 602 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 603 static rdma_stat rib_rem_replylist(rib_qp_t *); 604 static int rib_remreply(rib_qp_t *, struct reply *); 605 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 606 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 607 608 609 /* 610 * One CQ pair per HCA 611 */ 612 static rdma_stat 613 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 614 rib_cq_t **cqp, rpcib_state_t *ribstat) 615 { 616 rib_cq_t *cq; 617 ibt_cq_attr_t cq_attr; 618 uint32_t real_size; 619 ibt_status_t status; 620 rdma_stat error = RDMA_SUCCESS; 621 622 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 623 cq->rib_hca = hca; 624 cq_attr.cq_size = cq_size; 625 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 626 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 627 &real_size); 628 if (status != IBT_SUCCESS) { 629 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 630 " status=%d", status); 631 error = RDMA_FAILED; 632 goto fail; 633 } 634 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 635 636 /* 637 * Enable CQ callbacks. CQ Callbacks are single shot 638 * (e.g. you have to call ibt_enable_cq_notify() 639 * after each callback to get another one). 640 */ 641 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 642 if (status != IBT_SUCCESS) { 643 cmn_err(CE_WARN, "rib_create_cq: " 644 "enable_cq_notify failed, status %d", status); 645 error = RDMA_FAILED; 646 goto fail; 647 } 648 *cqp = cq; 649 650 return (error); 651 fail: 652 if (cq->rib_cq_hdl) 653 (void) ibt_free_cq(cq->rib_cq_hdl); 654 if (cq) 655 kmem_free(cq, sizeof (rib_cq_t)); 656 return (error); 657 } 658 659 static rdma_stat 660 open_hcas(rpcib_state_t *ribstat) 661 { 662 rib_hca_t *hca; 663 ibt_status_t ibt_status; 664 rdma_stat status; 665 ibt_hca_portinfo_t *pinfop; 666 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 667 uint_t size, cq_size; 668 int i; 669 kstat_t *ksp; 670 cache_avl_struct_t example_avl_node; 671 char rssc_name[32]; 672 673 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 674 675 if (ribstat->hcas == NULL) 676 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 677 sizeof (rib_hca_t), KM_SLEEP); 678 679 /* 680 * Open a hca and setup for RDMA 681 */ 682 for (i = 0; i < ribstat->hca_count; i++) { 683 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 684 ribstat->hca_guids[i], 685 &ribstat->hcas[i].hca_hdl); 686 if (ibt_status != IBT_SUCCESS) { 687 continue; 688 } 689 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 690 hca = &(ribstat->hcas[i]); 691 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 692 hca->state = HCA_INITED; 693 694 /* 695 * query HCA info 696 */ 697 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 698 if (ibt_status != IBT_SUCCESS) { 699 goto fail1; 700 } 701 702 /* 703 * One PD (Protection Domain) per HCA. 704 * A qp is allowed to access a memory region 705 * only when it's in the same PD as that of 706 * the memory region. 707 */ 708 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 709 if (ibt_status != IBT_SUCCESS) { 710 goto fail1; 711 } 712 713 /* 714 * query HCA ports 715 */ 716 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 717 0, &pinfop, &hca->hca_nports, &size); 718 if (ibt_status != IBT_SUCCESS) { 719 goto fail2; 720 } 721 hca->hca_ports = pinfop; 722 hca->hca_pinfosz = size; 723 pinfop = NULL; 724 725 cq_size = DEF_CQ_SIZE; /* default cq size */ 726 /* 727 * Create 2 pairs of cq's (1 pair for client 728 * and the other pair for server) on this hca. 729 * If number of qp's gets too large, then several 730 * cq's will be needed. 731 */ 732 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 733 &hca->svc_rcq, ribstat); 734 if (status != RDMA_SUCCESS) { 735 goto fail3; 736 } 737 738 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 739 &hca->svc_scq, ribstat); 740 if (status != RDMA_SUCCESS) { 741 goto fail3; 742 } 743 744 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 745 &hca->clnt_rcq, ribstat); 746 if (status != RDMA_SUCCESS) { 747 goto fail3; 748 } 749 750 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 751 &hca->clnt_scq, ribstat); 752 if (status != RDMA_SUCCESS) { 753 goto fail3; 754 } 755 756 /* 757 * Create buffer pools. 758 * Note rib_rbuf_create also allocates memory windows. 759 */ 760 hca->recv_pool = rib_rbufpool_create(hca, 761 RECV_BUFFER, MAX_BUFS); 762 if (hca->recv_pool == NULL) { 763 goto fail3; 764 } 765 766 hca->send_pool = rib_rbufpool_create(hca, 767 SEND_BUFFER, MAX_BUFS); 768 if (hca->send_pool == NULL) { 769 rib_rbufpool_destroy(hca, RECV_BUFFER); 770 goto fail3; 771 } 772 773 if (hca->server_side_cache == NULL) { 774 (void) sprintf(rssc_name, 775 "rib_server_side_cache_%04d", i); 776 hca->server_side_cache = kmem_cache_create( 777 rssc_name, 778 sizeof (cache_avl_struct_t), 0, 779 NULL, 780 NULL, 781 rib_server_side_cache_reclaim, 782 hca, NULL, 0); 783 } 784 785 avl_create(&hca->avl_tree, 786 avl_compare, 787 sizeof (cache_avl_struct_t), 788 (uint_t)(uintptr_t)&example_avl_node.avl_link- 789 (uint_t)(uintptr_t)&example_avl_node); 790 791 rw_init(&hca->avl_rw_lock, 792 NULL, RW_DRIVER, hca->iblock); 793 mutex_init(&hca->cache_allocation, 794 NULL, MUTEX_DRIVER, NULL); 795 hca->avl_init = TRUE; 796 797 /* Create kstats for the cache */ 798 ASSERT(INGLOBALZONE(curproc)); 799 800 if (!stats_enabled) { 801 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 802 KSTAT_TYPE_NAMED, 803 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 804 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 805 GLOBAL_ZONEID); 806 if (ksp) { 807 ksp->ks_data = (void *) &rpcib_kstat; 808 ksp->ks_update = rpcib_cache_kstat_update; 809 kstat_install(ksp); 810 stats_enabled = TRUE; 811 } 812 } 813 if (NULL == hca->reg_cache_clean_up) { 814 hca->reg_cache_clean_up = ddi_taskq_create(NULL, 815 "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); 816 } 817 818 /* 819 * Initialize the registered service list and 820 * the lock 821 */ 822 hca->service_list = NULL; 823 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 824 825 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 826 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 827 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 828 hca->iblock); 829 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 830 hca->iblock); 831 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 832 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 833 hca->inuse = TRUE; 834 /* 835 * XXX One hca only. Add multi-hca functionality if needed 836 * later. 837 */ 838 ribstat->hca = hca; 839 ribstat->nhca_inited++; 840 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 841 break; 842 843 fail3: 844 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 845 fail2: 846 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 847 fail1: 848 (void) ibt_close_hca(hca->hca_hdl); 849 850 } 851 if (ribstat->hca != NULL) 852 return (RDMA_SUCCESS); 853 else 854 return (RDMA_FAILED); 855 } 856 857 /* 858 * Callback routines 859 */ 860 861 /* 862 * SCQ handlers 863 */ 864 /* ARGSUSED */ 865 static void 866 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 867 { 868 ibt_status_t ibt_status; 869 ibt_wc_t wc; 870 int i; 871 872 /* 873 * Re-enable cq notify here to avoid missing any 874 * completion queue notification. 875 */ 876 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 877 878 ibt_status = IBT_SUCCESS; 879 while (ibt_status != IBT_CQ_EMPTY) { 880 bzero(&wc, sizeof (wc)); 881 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 882 if (ibt_status != IBT_SUCCESS) 883 return; 884 885 /* 886 * Got a send completion 887 */ 888 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 889 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 890 CONN *conn = qptoc(wd->qp); 891 892 mutex_enter(&wd->sendwait_lock); 893 switch (wc.wc_status) { 894 case IBT_WC_SUCCESS: 895 wd->status = RDMA_SUCCESS; 896 break; 897 case IBT_WC_WR_FLUSHED_ERR: 898 wd->status = RDMA_FAILED; 899 break; 900 default: 901 /* 902 * RC Send Q Error Code Local state Remote State 903 * ==================== =========== ============ 904 * IBT_WC_BAD_RESPONSE_ERR ERROR None 905 * IBT_WC_LOCAL_LEN_ERR ERROR None 906 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 907 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 908 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 909 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 910 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 911 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 912 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 913 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 914 * IBT_WC_WR_FLUSHED_ERR None None 915 */ 916 /* 917 * Channel in error state. Set connection to 918 * ERROR and cleanup will happen either from 919 * conn_release or from rib_conn_get 920 */ 921 wd->status = RDMA_FAILED; 922 mutex_enter(&conn->c_lock); 923 if (conn->c_state != C_DISCONN_PEND) 924 conn->c_state = C_ERROR_CONN; 925 mutex_exit(&conn->c_lock); 926 break; 927 } 928 929 if (wd->cv_sig == 1) { 930 /* 931 * Notify poster 932 */ 933 cv_signal(&wd->wait_cv); 934 mutex_exit(&wd->sendwait_lock); 935 } else { 936 /* 937 * Poster not waiting for notification. 938 * Free the send buffers and send_wid 939 */ 940 for (i = 0; i < wd->nsbufs; i++) { 941 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 942 (void *)(uintptr_t)wd->sbufaddr[i]); 943 } 944 mutex_exit(&wd->sendwait_lock); 945 (void) rib_free_sendwait(wd); 946 } 947 } 948 } 949 } 950 951 /* ARGSUSED */ 952 static void 953 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 954 { 955 ibt_status_t ibt_status; 956 ibt_wc_t wc; 957 int i; 958 959 /* 960 * Re-enable cq notify here to avoid missing any 961 * completion queue notification. 962 */ 963 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 964 965 ibt_status = IBT_SUCCESS; 966 while (ibt_status != IBT_CQ_EMPTY) { 967 bzero(&wc, sizeof (wc)); 968 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 969 if (ibt_status != IBT_SUCCESS) 970 return; 971 972 /* 973 * Got a send completion 974 */ 975 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 976 struct send_wid *wd = 977 (struct send_wid *)(uintptr_t)wc.wc_id; 978 mutex_enter(&wd->sendwait_lock); 979 if (wd->cv_sig == 1) { 980 /* 981 * Update completion status and notify poster 982 */ 983 if (wc.wc_status == IBT_WC_SUCCESS) 984 wd->status = RDMA_SUCCESS; 985 else 986 wd->status = RDMA_FAILED; 987 cv_signal(&wd->wait_cv); 988 mutex_exit(&wd->sendwait_lock); 989 } else { 990 /* 991 * Poster not waiting for notification. 992 * Free the send buffers and send_wid 993 */ 994 for (i = 0; i < wd->nsbufs; i++) { 995 rib_rbuf_free(qptoc(wd->qp), 996 SEND_BUFFER, 997 (void *)(uintptr_t)wd->sbufaddr[i]); 998 } 999 mutex_exit(&wd->sendwait_lock); 1000 (void) rib_free_sendwait(wd); 1001 } 1002 } 1003 } 1004 } 1005 1006 /* 1007 * RCQ handler 1008 */ 1009 /* ARGSUSED */ 1010 static void 1011 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1012 { 1013 rib_qp_t *qp; 1014 ibt_status_t ibt_status; 1015 ibt_wc_t wc; 1016 struct recv_wid *rwid; 1017 1018 /* 1019 * Re-enable cq notify here to avoid missing any 1020 * completion queue notification. 1021 */ 1022 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1023 1024 ibt_status = IBT_SUCCESS; 1025 while (ibt_status != IBT_CQ_EMPTY) { 1026 bzero(&wc, sizeof (wc)); 1027 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1028 if (ibt_status != IBT_SUCCESS) 1029 return; 1030 1031 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1032 qp = rwid->qp; 1033 if (wc.wc_status == IBT_WC_SUCCESS) { 1034 XDR inxdrs, *xdrs; 1035 uint_t xid, vers, op, find_xid = 0; 1036 struct reply *r; 1037 CONN *conn = qptoc(qp); 1038 uint32_t rdma_credit = 0; 1039 1040 xdrs = &inxdrs; 1041 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1042 wc.wc_bytes_xfer, XDR_DECODE); 1043 /* 1044 * Treat xid as opaque (xid is the first entity 1045 * in the rpc rdma message). 1046 */ 1047 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1048 1049 /* Skip xid and set the xdr position accordingly. */ 1050 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1051 (void) xdr_u_int(xdrs, &vers); 1052 (void) xdr_u_int(xdrs, &rdma_credit); 1053 (void) xdr_u_int(xdrs, &op); 1054 XDR_DESTROY(xdrs); 1055 1056 if (vers != RPCRDMA_VERS) { 1057 /* 1058 * Invalid RPC/RDMA version. Cannot 1059 * interoperate. Set connection to 1060 * ERROR state and bail out. 1061 */ 1062 mutex_enter(&conn->c_lock); 1063 if (conn->c_state != C_DISCONN_PEND) 1064 conn->c_state = C_ERROR_CONN; 1065 mutex_exit(&conn->c_lock); 1066 rib_rbuf_free(conn, RECV_BUFFER, 1067 (void *)(uintptr_t)rwid->addr); 1068 rib_free_wid(rwid); 1069 continue; 1070 } 1071 1072 mutex_enter(&qp->replylist_lock); 1073 for (r = qp->replylist; r != NULL; r = r->next) { 1074 if (r->xid == xid) { 1075 find_xid = 1; 1076 switch (op) { 1077 case RDMA_MSG: 1078 case RDMA_NOMSG: 1079 case RDMA_MSGP: 1080 r->status = RDMA_SUCCESS; 1081 r->vaddr_cq = rwid->addr; 1082 r->bytes_xfer = 1083 wc.wc_bytes_xfer; 1084 cv_signal(&r->wait_cv); 1085 break; 1086 default: 1087 rib_rbuf_free(qptoc(qp), 1088 RECV_BUFFER, 1089 (void *)(uintptr_t) 1090 rwid->addr); 1091 break; 1092 } 1093 break; 1094 } 1095 } 1096 mutex_exit(&qp->replylist_lock); 1097 if (find_xid == 0) { 1098 /* RPC caller not waiting for reply */ 1099 1100 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1101 int, xid); 1102 1103 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1104 (void *)(uintptr_t)rwid->addr); 1105 } 1106 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1107 CONN *conn = qptoc(qp); 1108 1109 /* 1110 * Connection being flushed. Just free 1111 * the posted buffer 1112 */ 1113 rib_rbuf_free(conn, RECV_BUFFER, 1114 (void *)(uintptr_t)rwid->addr); 1115 } else { 1116 CONN *conn = qptoc(qp); 1117 /* 1118 * RC Recv Q Error Code Local state Remote State 1119 * ==================== =========== ============ 1120 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1121 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1122 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1123 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1124 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1125 * IBT_WC_WR_FLUSHED_ERR None None 1126 */ 1127 /* 1128 * Channel in error state. Set connection 1129 * in ERROR state. 1130 */ 1131 mutex_enter(&conn->c_lock); 1132 if (conn->c_state != C_DISCONN_PEND) 1133 conn->c_state = C_ERROR_CONN; 1134 mutex_exit(&conn->c_lock); 1135 rib_rbuf_free(conn, RECV_BUFFER, 1136 (void *)(uintptr_t)rwid->addr); 1137 } 1138 rib_free_wid(rwid); 1139 } 1140 } 1141 1142 /* Server side */ 1143 /* ARGSUSED */ 1144 static void 1145 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1146 { 1147 rdma_recv_data_t *rdp; 1148 rib_qp_t *qp; 1149 ibt_status_t ibt_status; 1150 ibt_wc_t wc; 1151 struct svc_recv *s_recvp; 1152 CONN *conn; 1153 mblk_t *mp; 1154 1155 /* 1156 * Re-enable cq notify here to avoid missing any 1157 * completion queue notification. 1158 */ 1159 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1160 1161 ibt_status = IBT_SUCCESS; 1162 while (ibt_status != IBT_CQ_EMPTY) { 1163 bzero(&wc, sizeof (wc)); 1164 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1165 if (ibt_status != IBT_SUCCESS) 1166 return; 1167 1168 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1169 qp = s_recvp->qp; 1170 conn = qptoc(qp); 1171 mutex_enter(&qp->posted_rbufs_lock); 1172 qp->n_posted_rbufs--; 1173 #if defined(MEASURE_POOL_DEPTH) 1174 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1175 #endif 1176 if (qp->n_posted_rbufs == 0) 1177 cv_signal(&qp->posted_rbufs_cv); 1178 mutex_exit(&qp->posted_rbufs_lock); 1179 1180 if (wc.wc_status == IBT_WC_SUCCESS) { 1181 XDR inxdrs, *xdrs; 1182 uint_t xid, vers, op; 1183 uint32_t rdma_credit; 1184 1185 xdrs = &inxdrs; 1186 /* s_recvp->vaddr stores data */ 1187 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1188 wc.wc_bytes_xfer, XDR_DECODE); 1189 1190 /* 1191 * Treat xid as opaque (xid is the first entity 1192 * in the rpc rdma message). 1193 */ 1194 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1195 /* Skip xid and set the xdr position accordingly. */ 1196 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1197 if (!xdr_u_int(xdrs, &vers) || 1198 !xdr_u_int(xdrs, &rdma_credit) || 1199 !xdr_u_int(xdrs, &op)) { 1200 rib_rbuf_free(conn, RECV_BUFFER, 1201 (void *)(uintptr_t)s_recvp->vaddr); 1202 XDR_DESTROY(xdrs); 1203 (void) rib_free_svc_recv(s_recvp); 1204 continue; 1205 } 1206 XDR_DESTROY(xdrs); 1207 1208 if (vers != RPCRDMA_VERS) { 1209 /* 1210 * Invalid RPC/RDMA version. 1211 * Drop rpc rdma message. 1212 */ 1213 rib_rbuf_free(conn, RECV_BUFFER, 1214 (void *)(uintptr_t)s_recvp->vaddr); 1215 (void) rib_free_svc_recv(s_recvp); 1216 continue; 1217 } 1218 /* 1219 * Is this for RDMA_DONE? 1220 */ 1221 if (op == RDMA_DONE) { 1222 rib_rbuf_free(conn, RECV_BUFFER, 1223 (void *)(uintptr_t)s_recvp->vaddr); 1224 /* 1225 * Wake up the thread waiting on 1226 * a RDMA_DONE for xid 1227 */ 1228 mutex_enter(&qp->rdlist_lock); 1229 rdma_done_notify(qp, xid); 1230 mutex_exit(&qp->rdlist_lock); 1231 (void) rib_free_svc_recv(s_recvp); 1232 continue; 1233 } 1234 1235 mutex_enter(&plugin_state_lock); 1236 if (plugin_state == ACCEPT) { 1237 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1238 == NULL) 1239 (void) strwaitbuf( 1240 sizeof (*rdp), BPRI_LO); 1241 /* 1242 * Plugin is in accept state, hence the master 1243 * transport queue for this is still accepting 1244 * requests. Hence we can call svc_queuereq to 1245 * queue this recieved msg. 1246 */ 1247 rdp = (rdma_recv_data_t *)mp->b_rptr; 1248 rdp->conn = conn; 1249 rdp->rpcmsg.addr = 1250 (caddr_t)(uintptr_t)s_recvp->vaddr; 1251 rdp->rpcmsg.type = RECV_BUFFER; 1252 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1253 rdp->status = wc.wc_status; 1254 mutex_enter(&conn->c_lock); 1255 conn->c_ref++; 1256 mutex_exit(&conn->c_lock); 1257 mp->b_wptr += sizeof (*rdp); 1258 svc_queuereq((queue_t *)rib_stat->q, mp); 1259 mutex_exit(&plugin_state_lock); 1260 } else { 1261 /* 1262 * The master transport for this is going 1263 * away and the queue is not accepting anymore 1264 * requests for krpc, so don't do anything, just 1265 * free the msg. 1266 */ 1267 mutex_exit(&plugin_state_lock); 1268 rib_rbuf_free(conn, RECV_BUFFER, 1269 (void *)(uintptr_t)s_recvp->vaddr); 1270 } 1271 } else { 1272 rib_rbuf_free(conn, RECV_BUFFER, 1273 (void *)(uintptr_t)s_recvp->vaddr); 1274 } 1275 (void) rib_free_svc_recv(s_recvp); 1276 } 1277 } 1278 1279 /* 1280 * Handles DR event of IBT_HCA_DETACH_EVENT. 1281 */ 1282 /* ARGSUSED */ 1283 static void 1284 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1285 ibt_async_code_t code, ibt_async_event_t *event) 1286 { 1287 1288 switch (code) { 1289 case IBT_HCA_ATTACH_EVENT: 1290 /* ignore */ 1291 break; 1292 case IBT_HCA_DETACH_EVENT: 1293 { 1294 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1295 rib_detach_hca(rib_stat->hca); 1296 #ifdef DEBUG 1297 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1298 #endif 1299 break; 1300 } 1301 #ifdef DEBUG 1302 case IBT_EVENT_PATH_MIGRATED: 1303 cmn_err(CE_NOTE, "rib_async_handler(): " 1304 "IBT_EVENT_PATH_MIGRATED\n"); 1305 break; 1306 case IBT_EVENT_SQD: 1307 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1308 break; 1309 case IBT_EVENT_COM_EST: 1310 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1311 break; 1312 case IBT_ERROR_CATASTROPHIC_CHAN: 1313 cmn_err(CE_NOTE, "rib_async_handler(): " 1314 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1315 break; 1316 case IBT_ERROR_INVALID_REQUEST_CHAN: 1317 cmn_err(CE_NOTE, "rib_async_handler(): " 1318 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1319 break; 1320 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1321 cmn_err(CE_NOTE, "rib_async_handler(): " 1322 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1323 break; 1324 case IBT_ERROR_PATH_MIGRATE_REQ: 1325 cmn_err(CE_NOTE, "rib_async_handler(): " 1326 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1327 break; 1328 case IBT_ERROR_CQ: 1329 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1330 break; 1331 case IBT_ERROR_PORT_DOWN: 1332 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1333 break; 1334 case IBT_EVENT_PORT_UP: 1335 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1336 break; 1337 case IBT_ASYNC_OPAQUE1: 1338 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1339 break; 1340 case IBT_ASYNC_OPAQUE2: 1341 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1342 break; 1343 case IBT_ASYNC_OPAQUE3: 1344 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1345 break; 1346 case IBT_ASYNC_OPAQUE4: 1347 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1348 break; 1349 #endif 1350 default: 1351 break; 1352 } 1353 } 1354 1355 /* 1356 * Client's reachable function. 1357 */ 1358 static rdma_stat 1359 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1360 { 1361 rib_hca_t *hca; 1362 rdma_stat status; 1363 1364 /* 1365 * First check if a hca is still attached 1366 */ 1367 *handle = NULL; 1368 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1369 if (rib_stat->hca->state != HCA_INITED) { 1370 rw_exit(&rib_stat->hca->state_lock); 1371 return (RDMA_FAILED); 1372 } 1373 status = rib_ping_srv(addr_type, raddr, &hca); 1374 rw_exit(&rib_stat->hca->state_lock); 1375 1376 if (status == RDMA_SUCCESS) { 1377 *handle = (void *)hca; 1378 return (RDMA_SUCCESS); 1379 } else { 1380 *handle = NULL; 1381 DTRACE_PROBE(rpcib__i__pingfailed); 1382 return (RDMA_FAILED); 1383 } 1384 } 1385 1386 /* Client side qp creation */ 1387 static rdma_stat 1388 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1389 { 1390 rib_qp_t *kqp = NULL; 1391 CONN *conn; 1392 rdma_clnt_cred_ctrl_t *cc_info; 1393 1394 ASSERT(qp != NULL); 1395 *qp = NULL; 1396 1397 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1398 conn = qptoc(kqp); 1399 kqp->hca = hca; 1400 kqp->rdmaconn.c_rdmamod = &rib_mod; 1401 kqp->rdmaconn.c_private = (caddr_t)kqp; 1402 1403 kqp->mode = RIB_CLIENT; 1404 kqp->chan_flags = IBT_BLOCKING; 1405 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1406 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1407 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1408 /* 1409 * Initialize 1410 */ 1411 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1412 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1413 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1414 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1415 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1416 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1417 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1418 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1419 /* 1420 * Initialize the client credit control 1421 * portion of the rdmaconn struct. 1422 */ 1423 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1424 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1425 cc_info->clnt_cc_granted_ops = 0; 1426 cc_info->clnt_cc_in_flight_ops = 0; 1427 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1428 1429 *qp = kqp; 1430 return (RDMA_SUCCESS); 1431 } 1432 1433 /* Server side qp creation */ 1434 static rdma_stat 1435 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1436 { 1437 rib_qp_t *kqp = NULL; 1438 ibt_chan_sizes_t chan_sizes; 1439 ibt_rc_chan_alloc_args_t qp_attr; 1440 ibt_status_t ibt_status; 1441 rdma_srv_cred_ctrl_t *cc_info; 1442 1443 *qp = NULL; 1444 1445 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1446 kqp->hca = hca; 1447 kqp->port_num = port; 1448 kqp->rdmaconn.c_rdmamod = &rib_mod; 1449 kqp->rdmaconn.c_private = (caddr_t)kqp; 1450 1451 /* 1452 * Create the qp handle 1453 */ 1454 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1455 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1456 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1457 qp_attr.rc_pd = hca->pd_hdl; 1458 qp_attr.rc_hca_port_num = port; 1459 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1460 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1461 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1462 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1463 qp_attr.rc_clone_chan = NULL; 1464 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1465 qp_attr.rc_flags = IBT_WR_SIGNALED; 1466 1467 rw_enter(&hca->state_lock, RW_READER); 1468 if (hca->state != HCA_DETACHED) { 1469 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1470 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1471 &chan_sizes); 1472 } else { 1473 rw_exit(&hca->state_lock); 1474 goto fail; 1475 } 1476 rw_exit(&hca->state_lock); 1477 1478 if (ibt_status != IBT_SUCCESS) { 1479 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1480 int, ibt_status); 1481 goto fail; 1482 } 1483 1484 kqp->mode = RIB_SERVER; 1485 kqp->chan_flags = IBT_BLOCKING; 1486 kqp->q = q; /* server ONLY */ 1487 1488 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1489 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1490 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1491 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1492 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1493 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1494 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1495 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1496 /* 1497 * Set the private data area to qp to be used in callbacks 1498 */ 1499 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1500 kqp->rdmaconn.c_state = C_CONNECTED; 1501 1502 /* 1503 * Initialize the server credit control 1504 * portion of the rdmaconn struct. 1505 */ 1506 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1507 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1508 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1509 cc_info->srv_cc_cur_buffers_used = 0; 1510 cc_info->srv_cc_posted = preposted_rbufs; 1511 1512 *qp = kqp; 1513 1514 return (RDMA_SUCCESS); 1515 fail: 1516 if (kqp) 1517 kmem_free(kqp, sizeof (rib_qp_t)); 1518 1519 return (RDMA_FAILED); 1520 } 1521 1522 /* ARGSUSED */ 1523 ibt_cm_status_t 1524 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1525 ibt_cm_return_args_t *ret_args, void *priv_data, 1526 ibt_priv_data_len_t len) 1527 { 1528 rpcib_state_t *ribstat; 1529 rib_hca_t *hca; 1530 1531 ribstat = (rpcib_state_t *)clnt_hdl; 1532 hca = (rib_hca_t *)ribstat->hca; 1533 1534 switch (event->cm_type) { 1535 1536 /* got a connection close event */ 1537 case IBT_CM_EVENT_CONN_CLOSED: 1538 { 1539 CONN *conn; 1540 rib_qp_t *qp; 1541 1542 /* check reason why connection was closed */ 1543 switch (event->cm_event.closed) { 1544 case IBT_CM_CLOSED_DREP_RCVD: 1545 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1546 case IBT_CM_CLOSED_DUP: 1547 case IBT_CM_CLOSED_ABORT: 1548 case IBT_CM_CLOSED_ALREADY: 1549 /* 1550 * These cases indicate the local end initiated 1551 * the closing of the channel. Nothing to do here. 1552 */ 1553 break; 1554 default: 1555 /* 1556 * Reason for CONN_CLOSED event must be one of 1557 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1558 * or IBT_CM_CLOSED_STALE. These indicate cases were 1559 * the remote end is closing the channel. In these 1560 * cases free the channel and transition to error 1561 * state 1562 */ 1563 qp = ibt_get_chan_private(event->cm_channel); 1564 conn = qptoc(qp); 1565 mutex_enter(&conn->c_lock); 1566 if (conn->c_state == C_DISCONN_PEND) { 1567 mutex_exit(&conn->c_lock); 1568 break; 1569 } 1570 1571 conn->c_state = C_ERROR_CONN; 1572 1573 /* 1574 * Free the rc_channel. Channel has already 1575 * transitioned to ERROR state and WRs have been 1576 * FLUSHED_ERR already. 1577 */ 1578 (void) ibt_free_channel(qp->qp_hdl); 1579 qp->qp_hdl = NULL; 1580 1581 /* 1582 * Free the conn if c_ref is down to 0 already 1583 */ 1584 if (conn->c_ref == 0) { 1585 /* 1586 * Remove from list and free conn 1587 */ 1588 conn->c_state = C_DISCONN_PEND; 1589 mutex_exit(&conn->c_lock); 1590 (void) rib_disconnect_channel(conn, 1591 &hca->cl_conn_list); 1592 } else { 1593 mutex_exit(&conn->c_lock); 1594 } 1595 #ifdef DEBUG 1596 if (rib_debug) 1597 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1598 "(CONN_CLOSED) channel disconnected"); 1599 #endif 1600 break; 1601 } 1602 break; 1603 } 1604 default: 1605 break; 1606 } 1607 return (IBT_CM_ACCEPT); 1608 } 1609 1610 /* Check server ib address */ 1611 rdma_stat 1612 rib_chk_srv_ibaddr(struct netbuf *raddr, 1613 int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip, 1614 ibt_ip_addr_t *d_ip) 1615 { 1616 struct sockaddr_in *sin4; 1617 struct sockaddr_in6 *sin6; 1618 ibt_status_t ibt_status; 1619 ibt_ip_path_attr_t ipattr; 1620 uint8_t npaths = 0; 1621 ibt_path_ip_src_t srcip; 1622 1623 ASSERT(raddr->buf != NULL); 1624 1625 (void) bzero(path, sizeof (ibt_path_info_t)); 1626 1627 switch (addr_type) { 1628 case AF_INET: 1629 sin4 = (struct sockaddr_in *)raddr->buf; 1630 d_ip->family = AF_INET; 1631 d_ip->un.ip4addr = sin4->sin_addr.s_addr; 1632 break; 1633 1634 case AF_INET6: 1635 sin6 = (struct sockaddr_in6 *)raddr->buf; 1636 d_ip->family = AF_INET6; 1637 d_ip->un.ip6addr = sin6->sin6_addr; 1638 break; 1639 1640 default: 1641 return (RDMA_INVAL); 1642 } 1643 1644 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1645 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1646 1647 ipattr.ipa_dst_ip = d_ip; 1648 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1649 ipattr.ipa_ndst = 1; 1650 ipattr.ipa_max_paths = 1; 1651 npaths = 0; 1652 1653 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1654 IBT_PATH_NO_FLAGS, 1655 &ipattr, 1656 path, 1657 &npaths, 1658 &srcip); 1659 1660 if (ibt_status != IBT_SUCCESS || 1661 npaths < 1 || 1662 path->pi_hca_guid != rib_stat->hca->hca_guid) { 1663 1664 bzero(s_ip, sizeof (ibt_path_ip_src_t)); 1665 return (RDMA_FAILED); 1666 } 1667 1668 if (srcip.ip_primary.family == AF_INET) { 1669 s_ip->family = AF_INET; 1670 s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr; 1671 } else { 1672 s_ip->family = AF_INET6; 1673 s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr; 1674 } 1675 1676 return (RDMA_SUCCESS); 1677 } 1678 1679 1680 /* 1681 * Connect to the server. 1682 */ 1683 rdma_stat 1684 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path, 1685 ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip) 1686 { 1687 ibt_chan_open_args_t chan_args; /* channel args */ 1688 ibt_chan_sizes_t chan_sizes; 1689 ibt_rc_chan_alloc_args_t qp_attr; 1690 ibt_status_t ibt_status; 1691 ibt_rc_returns_t ret_args; /* conn reject info */ 1692 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1693 ibt_ip_cm_info_t ipcm_info; 1694 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1695 1696 1697 (void) bzero(&chan_args, sizeof (chan_args)); 1698 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1699 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1700 1701 switch (ipcm_info.src_addr.family = s_ip->family) { 1702 case AF_INET: 1703 ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr; 1704 break; 1705 case AF_INET6: 1706 ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr; 1707 break; 1708 } 1709 1710 switch (ipcm_info.dst_addr.family = d_ip->family) { 1711 case AF_INET: 1712 ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr; 1713 break; 1714 case AF_INET6: 1715 ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr; 1716 break; 1717 } 1718 1719 ipcm_info.src_port = NFS_RDMA_PORT; 1720 1721 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1722 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1723 1724 if (ibt_status != IBT_SUCCESS) { 1725 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1726 return (-1); 1727 } 1728 1729 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1730 /* Alloc a RC channel */ 1731 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1732 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1733 qp_attr.rc_pd = hca->pd_hdl; 1734 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1735 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1736 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1737 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1738 qp_attr.rc_clone_chan = NULL; 1739 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1740 qp_attr.rc_flags = IBT_WR_SIGNALED; 1741 1742 path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT); 1743 chan_args.oc_path = path; 1744 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1745 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1746 chan_args.oc_rdma_ra_out = 4; 1747 chan_args.oc_rdma_ra_in = 4; 1748 chan_args.oc_path_retry_cnt = 2; 1749 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1750 chan_args.oc_priv_data = cmp_ip_pvt; 1751 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1752 1753 refresh: 1754 rw_enter(&hca->state_lock, RW_READER); 1755 if (hca->state != HCA_DETACHED) { 1756 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1757 IBT_ACHAN_NO_FLAGS, 1758 &qp_attr, &qp->qp_hdl, 1759 &chan_sizes); 1760 } else { 1761 rw_exit(&hca->state_lock); 1762 return (RDMA_FAILED); 1763 } 1764 rw_exit(&hca->state_lock); 1765 1766 if (ibt_status != IBT_SUCCESS) { 1767 DTRACE_PROBE1(rpcib__i_conntosrv, 1768 int, ibt_status); 1769 return (RDMA_FAILED); 1770 } 1771 1772 /* Connect to the Server */ 1773 (void) bzero(&ret_args, sizeof (ret_args)); 1774 mutex_enter(&qp->cb_lock); 1775 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1776 IBT_BLOCKING, &chan_args, &ret_args); 1777 if (ibt_status != IBT_SUCCESS) { 1778 DTRACE_PROBE2(rpcib__i_openrctosrv, 1779 int, ibt_status, int, ret_args.rc_status); 1780 1781 (void) ibt_free_channel(qp->qp_hdl); 1782 qp->qp_hdl = NULL; 1783 mutex_exit(&qp->cb_lock); 1784 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1785 ret_args.rc_status == IBT_CM_CONN_STALE) { 1786 /* 1787 * Got IBT_CM_CONN_STALE probably because of stale 1788 * data on the passive end of a channel that existed 1789 * prior to reboot. Retry establishing a channel 1790 * REFRESH_ATTEMPTS times, during which time the 1791 * stale conditions on the server might clear up. 1792 */ 1793 goto refresh; 1794 } 1795 return (RDMA_FAILED); 1796 } 1797 mutex_exit(&qp->cb_lock); 1798 /* 1799 * Set the private data area to qp to be used in callbacks 1800 */ 1801 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1802 return (RDMA_SUCCESS); 1803 } 1804 1805 rdma_stat 1806 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) 1807 { 1808 uint_t i; 1809 ibt_path_info_t path; 1810 ibt_status_t ibt_status; 1811 uint8_t num_paths_p; 1812 ibt_ip_path_attr_t ipattr; 1813 ibt_ip_addr_t dstip; 1814 ibt_path_ip_src_t srcip; 1815 rpcib_ipaddrs_t addrs4; 1816 rpcib_ipaddrs_t addrs6; 1817 struct sockaddr_in *sinp; 1818 struct sockaddr_in6 *sin6p; 1819 rdma_stat retval = RDMA_SUCCESS; 1820 1821 *hca = NULL; 1822 ASSERT(raddr->buf != NULL); 1823 1824 bzero(&path, sizeof (ibt_path_info_t)); 1825 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1826 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1827 1828 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1829 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1830 retval = RDMA_FAILED; 1831 goto done; 1832 } 1833 1834 /* Prep the destination address */ 1835 switch (addr_type) { 1836 case AF_INET: 1837 sinp = (struct sockaddr_in *)raddr->buf; 1838 dstip.family = AF_INET; 1839 dstip.un.ip4addr = sinp->sin_addr.s_addr; 1840 sinp = addrs4.ri_list; 1841 1842 for (i = 0; i < addrs4.ri_count; i++) { 1843 num_paths_p = 0; 1844 ipattr.ipa_dst_ip = &dstip; 1845 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1846 ipattr.ipa_ndst = 1; 1847 ipattr.ipa_max_paths = 1; 1848 ipattr.ipa_src_ip.family = dstip.family; 1849 ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; 1850 1851 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1852 IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, 1853 &srcip); 1854 if (ibt_status == IBT_SUCCESS && 1855 num_paths_p != 0 && 1856 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1857 *hca = rib_stat->hca; 1858 goto done; 1859 } 1860 } 1861 retval = RDMA_FAILED; 1862 break; 1863 1864 case AF_INET6: 1865 sin6p = (struct sockaddr_in6 *)raddr->buf; 1866 dstip.family = AF_INET6; 1867 dstip.un.ip6addr = sin6p->sin6_addr; 1868 sin6p = addrs6.ri_list; 1869 1870 for (i = 0; i < addrs6.ri_count; i++) { 1871 num_paths_p = 0; 1872 ipattr.ipa_dst_ip = &dstip; 1873 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1874 ipattr.ipa_ndst = 1; 1875 ipattr.ipa_max_paths = 1; 1876 ipattr.ipa_src_ip.family = dstip.family; 1877 ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; 1878 1879 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1880 IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, 1881 &srcip); 1882 if (ibt_status == IBT_SUCCESS && 1883 num_paths_p != 0 && 1884 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1885 *hca = rib_stat->hca; 1886 goto done; 1887 } 1888 } 1889 retval = RDMA_FAILED; 1890 break; 1891 1892 default: 1893 retval = RDMA_INVAL; 1894 break; 1895 } 1896 done: 1897 if (addrs4.ri_size > 0) 1898 kmem_free(addrs4.ri_list, addrs4.ri_size); 1899 if (addrs6.ri_size > 0) 1900 kmem_free(addrs6.ri_list, addrs6.ri_size); 1901 return (retval); 1902 } 1903 1904 /* 1905 * Close channel, remove from connection list and 1906 * free up resources allocated for that channel. 1907 */ 1908 rdma_stat 1909 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1910 { 1911 rib_qp_t *qp = ctoqp(conn); 1912 rib_hca_t *hca; 1913 1914 /* 1915 * c_ref == 0 and connection is in C_DISCONN_PEND 1916 */ 1917 hca = qp->hca; 1918 if (conn_list != NULL) 1919 (void) rib_rm_conn(conn, conn_list); 1920 1921 if (qp->qp_hdl != NULL) { 1922 /* 1923 * If the channel has not been establised, 1924 * ibt_flush_channel is called to flush outstanding WRs 1925 * on the Qs. Otherwise, ibt_close_rc_channel() is 1926 * called. The channel is then freed. 1927 */ 1928 if (conn_list != NULL) 1929 (void) ibt_close_rc_channel(qp->qp_hdl, 1930 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1931 else 1932 (void) ibt_flush_channel(qp->qp_hdl); 1933 1934 mutex_enter(&qp->posted_rbufs_lock); 1935 while (qp->n_posted_rbufs) 1936 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1937 mutex_exit(&qp->posted_rbufs_lock); 1938 (void) ibt_free_channel(qp->qp_hdl); 1939 qp->qp_hdl = NULL; 1940 } 1941 1942 ASSERT(qp->rdlist == NULL); 1943 1944 if (qp->replylist != NULL) { 1945 (void) rib_rem_replylist(qp); 1946 } 1947 1948 cv_destroy(&qp->cb_conn_cv); 1949 cv_destroy(&qp->posted_rbufs_cv); 1950 mutex_destroy(&qp->cb_lock); 1951 1952 mutex_destroy(&qp->replylist_lock); 1953 mutex_destroy(&qp->posted_rbufs_lock); 1954 mutex_destroy(&qp->rdlist_lock); 1955 1956 cv_destroy(&conn->c_cv); 1957 mutex_destroy(&conn->c_lock); 1958 1959 if (conn->c_raddr.buf != NULL) { 1960 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1961 } 1962 if (conn->c_laddr.buf != NULL) { 1963 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1964 } 1965 1966 /* 1967 * Credit control cleanup. 1968 */ 1969 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 1970 rdma_clnt_cred_ctrl_t *cc_info; 1971 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1972 cv_destroy(&cc_info->clnt_cc_cv); 1973 } 1974 1975 kmem_free(qp, sizeof (rib_qp_t)); 1976 1977 /* 1978 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1979 * then the hca is no longer being used. 1980 */ 1981 if (conn_list != NULL) { 1982 rw_enter(&hca->state_lock, RW_READER); 1983 if (hca->state == HCA_DETACHED) { 1984 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1985 if (hca->srv_conn_list.conn_hd == NULL) { 1986 rw_enter(&hca->cl_conn_list.conn_lock, 1987 RW_READER); 1988 1989 if (hca->cl_conn_list.conn_hd == NULL) { 1990 mutex_enter(&hca->inuse_lock); 1991 hca->inuse = FALSE; 1992 cv_signal(&hca->cb_cv); 1993 mutex_exit(&hca->inuse_lock); 1994 } 1995 rw_exit(&hca->cl_conn_list.conn_lock); 1996 } 1997 rw_exit(&hca->srv_conn_list.conn_lock); 1998 } 1999 rw_exit(&hca->state_lock); 2000 } 2001 2002 return (RDMA_SUCCESS); 2003 } 2004 2005 /* 2006 * Wait for send completion notification. Only on receiving a 2007 * notification be it a successful or error completion, free the 2008 * send_wid. 2009 */ 2010 static rdma_stat 2011 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2012 { 2013 clock_t timout, cv_wait_ret; 2014 rdma_stat error = RDMA_SUCCESS; 2015 int i; 2016 2017 /* 2018 * Wait for send to complete 2019 */ 2020 ASSERT(wd != NULL); 2021 mutex_enter(&wd->sendwait_lock); 2022 if (wd->status == (uint_t)SEND_WAIT) { 2023 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2024 ddi_get_lbolt(); 2025 2026 if (qp->mode == RIB_SERVER) { 2027 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2028 &wd->sendwait_lock, timout)) > 0 && 2029 wd->status == (uint_t)SEND_WAIT) 2030 ; 2031 switch (cv_wait_ret) { 2032 case -1: /* timeout */ 2033 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2034 2035 wd->cv_sig = 0; /* no signal needed */ 2036 error = RDMA_TIMEDOUT; 2037 break; 2038 default: /* got send completion */ 2039 break; 2040 } 2041 } else { 2042 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2043 &wd->sendwait_lock, timout)) > 0 && 2044 wd->status == (uint_t)SEND_WAIT) 2045 ; 2046 switch (cv_wait_ret) { 2047 case -1: /* timeout */ 2048 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2049 2050 wd->cv_sig = 0; /* no signal needed */ 2051 error = RDMA_TIMEDOUT; 2052 break; 2053 case 0: /* interrupted */ 2054 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2055 2056 wd->cv_sig = 0; /* no signal needed */ 2057 error = RDMA_INTR; 2058 break; 2059 default: /* got send completion */ 2060 break; 2061 } 2062 } 2063 } 2064 2065 if (wd->status != (uint_t)SEND_WAIT) { 2066 /* got send completion */ 2067 if (wd->status != RDMA_SUCCESS) { 2068 error = wd->status; 2069 if (wd->status != RDMA_CONNLOST) 2070 error = RDMA_FAILED; 2071 } 2072 for (i = 0; i < wd->nsbufs; i++) { 2073 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2074 (void *)(uintptr_t)wd->sbufaddr[i]); 2075 } 2076 mutex_exit(&wd->sendwait_lock); 2077 (void) rib_free_sendwait(wd); 2078 } else { 2079 mutex_exit(&wd->sendwait_lock); 2080 } 2081 return (error); 2082 } 2083 2084 static struct send_wid * 2085 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2086 { 2087 struct send_wid *wd; 2088 2089 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2090 wd->xid = xid; 2091 wd->cv_sig = cv_sig; 2092 wd->qp = qp; 2093 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2094 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2095 wd->status = (uint_t)SEND_WAIT; 2096 2097 return (wd); 2098 } 2099 2100 static int 2101 rib_free_sendwait(struct send_wid *wdesc) 2102 { 2103 cv_destroy(&wdesc->wait_cv); 2104 mutex_destroy(&wdesc->sendwait_lock); 2105 kmem_free(wdesc, sizeof (*wdesc)); 2106 2107 return (0); 2108 } 2109 2110 static rdma_stat 2111 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2112 { 2113 mutex_enter(&qp->replylist_lock); 2114 if (rep != NULL) { 2115 (void) rib_remreply(qp, rep); 2116 mutex_exit(&qp->replylist_lock); 2117 return (RDMA_SUCCESS); 2118 } 2119 mutex_exit(&qp->replylist_lock); 2120 return (RDMA_FAILED); 2121 } 2122 2123 /* 2124 * Send buffers are freed here only in case of error in posting 2125 * on QP. If the post succeeded, the send buffers are freed upon 2126 * send completion in rib_sendwait() or in the scq_handler. 2127 */ 2128 rdma_stat 2129 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2130 int send_sig, int cv_sig, caddr_t *swid) 2131 { 2132 struct send_wid *wdesc; 2133 struct clist *clp; 2134 ibt_status_t ibt_status = IBT_SUCCESS; 2135 rdma_stat ret = RDMA_SUCCESS; 2136 ibt_send_wr_t tx_wr; 2137 int i, nds; 2138 ibt_wr_ds_t sgl[DSEG_MAX]; 2139 uint_t total_msg_size; 2140 rib_qp_t *qp; 2141 2142 qp = ctoqp(conn); 2143 2144 ASSERT(cl != NULL); 2145 2146 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2147 2148 nds = 0; 2149 total_msg_size = 0; 2150 clp = cl; 2151 while (clp != NULL) { 2152 if (nds >= DSEG_MAX) { 2153 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2154 return (RDMA_FAILED); 2155 } 2156 sgl[nds].ds_va = clp->w.c_saddr; 2157 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2158 sgl[nds].ds_len = clp->c_len; 2159 total_msg_size += clp->c_len; 2160 clp = clp->c_next; 2161 nds++; 2162 } 2163 2164 if (send_sig) { 2165 /* Set SEND_SIGNAL flag. */ 2166 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2167 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2168 *swid = (caddr_t)wdesc; 2169 } else { 2170 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2171 wdesc = rib_init_sendwait(msgid, 0, qp); 2172 *swid = (caddr_t)wdesc; 2173 } 2174 wdesc->nsbufs = nds; 2175 for (i = 0; i < nds; i++) { 2176 wdesc->sbufaddr[i] = sgl[i].ds_va; 2177 } 2178 2179 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2180 tx_wr.wr_opcode = IBT_WRC_SEND; 2181 tx_wr.wr_trans = IBT_RC_SRV; 2182 tx_wr.wr_nds = nds; 2183 tx_wr.wr_sgl = sgl; 2184 2185 mutex_enter(&conn->c_lock); 2186 if (conn->c_state == C_CONNECTED) { 2187 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2188 } 2189 if (conn->c_state != C_CONNECTED || 2190 ibt_status != IBT_SUCCESS) { 2191 if (conn->c_state != C_DISCONN_PEND) 2192 conn->c_state = C_ERROR_CONN; 2193 mutex_exit(&conn->c_lock); 2194 for (i = 0; i < nds; i++) { 2195 rib_rbuf_free(conn, SEND_BUFFER, 2196 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2197 } 2198 2199 (void) rib_free_sendwait(wdesc); 2200 2201 return (RDMA_CONNLOST); 2202 } 2203 mutex_exit(&conn->c_lock); 2204 2205 if (send_sig) { 2206 if (cv_sig) { 2207 /* 2208 * cv_wait for send to complete. 2209 * We can fail due to a timeout or signal or 2210 * unsuccessful send. 2211 */ 2212 ret = rib_sendwait(qp, wdesc); 2213 2214 return (ret); 2215 } 2216 } 2217 2218 return (RDMA_SUCCESS); 2219 } 2220 2221 2222 rdma_stat 2223 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2224 { 2225 rdma_stat ret; 2226 caddr_t wd; 2227 2228 /* send-wait & cv_signal */ 2229 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2230 return (ret); 2231 } 2232 2233 /* 2234 * Server interface (svc_rdma_ksend). 2235 * Send RPC reply and wait for RDMA_DONE. 2236 */ 2237 rdma_stat 2238 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2239 { 2240 rdma_stat ret = RDMA_SUCCESS; 2241 struct rdma_done_list *rd; 2242 clock_t timout, cv_wait_ret; 2243 caddr_t *wid = NULL; 2244 rib_qp_t *qp = ctoqp(conn); 2245 2246 mutex_enter(&qp->rdlist_lock); 2247 rd = rdma_done_add(qp, msgid); 2248 2249 /* No cv_signal (whether send-wait or no-send-wait) */ 2250 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2251 2252 if (ret != RDMA_SUCCESS) { 2253 rdma_done_rm(qp, rd); 2254 } else { 2255 /* 2256 * Wait for RDMA_DONE from remote end 2257 */ 2258 timout = 2259 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2260 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2261 &qp->rdlist_lock, 2262 timout); 2263 2264 rdma_done_rm(qp, rd); 2265 2266 if (cv_wait_ret < 0) { 2267 ret = RDMA_TIMEDOUT; 2268 } 2269 } 2270 2271 mutex_exit(&qp->rdlist_lock); 2272 return (ret); 2273 } 2274 2275 static struct recv_wid * 2276 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2277 { 2278 struct recv_wid *rwid; 2279 2280 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2281 rwid->xid = msgid; 2282 rwid->addr = sgl->ds_va; 2283 rwid->qp = qp; 2284 2285 return (rwid); 2286 } 2287 2288 static void 2289 rib_free_wid(struct recv_wid *rwid) 2290 { 2291 kmem_free(rwid, sizeof (struct recv_wid)); 2292 } 2293 2294 rdma_stat 2295 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2296 { 2297 rib_qp_t *qp = ctoqp(conn); 2298 struct clist *clp = cl; 2299 struct reply *rep; 2300 struct recv_wid *rwid; 2301 int nds; 2302 ibt_wr_ds_t sgl[DSEG_MAX]; 2303 ibt_recv_wr_t recv_wr; 2304 rdma_stat ret; 2305 ibt_status_t ibt_status; 2306 2307 /* 2308 * rdma_clnt_postrecv uses RECV_BUFFER. 2309 */ 2310 2311 nds = 0; 2312 while (cl != NULL) { 2313 if (nds >= DSEG_MAX) { 2314 ret = RDMA_FAILED; 2315 goto done; 2316 } 2317 sgl[nds].ds_va = cl->w.c_saddr; 2318 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2319 sgl[nds].ds_len = cl->c_len; 2320 cl = cl->c_next; 2321 nds++; 2322 } 2323 2324 if (nds != 1) { 2325 ret = RDMA_FAILED; 2326 goto done; 2327 } 2328 2329 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2330 recv_wr.wr_nds = nds; 2331 recv_wr.wr_sgl = sgl; 2332 2333 rwid = rib_create_wid(qp, &sgl[0], msgid); 2334 if (rwid) { 2335 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2336 } else { 2337 ret = RDMA_NORESOURCE; 2338 goto done; 2339 } 2340 rep = rib_addreplylist(qp, msgid); 2341 if (!rep) { 2342 rib_free_wid(rwid); 2343 ret = RDMA_NORESOURCE; 2344 goto done; 2345 } 2346 2347 mutex_enter(&conn->c_lock); 2348 2349 if (conn->c_state == C_CONNECTED) { 2350 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2351 } 2352 2353 if (conn->c_state != C_CONNECTED || 2354 ibt_status != IBT_SUCCESS) { 2355 if (conn->c_state != C_DISCONN_PEND) 2356 conn->c_state = C_ERROR_CONN; 2357 mutex_exit(&conn->c_lock); 2358 rib_free_wid(rwid); 2359 (void) rib_rem_rep(qp, rep); 2360 ret = RDMA_CONNLOST; 2361 goto done; 2362 } 2363 mutex_exit(&conn->c_lock); 2364 return (RDMA_SUCCESS); 2365 2366 done: 2367 while (clp != NULL) { 2368 rib_rbuf_free(conn, RECV_BUFFER, 2369 (void *)(uintptr_t)clp->w.c_saddr3); 2370 clp = clp->c_next; 2371 } 2372 return (ret); 2373 } 2374 2375 rdma_stat 2376 rib_svc_post(CONN* conn, struct clist *cl) 2377 { 2378 rib_qp_t *qp = ctoqp(conn); 2379 struct svc_recv *s_recvp; 2380 int nds; 2381 ibt_wr_ds_t sgl[DSEG_MAX]; 2382 ibt_recv_wr_t recv_wr; 2383 ibt_status_t ibt_status; 2384 2385 nds = 0; 2386 while (cl != NULL) { 2387 if (nds >= DSEG_MAX) { 2388 return (RDMA_FAILED); 2389 } 2390 sgl[nds].ds_va = cl->w.c_saddr; 2391 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2392 sgl[nds].ds_len = cl->c_len; 2393 cl = cl->c_next; 2394 nds++; 2395 } 2396 2397 if (nds != 1) { 2398 rib_rbuf_free(conn, RECV_BUFFER, 2399 (caddr_t)(uintptr_t)sgl[0].ds_va); 2400 2401 return (RDMA_FAILED); 2402 } 2403 2404 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2405 recv_wr.wr_nds = nds; 2406 recv_wr.wr_sgl = sgl; 2407 2408 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2409 /* Use s_recvp's addr as wr id */ 2410 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2411 mutex_enter(&conn->c_lock); 2412 if (conn->c_state == C_CONNECTED) { 2413 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2414 } 2415 if (conn->c_state != C_CONNECTED || 2416 ibt_status != IBT_SUCCESS) { 2417 if (conn->c_state != C_DISCONN_PEND) 2418 conn->c_state = C_ERROR_CONN; 2419 mutex_exit(&conn->c_lock); 2420 rib_rbuf_free(conn, RECV_BUFFER, 2421 (caddr_t)(uintptr_t)sgl[0].ds_va); 2422 (void) rib_free_svc_recv(s_recvp); 2423 2424 return (RDMA_CONNLOST); 2425 } 2426 mutex_exit(&conn->c_lock); 2427 2428 return (RDMA_SUCCESS); 2429 } 2430 2431 /* Client */ 2432 rdma_stat 2433 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2434 { 2435 2436 return (rib_clnt_post(conn, cl, msgid)); 2437 } 2438 2439 /* Client */ 2440 rdma_stat 2441 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2442 { 2443 rib_qp_t *qp = ctoqp(conn); 2444 struct reply *rep; 2445 2446 mutex_enter(&qp->replylist_lock); 2447 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2448 if (rep->xid == msgid) { 2449 if (rep->vaddr_cq) { 2450 rib_rbuf_free(conn, RECV_BUFFER, 2451 (caddr_t)(uintptr_t)rep->vaddr_cq); 2452 } 2453 (void) rib_remreply(qp, rep); 2454 break; 2455 } 2456 } 2457 mutex_exit(&qp->replylist_lock); 2458 2459 return (RDMA_SUCCESS); 2460 } 2461 2462 /* Server */ 2463 rdma_stat 2464 rib_post_recv(CONN *conn, struct clist *cl) 2465 { 2466 rib_qp_t *qp = ctoqp(conn); 2467 2468 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2469 mutex_enter(&qp->posted_rbufs_lock); 2470 qp->n_posted_rbufs++; 2471 mutex_exit(&qp->posted_rbufs_lock); 2472 return (RDMA_SUCCESS); 2473 } 2474 return (RDMA_FAILED); 2475 } 2476 2477 /* 2478 * Client side only interface to "recv" the rpc reply buf 2479 * posted earlier by rib_post_resp(conn, cl, msgid). 2480 */ 2481 rdma_stat 2482 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2483 { 2484 struct reply *rep = NULL; 2485 clock_t timout, cv_wait_ret; 2486 rdma_stat ret = RDMA_SUCCESS; 2487 rib_qp_t *qp = ctoqp(conn); 2488 2489 /* 2490 * Find the reply structure for this msgid 2491 */ 2492 mutex_enter(&qp->replylist_lock); 2493 2494 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2495 if (rep->xid == msgid) 2496 break; 2497 } 2498 2499 if (rep != NULL) { 2500 /* 2501 * If message not yet received, wait. 2502 */ 2503 if (rep->status == (uint_t)REPLY_WAIT) { 2504 timout = ddi_get_lbolt() + 2505 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2506 2507 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2508 &qp->replylist_lock, timout)) > 0 && 2509 rep->status == (uint_t)REPLY_WAIT) 2510 ; 2511 2512 switch (cv_wait_ret) { 2513 case -1: /* timeout */ 2514 ret = RDMA_TIMEDOUT; 2515 break; 2516 case 0: 2517 ret = RDMA_INTR; 2518 break; 2519 default: 2520 break; 2521 } 2522 } 2523 2524 if (rep->status == RDMA_SUCCESS) { 2525 struct clist *cl = NULL; 2526 2527 /* 2528 * Got message successfully 2529 */ 2530 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2531 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2532 *clp = cl; 2533 } else { 2534 if (rep->status != (uint_t)REPLY_WAIT) { 2535 /* 2536 * Got error in reply message. Free 2537 * recv buffer here. 2538 */ 2539 ret = rep->status; 2540 rib_rbuf_free(conn, RECV_BUFFER, 2541 (caddr_t)(uintptr_t)rep->vaddr_cq); 2542 } 2543 } 2544 (void) rib_remreply(qp, rep); 2545 } else { 2546 /* 2547 * No matching reply structure found for given msgid on the 2548 * reply wait list. 2549 */ 2550 ret = RDMA_INVAL; 2551 DTRACE_PROBE(rpcib__i__nomatchxid2); 2552 } 2553 2554 /* 2555 * Done. 2556 */ 2557 mutex_exit(&qp->replylist_lock); 2558 return (ret); 2559 } 2560 2561 /* 2562 * RDMA write a buffer to the remote address. 2563 */ 2564 rdma_stat 2565 rib_write(CONN *conn, struct clist *cl, int wait) 2566 { 2567 ibt_send_wr_t tx_wr; 2568 int cv_sig; 2569 int i; 2570 ibt_wr_ds_t sgl[DSEG_MAX]; 2571 struct send_wid *wdesc; 2572 ibt_status_t ibt_status; 2573 rdma_stat ret = RDMA_SUCCESS; 2574 rib_qp_t *qp = ctoqp(conn); 2575 uint64_t n_writes = 0; 2576 bool_t force_wait = FALSE; 2577 2578 if (cl == NULL) { 2579 return (RDMA_FAILED); 2580 } 2581 2582 2583 while ((cl != NULL)) { 2584 if (cl->c_len > 0) { 2585 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2586 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2587 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2588 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2589 sgl[0].ds_va = cl->w.c_saddr; 2590 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2591 sgl[0].ds_len = cl->c_len; 2592 2593 if (wait) { 2594 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2595 cv_sig = 1; 2596 } else { 2597 if (n_writes > max_unsignaled_rws) { 2598 n_writes = 0; 2599 force_wait = TRUE; 2600 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2601 cv_sig = 1; 2602 } else { 2603 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2604 cv_sig = 0; 2605 } 2606 } 2607 2608 wdesc = rib_init_sendwait(0, cv_sig, qp); 2609 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2610 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2611 tx_wr.wr_trans = IBT_RC_SRV; 2612 tx_wr.wr_nds = 1; 2613 tx_wr.wr_sgl = sgl; 2614 2615 mutex_enter(&conn->c_lock); 2616 if (conn->c_state == C_CONNECTED) { 2617 ibt_status = 2618 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2619 } 2620 if (conn->c_state != C_CONNECTED || 2621 ibt_status != IBT_SUCCESS) { 2622 if (conn->c_state != C_DISCONN_PEND) 2623 conn->c_state = C_ERROR_CONN; 2624 mutex_exit(&conn->c_lock); 2625 (void) rib_free_sendwait(wdesc); 2626 return (RDMA_CONNLOST); 2627 } 2628 mutex_exit(&conn->c_lock); 2629 2630 /* 2631 * Wait for send to complete 2632 */ 2633 if (wait || force_wait) { 2634 force_wait = FALSE; 2635 ret = rib_sendwait(qp, wdesc); 2636 if (ret != 0) { 2637 return (ret); 2638 } 2639 } else { 2640 mutex_enter(&wdesc->sendwait_lock); 2641 for (i = 0; i < wdesc->nsbufs; i++) { 2642 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2643 (void *)(uintptr_t) 2644 wdesc->sbufaddr[i]); 2645 } 2646 mutex_exit(&wdesc->sendwait_lock); 2647 (void) rib_free_sendwait(wdesc); 2648 } 2649 n_writes ++; 2650 } 2651 cl = cl->c_next; 2652 } 2653 return (RDMA_SUCCESS); 2654 } 2655 2656 /* 2657 * RDMA Read a buffer from the remote address. 2658 */ 2659 rdma_stat 2660 rib_read(CONN *conn, struct clist *cl, int wait) 2661 { 2662 ibt_send_wr_t rx_wr; 2663 int cv_sig; 2664 int i; 2665 ibt_wr_ds_t sgl; 2666 struct send_wid *wdesc; 2667 ibt_status_t ibt_status = IBT_SUCCESS; 2668 rdma_stat ret = RDMA_SUCCESS; 2669 rib_qp_t *qp = ctoqp(conn); 2670 2671 if (cl == NULL) { 2672 return (RDMA_FAILED); 2673 } 2674 2675 while (cl != NULL) { 2676 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2677 /* 2678 * Remote address is at the head chunk item in list. 2679 */ 2680 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2681 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2682 2683 sgl.ds_va = cl->u.c_daddr; 2684 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2685 sgl.ds_len = cl->c_len; 2686 2687 if (wait) { 2688 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2689 cv_sig = 1; 2690 } else { 2691 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2692 cv_sig = 0; 2693 } 2694 2695 wdesc = rib_init_sendwait(0, cv_sig, qp); 2696 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2697 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2698 rx_wr.wr_trans = IBT_RC_SRV; 2699 rx_wr.wr_nds = 1; 2700 rx_wr.wr_sgl = &sgl; 2701 2702 mutex_enter(&conn->c_lock); 2703 if (conn->c_state == C_CONNECTED) { 2704 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2705 } 2706 if (conn->c_state != C_CONNECTED || 2707 ibt_status != IBT_SUCCESS) { 2708 if (conn->c_state != C_DISCONN_PEND) 2709 conn->c_state = C_ERROR_CONN; 2710 mutex_exit(&conn->c_lock); 2711 (void) rib_free_sendwait(wdesc); 2712 return (RDMA_CONNLOST); 2713 } 2714 mutex_exit(&conn->c_lock); 2715 2716 /* 2717 * Wait for send to complete if this is the 2718 * last item in the list. 2719 */ 2720 if (wait && cl->c_next == NULL) { 2721 ret = rib_sendwait(qp, wdesc); 2722 if (ret != 0) { 2723 return (ret); 2724 } 2725 } else { 2726 mutex_enter(&wdesc->sendwait_lock); 2727 for (i = 0; i < wdesc->nsbufs; i++) { 2728 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2729 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2730 } 2731 mutex_exit(&wdesc->sendwait_lock); 2732 (void) rib_free_sendwait(wdesc); 2733 } 2734 cl = cl->c_next; 2735 } 2736 return (RDMA_SUCCESS); 2737 } 2738 2739 /* 2740 * rib_srv_cm_handler() 2741 * Connection Manager callback to handle RC connection requests. 2742 */ 2743 /* ARGSUSED */ 2744 static ibt_cm_status_t 2745 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2746 ibt_cm_return_args_t *ret_args, void *priv_data, 2747 ibt_priv_data_len_t len) 2748 { 2749 queue_t *q; 2750 rib_qp_t *qp; 2751 rpcib_state_t *ribstat; 2752 rib_hca_t *hca; 2753 rdma_stat status = RDMA_SUCCESS; 2754 int i; 2755 struct clist cl; 2756 rdma_buf_t rdbuf = {0}; 2757 void *buf = NULL; 2758 CONN *conn; 2759 ibt_ip_cm_info_t ipinfo; 2760 struct sockaddr_in *s; 2761 struct sockaddr_in6 *s6; 2762 int sin_size = sizeof (struct sockaddr_in); 2763 int in_size = sizeof (struct in_addr); 2764 int sin6_size = sizeof (struct sockaddr_in6); 2765 2766 ASSERT(any != NULL); 2767 ASSERT(event != NULL); 2768 2769 ribstat = (rpcib_state_t *)any; 2770 hca = (rib_hca_t *)ribstat->hca; 2771 ASSERT(hca != NULL); 2772 2773 /* got a connection request */ 2774 switch (event->cm_type) { 2775 case IBT_CM_EVENT_REQ_RCV: 2776 /* 2777 * If the plugin is in the NO_ACCEPT state, bail out. 2778 */ 2779 mutex_enter(&plugin_state_lock); 2780 if (plugin_state == NO_ACCEPT) { 2781 mutex_exit(&plugin_state_lock); 2782 return (IBT_CM_REJECT); 2783 } 2784 mutex_exit(&plugin_state_lock); 2785 2786 /* 2787 * Need to send a MRA MAD to CM so that it does not 2788 * timeout on us. 2789 */ 2790 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2791 event->cm_event.req.req_timeout * 8, NULL, 0); 2792 2793 mutex_enter(&rib_stat->open_hca_lock); 2794 q = rib_stat->q; 2795 mutex_exit(&rib_stat->open_hca_lock); 2796 2797 status = rib_svc_create_chan(hca, (caddr_t)q, 2798 event->cm_event.req.req_prim_hca_port, &qp); 2799 2800 if (status) { 2801 return (IBT_CM_REJECT); 2802 } 2803 2804 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2805 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2806 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2807 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2808 2809 /* 2810 * Pre-posts RECV buffers 2811 */ 2812 conn = qptoc(qp); 2813 for (i = 0; i < preposted_rbufs; i++) { 2814 bzero(&rdbuf, sizeof (rdbuf)); 2815 rdbuf.type = RECV_BUFFER; 2816 buf = rib_rbuf_alloc(conn, &rdbuf); 2817 if (buf == NULL) { 2818 (void) rib_disconnect_channel(conn, NULL); 2819 return (IBT_CM_REJECT); 2820 } 2821 2822 bzero(&cl, sizeof (cl)); 2823 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2824 cl.c_len = rdbuf.len; 2825 cl.c_smemhandle.mrc_lmr = 2826 rdbuf.handle.mrc_lmr; /* lkey */ 2827 cl.c_next = NULL; 2828 status = rib_post_recv(conn, &cl); 2829 if (status != RDMA_SUCCESS) { 2830 (void) rib_disconnect_channel(conn, NULL); 2831 return (IBT_CM_REJECT); 2832 } 2833 } 2834 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2835 2836 /* 2837 * Get the address translation 2838 */ 2839 rw_enter(&hca->state_lock, RW_READER); 2840 if (hca->state == HCA_DETACHED) { 2841 rw_exit(&hca->state_lock); 2842 return (IBT_CM_REJECT); 2843 } 2844 rw_exit(&hca->state_lock); 2845 2846 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2847 2848 if (ibt_get_ip_data(event->cm_priv_data_len, 2849 event->cm_priv_data, 2850 &ipinfo) != IBT_SUCCESS) { 2851 2852 return (IBT_CM_REJECT); 2853 } 2854 2855 switch (ipinfo.src_addr.family) { 2856 case AF_INET: 2857 2858 conn->c_raddr.maxlen = 2859 conn->c_raddr.len = sin_size; 2860 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2861 2862 s = (struct sockaddr_in *)conn->c_raddr.buf; 2863 s->sin_family = AF_INET; 2864 2865 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2866 &s->sin_addr, in_size); 2867 2868 break; 2869 2870 case AF_INET6: 2871 2872 conn->c_raddr.maxlen = 2873 conn->c_raddr.len = sin6_size; 2874 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2875 2876 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2877 s6->sin6_family = AF_INET6; 2878 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2879 &s6->sin6_addr, 2880 sizeof (struct in6_addr)); 2881 2882 break; 2883 2884 default: 2885 return (IBT_CM_REJECT); 2886 } 2887 2888 break; 2889 2890 case IBT_CM_EVENT_CONN_CLOSED: 2891 { 2892 CONN *conn; 2893 rib_qp_t *qp; 2894 2895 switch (event->cm_event.closed) { 2896 case IBT_CM_CLOSED_DREP_RCVD: 2897 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2898 case IBT_CM_CLOSED_DUP: 2899 case IBT_CM_CLOSED_ABORT: 2900 case IBT_CM_CLOSED_ALREADY: 2901 /* 2902 * These cases indicate the local end initiated 2903 * the closing of the channel. Nothing to do here. 2904 */ 2905 break; 2906 default: 2907 /* 2908 * Reason for CONN_CLOSED event must be one of 2909 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2910 * or IBT_CM_CLOSED_STALE. These indicate cases were 2911 * the remote end is closing the channel. In these 2912 * cases free the channel and transition to error 2913 * state 2914 */ 2915 qp = ibt_get_chan_private(event->cm_channel); 2916 conn = qptoc(qp); 2917 mutex_enter(&conn->c_lock); 2918 if (conn->c_state == C_DISCONN_PEND) { 2919 mutex_exit(&conn->c_lock); 2920 break; 2921 } 2922 conn->c_state = C_ERROR_CONN; 2923 2924 /* 2925 * Free the rc_channel. Channel has already 2926 * transitioned to ERROR state and WRs have been 2927 * FLUSHED_ERR already. 2928 */ 2929 (void) ibt_free_channel(qp->qp_hdl); 2930 qp->qp_hdl = NULL; 2931 2932 /* 2933 * Free the conn if c_ref goes down to 0 2934 */ 2935 if (conn->c_ref == 0) { 2936 /* 2937 * Remove from list and free conn 2938 */ 2939 conn->c_state = C_DISCONN_PEND; 2940 mutex_exit(&conn->c_lock); 2941 (void) rib_disconnect_channel(conn, 2942 &hca->srv_conn_list); 2943 } else { 2944 mutex_exit(&conn->c_lock); 2945 } 2946 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 2947 break; 2948 } 2949 break; 2950 } 2951 case IBT_CM_EVENT_CONN_EST: 2952 /* 2953 * RTU received, hence connection established. 2954 */ 2955 if (rib_debug > 1) 2956 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2957 "(CONN_EST) channel established"); 2958 break; 2959 2960 default: 2961 if (rib_debug > 2) { 2962 /* Let CM handle the following events. */ 2963 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2964 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2965 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2966 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2967 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2968 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2969 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2970 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2971 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2972 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2973 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2974 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2975 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2976 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2977 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2978 } 2979 } 2980 return (IBT_CM_DEFAULT); 2981 } 2982 2983 /* accept all other CM messages (i.e. let the CM handle them) */ 2984 return (IBT_CM_ACCEPT); 2985 } 2986 2987 static rdma_stat 2988 rib_register_service(rib_hca_t *hca, int service_type) 2989 { 2990 ibt_srv_desc_t sdesc; 2991 ibt_hca_portinfo_t *port_infop; 2992 ib_svc_id_t srv_id; 2993 ibt_srv_hdl_t srv_hdl; 2994 uint_t port_size; 2995 uint_t pki, i, num_ports, nbinds; 2996 ibt_status_t ibt_status; 2997 rib_service_t *new_service; 2998 ib_pkey_t pkey; 2999 3000 /* 3001 * Query all ports for the given HCA 3002 */ 3003 rw_enter(&hca->state_lock, RW_READER); 3004 if (hca->state != HCA_DETACHED) { 3005 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3006 &num_ports, &port_size); 3007 rw_exit(&hca->state_lock); 3008 } else { 3009 rw_exit(&hca->state_lock); 3010 return (RDMA_FAILED); 3011 } 3012 if (ibt_status != IBT_SUCCESS) { 3013 return (RDMA_FAILED); 3014 } 3015 3016 DTRACE_PROBE1(rpcib__i__regservice_numports, 3017 int, num_ports); 3018 3019 for (i = 0; i < num_ports; i++) { 3020 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3021 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3022 int, i+1); 3023 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3024 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3025 int, i+1); 3026 } 3027 } 3028 3029 /* 3030 * Get all the IP addresses on this system to register the 3031 * given "service type" on all DNS recognized IP addrs. 3032 * Each service type such as NFS will have all the systems 3033 * IP addresses as its different names. For now the only 3034 * type of service we support in RPCIB is NFS. 3035 */ 3036 rw_enter(&hca->service_list_lock, RW_WRITER); 3037 /* 3038 * Start registering and binding service to active 3039 * on active ports on this HCA. 3040 */ 3041 nbinds = 0; 3042 new_service = NULL; 3043 3044 /* 3045 * We use IP addresses as the service names for 3046 * service registration. Register each of them 3047 * with CM to obtain a svc_id and svc_hdl. We do not 3048 * register the service with machine's loopback address. 3049 */ 3050 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3051 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3052 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3053 3054 sdesc.sd_handler = rib_srv_cm_handler; 3055 sdesc.sd_flags = 0; 3056 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3057 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT), 3058 1, &srv_hdl, &srv_id); 3059 3060 for (i = 0; i < num_ports; i++) { 3061 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3062 continue; 3063 3064 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3065 pkey = port_infop[i].p_pkey_tbl[pki]; 3066 if ((pkey & IBSRM_HB) && 3067 (pkey != IB_PKEY_INVALID_FULL)) { 3068 3069 /* 3070 * Allocate and prepare a service entry 3071 */ 3072 new_service = 3073 kmem_zalloc(1 * sizeof (rib_service_t), 3074 KM_SLEEP); 3075 3076 new_service->srv_type = service_type; 3077 new_service->srv_hdl = srv_hdl; 3078 new_service->srv_next = NULL; 3079 3080 ibt_status = ibt_bind_service(srv_hdl, 3081 port_infop[i].p_sgid_tbl[0], 3082 NULL, rib_stat, NULL); 3083 3084 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3085 int, ibt_status); 3086 3087 if (ibt_status != IBT_SUCCESS) { 3088 kmem_free(new_service, 3089 sizeof (rib_service_t)); 3090 new_service = NULL; 3091 continue; 3092 } 3093 3094 /* 3095 * Add to the service list for this HCA 3096 */ 3097 new_service->srv_next = hca->service_list; 3098 hca->service_list = new_service; 3099 new_service = NULL; 3100 nbinds++; 3101 } 3102 } 3103 } 3104 rw_exit(&hca->service_list_lock); 3105 3106 ibt_free_portinfo(port_infop, port_size); 3107 3108 if (nbinds == 0) { 3109 return (RDMA_FAILED); 3110 } else { 3111 /* 3112 * Put this plugin into accept state, since atleast 3113 * one registration was successful. 3114 */ 3115 mutex_enter(&plugin_state_lock); 3116 plugin_state = ACCEPT; 3117 mutex_exit(&plugin_state_lock); 3118 return (RDMA_SUCCESS); 3119 } 3120 } 3121 3122 void 3123 rib_listen(struct rdma_svc_data *rd) 3124 { 3125 rdma_stat status = RDMA_SUCCESS; 3126 3127 rd->active = 0; 3128 rd->err_code = RDMA_FAILED; 3129 3130 /* 3131 * First check if a hca is still attached 3132 */ 3133 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3134 if (rib_stat->hca->state != HCA_INITED) { 3135 rw_exit(&rib_stat->hca->state_lock); 3136 return; 3137 } 3138 rw_exit(&rib_stat->hca->state_lock); 3139 3140 rib_stat->q = &rd->q; 3141 /* 3142 * Right now the only service type is NFS. Hence force feed this 3143 * value. Ideally to communicate the service type it should be 3144 * passed down in rdma_svc_data. 3145 */ 3146 rib_stat->service_type = NFS; 3147 status = rib_register_service(rib_stat->hca, NFS); 3148 if (status != RDMA_SUCCESS) { 3149 rd->err_code = status; 3150 return; 3151 } 3152 /* 3153 * Service active on an HCA, check rd->err_code for more 3154 * explainable errors. 3155 */ 3156 rd->active = 1; 3157 rd->err_code = status; 3158 } 3159 3160 /* XXXX */ 3161 /* ARGSUSED */ 3162 static void 3163 rib_listen_stop(struct rdma_svc_data *svcdata) 3164 { 3165 rib_hca_t *hca; 3166 3167 /* 3168 * KRPC called the RDMATF to stop the listeners, this means 3169 * stop sending incomming or recieved requests to KRPC master 3170 * transport handle for RDMA-IB. This is also means that the 3171 * master transport handle, responsible for us, is going away. 3172 */ 3173 mutex_enter(&plugin_state_lock); 3174 plugin_state = NO_ACCEPT; 3175 if (svcdata != NULL) 3176 svcdata->active = 0; 3177 mutex_exit(&plugin_state_lock); 3178 3179 /* 3180 * First check if a hca is still attached 3181 */ 3182 hca = rib_stat->hca; 3183 rw_enter(&hca->state_lock, RW_READER); 3184 if (hca->state != HCA_INITED) { 3185 rw_exit(&hca->state_lock); 3186 return; 3187 } 3188 rib_close_channels(&hca->srv_conn_list); 3189 rib_stop_services(hca); 3190 rw_exit(&hca->state_lock); 3191 } 3192 3193 /* 3194 * Traverse the HCA's service list to unbind and deregister services. 3195 * Instead of unbinding the service for a service handle by 3196 * calling ibt_unbind_service() for each port/pkey, we unbind 3197 * all the services for the service handle by making only one 3198 * call to ibt_unbind_all_services(). Then, we deregister the 3199 * service for the service handle. 3200 * 3201 * When traversing the entries in service_list, we compare the 3202 * srv_hdl of the current entry with that of the next. If they 3203 * are different or if the next entry is NULL, the current entry 3204 * marks the last binding of the service handle. In this case, 3205 * call ibt_unbind_all_services() and deregister the service for 3206 * the service handle. If they are the same, the current and the 3207 * next entries are bound to the same service handle. In this 3208 * case, move on to the next entry. 3209 */ 3210 static void 3211 rib_stop_services(rib_hca_t *hca) 3212 { 3213 rib_service_t *srv_list, *to_remove; 3214 3215 /* 3216 * unbind and deregister the services for this service type. 3217 * Right now there is only one service type. In future it will 3218 * be passed down to this function. 3219 */ 3220 rw_enter(&hca->service_list_lock, RW_WRITER); 3221 srv_list = hca->service_list; 3222 while (srv_list != NULL) { 3223 to_remove = srv_list; 3224 srv_list = to_remove->srv_next; 3225 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3226 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3227 3228 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3229 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3230 to_remove->srv_hdl); 3231 } 3232 3233 kmem_free(to_remove, sizeof (rib_service_t)); 3234 } 3235 hca->service_list = NULL; 3236 rw_exit(&hca->service_list_lock); 3237 } 3238 3239 static struct svc_recv * 3240 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3241 { 3242 struct svc_recv *recvp; 3243 3244 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3245 recvp->vaddr = sgl->ds_va; 3246 recvp->qp = qp; 3247 recvp->bytes_xfer = 0; 3248 return (recvp); 3249 } 3250 3251 static int 3252 rib_free_svc_recv(struct svc_recv *recvp) 3253 { 3254 kmem_free(recvp, sizeof (*recvp)); 3255 3256 return (0); 3257 } 3258 3259 static struct reply * 3260 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3261 { 3262 struct reply *rep; 3263 3264 3265 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3266 if (rep == NULL) { 3267 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3268 return (NULL); 3269 } 3270 rep->xid = msgid; 3271 rep->vaddr_cq = NULL; 3272 rep->bytes_xfer = 0; 3273 rep->status = (uint_t)REPLY_WAIT; 3274 rep->prev = NULL; 3275 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3276 3277 mutex_enter(&qp->replylist_lock); 3278 if (qp->replylist) { 3279 rep->next = qp->replylist; 3280 qp->replylist->prev = rep; 3281 } 3282 qp->rep_list_size++; 3283 3284 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3285 int, qp->rep_list_size); 3286 3287 qp->replylist = rep; 3288 mutex_exit(&qp->replylist_lock); 3289 3290 return (rep); 3291 } 3292 3293 static rdma_stat 3294 rib_rem_replylist(rib_qp_t *qp) 3295 { 3296 struct reply *r, *n; 3297 3298 mutex_enter(&qp->replylist_lock); 3299 for (r = qp->replylist; r != NULL; r = n) { 3300 n = r->next; 3301 (void) rib_remreply(qp, r); 3302 } 3303 mutex_exit(&qp->replylist_lock); 3304 3305 return (RDMA_SUCCESS); 3306 } 3307 3308 static int 3309 rib_remreply(rib_qp_t *qp, struct reply *rep) 3310 { 3311 3312 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3313 if (rep->prev) { 3314 rep->prev->next = rep->next; 3315 } 3316 if (rep->next) { 3317 rep->next->prev = rep->prev; 3318 } 3319 if (qp->replylist == rep) 3320 qp->replylist = rep->next; 3321 3322 cv_destroy(&rep->wait_cv); 3323 qp->rep_list_size--; 3324 3325 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3326 int, qp->rep_list_size); 3327 3328 kmem_free(rep, sizeof (*rep)); 3329 3330 return (0); 3331 } 3332 3333 rdma_stat 3334 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3335 struct mrc *buf_handle) 3336 { 3337 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3338 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3339 rdma_stat status; 3340 rib_hca_t *hca = (ctoqp(conn))->hca; 3341 3342 /* 3343 * Note: ALL buffer pools use the same memory type RDMARW. 3344 */ 3345 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3346 if (status == RDMA_SUCCESS) { 3347 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3348 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3349 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3350 } else { 3351 buf_handle->mrc_linfo = NULL; 3352 buf_handle->mrc_lmr = 0; 3353 buf_handle->mrc_rmr = 0; 3354 } 3355 return (status); 3356 } 3357 3358 static rdma_stat 3359 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3360 ibt_mr_flags_t spec, 3361 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3362 { 3363 ibt_mr_attr_t mem_attr; 3364 ibt_status_t ibt_status; 3365 mem_attr.mr_vaddr = (uintptr_t)buf; 3366 mem_attr.mr_len = (ib_msglen_t)size; 3367 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3368 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3369 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3370 IBT_MR_ENABLE_WINDOW_BIND | spec; 3371 3372 rw_enter(&hca->state_lock, RW_READER); 3373 if (hca->state == HCA_INITED) { 3374 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3375 &mem_attr, mr_hdlp, mr_descp); 3376 rw_exit(&hca->state_lock); 3377 } else { 3378 rw_exit(&hca->state_lock); 3379 return (RDMA_FAILED); 3380 } 3381 3382 if (ibt_status != IBT_SUCCESS) { 3383 return (RDMA_FAILED); 3384 } 3385 return (RDMA_SUCCESS); 3386 } 3387 3388 rdma_stat 3389 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3390 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3391 { 3392 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3393 rib_lrc_entry_t *l; 3394 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3395 rdma_stat status; 3396 rib_hca_t *hca = (ctoqp(conn))->hca; 3397 3398 /* 3399 * Non-coherent memory registration. 3400 */ 3401 l = (rib_lrc_entry_t *)lrc; 3402 if (l) { 3403 if (l->registered) { 3404 buf_handle->mrc_linfo = 3405 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3406 buf_handle->mrc_lmr = 3407 (uint32_t)l->lrc_mhandle.mrc_lmr; 3408 buf_handle->mrc_rmr = 3409 (uint32_t)l->lrc_mhandle.mrc_rmr; 3410 *sync_handle = (RIB_SYNCMEM_HANDLE) 3411 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3412 return (RDMA_SUCCESS); 3413 } else { 3414 /* Always register the whole buffer */ 3415 buf = (caddr_t)l->lrc_buf; 3416 buflen = l->lrc_len; 3417 } 3418 } 3419 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3420 3421 if (status == RDMA_SUCCESS) { 3422 if (l) { 3423 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3424 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3425 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3426 l->registered = TRUE; 3427 } 3428 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3429 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3430 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3431 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3432 } else { 3433 buf_handle->mrc_linfo = NULL; 3434 buf_handle->mrc_lmr = 0; 3435 buf_handle->mrc_rmr = 0; 3436 } 3437 return (status); 3438 } 3439 3440 /* ARGSUSED */ 3441 rdma_stat 3442 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3443 { 3444 rib_hca_t *hca = (ctoqp(conn))->hca; 3445 /* 3446 * Allow memory deregistration even if HCA is 3447 * getting detached. Need all outstanding 3448 * memory registrations to be deregistered 3449 * before HCA_DETACH_EVENT can be accepted. 3450 */ 3451 (void) ibt_deregister_mr(hca->hca_hdl, 3452 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3453 return (RDMA_SUCCESS); 3454 } 3455 3456 /* ARGSUSED */ 3457 rdma_stat 3458 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3459 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3460 { 3461 rib_lrc_entry_t *l; 3462 l = (rib_lrc_entry_t *)lrc; 3463 if (l) 3464 if (l->registered) 3465 return (RDMA_SUCCESS); 3466 3467 (void) rib_deregistermem(conn, buf, buf_handle); 3468 3469 return (RDMA_SUCCESS); 3470 } 3471 3472 /* ARGSUSED */ 3473 rdma_stat 3474 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3475 int len, int cpu) 3476 { 3477 ibt_status_t status; 3478 rib_hca_t *hca = (ctoqp(conn))->hca; 3479 ibt_mr_sync_t mr_segment; 3480 3481 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3482 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3483 mr_segment.ms_len = (ib_memlen_t)len; 3484 if (cpu) { 3485 /* make incoming data visible to memory */ 3486 mr_segment.ms_flags = IBT_SYNC_WRITE; 3487 } else { 3488 /* make memory changes visible to IO */ 3489 mr_segment.ms_flags = IBT_SYNC_READ; 3490 } 3491 rw_enter(&hca->state_lock, RW_READER); 3492 if (hca->state == HCA_INITED) { 3493 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3494 rw_exit(&hca->state_lock); 3495 } else { 3496 rw_exit(&hca->state_lock); 3497 return (RDMA_FAILED); 3498 } 3499 3500 if (status == IBT_SUCCESS) 3501 return (RDMA_SUCCESS); 3502 else { 3503 return (RDMA_FAILED); 3504 } 3505 } 3506 3507 /* 3508 * XXXX ???? 3509 */ 3510 static rdma_stat 3511 rib_getinfo(rdma_info_t *info) 3512 { 3513 /* 3514 * XXXX Hack! 3515 */ 3516 info->addrlen = 16; 3517 info->mts = 1000000; 3518 info->mtu = 1000000; 3519 3520 return (RDMA_SUCCESS); 3521 } 3522 3523 rib_bufpool_t * 3524 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3525 { 3526 rib_bufpool_t *rbp = NULL; 3527 bufpool_t *bp = NULL; 3528 caddr_t buf; 3529 ibt_mr_attr_t mem_attr; 3530 ibt_status_t ibt_status; 3531 int i, j; 3532 3533 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3534 3535 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3536 num * sizeof (void *), KM_SLEEP); 3537 3538 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3539 bp->numelems = num; 3540 3541 3542 switch (ptype) { 3543 case SEND_BUFFER: 3544 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3545 bp->rsize = RPC_MSG_SZ; 3546 break; 3547 case RECV_BUFFER: 3548 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3549 bp->rsize = RPC_BUF_SIZE; 3550 break; 3551 default: 3552 goto fail; 3553 } 3554 3555 /* 3556 * Register the pool. 3557 */ 3558 bp->bufsize = num * bp->rsize; 3559 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3560 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3561 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3562 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3563 sizeof (ibt_mr_desc_t), KM_SLEEP); 3564 rw_enter(&hca->state_lock, RW_READER); 3565 3566 if (hca->state != HCA_INITED) { 3567 rw_exit(&hca->state_lock); 3568 goto fail; 3569 } 3570 3571 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3572 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3573 mem_attr.mr_vaddr = (uintptr_t)buf; 3574 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3575 mem_attr.mr_as = NULL; 3576 ibt_status = ibt_register_mr(hca->hca_hdl, 3577 hca->pd_hdl, &mem_attr, 3578 &rbp->mr_hdl[i], 3579 &rbp->mr_desc[i]); 3580 if (ibt_status != IBT_SUCCESS) { 3581 for (j = 0; j < i; j++) { 3582 (void) ibt_deregister_mr(hca->hca_hdl, 3583 rbp->mr_hdl[j]); 3584 } 3585 rw_exit(&hca->state_lock); 3586 goto fail; 3587 } 3588 } 3589 rw_exit(&hca->state_lock); 3590 buf = (caddr_t)bp->buf; 3591 for (i = 0; i < num; i++, buf += bp->rsize) { 3592 bp->buflist[i] = (void *)buf; 3593 } 3594 bp->buffree = num - 1; /* no. of free buffers */ 3595 rbp->bpool = bp; 3596 3597 return (rbp); 3598 fail: 3599 if (bp) { 3600 if (bp->buf) 3601 kmem_free(bp->buf, bp->bufsize); 3602 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3603 } 3604 if (rbp) { 3605 if (rbp->mr_hdl) 3606 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3607 if (rbp->mr_desc) 3608 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3609 kmem_free(rbp, sizeof (rib_bufpool_t)); 3610 } 3611 return (NULL); 3612 } 3613 3614 static void 3615 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3616 { 3617 int i; 3618 rib_bufpool_t *rbp = NULL; 3619 bufpool_t *bp; 3620 3621 /* 3622 * Obtain pool address based on type of pool 3623 */ 3624 switch (ptype) { 3625 case SEND_BUFFER: 3626 rbp = hca->send_pool; 3627 break; 3628 case RECV_BUFFER: 3629 rbp = hca->recv_pool; 3630 break; 3631 default: 3632 return; 3633 } 3634 if (rbp == NULL) 3635 return; 3636 3637 bp = rbp->bpool; 3638 3639 /* 3640 * Deregister the pool memory and free it. 3641 */ 3642 for (i = 0; i < bp->numelems; i++) { 3643 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3644 } 3645 } 3646 3647 static void 3648 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3649 { 3650 3651 rib_bufpool_t *rbp = NULL; 3652 bufpool_t *bp; 3653 3654 /* 3655 * Obtain pool address based on type of pool 3656 */ 3657 switch (ptype) { 3658 case SEND_BUFFER: 3659 rbp = hca->send_pool; 3660 break; 3661 case RECV_BUFFER: 3662 rbp = hca->recv_pool; 3663 break; 3664 default: 3665 return; 3666 } 3667 if (rbp == NULL) 3668 return; 3669 3670 bp = rbp->bpool; 3671 3672 /* 3673 * Free the pool memory. 3674 */ 3675 if (rbp->mr_hdl) 3676 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3677 3678 if (rbp->mr_desc) 3679 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3680 if (bp->buf) 3681 kmem_free(bp->buf, bp->bufsize); 3682 mutex_destroy(&bp->buflock); 3683 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3684 kmem_free(rbp, sizeof (rib_bufpool_t)); 3685 } 3686 3687 void 3688 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3689 { 3690 /* 3691 * Deregister the pool memory and free it. 3692 */ 3693 rib_rbufpool_deregister(hca, ptype); 3694 rib_rbufpool_free(hca, ptype); 3695 } 3696 3697 /* 3698 * Fetch a buffer from the pool of type specified in rdbuf->type. 3699 */ 3700 static rdma_stat 3701 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3702 { 3703 rib_lrc_entry_t *rlep; 3704 3705 if (rdbuf->type == RDMA_LONG_BUFFER) { 3706 rlep = rib_get_cache_buf(conn, rdbuf->len); 3707 rdbuf->rb_private = (caddr_t)rlep; 3708 rdbuf->addr = rlep->lrc_buf; 3709 rdbuf->handle = rlep->lrc_mhandle; 3710 return (RDMA_SUCCESS); 3711 } 3712 3713 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3714 if (rdbuf->addr) { 3715 switch (rdbuf->type) { 3716 case SEND_BUFFER: 3717 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3718 break; 3719 case RECV_BUFFER: 3720 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3721 break; 3722 default: 3723 rdbuf->len = 0; 3724 } 3725 return (RDMA_SUCCESS); 3726 } else 3727 return (RDMA_FAILED); 3728 } 3729 3730 #if defined(MEASURE_POOL_DEPTH) 3731 static void rib_recv_bufs(uint32_t x) { 3732 3733 } 3734 3735 static void rib_send_bufs(uint32_t x) { 3736 3737 } 3738 #endif 3739 3740 /* 3741 * Fetch a buffer of specified type. 3742 * Note that rdbuf->handle is mw's rkey. 3743 */ 3744 static void * 3745 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3746 { 3747 rib_qp_t *qp = ctoqp(conn); 3748 rib_hca_t *hca = qp->hca; 3749 rdma_btype ptype = rdbuf->type; 3750 void *buf; 3751 rib_bufpool_t *rbp = NULL; 3752 bufpool_t *bp; 3753 int i; 3754 3755 /* 3756 * Obtain pool address based on type of pool 3757 */ 3758 switch (ptype) { 3759 case SEND_BUFFER: 3760 rbp = hca->send_pool; 3761 break; 3762 case RECV_BUFFER: 3763 rbp = hca->recv_pool; 3764 break; 3765 default: 3766 return (NULL); 3767 } 3768 if (rbp == NULL) 3769 return (NULL); 3770 3771 bp = rbp->bpool; 3772 3773 mutex_enter(&bp->buflock); 3774 if (bp->buffree < 0) { 3775 mutex_exit(&bp->buflock); 3776 return (NULL); 3777 } 3778 3779 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3780 buf = bp->buflist[bp->buffree]; 3781 rdbuf->addr = buf; 3782 rdbuf->len = bp->rsize; 3783 for (i = bp->numelems - 1; i >= 0; i--) { 3784 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3785 rdbuf->handle.mrc_rmr = 3786 (uint32_t)rbp->mr_desc[i].md_rkey; 3787 rdbuf->handle.mrc_linfo = 3788 (uintptr_t)rbp->mr_hdl[i]; 3789 rdbuf->handle.mrc_lmr = 3790 (uint32_t)rbp->mr_desc[i].md_lkey; 3791 #if defined(MEASURE_POOL_DEPTH) 3792 if (ptype == SEND_BUFFER) 3793 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 3794 if (ptype == RECV_BUFFER) 3795 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 3796 #endif 3797 bp->buffree--; 3798 3799 mutex_exit(&bp->buflock); 3800 3801 return (buf); 3802 } 3803 } 3804 3805 mutex_exit(&bp->buflock); 3806 3807 return (NULL); 3808 } 3809 3810 static void 3811 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3812 { 3813 3814 if (rdbuf->type == RDMA_LONG_BUFFER) { 3815 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3816 rdbuf->rb_private = NULL; 3817 return; 3818 } 3819 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3820 } 3821 3822 static void 3823 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3824 { 3825 rib_qp_t *qp = ctoqp(conn); 3826 rib_hca_t *hca = qp->hca; 3827 rib_bufpool_t *rbp = NULL; 3828 bufpool_t *bp; 3829 3830 /* 3831 * Obtain pool address based on type of pool 3832 */ 3833 switch (ptype) { 3834 case SEND_BUFFER: 3835 rbp = hca->send_pool; 3836 break; 3837 case RECV_BUFFER: 3838 rbp = hca->recv_pool; 3839 break; 3840 default: 3841 return; 3842 } 3843 if (rbp == NULL) 3844 return; 3845 3846 bp = rbp->bpool; 3847 3848 mutex_enter(&bp->buflock); 3849 if (++bp->buffree >= bp->numelems) { 3850 /* 3851 * Should never happen 3852 */ 3853 bp->buffree--; 3854 } else { 3855 bp->buflist[bp->buffree] = buf; 3856 } 3857 mutex_exit(&bp->buflock); 3858 } 3859 3860 static rdma_stat 3861 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3862 { 3863 rw_enter(&connlist->conn_lock, RW_WRITER); 3864 if (connlist->conn_hd) { 3865 cn->c_next = connlist->conn_hd; 3866 connlist->conn_hd->c_prev = cn; 3867 } 3868 connlist->conn_hd = cn; 3869 rw_exit(&connlist->conn_lock); 3870 3871 return (RDMA_SUCCESS); 3872 } 3873 3874 static rdma_stat 3875 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3876 { 3877 rw_enter(&connlist->conn_lock, RW_WRITER); 3878 if (cn->c_prev) { 3879 cn->c_prev->c_next = cn->c_next; 3880 } 3881 if (cn->c_next) { 3882 cn->c_next->c_prev = cn->c_prev; 3883 } 3884 if (connlist->conn_hd == cn) 3885 connlist->conn_hd = cn->c_next; 3886 rw_exit(&connlist->conn_lock); 3887 3888 return (RDMA_SUCCESS); 3889 } 3890 3891 /* 3892 * Connection management. 3893 * IBTF does not support recycling of channels. So connections are only 3894 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3895 * C_DISCONN_PEND state. No C_IDLE state. 3896 * C_CONN_PEND state: Connection establishment in progress to the server. 3897 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3898 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3899 * only in this state. 3900 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3901 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3902 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3903 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3904 * c_ref drops to 0 (this indicates that RPC has no more references to this 3905 * connection), the connection should be destroyed. A connection transitions 3906 * into this state when it is being destroyed. 3907 */ 3908 static rdma_stat 3909 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3910 { 3911 CONN *cn; 3912 int status = RDMA_SUCCESS; 3913 rib_hca_t *hca = (rib_hca_t *)handle; 3914 rib_qp_t *qp; 3915 clock_t cv_stat, timout; 3916 ibt_path_info_t path; 3917 ibt_ip_addr_t s_ip, d_ip; 3918 3919 again: 3920 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3921 cn = hca->cl_conn_list.conn_hd; 3922 while (cn != NULL) { 3923 /* 3924 * First, clear up any connection in the ERROR state 3925 */ 3926 mutex_enter(&cn->c_lock); 3927 if (cn->c_state == C_ERROR_CONN) { 3928 if (cn->c_ref == 0) { 3929 /* 3930 * Remove connection from list and destroy it. 3931 */ 3932 cn->c_state = C_DISCONN_PEND; 3933 mutex_exit(&cn->c_lock); 3934 rw_exit(&hca->cl_conn_list.conn_lock); 3935 (void) rib_disconnect_channel(cn, 3936 &hca->cl_conn_list); 3937 goto again; 3938 } 3939 mutex_exit(&cn->c_lock); 3940 cn = cn->c_next; 3941 continue; 3942 } 3943 if (cn->c_state == C_DISCONN_PEND) { 3944 mutex_exit(&cn->c_lock); 3945 cn = cn->c_next; 3946 continue; 3947 } 3948 if ((cn->c_raddr.len == svcaddr->len) && 3949 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 3950 /* 3951 * Our connection. Give up conn list lock 3952 * as we are done traversing the list. 3953 */ 3954 rw_exit(&hca->cl_conn_list.conn_lock); 3955 if (cn->c_state == C_CONNECTED) { 3956 cn->c_ref++; /* sharing a conn */ 3957 mutex_exit(&cn->c_lock); 3958 *conn = cn; 3959 return (status); 3960 } 3961 if (cn->c_state == C_CONN_PEND) { 3962 /* 3963 * Hold a reference to this conn before 3964 * we give up the lock. 3965 */ 3966 cn->c_ref++; 3967 timout = ddi_get_lbolt() + 3968 drv_usectohz(CONN_WAIT_TIME * 1000000); 3969 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 3970 &cn->c_lock, timout)) > 0 && 3971 cn->c_state == C_CONN_PEND) 3972 ; 3973 if (cv_stat == 0) { 3974 cn->c_ref--; 3975 mutex_exit(&cn->c_lock); 3976 return (RDMA_INTR); 3977 } 3978 if (cv_stat < 0) { 3979 cn->c_ref--; 3980 mutex_exit(&cn->c_lock); 3981 return (RDMA_TIMEDOUT); 3982 } 3983 if (cn->c_state == C_CONNECTED) { 3984 *conn = cn; 3985 mutex_exit(&cn->c_lock); 3986 return (status); 3987 } else { 3988 cn->c_ref--; 3989 mutex_exit(&cn->c_lock); 3990 return (RDMA_TIMEDOUT); 3991 } 3992 } 3993 } 3994 mutex_exit(&cn->c_lock); 3995 cn = cn->c_next; 3996 } 3997 rw_exit(&hca->cl_conn_list.conn_lock); 3998 3999 bzero(&path, sizeof (ibt_path_info_t)); 4000 bzero(&s_ip, sizeof (ibt_ip_addr_t)); 4001 bzero(&d_ip, sizeof (ibt_ip_addr_t)); 4002 4003 status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip); 4004 if (status != RDMA_SUCCESS) { 4005 return (RDMA_FAILED); 4006 } 4007 4008 /* 4009 * Channel to server doesn't exist yet, create one. 4010 */ 4011 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4012 return (RDMA_FAILED); 4013 } 4014 cn = qptoc(qp); 4015 cn->c_state = C_CONN_PEND; 4016 cn->c_ref = 1; 4017 4018 /* 4019 * Add to conn list. 4020 * We had given up the READER lock. In the time since then, 4021 * another thread might have created the connection we are 4022 * trying here. But for now, that is quiet alright - there 4023 * might be two connections between a pair of hosts instead 4024 * of one. If we really want to close that window, 4025 * then need to check the list after acquiring the 4026 * WRITER lock. 4027 */ 4028 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4029 status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip); 4030 mutex_enter(&cn->c_lock); 4031 if (status == RDMA_SUCCESS) { 4032 cn->c_state = C_CONNECTED; 4033 *conn = cn; 4034 } else { 4035 cn->c_state = C_ERROR_CONN; 4036 cn->c_ref--; 4037 } 4038 cv_broadcast(&cn->c_cv); 4039 mutex_exit(&cn->c_lock); 4040 return (status); 4041 } 4042 4043 static rdma_stat 4044 rib_conn_release(CONN *conn) 4045 { 4046 rib_qp_t *qp = ctoqp(conn); 4047 4048 mutex_enter(&conn->c_lock); 4049 conn->c_ref--; 4050 4051 /* 4052 * If a conn is C_ERROR_CONN, close the channel. 4053 * If it's CONNECTED, keep it that way. 4054 */ 4055 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4056 conn->c_state = C_DISCONN_PEND; 4057 mutex_exit(&conn->c_lock); 4058 if (qp->mode == RIB_SERVER) 4059 (void) rib_disconnect_channel(conn, 4060 &qp->hca->srv_conn_list); 4061 else 4062 (void) rib_disconnect_channel(conn, 4063 &qp->hca->cl_conn_list); 4064 return (RDMA_SUCCESS); 4065 } 4066 mutex_exit(&conn->c_lock); 4067 return (RDMA_SUCCESS); 4068 } 4069 4070 /* 4071 * Add at front of list 4072 */ 4073 static struct rdma_done_list * 4074 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4075 { 4076 struct rdma_done_list *rd; 4077 4078 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4079 4080 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4081 rd->xid = xid; 4082 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4083 4084 rd->prev = NULL; 4085 rd->next = qp->rdlist; 4086 if (qp->rdlist != NULL) 4087 qp->rdlist->prev = rd; 4088 qp->rdlist = rd; 4089 4090 return (rd); 4091 } 4092 4093 static void 4094 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4095 { 4096 struct rdma_done_list *r; 4097 4098 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4099 4100 r = rd->next; 4101 if (r != NULL) { 4102 r->prev = rd->prev; 4103 } 4104 4105 r = rd->prev; 4106 if (r != NULL) { 4107 r->next = rd->next; 4108 } else { 4109 qp->rdlist = rd->next; 4110 } 4111 4112 cv_destroy(&rd->rdma_done_cv); 4113 kmem_free(rd, sizeof (*rd)); 4114 } 4115 4116 static void 4117 rdma_done_rem_list(rib_qp_t *qp) 4118 { 4119 struct rdma_done_list *r, *n; 4120 4121 mutex_enter(&qp->rdlist_lock); 4122 for (r = qp->rdlist; r != NULL; r = n) { 4123 n = r->next; 4124 rdma_done_rm(qp, r); 4125 } 4126 mutex_exit(&qp->rdlist_lock); 4127 } 4128 4129 static void 4130 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4131 { 4132 struct rdma_done_list *r = qp->rdlist; 4133 4134 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4135 4136 while (r) { 4137 if (r->xid == xid) { 4138 cv_signal(&r->rdma_done_cv); 4139 return; 4140 } else { 4141 r = r->next; 4142 } 4143 } 4144 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4145 int, xid); 4146 } 4147 4148 4149 /* 4150 * Goes through all connections and closes the channel 4151 * This will cause all the WRs on those channels to be 4152 * flushed. 4153 */ 4154 static void 4155 rib_close_channels(rib_conn_list_t *connlist) 4156 { 4157 CONN *conn; 4158 rib_qp_t *qp; 4159 4160 rw_enter(&connlist->conn_lock, RW_READER); 4161 conn = connlist->conn_hd; 4162 while (conn != NULL) { 4163 mutex_enter(&conn->c_lock); 4164 qp = ctoqp(conn); 4165 if (conn->c_state == C_CONNECTED) { 4166 /* 4167 * Live connection in CONNECTED state. 4168 * Call ibt_close_rc_channel in nonblocking mode 4169 * with no callbacks. 4170 */ 4171 conn->c_state = C_ERROR_CONN; 4172 (void) ibt_close_rc_channel(qp->qp_hdl, 4173 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4174 (void) ibt_free_channel(qp->qp_hdl); 4175 qp->qp_hdl = NULL; 4176 } else { 4177 if (conn->c_state == C_ERROR_CONN && 4178 qp->qp_hdl != NULL) { 4179 /* 4180 * Connection in ERROR state but 4181 * channel is not yet freed. 4182 */ 4183 (void) ibt_close_rc_channel(qp->qp_hdl, 4184 IBT_NOCALLBACKS, NULL, 0, NULL, 4185 NULL, 0); 4186 (void) ibt_free_channel(qp->qp_hdl); 4187 qp->qp_hdl = NULL; 4188 } 4189 } 4190 mutex_exit(&conn->c_lock); 4191 conn = conn->c_next; 4192 } 4193 rw_exit(&connlist->conn_lock); 4194 } 4195 4196 /* 4197 * Frees up all connections that are no longer being referenced 4198 */ 4199 static void 4200 rib_purge_connlist(rib_conn_list_t *connlist) 4201 { 4202 CONN *conn; 4203 4204 top: 4205 rw_enter(&connlist->conn_lock, RW_READER); 4206 conn = connlist->conn_hd; 4207 while (conn != NULL) { 4208 mutex_enter(&conn->c_lock); 4209 4210 /* 4211 * At this point connection is either in ERROR 4212 * or DISCONN_PEND state. If in DISCONN_PEND state 4213 * then some other thread is culling that connection. 4214 * If not and if c_ref is 0, then destroy the connection. 4215 */ 4216 if (conn->c_ref == 0 && 4217 conn->c_state != C_DISCONN_PEND) { 4218 /* 4219 * Cull the connection 4220 */ 4221 conn->c_state = C_DISCONN_PEND; 4222 mutex_exit(&conn->c_lock); 4223 rw_exit(&connlist->conn_lock); 4224 (void) rib_disconnect_channel(conn, connlist); 4225 goto top; 4226 } else { 4227 /* 4228 * conn disconnect already scheduled or will 4229 * happen from conn_release when c_ref drops to 0. 4230 */ 4231 mutex_exit(&conn->c_lock); 4232 } 4233 conn = conn->c_next; 4234 } 4235 rw_exit(&connlist->conn_lock); 4236 4237 /* 4238 * At this point, only connections with c_ref != 0 are on the list 4239 */ 4240 } 4241 4242 /* 4243 * Cleans and closes up all uses of the HCA 4244 */ 4245 static void 4246 rib_detach_hca(rib_hca_t *hca) 4247 { 4248 4249 /* 4250 * Stop all services on the HCA 4251 * Go through cl_conn_list and close all rc_channels 4252 * Go through svr_conn_list and close all rc_channels 4253 * Free connections whose c_ref has dropped to 0 4254 * Destroy all CQs 4255 * Deregister and released all buffer pool memory after all 4256 * connections are destroyed 4257 * Free the protection domain 4258 * ibt_close_hca() 4259 */ 4260 rw_enter(&hca->state_lock, RW_WRITER); 4261 if (hca->state == HCA_DETACHED) { 4262 rw_exit(&hca->state_lock); 4263 return; 4264 } 4265 4266 hca->state = HCA_DETACHED; 4267 rib_stat->nhca_inited--; 4268 4269 rib_stop_services(hca); 4270 rib_close_channels(&hca->cl_conn_list); 4271 rib_close_channels(&hca->srv_conn_list); 4272 rw_exit(&hca->state_lock); 4273 4274 rib_purge_connlist(&hca->cl_conn_list); 4275 rib_purge_connlist(&hca->srv_conn_list); 4276 4277 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4278 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4279 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4280 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4281 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4282 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4283 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4284 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4285 4286 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4287 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4288 if (hca->srv_conn_list.conn_hd == NULL && 4289 hca->cl_conn_list.conn_hd == NULL) { 4290 /* 4291 * conn_lists are NULL, so destroy 4292 * buffers, close hca and be done. 4293 */ 4294 rib_rbufpool_destroy(hca, RECV_BUFFER); 4295 rib_rbufpool_destroy(hca, SEND_BUFFER); 4296 rib_destroy_cache(hca); 4297 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4298 (void) ibt_close_hca(hca->hca_hdl); 4299 hca->hca_hdl = NULL; 4300 } 4301 rw_exit(&hca->cl_conn_list.conn_lock); 4302 rw_exit(&hca->srv_conn_list.conn_lock); 4303 4304 if (hca->hca_hdl != NULL) { 4305 mutex_enter(&hca->inuse_lock); 4306 while (hca->inuse) 4307 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4308 mutex_exit(&hca->inuse_lock); 4309 /* 4310 * conn_lists are now NULL, so destroy 4311 * buffers, close hca and be done. 4312 */ 4313 rib_rbufpool_destroy(hca, RECV_BUFFER); 4314 rib_rbufpool_destroy(hca, SEND_BUFFER); 4315 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4316 (void) ibt_close_hca(hca->hca_hdl); 4317 hca->hca_hdl = NULL; 4318 } 4319 } 4320 4321 static void 4322 rib_server_side_cache_reclaim(void *argp) 4323 { 4324 cache_avl_struct_t *rcas; 4325 rib_lrc_entry_t *rb; 4326 rib_hca_t *hca = (rib_hca_t *)argp; 4327 4328 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4329 rcas = avl_first(&hca->avl_tree); 4330 if (rcas != NULL) 4331 avl_remove(&hca->avl_tree, rcas); 4332 4333 while (rcas != NULL) { 4334 while (rcas->r.forw != &rcas->r) { 4335 rcas->elements--; 4336 rib_total_buffers --; 4337 rb = rcas->r.forw; 4338 remque(rb); 4339 if (rb->registered) 4340 (void) rib_deregistermem_via_hca(hca, 4341 rb->lrc_buf, rb->lrc_mhandle); 4342 cache_allocation -= rb->lrc_len; 4343 kmem_free(rb->lrc_buf, rb->lrc_len); 4344 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4345 } 4346 mutex_destroy(&rcas->node_lock); 4347 kmem_cache_free(hca->server_side_cache, rcas); 4348 rcas = avl_first(&hca->avl_tree); 4349 if (rcas != NULL) 4350 avl_remove(&hca->avl_tree, rcas); 4351 } 4352 rw_exit(&hca->avl_rw_lock); 4353 } 4354 4355 static void 4356 rib_server_side_cache_cleanup(void *argp) 4357 { 4358 cache_avl_struct_t *rcas; 4359 rib_lrc_entry_t *rb; 4360 rib_hca_t *hca = (rib_hca_t *)argp; 4361 4362 rw_enter(&hca->avl_rw_lock, RW_READER); 4363 if (cache_allocation < cache_limit) { 4364 rw_exit(&hca->avl_rw_lock); 4365 return; 4366 } 4367 rw_exit(&hca->avl_rw_lock); 4368 4369 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4370 rcas = avl_last(&hca->avl_tree); 4371 if (rcas != NULL) 4372 avl_remove(&hca->avl_tree, rcas); 4373 4374 while (rcas != NULL) { 4375 while (rcas->r.forw != &rcas->r) { 4376 rcas->elements--; 4377 rib_total_buffers --; 4378 rb = rcas->r.forw; 4379 remque(rb); 4380 if (rb->registered) 4381 (void) rib_deregistermem_via_hca(hca, 4382 rb->lrc_buf, rb->lrc_mhandle); 4383 cache_allocation -= rb->lrc_len; 4384 kmem_free(rb->lrc_buf, rb->lrc_len); 4385 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4386 } 4387 mutex_destroy(&rcas->node_lock); 4388 kmem_cache_free(hca->server_side_cache, rcas); 4389 if ((cache_allocation) < cache_limit) { 4390 rw_exit(&hca->avl_rw_lock); 4391 return; 4392 } 4393 4394 rcas = avl_last(&hca->avl_tree); 4395 if (rcas != NULL) 4396 avl_remove(&hca->avl_tree, rcas); 4397 } 4398 rw_exit(&hca->avl_rw_lock); 4399 } 4400 4401 static int 4402 avl_compare(const void *t1, const void *t2) 4403 { 4404 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4405 return (0); 4406 4407 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4408 return (-1); 4409 4410 return (1); 4411 } 4412 4413 static void 4414 rib_destroy_cache(rib_hca_t *hca) 4415 { 4416 if (hca->reg_cache_clean_up != NULL) { 4417 ddi_taskq_destroy(hca->reg_cache_clean_up); 4418 hca->reg_cache_clean_up = NULL; 4419 } 4420 if (!hca->avl_init) { 4421 kmem_cache_destroy(hca->server_side_cache); 4422 avl_destroy(&hca->avl_tree); 4423 mutex_destroy(&hca->cache_allocation); 4424 rw_destroy(&hca->avl_rw_lock); 4425 } 4426 hca->avl_init = FALSE; 4427 } 4428 4429 static void 4430 rib_force_cleanup(void *hca) 4431 { 4432 if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL) 4433 (void) ddi_taskq_dispatch( 4434 ((rib_hca_t *)hca)->reg_cache_clean_up, 4435 rib_server_side_cache_cleanup, 4436 (void *)hca, DDI_NOSLEEP); 4437 } 4438 4439 static rib_lrc_entry_t * 4440 rib_get_cache_buf(CONN *conn, uint32_t len) 4441 { 4442 cache_avl_struct_t cas, *rcas; 4443 rib_hca_t *hca = (ctoqp(conn))->hca; 4444 rib_lrc_entry_t *reply_buf; 4445 avl_index_t where = NULL; 4446 uint64_t c_alloc = 0; 4447 4448 if (!hca->avl_init) 4449 goto error_alloc; 4450 4451 cas.len = len; 4452 4453 rw_enter(&hca->avl_rw_lock, RW_READER); 4454 4455 mutex_enter(&hca->cache_allocation); 4456 c_alloc = cache_allocation; 4457 mutex_exit(&hca->cache_allocation); 4458 4459 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4460 &where)) == NULL) { 4461 /* Am I above the cache limit */ 4462 if ((c_alloc + len) >= cache_limit) { 4463 rib_force_cleanup((void *)hca); 4464 rw_exit(&hca->avl_rw_lock); 4465 cache_misses_above_the_limit ++; 4466 4467 /* Allocate and register the buffer directly */ 4468 goto error_alloc; 4469 } 4470 4471 rw_exit(&hca->avl_rw_lock); 4472 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4473 4474 /* Recheck to make sure no other thread added the entry in */ 4475 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4476 &cas, &where)) == NULL) { 4477 /* Allocate an avl tree entry */ 4478 rcas = (cache_avl_struct_t *) 4479 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4480 4481 bzero(rcas, sizeof (cache_avl_struct_t)); 4482 rcas->elements = 0; 4483 rcas->r.forw = &rcas->r; 4484 rcas->r.back = &rcas->r; 4485 rcas->len = len; 4486 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4487 avl_insert(&hca->avl_tree, rcas, where); 4488 } 4489 } 4490 4491 mutex_enter(&rcas->node_lock); 4492 4493 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4494 rib_total_buffers--; 4495 cache_hits++; 4496 reply_buf = rcas->r.forw; 4497 remque(reply_buf); 4498 rcas->elements--; 4499 mutex_exit(&rcas->node_lock); 4500 rw_exit(&hca->avl_rw_lock); 4501 mutex_enter(&hca->cache_allocation); 4502 cache_allocation -= len; 4503 mutex_exit(&hca->cache_allocation); 4504 } else { 4505 /* Am I above the cache limit */ 4506 mutex_exit(&rcas->node_lock); 4507 if ((c_alloc + len) >= cache_limit) { 4508 rib_force_cleanup((void *)hca); 4509 rw_exit(&hca->avl_rw_lock); 4510 cache_misses_above_the_limit ++; 4511 /* Allocate and register the buffer directly */ 4512 goto error_alloc; 4513 } 4514 rw_exit(&hca->avl_rw_lock); 4515 cache_misses ++; 4516 /* Allocate a reply_buf entry */ 4517 reply_buf = (rib_lrc_entry_t *) 4518 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4519 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4520 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4521 reply_buf->lrc_len = len; 4522 reply_buf->registered = FALSE; 4523 reply_buf->avl_node = (void *)rcas; 4524 } 4525 4526 return (reply_buf); 4527 4528 error_alloc: 4529 reply_buf = (rib_lrc_entry_t *) 4530 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4531 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4532 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4533 reply_buf->lrc_len = len; 4534 reply_buf->registered = FALSE; 4535 reply_buf->avl_node = NULL; 4536 4537 return (reply_buf); 4538 } 4539 4540 /* 4541 * Return a pre-registered back to the cache (without 4542 * unregistering the buffer).. 4543 */ 4544 4545 static void 4546 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4547 { 4548 cache_avl_struct_t cas, *rcas; 4549 avl_index_t where = NULL; 4550 rib_hca_t *hca = (ctoqp(conn))->hca; 4551 4552 if (!hca->avl_init) 4553 goto error_free; 4554 4555 cas.len = reg_buf->lrc_len; 4556 rw_enter(&hca->avl_rw_lock, RW_READER); 4557 if ((rcas = (cache_avl_struct_t *) 4558 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4559 rw_exit(&hca->avl_rw_lock); 4560 goto error_free; 4561 } else { 4562 rib_total_buffers ++; 4563 cas.len = reg_buf->lrc_len; 4564 mutex_enter(&rcas->node_lock); 4565 insque(reg_buf, &rcas->r); 4566 rcas->elements ++; 4567 mutex_exit(&rcas->node_lock); 4568 rw_exit(&hca->avl_rw_lock); 4569 mutex_enter(&hca->cache_allocation); 4570 cache_allocation += cas.len; 4571 mutex_exit(&hca->cache_allocation); 4572 } 4573 4574 return; 4575 4576 error_free: 4577 4578 if (reg_buf->registered) 4579 (void) rib_deregistermem_via_hca(hca, 4580 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4581 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4582 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4583 } 4584 4585 static rdma_stat 4586 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4587 uint_t buflen, struct mrc *buf_handle) 4588 { 4589 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4590 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4591 rdma_stat status; 4592 4593 4594 /* 4595 * Note: ALL buffer pools use the same memory type RDMARW. 4596 */ 4597 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4598 if (status == RDMA_SUCCESS) { 4599 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4600 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4601 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4602 } else { 4603 buf_handle->mrc_linfo = NULL; 4604 buf_handle->mrc_lmr = 0; 4605 buf_handle->mrc_rmr = 0; 4606 } 4607 return (status); 4608 } 4609 4610 /* ARGSUSED */ 4611 static rdma_stat 4612 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4613 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4614 { 4615 4616 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4617 return (RDMA_SUCCESS); 4618 } 4619 4620 /* ARGSUSED */ 4621 static rdma_stat 4622 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4623 { 4624 4625 (void) ibt_deregister_mr(hca->hca_hdl, 4626 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4627 return (RDMA_SUCCESS); 4628 } 4629 4630 /* 4631 * Check if the IP interface named by `lifrp' is RDMA-capable. 4632 */ 4633 static boolean_t 4634 rpcib_rdma_capable_interface(struct lifreq *lifrp) 4635 { 4636 char ifname[LIFNAMSIZ]; 4637 char *cp; 4638 4639 if (lifrp->lifr_type == IFT_IB) 4640 return (B_TRUE); 4641 4642 /* 4643 * Strip off the logical interface portion before getting 4644 * intimate with the name. 4645 */ 4646 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 4647 if ((cp = strchr(ifname, ':')) != NULL) 4648 *cp = '\0'; 4649 4650 return (strcmp("lo0", ifname) == 0); 4651 } 4652 4653 static int 4654 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 4655 { 4656 vnode_t *kvp, *vp; 4657 TIUSER *tiptr; 4658 struct strioctl iocb; 4659 k_sigset_t smask; 4660 int err = 0; 4661 4662 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 4663 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 4664 &tiptr, CRED()) == 0) { 4665 vp = tiptr->fp->f_vnode; 4666 } else { 4667 VN_RELE(kvp); 4668 return (EPROTO); 4669 } 4670 } else { 4671 return (EPROTO); 4672 } 4673 4674 iocb.ic_cmd = cmd; 4675 iocb.ic_timout = 0; 4676 iocb.ic_len = len; 4677 iocb.ic_dp = (caddr_t)arg; 4678 sigintr(&smask, 0); 4679 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4680 sigunintr(&smask); 4681 (void) t_kclose(tiptr, 0); 4682 VN_RELE(kvp); 4683 return (err); 4684 } 4685 4686 /* 4687 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 4688 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 4689 */ 4690 static int 4691 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 4692 { 4693 int err; 4694 struct lifnum lifn; 4695 4696 bzero(&lifn, sizeof (struct lifnum)); 4697 lifn.lifn_family = AF_UNSPEC; 4698 4699 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 4700 if (err != 0) 4701 return (err); 4702 4703 /* 4704 * Pad the interface count to account for additional interfaces that 4705 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 4706 */ 4707 lifn.lifn_count += 4; 4708 4709 bzero(lifcp, sizeof (struct lifconf)); 4710 lifcp->lifc_family = AF_UNSPEC; 4711 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 4712 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 4713 4714 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 4715 if (err != 0) { 4716 kmem_free(lifcp->lifc_buf, *bufsizep); 4717 return (err); 4718 } 4719 return (0); 4720 } 4721 4722 static boolean_t 4723 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 4724 { 4725 uint_t i, nifs; 4726 uint_t bufsize; 4727 struct lifconf lifc; 4728 struct lifreq *lifrp; 4729 struct sockaddr_in *sinp; 4730 struct sockaddr_in6 *sin6p; 4731 4732 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 4733 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 4734 4735 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 4736 return (B_FALSE); 4737 4738 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 4739 kmem_free(lifc.lifc_buf, bufsize); 4740 return (B_FALSE); 4741 } 4742 4743 /* 4744 * Worst case is that all of the addresses are IB-capable and have 4745 * the same address family, so size our buffers accordingly. 4746 */ 4747 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 4748 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 4749 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 4750 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 4751 4752 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 4753 if (!rpcib_rdma_capable_interface(lifrp)) 4754 continue; 4755 4756 if (lifrp->lifr_addr.ss_family == AF_INET) { 4757 sinp = addrs4->ri_list; 4758 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 4759 sizeof (struct sockaddr_in)); 4760 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 4761 sin6p = addrs6->ri_list; 4762 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 4763 sizeof (struct sockaddr_in6)); 4764 } 4765 } 4766 4767 kmem_free(lifc.lifc_buf, bufsize); 4768 return (B_TRUE); 4769 } 4770 4771 /* ARGSUSED */ 4772 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4773 4774 if (KSTAT_WRITE == rw) { 4775 return (EACCES); 4776 } 4777 rpcib_kstat.cache_limit.value.ui64 = 4778 (uint64_t)cache_limit; 4779 rpcib_kstat.cache_allocation.value.ui64 = 4780 (uint64_t)cache_allocation; 4781 rpcib_kstat.cache_hits.value.ui64 = 4782 (uint64_t)cache_hits; 4783 rpcib_kstat.cache_misses.value.ui64 = 4784 (uint64_t)cache_misses; 4785 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4786 (uint64_t)cache_misses_above_the_limit; 4787 return (0); 4788 } 4789