1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 2050 86 87 /* 88 * Convenience structures for connection management 89 */ 90 typedef struct rpcib_ipaddrs { 91 void *ri_list; /* pointer to list of addresses */ 92 uint_t ri_count; /* number of addresses in list */ 93 uint_t ri_size; /* size of ri_list in bytes */ 94 } rpcib_ipaddrs_t; 95 96 97 typedef struct rpcib_ping { 98 rib_hca_t *hca; 99 ibt_path_info_t path; 100 ibt_ip_addr_t srcip; 101 ibt_ip_addr_t dstip; 102 } rpcib_ping_t; 103 104 /* 105 * Prototype declarations for driver ops 106 */ 107 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 108 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 109 void *, void **); 110 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 112 static int rpcib_do_ip_ioctl(int, int, void *); 113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 114 static int rpcib_cache_kstat_update(kstat_t *, int); 115 static void rib_force_cleanup(void *); 116 117 struct { 118 kstat_named_t cache_limit; 119 kstat_named_t cache_allocation; 120 kstat_named_t cache_hits; 121 kstat_named_t cache_misses; 122 kstat_named_t cache_misses_above_the_limit; 123 } rpcib_kstat = { 124 {"cache_limit", KSTAT_DATA_UINT64 }, 125 {"cache_allocation", KSTAT_DATA_UINT64 }, 126 {"cache_hits", KSTAT_DATA_UINT64 }, 127 {"cache_misses", KSTAT_DATA_UINT64 }, 128 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 129 }; 130 131 /* rpcib cb_ops */ 132 static struct cb_ops rpcib_cbops = { 133 nulldev, /* open */ 134 nulldev, /* close */ 135 nodev, /* strategy */ 136 nodev, /* print */ 137 nodev, /* dump */ 138 nodev, /* read */ 139 nodev, /* write */ 140 nodev, /* ioctl */ 141 nodev, /* devmap */ 142 nodev, /* mmap */ 143 nodev, /* segmap */ 144 nochpoll, /* poll */ 145 ddi_prop_op, /* prop_op */ 146 NULL, /* stream */ 147 D_MP, /* cb_flag */ 148 CB_REV, /* rev */ 149 nodev, /* int (*cb_aread)() */ 150 nodev /* int (*cb_awrite)() */ 151 }; 152 153 /* 154 * Device options 155 */ 156 static struct dev_ops rpcib_ops = { 157 DEVO_REV, /* devo_rev, */ 158 0, /* refcnt */ 159 rpcib_getinfo, /* info */ 160 nulldev, /* identify */ 161 nulldev, /* probe */ 162 rpcib_attach, /* attach */ 163 rpcib_detach, /* detach */ 164 nodev, /* reset */ 165 &rpcib_cbops, /* driver ops - devctl interfaces */ 166 NULL, /* bus operations */ 167 NULL, /* power */ 168 ddi_quiesce_not_needed, /* quiesce */ 169 }; 170 171 /* 172 * Module linkage information. 173 */ 174 175 static struct modldrv rib_modldrv = { 176 &mod_driverops, /* Driver module */ 177 "RPCIB plugin driver", /* Driver name and version */ 178 &rpcib_ops, /* Driver ops */ 179 }; 180 181 static struct modlinkage rib_modlinkage = { 182 MODREV_1, 183 (void *)&rib_modldrv, 184 NULL 185 }; 186 187 typedef struct rib_lrc_entry { 188 struct rib_lrc_entry *forw; 189 struct rib_lrc_entry *back; 190 char *lrc_buf; 191 192 uint32_t lrc_len; 193 void *avl_node; 194 bool_t registered; 195 196 struct mrc lrc_mhandle; 197 bool_t lrc_on_freed_list; 198 } rib_lrc_entry_t; 199 200 typedef struct cache_struct { 201 rib_lrc_entry_t r; 202 uint32_t len; 203 uint32_t elements; 204 kmutex_t node_lock; 205 avl_node_t avl_link; 206 } cache_avl_struct_t; 207 208 static uint64_t rib_total_buffers = 0; 209 uint64_t cache_limit = 100 * 1024 * 1024; 210 static volatile uint64_t cache_allocation = 0; 211 static uint64_t cache_watermark = 80 * 1024 * 1024; 212 static uint64_t cache_hits = 0; 213 static uint64_t cache_misses = 0; 214 static uint64_t cache_cold_misses = 0; 215 static uint64_t cache_hot_misses = 0; 216 static uint64_t cache_misses_above_the_limit = 0; 217 static bool_t stats_enabled = FALSE; 218 219 static uint64_t max_unsignaled_rws = 5; 220 221 /* 222 * rib_stat: private data pointer used when registering 223 * with the IBTF. It is returned to the consumer 224 * in all callbacks. 225 */ 226 static rpcib_state_t *rib_stat = NULL; 227 228 #define RNR_RETRIES IBT_RNR_RETRY_1 229 #define MAX_PORTS 2 230 231 int preposted_rbufs = RDMA_BUFS_GRANT; 232 int send_threshold = 1; 233 234 /* 235 * State of the plugin. 236 * ACCEPT = accepting new connections and requests. 237 * NO_ACCEPT = not accepting new connection and requests. 238 * This should eventually move to rpcib_state_t structure, since this 239 * will tell in which state the plugin is for a particular type of service 240 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 241 * state for one and in no_accept state for the other. 242 */ 243 int plugin_state; 244 kmutex_t plugin_state_lock; 245 246 ldi_ident_t rpcib_li; 247 248 /* 249 * RPCIB RDMATF operations 250 */ 251 #if defined(MEASURE_POOL_DEPTH) 252 static void rib_posted_rbufs(uint32_t x) { return; } 253 #endif 254 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 255 static rdma_stat rib_disconnect(CONN *conn); 256 static void rib_listen(struct rdma_svc_data *rd); 257 static void rib_listen_stop(struct rdma_svc_data *rd); 258 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 259 uint_t buflen, struct mrc *buf_handle); 260 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 261 struct mrc buf_handle); 262 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 263 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 264 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 265 struct mrc buf_handle); 266 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 267 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 268 void *lrc); 269 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 270 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 271 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 272 caddr_t buf, int len, int cpu); 273 274 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 275 276 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 277 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 278 279 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 280 281 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 282 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 283 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 284 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 285 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 286 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 287 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 288 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 289 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 290 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 291 static rdma_stat rib_conn_release(CONN *conn); 292 static rdma_stat rib_getinfo(rdma_info_t *info); 293 294 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 295 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 296 static void rib_destroy_cache(rib_hca_t *hca); 297 static void rib_server_side_cache_reclaim(void *argp); 298 static int avl_compare(const void *t1, const void *t2); 299 300 static void rib_stop_services(rib_hca_t *); 301 static void rib_close_channels(rib_conn_list_t *); 302 303 /* 304 * RPCIB addressing operations 305 */ 306 307 /* 308 * RDMA operations the RPCIB module exports 309 */ 310 static rdmaops_t rib_ops = { 311 rib_reachable, 312 rib_conn_get, 313 rib_conn_release, 314 rib_listen, 315 rib_listen_stop, 316 rib_registermem, 317 rib_deregistermem, 318 rib_registermemsync, 319 rib_deregistermemsync, 320 rib_syncmem, 321 rib_reg_buf_alloc, 322 rib_reg_buf_free, 323 rib_send, 324 rib_send_resp, 325 rib_post_resp, 326 rib_post_resp_remove, 327 rib_post_recv, 328 rib_recv, 329 rib_read, 330 rib_write, 331 rib_getinfo, 332 }; 333 334 /* 335 * RDMATF RPCIB plugin details 336 */ 337 static rdma_mod_t rib_mod = { 338 "ibtf", /* api name */ 339 RDMATF_VERS_1, 340 0, 341 &rib_ops, /* rdma op vector for ibtf */ 342 }; 343 344 static rdma_stat open_hcas(rpcib_state_t *); 345 static rdma_stat rib_qp_init(rib_qp_t *, int); 346 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 347 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 348 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 349 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 350 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 351 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 352 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 353 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 354 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 355 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 356 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 357 rib_qp_t **); 358 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 359 rib_qp_t **); 360 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 361 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 362 static int rib_free_sendwait(struct send_wid *); 363 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 364 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 365 static void rdma_done_rem_list(rib_qp_t *); 366 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 367 368 static void rib_async_handler(void *, 369 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 370 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 371 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 372 static int rib_free_svc_recv(struct svc_recv *); 373 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 374 static void rib_free_wid(struct recv_wid *); 375 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 376 static void rib_detach_hca(rib_hca_t *); 377 378 /* 379 * Registration with IBTF as a consumer 380 */ 381 static struct ibt_clnt_modinfo_s rib_modinfo = { 382 IBTI_V_CURR, 383 IBT_GENERIC, 384 rib_async_handler, /* async event handler */ 385 NULL, /* Memory Region Handler */ 386 "nfs/ib" 387 }; 388 389 /* 390 * Global strucuture 391 */ 392 393 typedef struct rpcib_s { 394 dev_info_t *rpcib_dip; 395 kmutex_t rpcib_mutex; 396 } rpcib_t; 397 398 rpcib_t rpcib; 399 400 /* 401 * /etc/system controlled variable to control 402 * debugging in rpcib kernel module. 403 * Set it to values greater that 1 to control 404 * the amount of debugging messages required. 405 */ 406 int rib_debug = 0; 407 408 int 409 _init(void) 410 { 411 int error; 412 413 error = mod_install((struct modlinkage *)&rib_modlinkage); 414 if (error != 0) { 415 /* 416 * Could not load module 417 */ 418 return (error); 419 } 420 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 421 return (0); 422 } 423 424 int 425 _fini() 426 { 427 int status; 428 429 /* 430 * Remove module 431 */ 432 if ((status = mod_remove(&rib_modlinkage)) != 0) { 433 return (status); 434 } 435 mutex_destroy(&plugin_state_lock); 436 return (0); 437 } 438 439 int 440 _info(struct modinfo *modinfop) 441 { 442 return (mod_info(&rib_modlinkage, modinfop)); 443 } 444 445 /* 446 * rpcib_getinfo() 447 * Given the device number, return the devinfo pointer or the 448 * instance number. 449 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 450 */ 451 452 /*ARGSUSED*/ 453 static int 454 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 455 { 456 int ret = DDI_SUCCESS; 457 458 switch (cmd) { 459 case DDI_INFO_DEVT2DEVINFO: 460 if (rpcib.rpcib_dip != NULL) 461 *result = rpcib.rpcib_dip; 462 else { 463 *result = NULL; 464 ret = DDI_FAILURE; 465 } 466 break; 467 468 case DDI_INFO_DEVT2INSTANCE: 469 *result = NULL; 470 break; 471 472 default: 473 ret = DDI_FAILURE; 474 } 475 return (ret); 476 } 477 478 static int 479 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 480 { 481 ibt_status_t ibt_status; 482 rdma_stat r_status; 483 484 switch (cmd) { 485 case DDI_ATTACH: 486 break; 487 case DDI_RESUME: 488 return (DDI_SUCCESS); 489 default: 490 return (DDI_FAILURE); 491 } 492 493 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 494 495 mutex_enter(&rpcib.rpcib_mutex); 496 if (rpcib.rpcib_dip != NULL) { 497 mutex_exit(&rpcib.rpcib_mutex); 498 return (DDI_FAILURE); 499 } 500 rpcib.rpcib_dip = dip; 501 mutex_exit(&rpcib.rpcib_mutex); 502 /* 503 * Create the "rpcib" minor-node. 504 */ 505 if (ddi_create_minor_node(dip, 506 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 507 /* Error message, no cmn_err as they print on console */ 508 return (DDI_FAILURE); 509 } 510 511 if (rib_stat == NULL) { 512 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 513 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 514 } 515 516 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 517 if (rib_stat->hca_count < 1) { 518 mutex_destroy(&rib_stat->open_hca_lock); 519 kmem_free(rib_stat, sizeof (*rib_stat)); 520 rib_stat = NULL; 521 return (DDI_FAILURE); 522 } 523 524 ibt_status = ibt_attach(&rib_modinfo, dip, 525 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 526 527 if (ibt_status != IBT_SUCCESS) { 528 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 529 mutex_destroy(&rib_stat->open_hca_lock); 530 kmem_free(rib_stat, sizeof (*rib_stat)); 531 rib_stat = NULL; 532 return (DDI_FAILURE); 533 } 534 535 mutex_enter(&rib_stat->open_hca_lock); 536 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 537 mutex_exit(&rib_stat->open_hca_lock); 538 goto open_fail; 539 } 540 mutex_exit(&rib_stat->open_hca_lock); 541 542 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 543 DDI_PROP_SUCCESS) { 544 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 545 "failed."); 546 goto register_fail; 547 } 548 549 /* 550 * Register with rdmatf 551 */ 552 rib_mod.rdma_count = rib_stat->nhca_inited; 553 r_status = rdma_register_mod(&rib_mod); 554 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 555 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 556 "status = %d", r_status); 557 goto register_fail; 558 } 559 560 return (DDI_SUCCESS); 561 562 register_fail: 563 rib_detach_hca(rib_stat->hca); 564 open_fail: 565 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 566 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 567 mutex_destroy(&rib_stat->open_hca_lock); 568 kmem_free(rib_stat, sizeof (*rib_stat)); 569 rib_stat = NULL; 570 return (DDI_FAILURE); 571 } 572 573 /*ARGSUSED*/ 574 static int 575 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 576 { 577 switch (cmd) { 578 579 case DDI_DETACH: 580 break; 581 582 case DDI_SUSPEND: 583 default: 584 return (DDI_FAILURE); 585 } 586 587 /* 588 * Detach the hca and free resources 589 */ 590 mutex_enter(&plugin_state_lock); 591 plugin_state = NO_ACCEPT; 592 mutex_exit(&plugin_state_lock); 593 rib_detach_hca(rib_stat->hca); 594 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 595 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 596 mutex_destroy(&rib_stat->open_hca_lock); 597 if (rib_stat->hcas) { 598 kmem_free(rib_stat->hcas, rib_stat->hca_count * 599 sizeof (rib_hca_t)); 600 rib_stat->hcas = NULL; 601 } 602 kmem_free(rib_stat, sizeof (*rib_stat)); 603 rib_stat = NULL; 604 605 mutex_enter(&rpcib.rpcib_mutex); 606 rpcib.rpcib_dip = NULL; 607 mutex_exit(&rpcib.rpcib_mutex); 608 mutex_destroy(&rpcib.rpcib_mutex); 609 return (DDI_SUCCESS); 610 } 611 612 613 static void rib_rbufpool_free(rib_hca_t *, int); 614 static void rib_rbufpool_deregister(rib_hca_t *, int); 615 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 616 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 617 static rdma_stat rib_rem_replylist(rib_qp_t *); 618 static int rib_remreply(rib_qp_t *, struct reply *); 619 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 620 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 621 622 623 /* 624 * One CQ pair per HCA 625 */ 626 static rdma_stat 627 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 628 rib_cq_t **cqp, rpcib_state_t *ribstat) 629 { 630 rib_cq_t *cq; 631 ibt_cq_attr_t cq_attr; 632 uint32_t real_size; 633 ibt_status_t status; 634 rdma_stat error = RDMA_SUCCESS; 635 636 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 637 cq->rib_hca = hca; 638 cq_attr.cq_size = cq_size; 639 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 640 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 641 &real_size); 642 if (status != IBT_SUCCESS) { 643 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 644 " status=%d", status); 645 error = RDMA_FAILED; 646 goto fail; 647 } 648 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 649 650 /* 651 * Enable CQ callbacks. CQ Callbacks are single shot 652 * (e.g. you have to call ibt_enable_cq_notify() 653 * after each callback to get another one). 654 */ 655 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 656 if (status != IBT_SUCCESS) { 657 cmn_err(CE_WARN, "rib_create_cq: " 658 "enable_cq_notify failed, status %d", status); 659 error = RDMA_FAILED; 660 goto fail; 661 } 662 *cqp = cq; 663 664 return (error); 665 fail: 666 if (cq->rib_cq_hdl) 667 (void) ibt_free_cq(cq->rib_cq_hdl); 668 if (cq) 669 kmem_free(cq, sizeof (rib_cq_t)); 670 return (error); 671 } 672 673 static rdma_stat 674 open_hcas(rpcib_state_t *ribstat) 675 { 676 rib_hca_t *hca; 677 ibt_status_t ibt_status; 678 rdma_stat status; 679 ibt_hca_portinfo_t *pinfop; 680 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 681 uint_t size, cq_size; 682 int i; 683 kstat_t *ksp; 684 cache_avl_struct_t example_avl_node; 685 char rssc_name[32]; 686 687 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 688 689 if (ribstat->hcas == NULL) 690 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 691 sizeof (rib_hca_t), KM_SLEEP); 692 693 /* 694 * Open a hca and setup for RDMA 695 */ 696 for (i = 0; i < ribstat->hca_count; i++) { 697 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 698 ribstat->hca_guids[i], 699 &ribstat->hcas[i].hca_hdl); 700 if (ibt_status != IBT_SUCCESS) { 701 continue; 702 } 703 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 704 hca = &(ribstat->hcas[i]); 705 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 706 hca->state = HCA_INITED; 707 708 /* 709 * query HCA info 710 */ 711 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 712 if (ibt_status != IBT_SUCCESS) { 713 goto fail1; 714 } 715 716 /* 717 * One PD (Protection Domain) per HCA. 718 * A qp is allowed to access a memory region 719 * only when it's in the same PD as that of 720 * the memory region. 721 */ 722 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 723 if (ibt_status != IBT_SUCCESS) { 724 goto fail1; 725 } 726 727 /* 728 * query HCA ports 729 */ 730 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 731 0, &pinfop, &hca->hca_nports, &size); 732 if (ibt_status != IBT_SUCCESS) { 733 goto fail2; 734 } 735 hca->hca_ports = pinfop; 736 hca->hca_pinfosz = size; 737 pinfop = NULL; 738 739 cq_size = DEF_CQ_SIZE; /* default cq size */ 740 /* 741 * Create 2 pairs of cq's (1 pair for client 742 * and the other pair for server) on this hca. 743 * If number of qp's gets too large, then several 744 * cq's will be needed. 745 */ 746 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 747 &hca->svc_rcq, ribstat); 748 if (status != RDMA_SUCCESS) { 749 goto fail3; 750 } 751 752 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 753 &hca->svc_scq, ribstat); 754 if (status != RDMA_SUCCESS) { 755 goto fail3; 756 } 757 758 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 759 &hca->clnt_rcq, ribstat); 760 if (status != RDMA_SUCCESS) { 761 goto fail3; 762 } 763 764 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 765 &hca->clnt_scq, ribstat); 766 if (status != RDMA_SUCCESS) { 767 goto fail3; 768 } 769 770 /* 771 * Create buffer pools. 772 * Note rib_rbuf_create also allocates memory windows. 773 */ 774 hca->recv_pool = rib_rbufpool_create(hca, 775 RECV_BUFFER, MAX_BUFS); 776 if (hca->recv_pool == NULL) { 777 goto fail3; 778 } 779 780 hca->send_pool = rib_rbufpool_create(hca, 781 SEND_BUFFER, MAX_BUFS); 782 if (hca->send_pool == NULL) { 783 rib_rbufpool_destroy(hca, RECV_BUFFER); 784 goto fail3; 785 } 786 787 if (hca->server_side_cache == NULL) { 788 (void) sprintf(rssc_name, 789 "rib_server_side_cache_%04d", i); 790 hca->server_side_cache = kmem_cache_create( 791 rssc_name, 792 sizeof (cache_avl_struct_t), 0, 793 NULL, 794 NULL, 795 rib_server_side_cache_reclaim, 796 hca, NULL, 0); 797 } 798 799 avl_create(&hca->avl_tree, 800 avl_compare, 801 sizeof (cache_avl_struct_t), 802 (uint_t)(uintptr_t)&example_avl_node.avl_link- 803 (uint_t)(uintptr_t)&example_avl_node); 804 805 rw_init(&hca->avl_rw_lock, 806 NULL, RW_DRIVER, hca->iblock); 807 mutex_init(&hca->cache_allocation, 808 NULL, MUTEX_DRIVER, NULL); 809 hca->avl_init = TRUE; 810 811 /* Create kstats for the cache */ 812 ASSERT(INGLOBALZONE(curproc)); 813 814 if (!stats_enabled) { 815 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 816 KSTAT_TYPE_NAMED, 817 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 818 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 819 GLOBAL_ZONEID); 820 if (ksp) { 821 ksp->ks_data = (void *) &rpcib_kstat; 822 ksp->ks_update = rpcib_cache_kstat_update; 823 kstat_install(ksp); 824 stats_enabled = TRUE; 825 } 826 } 827 if (NULL == hca->reg_cache_clean_up) { 828 hca->reg_cache_clean_up = ddi_taskq_create(NULL, 829 "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); 830 } 831 832 /* 833 * Initialize the registered service list and 834 * the lock 835 */ 836 hca->service_list = NULL; 837 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 838 839 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 840 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 841 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 842 hca->iblock); 843 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 844 hca->iblock); 845 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 846 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 847 hca->inuse = TRUE; 848 /* 849 * XXX One hca only. Add multi-hca functionality if needed 850 * later. 851 */ 852 ribstat->hca = hca; 853 ribstat->nhca_inited++; 854 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 855 break; 856 857 fail3: 858 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 859 fail2: 860 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 861 fail1: 862 (void) ibt_close_hca(hca->hca_hdl); 863 864 } 865 if (ribstat->hca != NULL) 866 return (RDMA_SUCCESS); 867 else 868 return (RDMA_FAILED); 869 } 870 871 /* 872 * Callback routines 873 */ 874 875 /* 876 * SCQ handlers 877 */ 878 /* ARGSUSED */ 879 static void 880 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 881 { 882 ibt_status_t ibt_status; 883 ibt_wc_t wc; 884 int i; 885 886 /* 887 * Re-enable cq notify here to avoid missing any 888 * completion queue notification. 889 */ 890 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 891 892 ibt_status = IBT_SUCCESS; 893 while (ibt_status != IBT_CQ_EMPTY) { 894 bzero(&wc, sizeof (wc)); 895 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 896 if (ibt_status != IBT_SUCCESS) 897 return; 898 899 /* 900 * Got a send completion 901 */ 902 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 903 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 904 CONN *conn = qptoc(wd->qp); 905 906 mutex_enter(&wd->sendwait_lock); 907 switch (wc.wc_status) { 908 case IBT_WC_SUCCESS: 909 wd->status = RDMA_SUCCESS; 910 break; 911 case IBT_WC_WR_FLUSHED_ERR: 912 wd->status = RDMA_FAILED; 913 break; 914 default: 915 /* 916 * RC Send Q Error Code Local state Remote State 917 * ==================== =========== ============ 918 * IBT_WC_BAD_RESPONSE_ERR ERROR None 919 * IBT_WC_LOCAL_LEN_ERR ERROR None 920 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 921 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 922 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 923 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 924 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 925 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 926 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 927 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 928 * IBT_WC_WR_FLUSHED_ERR None None 929 */ 930 /* 931 * Channel in error state. Set connection to 932 * ERROR and cleanup will happen either from 933 * conn_release or from rib_conn_get 934 */ 935 wd->status = RDMA_FAILED; 936 mutex_enter(&conn->c_lock); 937 if (conn->c_state != C_DISCONN_PEND) 938 conn->c_state = C_ERROR_CONN; 939 mutex_exit(&conn->c_lock); 940 break; 941 } 942 943 if (wd->cv_sig == 1) { 944 /* 945 * Notify poster 946 */ 947 cv_signal(&wd->wait_cv); 948 mutex_exit(&wd->sendwait_lock); 949 } else { 950 /* 951 * Poster not waiting for notification. 952 * Free the send buffers and send_wid 953 */ 954 for (i = 0; i < wd->nsbufs; i++) { 955 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 956 (void *)(uintptr_t)wd->sbufaddr[i]); 957 } 958 mutex_exit(&wd->sendwait_lock); 959 (void) rib_free_sendwait(wd); 960 } 961 } 962 } 963 } 964 965 /* ARGSUSED */ 966 static void 967 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 968 { 969 ibt_status_t ibt_status; 970 ibt_wc_t wc; 971 int i; 972 973 /* 974 * Re-enable cq notify here to avoid missing any 975 * completion queue notification. 976 */ 977 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 978 979 ibt_status = IBT_SUCCESS; 980 while (ibt_status != IBT_CQ_EMPTY) { 981 bzero(&wc, sizeof (wc)); 982 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 983 if (ibt_status != IBT_SUCCESS) 984 return; 985 986 /* 987 * Got a send completion 988 */ 989 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 990 struct send_wid *wd = 991 (struct send_wid *)(uintptr_t)wc.wc_id; 992 mutex_enter(&wd->sendwait_lock); 993 if (wd->cv_sig == 1) { 994 /* 995 * Update completion status and notify poster 996 */ 997 if (wc.wc_status == IBT_WC_SUCCESS) 998 wd->status = RDMA_SUCCESS; 999 else 1000 wd->status = RDMA_FAILED; 1001 cv_signal(&wd->wait_cv); 1002 mutex_exit(&wd->sendwait_lock); 1003 } else { 1004 /* 1005 * Poster not waiting for notification. 1006 * Free the send buffers and send_wid 1007 */ 1008 for (i = 0; i < wd->nsbufs; i++) { 1009 rib_rbuf_free(qptoc(wd->qp), 1010 SEND_BUFFER, 1011 (void *)(uintptr_t)wd->sbufaddr[i]); 1012 } 1013 mutex_exit(&wd->sendwait_lock); 1014 (void) rib_free_sendwait(wd); 1015 } 1016 } 1017 } 1018 } 1019 1020 /* 1021 * RCQ handler 1022 */ 1023 /* ARGSUSED */ 1024 static void 1025 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1026 { 1027 rib_qp_t *qp; 1028 ibt_status_t ibt_status; 1029 ibt_wc_t wc; 1030 struct recv_wid *rwid; 1031 1032 /* 1033 * Re-enable cq notify here to avoid missing any 1034 * completion queue notification. 1035 */ 1036 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1037 1038 ibt_status = IBT_SUCCESS; 1039 while (ibt_status != IBT_CQ_EMPTY) { 1040 bzero(&wc, sizeof (wc)); 1041 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1042 if (ibt_status != IBT_SUCCESS) 1043 return; 1044 1045 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1046 qp = rwid->qp; 1047 if (wc.wc_status == IBT_WC_SUCCESS) { 1048 XDR inxdrs, *xdrs; 1049 uint_t xid, vers, op, find_xid = 0; 1050 struct reply *r; 1051 CONN *conn = qptoc(qp); 1052 uint32_t rdma_credit = 0; 1053 1054 xdrs = &inxdrs; 1055 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1056 wc.wc_bytes_xfer, XDR_DECODE); 1057 /* 1058 * Treat xid as opaque (xid is the first entity 1059 * in the rpc rdma message). 1060 */ 1061 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1062 1063 /* Skip xid and set the xdr position accordingly. */ 1064 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1065 (void) xdr_u_int(xdrs, &vers); 1066 (void) xdr_u_int(xdrs, &rdma_credit); 1067 (void) xdr_u_int(xdrs, &op); 1068 XDR_DESTROY(xdrs); 1069 1070 if (vers != RPCRDMA_VERS) { 1071 /* 1072 * Invalid RPC/RDMA version. Cannot 1073 * interoperate. Set connection to 1074 * ERROR state and bail out. 1075 */ 1076 mutex_enter(&conn->c_lock); 1077 if (conn->c_state != C_DISCONN_PEND) 1078 conn->c_state = C_ERROR_CONN; 1079 mutex_exit(&conn->c_lock); 1080 rib_rbuf_free(conn, RECV_BUFFER, 1081 (void *)(uintptr_t)rwid->addr); 1082 rib_free_wid(rwid); 1083 continue; 1084 } 1085 1086 mutex_enter(&qp->replylist_lock); 1087 for (r = qp->replylist; r != NULL; r = r->next) { 1088 if (r->xid == xid) { 1089 find_xid = 1; 1090 switch (op) { 1091 case RDMA_MSG: 1092 case RDMA_NOMSG: 1093 case RDMA_MSGP: 1094 r->status = RDMA_SUCCESS; 1095 r->vaddr_cq = rwid->addr; 1096 r->bytes_xfer = 1097 wc.wc_bytes_xfer; 1098 cv_signal(&r->wait_cv); 1099 break; 1100 default: 1101 rib_rbuf_free(qptoc(qp), 1102 RECV_BUFFER, 1103 (void *)(uintptr_t) 1104 rwid->addr); 1105 break; 1106 } 1107 break; 1108 } 1109 } 1110 mutex_exit(&qp->replylist_lock); 1111 if (find_xid == 0) { 1112 /* RPC caller not waiting for reply */ 1113 1114 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1115 int, xid); 1116 1117 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1118 (void *)(uintptr_t)rwid->addr); 1119 } 1120 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1121 CONN *conn = qptoc(qp); 1122 1123 /* 1124 * Connection being flushed. Just free 1125 * the posted buffer 1126 */ 1127 rib_rbuf_free(conn, RECV_BUFFER, 1128 (void *)(uintptr_t)rwid->addr); 1129 } else { 1130 CONN *conn = qptoc(qp); 1131 /* 1132 * RC Recv Q Error Code Local state Remote State 1133 * ==================== =========== ============ 1134 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1135 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1136 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1137 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1138 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1139 * IBT_WC_WR_FLUSHED_ERR None None 1140 */ 1141 /* 1142 * Channel in error state. Set connection 1143 * in ERROR state. 1144 */ 1145 mutex_enter(&conn->c_lock); 1146 if (conn->c_state != C_DISCONN_PEND) 1147 conn->c_state = C_ERROR_CONN; 1148 mutex_exit(&conn->c_lock); 1149 rib_rbuf_free(conn, RECV_BUFFER, 1150 (void *)(uintptr_t)rwid->addr); 1151 } 1152 rib_free_wid(rwid); 1153 } 1154 } 1155 1156 /* Server side */ 1157 /* ARGSUSED */ 1158 static void 1159 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1160 { 1161 rdma_recv_data_t *rdp; 1162 rib_qp_t *qp; 1163 ibt_status_t ibt_status; 1164 ibt_wc_t wc; 1165 struct svc_recv *s_recvp; 1166 CONN *conn; 1167 mblk_t *mp; 1168 1169 /* 1170 * Re-enable cq notify here to avoid missing any 1171 * completion queue notification. 1172 */ 1173 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1174 1175 ibt_status = IBT_SUCCESS; 1176 while (ibt_status != IBT_CQ_EMPTY) { 1177 bzero(&wc, sizeof (wc)); 1178 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1179 if (ibt_status != IBT_SUCCESS) 1180 return; 1181 1182 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1183 qp = s_recvp->qp; 1184 conn = qptoc(qp); 1185 mutex_enter(&qp->posted_rbufs_lock); 1186 qp->n_posted_rbufs--; 1187 #if defined(MEASURE_POOL_DEPTH) 1188 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1189 #endif 1190 if (qp->n_posted_rbufs == 0) 1191 cv_signal(&qp->posted_rbufs_cv); 1192 mutex_exit(&qp->posted_rbufs_lock); 1193 1194 if (wc.wc_status == IBT_WC_SUCCESS) { 1195 XDR inxdrs, *xdrs; 1196 uint_t xid, vers, op; 1197 uint32_t rdma_credit; 1198 1199 xdrs = &inxdrs; 1200 /* s_recvp->vaddr stores data */ 1201 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1202 wc.wc_bytes_xfer, XDR_DECODE); 1203 1204 /* 1205 * Treat xid as opaque (xid is the first entity 1206 * in the rpc rdma message). 1207 */ 1208 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1209 /* Skip xid and set the xdr position accordingly. */ 1210 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1211 if (!xdr_u_int(xdrs, &vers) || 1212 !xdr_u_int(xdrs, &rdma_credit) || 1213 !xdr_u_int(xdrs, &op)) { 1214 rib_rbuf_free(conn, RECV_BUFFER, 1215 (void *)(uintptr_t)s_recvp->vaddr); 1216 XDR_DESTROY(xdrs); 1217 (void) rib_free_svc_recv(s_recvp); 1218 continue; 1219 } 1220 XDR_DESTROY(xdrs); 1221 1222 if (vers != RPCRDMA_VERS) { 1223 /* 1224 * Invalid RPC/RDMA version. 1225 * Drop rpc rdma message. 1226 */ 1227 rib_rbuf_free(conn, RECV_BUFFER, 1228 (void *)(uintptr_t)s_recvp->vaddr); 1229 (void) rib_free_svc_recv(s_recvp); 1230 continue; 1231 } 1232 /* 1233 * Is this for RDMA_DONE? 1234 */ 1235 if (op == RDMA_DONE) { 1236 rib_rbuf_free(conn, RECV_BUFFER, 1237 (void *)(uintptr_t)s_recvp->vaddr); 1238 /* 1239 * Wake up the thread waiting on 1240 * a RDMA_DONE for xid 1241 */ 1242 mutex_enter(&qp->rdlist_lock); 1243 rdma_done_notify(qp, xid); 1244 mutex_exit(&qp->rdlist_lock); 1245 (void) rib_free_svc_recv(s_recvp); 1246 continue; 1247 } 1248 1249 mutex_enter(&plugin_state_lock); 1250 if (plugin_state == ACCEPT) { 1251 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1252 == NULL) 1253 (void) strwaitbuf( 1254 sizeof (*rdp), BPRI_LO); 1255 /* 1256 * Plugin is in accept state, hence the master 1257 * transport queue for this is still accepting 1258 * requests. Hence we can call svc_queuereq to 1259 * queue this recieved msg. 1260 */ 1261 rdp = (rdma_recv_data_t *)mp->b_rptr; 1262 rdp->conn = conn; 1263 rdp->rpcmsg.addr = 1264 (caddr_t)(uintptr_t)s_recvp->vaddr; 1265 rdp->rpcmsg.type = RECV_BUFFER; 1266 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1267 rdp->status = wc.wc_status; 1268 mutex_enter(&conn->c_lock); 1269 conn->c_ref++; 1270 mutex_exit(&conn->c_lock); 1271 mp->b_wptr += sizeof (*rdp); 1272 svc_queuereq((queue_t *)rib_stat->q, mp); 1273 mutex_exit(&plugin_state_lock); 1274 } else { 1275 /* 1276 * The master transport for this is going 1277 * away and the queue is not accepting anymore 1278 * requests for krpc, so don't do anything, just 1279 * free the msg. 1280 */ 1281 mutex_exit(&plugin_state_lock); 1282 rib_rbuf_free(conn, RECV_BUFFER, 1283 (void *)(uintptr_t)s_recvp->vaddr); 1284 } 1285 } else { 1286 rib_rbuf_free(conn, RECV_BUFFER, 1287 (void *)(uintptr_t)s_recvp->vaddr); 1288 } 1289 (void) rib_free_svc_recv(s_recvp); 1290 } 1291 } 1292 1293 /* 1294 * Handles DR event of IBT_HCA_DETACH_EVENT. 1295 */ 1296 /* ARGSUSED */ 1297 static void 1298 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1299 ibt_async_code_t code, ibt_async_event_t *event) 1300 { 1301 1302 switch (code) { 1303 case IBT_HCA_ATTACH_EVENT: 1304 /* ignore */ 1305 break; 1306 case IBT_HCA_DETACH_EVENT: 1307 { 1308 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1309 rib_detach_hca(rib_stat->hca); 1310 #ifdef DEBUG 1311 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1312 #endif 1313 break; 1314 } 1315 #ifdef DEBUG 1316 case IBT_EVENT_PATH_MIGRATED: 1317 cmn_err(CE_NOTE, "rib_async_handler(): " 1318 "IBT_EVENT_PATH_MIGRATED\n"); 1319 break; 1320 case IBT_EVENT_SQD: 1321 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1322 break; 1323 case IBT_EVENT_COM_EST: 1324 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1325 break; 1326 case IBT_ERROR_CATASTROPHIC_CHAN: 1327 cmn_err(CE_NOTE, "rib_async_handler(): " 1328 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1329 break; 1330 case IBT_ERROR_INVALID_REQUEST_CHAN: 1331 cmn_err(CE_NOTE, "rib_async_handler(): " 1332 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1333 break; 1334 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1335 cmn_err(CE_NOTE, "rib_async_handler(): " 1336 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1337 break; 1338 case IBT_ERROR_PATH_MIGRATE_REQ: 1339 cmn_err(CE_NOTE, "rib_async_handler(): " 1340 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1341 break; 1342 case IBT_ERROR_CQ: 1343 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1344 break; 1345 case IBT_ERROR_PORT_DOWN: 1346 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1347 break; 1348 case IBT_EVENT_PORT_UP: 1349 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1350 break; 1351 case IBT_ASYNC_OPAQUE1: 1352 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1353 break; 1354 case IBT_ASYNC_OPAQUE2: 1355 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1356 break; 1357 case IBT_ASYNC_OPAQUE3: 1358 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1359 break; 1360 case IBT_ASYNC_OPAQUE4: 1361 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1362 break; 1363 #endif 1364 default: 1365 break; 1366 } 1367 } 1368 1369 /* 1370 * Client's reachable function. 1371 */ 1372 static rdma_stat 1373 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1374 { 1375 rdma_stat status; 1376 rpcib_ping_t rpt; 1377 1378 /* 1379 * First check if a hca is still attached 1380 */ 1381 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1382 if (rib_stat->hca->state != HCA_INITED) { 1383 rw_exit(&rib_stat->hca->state_lock); 1384 return (RDMA_FAILED); 1385 } 1386 1387 bzero(&rpt, sizeof (rpcib_ping_t)); 1388 status = rib_ping_srv(addr_type, raddr, &rpt); 1389 rw_exit(&rib_stat->hca->state_lock); 1390 1391 if (status == RDMA_SUCCESS) { 1392 *handle = (void *)rpt.hca; 1393 return (RDMA_SUCCESS); 1394 } else { 1395 *handle = NULL; 1396 DTRACE_PROBE(rpcib__i__pingfailed); 1397 return (RDMA_FAILED); 1398 } 1399 } 1400 1401 /* Client side qp creation */ 1402 static rdma_stat 1403 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1404 { 1405 rib_qp_t *kqp = NULL; 1406 CONN *conn; 1407 rdma_clnt_cred_ctrl_t *cc_info; 1408 1409 ASSERT(qp != NULL); 1410 *qp = NULL; 1411 1412 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1413 conn = qptoc(kqp); 1414 kqp->hca = hca; 1415 kqp->rdmaconn.c_rdmamod = &rib_mod; 1416 kqp->rdmaconn.c_private = (caddr_t)kqp; 1417 1418 kqp->mode = RIB_CLIENT; 1419 kqp->chan_flags = IBT_BLOCKING; 1420 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1421 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1422 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1423 /* 1424 * Initialize 1425 */ 1426 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1427 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1428 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1429 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1430 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1431 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1432 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1433 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1434 /* 1435 * Initialize the client credit control 1436 * portion of the rdmaconn struct. 1437 */ 1438 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1439 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1440 cc_info->clnt_cc_granted_ops = 0; 1441 cc_info->clnt_cc_in_flight_ops = 0; 1442 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1443 1444 *qp = kqp; 1445 return (RDMA_SUCCESS); 1446 } 1447 1448 /* Server side qp creation */ 1449 static rdma_stat 1450 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1451 { 1452 rib_qp_t *kqp = NULL; 1453 ibt_chan_sizes_t chan_sizes; 1454 ibt_rc_chan_alloc_args_t qp_attr; 1455 ibt_status_t ibt_status; 1456 rdma_srv_cred_ctrl_t *cc_info; 1457 1458 *qp = NULL; 1459 1460 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1461 kqp->hca = hca; 1462 kqp->port_num = port; 1463 kqp->rdmaconn.c_rdmamod = &rib_mod; 1464 kqp->rdmaconn.c_private = (caddr_t)kqp; 1465 1466 /* 1467 * Create the qp handle 1468 */ 1469 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1470 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1471 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1472 qp_attr.rc_pd = hca->pd_hdl; 1473 qp_attr.rc_hca_port_num = port; 1474 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1475 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1476 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1477 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1478 qp_attr.rc_clone_chan = NULL; 1479 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1480 qp_attr.rc_flags = IBT_WR_SIGNALED; 1481 1482 rw_enter(&hca->state_lock, RW_READER); 1483 if (hca->state != HCA_DETACHED) { 1484 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1485 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1486 &chan_sizes); 1487 } else { 1488 rw_exit(&hca->state_lock); 1489 goto fail; 1490 } 1491 rw_exit(&hca->state_lock); 1492 1493 if (ibt_status != IBT_SUCCESS) { 1494 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1495 int, ibt_status); 1496 goto fail; 1497 } 1498 1499 kqp->mode = RIB_SERVER; 1500 kqp->chan_flags = IBT_BLOCKING; 1501 kqp->q = q; /* server ONLY */ 1502 1503 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1504 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1505 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1506 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1507 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1508 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1509 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1510 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1511 /* 1512 * Set the private data area to qp to be used in callbacks 1513 */ 1514 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1515 kqp->rdmaconn.c_state = C_CONNECTED; 1516 1517 /* 1518 * Initialize the server credit control 1519 * portion of the rdmaconn struct. 1520 */ 1521 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1522 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1523 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1524 cc_info->srv_cc_cur_buffers_used = 0; 1525 cc_info->srv_cc_posted = preposted_rbufs; 1526 1527 *qp = kqp; 1528 1529 return (RDMA_SUCCESS); 1530 fail: 1531 if (kqp) 1532 kmem_free(kqp, sizeof (rib_qp_t)); 1533 1534 return (RDMA_FAILED); 1535 } 1536 1537 /* ARGSUSED */ 1538 ibt_cm_status_t 1539 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1540 ibt_cm_return_args_t *ret_args, void *priv_data, 1541 ibt_priv_data_len_t len) 1542 { 1543 rpcib_state_t *ribstat; 1544 rib_hca_t *hca; 1545 1546 ribstat = (rpcib_state_t *)clnt_hdl; 1547 hca = (rib_hca_t *)ribstat->hca; 1548 1549 switch (event->cm_type) { 1550 1551 /* got a connection close event */ 1552 case IBT_CM_EVENT_CONN_CLOSED: 1553 { 1554 CONN *conn; 1555 rib_qp_t *qp; 1556 1557 /* check reason why connection was closed */ 1558 switch (event->cm_event.closed) { 1559 case IBT_CM_CLOSED_DREP_RCVD: 1560 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1561 case IBT_CM_CLOSED_DUP: 1562 case IBT_CM_CLOSED_ABORT: 1563 case IBT_CM_CLOSED_ALREADY: 1564 /* 1565 * These cases indicate the local end initiated 1566 * the closing of the channel. Nothing to do here. 1567 */ 1568 break; 1569 default: 1570 /* 1571 * Reason for CONN_CLOSED event must be one of 1572 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1573 * or IBT_CM_CLOSED_STALE. These indicate cases were 1574 * the remote end is closing the channel. In these 1575 * cases free the channel and transition to error 1576 * state 1577 */ 1578 qp = ibt_get_chan_private(event->cm_channel); 1579 conn = qptoc(qp); 1580 mutex_enter(&conn->c_lock); 1581 if (conn->c_state == C_DISCONN_PEND) { 1582 mutex_exit(&conn->c_lock); 1583 break; 1584 } 1585 1586 conn->c_state = C_ERROR_CONN; 1587 1588 /* 1589 * Free the rc_channel. Channel has already 1590 * transitioned to ERROR state and WRs have been 1591 * FLUSHED_ERR already. 1592 */ 1593 (void) ibt_free_channel(qp->qp_hdl); 1594 qp->qp_hdl = NULL; 1595 1596 /* 1597 * Free the conn if c_ref is down to 0 already 1598 */ 1599 if (conn->c_ref == 0) { 1600 /* 1601 * Remove from list and free conn 1602 */ 1603 conn->c_state = C_DISCONN_PEND; 1604 mutex_exit(&conn->c_lock); 1605 (void) rib_disconnect_channel(conn, 1606 &hca->cl_conn_list); 1607 } else { 1608 mutex_exit(&conn->c_lock); 1609 } 1610 #ifdef DEBUG 1611 if (rib_debug) 1612 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1613 "(CONN_CLOSED) channel disconnected"); 1614 #endif 1615 break; 1616 } 1617 break; 1618 } 1619 default: 1620 break; 1621 } 1622 return (IBT_CM_ACCEPT); 1623 } 1624 1625 /* 1626 * Connect to the server. 1627 */ 1628 rdma_stat 1629 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1630 { 1631 ibt_chan_open_args_t chan_args; /* channel args */ 1632 ibt_chan_sizes_t chan_sizes; 1633 ibt_rc_chan_alloc_args_t qp_attr; 1634 ibt_status_t ibt_status; 1635 ibt_rc_returns_t ret_args; /* conn reject info */ 1636 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1637 ibt_ip_cm_info_t ipcm_info; 1638 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1639 1640 1641 (void) bzero(&chan_args, sizeof (chan_args)); 1642 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1643 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1644 1645 ipcm_info.src_addr.family = rptp->srcip.family; 1646 switch (ipcm_info.src_addr.family) { 1647 case AF_INET: 1648 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1649 break; 1650 case AF_INET6: 1651 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1652 break; 1653 } 1654 1655 ipcm_info.dst_addr.family = rptp->srcip.family; 1656 switch (ipcm_info.dst_addr.family) { 1657 case AF_INET: 1658 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1659 break; 1660 case AF_INET6: 1661 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1662 break; 1663 } 1664 1665 ipcm_info.src_port = NFS_RDMA_PORT; 1666 1667 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1668 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1669 1670 if (ibt_status != IBT_SUCCESS) { 1671 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1672 return (-1); 1673 } 1674 1675 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1676 /* Alloc a RC channel */ 1677 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1678 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1679 qp_attr.rc_pd = hca->pd_hdl; 1680 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1681 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1682 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1683 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1684 qp_attr.rc_clone_chan = NULL; 1685 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1686 qp_attr.rc_flags = IBT_WR_SIGNALED; 1687 1688 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT); 1689 chan_args.oc_path = &rptp->path; 1690 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1691 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1692 chan_args.oc_rdma_ra_out = 4; 1693 chan_args.oc_rdma_ra_in = 4; 1694 chan_args.oc_path_retry_cnt = 2; 1695 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1696 chan_args.oc_priv_data = cmp_ip_pvt; 1697 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1698 1699 refresh: 1700 rw_enter(&hca->state_lock, RW_READER); 1701 if (hca->state != HCA_DETACHED) { 1702 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1703 IBT_ACHAN_NO_FLAGS, 1704 &qp_attr, &qp->qp_hdl, 1705 &chan_sizes); 1706 } else { 1707 rw_exit(&hca->state_lock); 1708 return (RDMA_FAILED); 1709 } 1710 rw_exit(&hca->state_lock); 1711 1712 if (ibt_status != IBT_SUCCESS) { 1713 DTRACE_PROBE1(rpcib__i_conntosrv, 1714 int, ibt_status); 1715 return (RDMA_FAILED); 1716 } 1717 1718 /* Connect to the Server */ 1719 (void) bzero(&ret_args, sizeof (ret_args)); 1720 mutex_enter(&qp->cb_lock); 1721 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1722 IBT_BLOCKING, &chan_args, &ret_args); 1723 if (ibt_status != IBT_SUCCESS) { 1724 DTRACE_PROBE2(rpcib__i_openrctosrv, 1725 int, ibt_status, int, ret_args.rc_status); 1726 1727 (void) ibt_free_channel(qp->qp_hdl); 1728 qp->qp_hdl = NULL; 1729 mutex_exit(&qp->cb_lock); 1730 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1731 ret_args.rc_status == IBT_CM_CONN_STALE) { 1732 /* 1733 * Got IBT_CM_CONN_STALE probably because of stale 1734 * data on the passive end of a channel that existed 1735 * prior to reboot. Retry establishing a channel 1736 * REFRESH_ATTEMPTS times, during which time the 1737 * stale conditions on the server might clear up. 1738 */ 1739 goto refresh; 1740 } 1741 return (RDMA_FAILED); 1742 } 1743 mutex_exit(&qp->cb_lock); 1744 /* 1745 * Set the private data area to qp to be used in callbacks 1746 */ 1747 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1748 return (RDMA_SUCCESS); 1749 } 1750 1751 rdma_stat 1752 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1753 { 1754 uint_t i; 1755 ibt_status_t ibt_status; 1756 uint8_t num_paths_p; 1757 ibt_ip_path_attr_t ipattr; 1758 ibt_path_ip_src_t srcip; 1759 rpcib_ipaddrs_t addrs4; 1760 rpcib_ipaddrs_t addrs6; 1761 struct sockaddr_in *sinp; 1762 struct sockaddr_in6 *sin6p; 1763 rdma_stat retval = RDMA_SUCCESS; 1764 1765 ASSERT(raddr->buf != NULL); 1766 1767 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1768 1769 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1770 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1771 retval = RDMA_FAILED; 1772 goto done; 1773 } 1774 1775 /* Prep the destination address */ 1776 switch (addr_type) { 1777 case AF_INET: 1778 sinp = (struct sockaddr_in *)raddr->buf; 1779 rptp->dstip.family = AF_INET; 1780 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1781 sinp = addrs4.ri_list; 1782 1783 ipattr.ipa_dst_ip = &rptp->dstip; 1784 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1785 ipattr.ipa_ndst = 1; 1786 ipattr.ipa_max_paths = 1; 1787 ipattr.ipa_src_ip.family = rptp->dstip.family; 1788 for (i = 0; i < addrs4.ri_count; i++) { 1789 num_paths_p = 0; 1790 ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; 1791 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1792 1793 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1794 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1795 &num_paths_p, &srcip); 1796 if (ibt_status == IBT_SUCCESS && 1797 num_paths_p != 0 && 1798 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1799 rptp->hca = rib_stat->hca; 1800 rptp->srcip.family = AF_INET; 1801 rptp->srcip.un.ip4addr = 1802 srcip.ip_primary.un.ip4addr; 1803 goto done; 1804 } 1805 } 1806 retval = RDMA_FAILED; 1807 break; 1808 1809 case AF_INET6: 1810 sin6p = (struct sockaddr_in6 *)raddr->buf; 1811 rptp->dstip.family = AF_INET6; 1812 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1813 sin6p = addrs6.ri_list; 1814 1815 ipattr.ipa_dst_ip = &rptp->dstip; 1816 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1817 ipattr.ipa_ndst = 1; 1818 ipattr.ipa_max_paths = 1; 1819 ipattr.ipa_src_ip.family = rptp->dstip.family; 1820 for (i = 0; i < addrs6.ri_count; i++) { 1821 num_paths_p = 0; 1822 ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; 1823 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1824 1825 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1826 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1827 &num_paths_p, &srcip); 1828 if (ibt_status == IBT_SUCCESS && 1829 num_paths_p != 0 && 1830 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1831 rptp->hca = rib_stat->hca; 1832 rptp->srcip.family = AF_INET6; 1833 rptp->srcip.un.ip6addr = 1834 srcip.ip_primary.un.ip6addr; 1835 goto done; 1836 } 1837 } 1838 retval = RDMA_FAILED; 1839 break; 1840 1841 default: 1842 retval = RDMA_INVAL; 1843 break; 1844 } 1845 done: 1846 1847 if (addrs4.ri_size > 0) 1848 kmem_free(addrs4.ri_list, addrs4.ri_size); 1849 if (addrs6.ri_size > 0) 1850 kmem_free(addrs6.ri_list, addrs6.ri_size); 1851 return (retval); 1852 } 1853 1854 /* 1855 * Close channel, remove from connection list and 1856 * free up resources allocated for that channel. 1857 */ 1858 rdma_stat 1859 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1860 { 1861 rib_qp_t *qp = ctoqp(conn); 1862 rib_hca_t *hca; 1863 1864 /* 1865 * c_ref == 0 and connection is in C_DISCONN_PEND 1866 */ 1867 hca = qp->hca; 1868 if (conn_list != NULL) 1869 (void) rib_rm_conn(conn, conn_list); 1870 1871 if (qp->qp_hdl != NULL) { 1872 /* 1873 * If the channel has not been establised, 1874 * ibt_flush_channel is called to flush outstanding WRs 1875 * on the Qs. Otherwise, ibt_close_rc_channel() is 1876 * called. The channel is then freed. 1877 */ 1878 if (conn_list != NULL) 1879 (void) ibt_close_rc_channel(qp->qp_hdl, 1880 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1881 else 1882 (void) ibt_flush_channel(qp->qp_hdl); 1883 1884 mutex_enter(&qp->posted_rbufs_lock); 1885 while (qp->n_posted_rbufs) 1886 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1887 mutex_exit(&qp->posted_rbufs_lock); 1888 (void) ibt_free_channel(qp->qp_hdl); 1889 qp->qp_hdl = NULL; 1890 } 1891 1892 ASSERT(qp->rdlist == NULL); 1893 1894 if (qp->replylist != NULL) { 1895 (void) rib_rem_replylist(qp); 1896 } 1897 1898 cv_destroy(&qp->cb_conn_cv); 1899 cv_destroy(&qp->posted_rbufs_cv); 1900 mutex_destroy(&qp->cb_lock); 1901 1902 mutex_destroy(&qp->replylist_lock); 1903 mutex_destroy(&qp->posted_rbufs_lock); 1904 mutex_destroy(&qp->rdlist_lock); 1905 1906 cv_destroy(&conn->c_cv); 1907 mutex_destroy(&conn->c_lock); 1908 1909 if (conn->c_raddr.buf != NULL) { 1910 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1911 } 1912 if (conn->c_laddr.buf != NULL) { 1913 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1914 } 1915 1916 /* 1917 * Credit control cleanup. 1918 */ 1919 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 1920 rdma_clnt_cred_ctrl_t *cc_info; 1921 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1922 cv_destroy(&cc_info->clnt_cc_cv); 1923 } 1924 1925 kmem_free(qp, sizeof (rib_qp_t)); 1926 1927 /* 1928 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1929 * then the hca is no longer being used. 1930 */ 1931 if (conn_list != NULL) { 1932 rw_enter(&hca->state_lock, RW_READER); 1933 if (hca->state == HCA_DETACHED) { 1934 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1935 if (hca->srv_conn_list.conn_hd == NULL) { 1936 rw_enter(&hca->cl_conn_list.conn_lock, 1937 RW_READER); 1938 1939 if (hca->cl_conn_list.conn_hd == NULL) { 1940 mutex_enter(&hca->inuse_lock); 1941 hca->inuse = FALSE; 1942 cv_signal(&hca->cb_cv); 1943 mutex_exit(&hca->inuse_lock); 1944 } 1945 rw_exit(&hca->cl_conn_list.conn_lock); 1946 } 1947 rw_exit(&hca->srv_conn_list.conn_lock); 1948 } 1949 rw_exit(&hca->state_lock); 1950 } 1951 1952 return (RDMA_SUCCESS); 1953 } 1954 1955 /* 1956 * Wait for send completion notification. Only on receiving a 1957 * notification be it a successful or error completion, free the 1958 * send_wid. 1959 */ 1960 static rdma_stat 1961 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 1962 { 1963 clock_t timout, cv_wait_ret; 1964 rdma_stat error = RDMA_SUCCESS; 1965 int i; 1966 1967 /* 1968 * Wait for send to complete 1969 */ 1970 ASSERT(wd != NULL); 1971 mutex_enter(&wd->sendwait_lock); 1972 if (wd->status == (uint_t)SEND_WAIT) { 1973 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 1974 ddi_get_lbolt(); 1975 1976 if (qp->mode == RIB_SERVER) { 1977 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 1978 &wd->sendwait_lock, timout)) > 0 && 1979 wd->status == (uint_t)SEND_WAIT) 1980 ; 1981 switch (cv_wait_ret) { 1982 case -1: /* timeout */ 1983 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 1984 1985 wd->cv_sig = 0; /* no signal needed */ 1986 error = RDMA_TIMEDOUT; 1987 break; 1988 default: /* got send completion */ 1989 break; 1990 } 1991 } else { 1992 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 1993 &wd->sendwait_lock, timout)) > 0 && 1994 wd->status == (uint_t)SEND_WAIT) 1995 ; 1996 switch (cv_wait_ret) { 1997 case -1: /* timeout */ 1998 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 1999 2000 wd->cv_sig = 0; /* no signal needed */ 2001 error = RDMA_TIMEDOUT; 2002 break; 2003 case 0: /* interrupted */ 2004 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2005 2006 wd->cv_sig = 0; /* no signal needed */ 2007 error = RDMA_INTR; 2008 break; 2009 default: /* got send completion */ 2010 break; 2011 } 2012 } 2013 } 2014 2015 if (wd->status != (uint_t)SEND_WAIT) { 2016 /* got send completion */ 2017 if (wd->status != RDMA_SUCCESS) { 2018 error = wd->status; 2019 if (wd->status != RDMA_CONNLOST) 2020 error = RDMA_FAILED; 2021 } 2022 for (i = 0; i < wd->nsbufs; i++) { 2023 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2024 (void *)(uintptr_t)wd->sbufaddr[i]); 2025 } 2026 mutex_exit(&wd->sendwait_lock); 2027 (void) rib_free_sendwait(wd); 2028 } else { 2029 mutex_exit(&wd->sendwait_lock); 2030 } 2031 return (error); 2032 } 2033 2034 static struct send_wid * 2035 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2036 { 2037 struct send_wid *wd; 2038 2039 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2040 wd->xid = xid; 2041 wd->cv_sig = cv_sig; 2042 wd->qp = qp; 2043 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2044 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2045 wd->status = (uint_t)SEND_WAIT; 2046 2047 return (wd); 2048 } 2049 2050 static int 2051 rib_free_sendwait(struct send_wid *wdesc) 2052 { 2053 cv_destroy(&wdesc->wait_cv); 2054 mutex_destroy(&wdesc->sendwait_lock); 2055 kmem_free(wdesc, sizeof (*wdesc)); 2056 2057 return (0); 2058 } 2059 2060 static rdma_stat 2061 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2062 { 2063 mutex_enter(&qp->replylist_lock); 2064 if (rep != NULL) { 2065 (void) rib_remreply(qp, rep); 2066 mutex_exit(&qp->replylist_lock); 2067 return (RDMA_SUCCESS); 2068 } 2069 mutex_exit(&qp->replylist_lock); 2070 return (RDMA_FAILED); 2071 } 2072 2073 /* 2074 * Send buffers are freed here only in case of error in posting 2075 * on QP. If the post succeeded, the send buffers are freed upon 2076 * send completion in rib_sendwait() or in the scq_handler. 2077 */ 2078 rdma_stat 2079 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2080 int send_sig, int cv_sig, caddr_t *swid) 2081 { 2082 struct send_wid *wdesc; 2083 struct clist *clp; 2084 ibt_status_t ibt_status = IBT_SUCCESS; 2085 rdma_stat ret = RDMA_SUCCESS; 2086 ibt_send_wr_t tx_wr; 2087 int i, nds; 2088 ibt_wr_ds_t sgl[DSEG_MAX]; 2089 uint_t total_msg_size; 2090 rib_qp_t *qp; 2091 2092 qp = ctoqp(conn); 2093 2094 ASSERT(cl != NULL); 2095 2096 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2097 2098 nds = 0; 2099 total_msg_size = 0; 2100 clp = cl; 2101 while (clp != NULL) { 2102 if (nds >= DSEG_MAX) { 2103 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2104 return (RDMA_FAILED); 2105 } 2106 sgl[nds].ds_va = clp->w.c_saddr; 2107 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2108 sgl[nds].ds_len = clp->c_len; 2109 total_msg_size += clp->c_len; 2110 clp = clp->c_next; 2111 nds++; 2112 } 2113 2114 if (send_sig) { 2115 /* Set SEND_SIGNAL flag. */ 2116 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2117 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2118 *swid = (caddr_t)wdesc; 2119 } else { 2120 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2121 wdesc = rib_init_sendwait(msgid, 0, qp); 2122 *swid = (caddr_t)wdesc; 2123 } 2124 wdesc->nsbufs = nds; 2125 for (i = 0; i < nds; i++) { 2126 wdesc->sbufaddr[i] = sgl[i].ds_va; 2127 } 2128 2129 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2130 tx_wr.wr_opcode = IBT_WRC_SEND; 2131 tx_wr.wr_trans = IBT_RC_SRV; 2132 tx_wr.wr_nds = nds; 2133 tx_wr.wr_sgl = sgl; 2134 2135 mutex_enter(&conn->c_lock); 2136 if (conn->c_state == C_CONNECTED) { 2137 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2138 } 2139 if (conn->c_state != C_CONNECTED || 2140 ibt_status != IBT_SUCCESS) { 2141 if (conn->c_state != C_DISCONN_PEND) 2142 conn->c_state = C_ERROR_CONN; 2143 mutex_exit(&conn->c_lock); 2144 for (i = 0; i < nds; i++) { 2145 rib_rbuf_free(conn, SEND_BUFFER, 2146 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2147 } 2148 2149 (void) rib_free_sendwait(wdesc); 2150 2151 return (RDMA_CONNLOST); 2152 } 2153 mutex_exit(&conn->c_lock); 2154 2155 if (send_sig) { 2156 if (cv_sig) { 2157 /* 2158 * cv_wait for send to complete. 2159 * We can fail due to a timeout or signal or 2160 * unsuccessful send. 2161 */ 2162 ret = rib_sendwait(qp, wdesc); 2163 2164 return (ret); 2165 } 2166 } 2167 2168 return (RDMA_SUCCESS); 2169 } 2170 2171 2172 rdma_stat 2173 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2174 { 2175 rdma_stat ret; 2176 caddr_t wd; 2177 2178 /* send-wait & cv_signal */ 2179 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2180 return (ret); 2181 } 2182 2183 /* 2184 * Server interface (svc_rdma_ksend). 2185 * Send RPC reply and wait for RDMA_DONE. 2186 */ 2187 rdma_stat 2188 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2189 { 2190 rdma_stat ret = RDMA_SUCCESS; 2191 struct rdma_done_list *rd; 2192 clock_t timout, cv_wait_ret; 2193 caddr_t *wid = NULL; 2194 rib_qp_t *qp = ctoqp(conn); 2195 2196 mutex_enter(&qp->rdlist_lock); 2197 rd = rdma_done_add(qp, msgid); 2198 2199 /* No cv_signal (whether send-wait or no-send-wait) */ 2200 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2201 2202 if (ret != RDMA_SUCCESS) { 2203 rdma_done_rm(qp, rd); 2204 } else { 2205 /* 2206 * Wait for RDMA_DONE from remote end 2207 */ 2208 timout = 2209 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2210 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2211 &qp->rdlist_lock, 2212 timout); 2213 2214 rdma_done_rm(qp, rd); 2215 2216 if (cv_wait_ret < 0) { 2217 ret = RDMA_TIMEDOUT; 2218 } 2219 } 2220 2221 mutex_exit(&qp->rdlist_lock); 2222 return (ret); 2223 } 2224 2225 static struct recv_wid * 2226 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2227 { 2228 struct recv_wid *rwid; 2229 2230 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2231 rwid->xid = msgid; 2232 rwid->addr = sgl->ds_va; 2233 rwid->qp = qp; 2234 2235 return (rwid); 2236 } 2237 2238 static void 2239 rib_free_wid(struct recv_wid *rwid) 2240 { 2241 kmem_free(rwid, sizeof (struct recv_wid)); 2242 } 2243 2244 rdma_stat 2245 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2246 { 2247 rib_qp_t *qp = ctoqp(conn); 2248 struct clist *clp = cl; 2249 struct reply *rep; 2250 struct recv_wid *rwid; 2251 int nds; 2252 ibt_wr_ds_t sgl[DSEG_MAX]; 2253 ibt_recv_wr_t recv_wr; 2254 rdma_stat ret; 2255 ibt_status_t ibt_status; 2256 2257 /* 2258 * rdma_clnt_postrecv uses RECV_BUFFER. 2259 */ 2260 2261 nds = 0; 2262 while (cl != NULL) { 2263 if (nds >= DSEG_MAX) { 2264 ret = RDMA_FAILED; 2265 goto done; 2266 } 2267 sgl[nds].ds_va = cl->w.c_saddr; 2268 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2269 sgl[nds].ds_len = cl->c_len; 2270 cl = cl->c_next; 2271 nds++; 2272 } 2273 2274 if (nds != 1) { 2275 ret = RDMA_FAILED; 2276 goto done; 2277 } 2278 2279 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2280 recv_wr.wr_nds = nds; 2281 recv_wr.wr_sgl = sgl; 2282 2283 rwid = rib_create_wid(qp, &sgl[0], msgid); 2284 if (rwid) { 2285 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2286 } else { 2287 ret = RDMA_NORESOURCE; 2288 goto done; 2289 } 2290 rep = rib_addreplylist(qp, msgid); 2291 if (!rep) { 2292 rib_free_wid(rwid); 2293 ret = RDMA_NORESOURCE; 2294 goto done; 2295 } 2296 2297 mutex_enter(&conn->c_lock); 2298 2299 if (conn->c_state == C_CONNECTED) { 2300 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2301 } 2302 2303 if (conn->c_state != C_CONNECTED || 2304 ibt_status != IBT_SUCCESS) { 2305 if (conn->c_state != C_DISCONN_PEND) 2306 conn->c_state = C_ERROR_CONN; 2307 mutex_exit(&conn->c_lock); 2308 rib_free_wid(rwid); 2309 (void) rib_rem_rep(qp, rep); 2310 ret = RDMA_CONNLOST; 2311 goto done; 2312 } 2313 mutex_exit(&conn->c_lock); 2314 return (RDMA_SUCCESS); 2315 2316 done: 2317 while (clp != NULL) { 2318 rib_rbuf_free(conn, RECV_BUFFER, 2319 (void *)(uintptr_t)clp->w.c_saddr3); 2320 clp = clp->c_next; 2321 } 2322 return (ret); 2323 } 2324 2325 rdma_stat 2326 rib_svc_post(CONN* conn, struct clist *cl) 2327 { 2328 rib_qp_t *qp = ctoqp(conn); 2329 struct svc_recv *s_recvp; 2330 int nds; 2331 ibt_wr_ds_t sgl[DSEG_MAX]; 2332 ibt_recv_wr_t recv_wr; 2333 ibt_status_t ibt_status; 2334 2335 nds = 0; 2336 while (cl != NULL) { 2337 if (nds >= DSEG_MAX) { 2338 return (RDMA_FAILED); 2339 } 2340 sgl[nds].ds_va = cl->w.c_saddr; 2341 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2342 sgl[nds].ds_len = cl->c_len; 2343 cl = cl->c_next; 2344 nds++; 2345 } 2346 2347 if (nds != 1) { 2348 rib_rbuf_free(conn, RECV_BUFFER, 2349 (caddr_t)(uintptr_t)sgl[0].ds_va); 2350 2351 return (RDMA_FAILED); 2352 } 2353 2354 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2355 recv_wr.wr_nds = nds; 2356 recv_wr.wr_sgl = sgl; 2357 2358 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2359 /* Use s_recvp's addr as wr id */ 2360 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2361 mutex_enter(&conn->c_lock); 2362 if (conn->c_state == C_CONNECTED) { 2363 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2364 } 2365 if (conn->c_state != C_CONNECTED || 2366 ibt_status != IBT_SUCCESS) { 2367 if (conn->c_state != C_DISCONN_PEND) 2368 conn->c_state = C_ERROR_CONN; 2369 mutex_exit(&conn->c_lock); 2370 rib_rbuf_free(conn, RECV_BUFFER, 2371 (caddr_t)(uintptr_t)sgl[0].ds_va); 2372 (void) rib_free_svc_recv(s_recvp); 2373 2374 return (RDMA_CONNLOST); 2375 } 2376 mutex_exit(&conn->c_lock); 2377 2378 return (RDMA_SUCCESS); 2379 } 2380 2381 /* Client */ 2382 rdma_stat 2383 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2384 { 2385 2386 return (rib_clnt_post(conn, cl, msgid)); 2387 } 2388 2389 /* Client */ 2390 rdma_stat 2391 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2392 { 2393 rib_qp_t *qp = ctoqp(conn); 2394 struct reply *rep; 2395 2396 mutex_enter(&qp->replylist_lock); 2397 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2398 if (rep->xid == msgid) { 2399 if (rep->vaddr_cq) { 2400 rib_rbuf_free(conn, RECV_BUFFER, 2401 (caddr_t)(uintptr_t)rep->vaddr_cq); 2402 } 2403 (void) rib_remreply(qp, rep); 2404 break; 2405 } 2406 } 2407 mutex_exit(&qp->replylist_lock); 2408 2409 return (RDMA_SUCCESS); 2410 } 2411 2412 /* Server */ 2413 rdma_stat 2414 rib_post_recv(CONN *conn, struct clist *cl) 2415 { 2416 rib_qp_t *qp = ctoqp(conn); 2417 2418 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2419 mutex_enter(&qp->posted_rbufs_lock); 2420 qp->n_posted_rbufs++; 2421 mutex_exit(&qp->posted_rbufs_lock); 2422 return (RDMA_SUCCESS); 2423 } 2424 return (RDMA_FAILED); 2425 } 2426 2427 /* 2428 * Client side only interface to "recv" the rpc reply buf 2429 * posted earlier by rib_post_resp(conn, cl, msgid). 2430 */ 2431 rdma_stat 2432 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2433 { 2434 struct reply *rep = NULL; 2435 clock_t timout, cv_wait_ret; 2436 rdma_stat ret = RDMA_SUCCESS; 2437 rib_qp_t *qp = ctoqp(conn); 2438 2439 /* 2440 * Find the reply structure for this msgid 2441 */ 2442 mutex_enter(&qp->replylist_lock); 2443 2444 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2445 if (rep->xid == msgid) 2446 break; 2447 } 2448 2449 if (rep != NULL) { 2450 /* 2451 * If message not yet received, wait. 2452 */ 2453 if (rep->status == (uint_t)REPLY_WAIT) { 2454 timout = ddi_get_lbolt() + 2455 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2456 2457 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2458 &qp->replylist_lock, timout)) > 0 && 2459 rep->status == (uint_t)REPLY_WAIT) 2460 ; 2461 2462 switch (cv_wait_ret) { 2463 case -1: /* timeout */ 2464 ret = RDMA_TIMEDOUT; 2465 break; 2466 case 0: 2467 ret = RDMA_INTR; 2468 break; 2469 default: 2470 break; 2471 } 2472 } 2473 2474 if (rep->status == RDMA_SUCCESS) { 2475 struct clist *cl = NULL; 2476 2477 /* 2478 * Got message successfully 2479 */ 2480 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2481 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2482 *clp = cl; 2483 } else { 2484 if (rep->status != (uint_t)REPLY_WAIT) { 2485 /* 2486 * Got error in reply message. Free 2487 * recv buffer here. 2488 */ 2489 ret = rep->status; 2490 rib_rbuf_free(conn, RECV_BUFFER, 2491 (caddr_t)(uintptr_t)rep->vaddr_cq); 2492 } 2493 } 2494 (void) rib_remreply(qp, rep); 2495 } else { 2496 /* 2497 * No matching reply structure found for given msgid on the 2498 * reply wait list. 2499 */ 2500 ret = RDMA_INVAL; 2501 DTRACE_PROBE(rpcib__i__nomatchxid2); 2502 } 2503 2504 /* 2505 * Done. 2506 */ 2507 mutex_exit(&qp->replylist_lock); 2508 return (ret); 2509 } 2510 2511 /* 2512 * RDMA write a buffer to the remote address. 2513 */ 2514 rdma_stat 2515 rib_write(CONN *conn, struct clist *cl, int wait) 2516 { 2517 ibt_send_wr_t tx_wr; 2518 int cv_sig; 2519 int i; 2520 ibt_wr_ds_t sgl[DSEG_MAX]; 2521 struct send_wid *wdesc; 2522 ibt_status_t ibt_status; 2523 rdma_stat ret = RDMA_SUCCESS; 2524 rib_qp_t *qp = ctoqp(conn); 2525 uint64_t n_writes = 0; 2526 bool_t force_wait = FALSE; 2527 2528 if (cl == NULL) { 2529 return (RDMA_FAILED); 2530 } 2531 2532 2533 while ((cl != NULL)) { 2534 if (cl->c_len > 0) { 2535 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2536 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2537 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2538 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2539 sgl[0].ds_va = cl->w.c_saddr; 2540 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2541 sgl[0].ds_len = cl->c_len; 2542 2543 if (wait) { 2544 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2545 cv_sig = 1; 2546 } else { 2547 if (n_writes > max_unsignaled_rws) { 2548 n_writes = 0; 2549 force_wait = TRUE; 2550 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2551 cv_sig = 1; 2552 } else { 2553 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2554 cv_sig = 0; 2555 } 2556 } 2557 2558 wdesc = rib_init_sendwait(0, cv_sig, qp); 2559 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2560 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2561 tx_wr.wr_trans = IBT_RC_SRV; 2562 tx_wr.wr_nds = 1; 2563 tx_wr.wr_sgl = sgl; 2564 2565 mutex_enter(&conn->c_lock); 2566 if (conn->c_state == C_CONNECTED) { 2567 ibt_status = 2568 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2569 } 2570 if (conn->c_state != C_CONNECTED || 2571 ibt_status != IBT_SUCCESS) { 2572 if (conn->c_state != C_DISCONN_PEND) 2573 conn->c_state = C_ERROR_CONN; 2574 mutex_exit(&conn->c_lock); 2575 (void) rib_free_sendwait(wdesc); 2576 return (RDMA_CONNLOST); 2577 } 2578 mutex_exit(&conn->c_lock); 2579 2580 /* 2581 * Wait for send to complete 2582 */ 2583 if (wait || force_wait) { 2584 force_wait = FALSE; 2585 ret = rib_sendwait(qp, wdesc); 2586 if (ret != 0) { 2587 return (ret); 2588 } 2589 } else { 2590 mutex_enter(&wdesc->sendwait_lock); 2591 for (i = 0; i < wdesc->nsbufs; i++) { 2592 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2593 (void *)(uintptr_t) 2594 wdesc->sbufaddr[i]); 2595 } 2596 mutex_exit(&wdesc->sendwait_lock); 2597 (void) rib_free_sendwait(wdesc); 2598 } 2599 n_writes ++; 2600 } 2601 cl = cl->c_next; 2602 } 2603 return (RDMA_SUCCESS); 2604 } 2605 2606 /* 2607 * RDMA Read a buffer from the remote address. 2608 */ 2609 rdma_stat 2610 rib_read(CONN *conn, struct clist *cl, int wait) 2611 { 2612 ibt_send_wr_t rx_wr; 2613 int cv_sig; 2614 int i; 2615 ibt_wr_ds_t sgl; 2616 struct send_wid *wdesc; 2617 ibt_status_t ibt_status = IBT_SUCCESS; 2618 rdma_stat ret = RDMA_SUCCESS; 2619 rib_qp_t *qp = ctoqp(conn); 2620 2621 if (cl == NULL) { 2622 return (RDMA_FAILED); 2623 } 2624 2625 while (cl != NULL) { 2626 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2627 /* 2628 * Remote address is at the head chunk item in list. 2629 */ 2630 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2631 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2632 2633 sgl.ds_va = cl->u.c_daddr; 2634 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2635 sgl.ds_len = cl->c_len; 2636 2637 if (wait) { 2638 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2639 cv_sig = 1; 2640 } else { 2641 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2642 cv_sig = 0; 2643 } 2644 2645 wdesc = rib_init_sendwait(0, cv_sig, qp); 2646 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2647 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2648 rx_wr.wr_trans = IBT_RC_SRV; 2649 rx_wr.wr_nds = 1; 2650 rx_wr.wr_sgl = &sgl; 2651 2652 mutex_enter(&conn->c_lock); 2653 if (conn->c_state == C_CONNECTED) { 2654 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2655 } 2656 if (conn->c_state != C_CONNECTED || 2657 ibt_status != IBT_SUCCESS) { 2658 if (conn->c_state != C_DISCONN_PEND) 2659 conn->c_state = C_ERROR_CONN; 2660 mutex_exit(&conn->c_lock); 2661 (void) rib_free_sendwait(wdesc); 2662 return (RDMA_CONNLOST); 2663 } 2664 mutex_exit(&conn->c_lock); 2665 2666 /* 2667 * Wait for send to complete if this is the 2668 * last item in the list. 2669 */ 2670 if (wait && cl->c_next == NULL) { 2671 ret = rib_sendwait(qp, wdesc); 2672 if (ret != 0) { 2673 return (ret); 2674 } 2675 } else { 2676 mutex_enter(&wdesc->sendwait_lock); 2677 for (i = 0; i < wdesc->nsbufs; i++) { 2678 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2679 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2680 } 2681 mutex_exit(&wdesc->sendwait_lock); 2682 (void) rib_free_sendwait(wdesc); 2683 } 2684 cl = cl->c_next; 2685 } 2686 return (RDMA_SUCCESS); 2687 } 2688 2689 /* 2690 * rib_srv_cm_handler() 2691 * Connection Manager callback to handle RC connection requests. 2692 */ 2693 /* ARGSUSED */ 2694 static ibt_cm_status_t 2695 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2696 ibt_cm_return_args_t *ret_args, void *priv_data, 2697 ibt_priv_data_len_t len) 2698 { 2699 queue_t *q; 2700 rib_qp_t *qp; 2701 rpcib_state_t *ribstat; 2702 rib_hca_t *hca; 2703 rdma_stat status = RDMA_SUCCESS; 2704 int i; 2705 struct clist cl; 2706 rdma_buf_t rdbuf = {0}; 2707 void *buf = NULL; 2708 CONN *conn; 2709 ibt_ip_cm_info_t ipinfo; 2710 struct sockaddr_in *s; 2711 struct sockaddr_in6 *s6; 2712 int sin_size = sizeof (struct sockaddr_in); 2713 int in_size = sizeof (struct in_addr); 2714 int sin6_size = sizeof (struct sockaddr_in6); 2715 2716 ASSERT(any != NULL); 2717 ASSERT(event != NULL); 2718 2719 ribstat = (rpcib_state_t *)any; 2720 hca = (rib_hca_t *)ribstat->hca; 2721 ASSERT(hca != NULL); 2722 2723 /* got a connection request */ 2724 switch (event->cm_type) { 2725 case IBT_CM_EVENT_REQ_RCV: 2726 /* 2727 * If the plugin is in the NO_ACCEPT state, bail out. 2728 */ 2729 mutex_enter(&plugin_state_lock); 2730 if (plugin_state == NO_ACCEPT) { 2731 mutex_exit(&plugin_state_lock); 2732 return (IBT_CM_REJECT); 2733 } 2734 mutex_exit(&plugin_state_lock); 2735 2736 /* 2737 * Need to send a MRA MAD to CM so that it does not 2738 * timeout on us. 2739 */ 2740 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2741 event->cm_event.req.req_timeout * 8, NULL, 0); 2742 2743 mutex_enter(&rib_stat->open_hca_lock); 2744 q = rib_stat->q; 2745 mutex_exit(&rib_stat->open_hca_lock); 2746 2747 status = rib_svc_create_chan(hca, (caddr_t)q, 2748 event->cm_event.req.req_prim_hca_port, &qp); 2749 2750 if (status) { 2751 return (IBT_CM_REJECT); 2752 } 2753 2754 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2755 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2756 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2757 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2758 2759 /* 2760 * Pre-posts RECV buffers 2761 */ 2762 conn = qptoc(qp); 2763 for (i = 0; i < preposted_rbufs; i++) { 2764 bzero(&rdbuf, sizeof (rdbuf)); 2765 rdbuf.type = RECV_BUFFER; 2766 buf = rib_rbuf_alloc(conn, &rdbuf); 2767 if (buf == NULL) { 2768 (void) rib_disconnect_channel(conn, NULL); 2769 return (IBT_CM_REJECT); 2770 } 2771 2772 bzero(&cl, sizeof (cl)); 2773 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2774 cl.c_len = rdbuf.len; 2775 cl.c_smemhandle.mrc_lmr = 2776 rdbuf.handle.mrc_lmr; /* lkey */ 2777 cl.c_next = NULL; 2778 status = rib_post_recv(conn, &cl); 2779 if (status != RDMA_SUCCESS) { 2780 (void) rib_disconnect_channel(conn, NULL); 2781 return (IBT_CM_REJECT); 2782 } 2783 } 2784 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2785 2786 /* 2787 * Get the address translation 2788 */ 2789 rw_enter(&hca->state_lock, RW_READER); 2790 if (hca->state == HCA_DETACHED) { 2791 rw_exit(&hca->state_lock); 2792 return (IBT_CM_REJECT); 2793 } 2794 rw_exit(&hca->state_lock); 2795 2796 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2797 2798 if (ibt_get_ip_data(event->cm_priv_data_len, 2799 event->cm_priv_data, 2800 &ipinfo) != IBT_SUCCESS) { 2801 2802 return (IBT_CM_REJECT); 2803 } 2804 2805 switch (ipinfo.src_addr.family) { 2806 case AF_INET: 2807 2808 conn->c_raddr.maxlen = 2809 conn->c_raddr.len = sin_size; 2810 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2811 2812 s = (struct sockaddr_in *)conn->c_raddr.buf; 2813 s->sin_family = AF_INET; 2814 2815 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2816 &s->sin_addr, in_size); 2817 2818 break; 2819 2820 case AF_INET6: 2821 2822 conn->c_raddr.maxlen = 2823 conn->c_raddr.len = sin6_size; 2824 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2825 2826 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2827 s6->sin6_family = AF_INET6; 2828 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2829 &s6->sin6_addr, 2830 sizeof (struct in6_addr)); 2831 2832 break; 2833 2834 default: 2835 return (IBT_CM_REJECT); 2836 } 2837 2838 break; 2839 2840 case IBT_CM_EVENT_CONN_CLOSED: 2841 { 2842 CONN *conn; 2843 rib_qp_t *qp; 2844 2845 switch (event->cm_event.closed) { 2846 case IBT_CM_CLOSED_DREP_RCVD: 2847 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2848 case IBT_CM_CLOSED_DUP: 2849 case IBT_CM_CLOSED_ABORT: 2850 case IBT_CM_CLOSED_ALREADY: 2851 /* 2852 * These cases indicate the local end initiated 2853 * the closing of the channel. Nothing to do here. 2854 */ 2855 break; 2856 default: 2857 /* 2858 * Reason for CONN_CLOSED event must be one of 2859 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2860 * or IBT_CM_CLOSED_STALE. These indicate cases were 2861 * the remote end is closing the channel. In these 2862 * cases free the channel and transition to error 2863 * state 2864 */ 2865 qp = ibt_get_chan_private(event->cm_channel); 2866 conn = qptoc(qp); 2867 mutex_enter(&conn->c_lock); 2868 if (conn->c_state == C_DISCONN_PEND) { 2869 mutex_exit(&conn->c_lock); 2870 break; 2871 } 2872 conn->c_state = C_ERROR_CONN; 2873 2874 /* 2875 * Free the rc_channel. Channel has already 2876 * transitioned to ERROR state and WRs have been 2877 * FLUSHED_ERR already. 2878 */ 2879 (void) ibt_free_channel(qp->qp_hdl); 2880 qp->qp_hdl = NULL; 2881 2882 /* 2883 * Free the conn if c_ref goes down to 0 2884 */ 2885 if (conn->c_ref == 0) { 2886 /* 2887 * Remove from list and free conn 2888 */ 2889 conn->c_state = C_DISCONN_PEND; 2890 mutex_exit(&conn->c_lock); 2891 (void) rib_disconnect_channel(conn, 2892 &hca->srv_conn_list); 2893 } else { 2894 mutex_exit(&conn->c_lock); 2895 } 2896 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 2897 break; 2898 } 2899 break; 2900 } 2901 case IBT_CM_EVENT_CONN_EST: 2902 /* 2903 * RTU received, hence connection established. 2904 */ 2905 if (rib_debug > 1) 2906 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2907 "(CONN_EST) channel established"); 2908 break; 2909 2910 default: 2911 if (rib_debug > 2) { 2912 /* Let CM handle the following events. */ 2913 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 2914 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2915 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 2916 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 2917 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2918 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 2919 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 2920 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2921 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 2922 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 2923 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2924 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 2925 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 2926 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2927 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 2928 } 2929 } 2930 return (IBT_CM_DEFAULT); 2931 } 2932 2933 /* accept all other CM messages (i.e. let the CM handle them) */ 2934 return (IBT_CM_ACCEPT); 2935 } 2936 2937 static rdma_stat 2938 rib_register_service(rib_hca_t *hca, int service_type) 2939 { 2940 ibt_srv_desc_t sdesc; 2941 ibt_hca_portinfo_t *port_infop; 2942 ib_svc_id_t srv_id; 2943 ibt_srv_hdl_t srv_hdl; 2944 uint_t port_size; 2945 uint_t pki, i, num_ports, nbinds; 2946 ibt_status_t ibt_status; 2947 rib_service_t *new_service; 2948 ib_pkey_t pkey; 2949 2950 /* 2951 * Query all ports for the given HCA 2952 */ 2953 rw_enter(&hca->state_lock, RW_READER); 2954 if (hca->state != HCA_DETACHED) { 2955 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 2956 &num_ports, &port_size); 2957 rw_exit(&hca->state_lock); 2958 } else { 2959 rw_exit(&hca->state_lock); 2960 return (RDMA_FAILED); 2961 } 2962 if (ibt_status != IBT_SUCCESS) { 2963 return (RDMA_FAILED); 2964 } 2965 2966 DTRACE_PROBE1(rpcib__i__regservice_numports, 2967 int, num_ports); 2968 2969 for (i = 0; i < num_ports; i++) { 2970 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 2971 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 2972 int, i+1); 2973 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 2974 DTRACE_PROBE1(rpcib__i__regservice__portactive, 2975 int, i+1); 2976 } 2977 } 2978 2979 /* 2980 * Get all the IP addresses on this system to register the 2981 * given "service type" on all DNS recognized IP addrs. 2982 * Each service type such as NFS will have all the systems 2983 * IP addresses as its different names. For now the only 2984 * type of service we support in RPCIB is NFS. 2985 */ 2986 rw_enter(&hca->service_list_lock, RW_WRITER); 2987 /* 2988 * Start registering and binding service to active 2989 * on active ports on this HCA. 2990 */ 2991 nbinds = 0; 2992 new_service = NULL; 2993 2994 /* 2995 * We use IP addresses as the service names for 2996 * service registration. Register each of them 2997 * with CM to obtain a svc_id and svc_hdl. We do not 2998 * register the service with machine's loopback address. 2999 */ 3000 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3001 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3002 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3003 3004 sdesc.sd_handler = rib_srv_cm_handler; 3005 sdesc.sd_flags = 0; 3006 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3007 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT), 3008 1, &srv_hdl, &srv_id); 3009 3010 for (i = 0; i < num_ports; i++) { 3011 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3012 continue; 3013 3014 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3015 pkey = port_infop[i].p_pkey_tbl[pki]; 3016 if ((pkey & IBSRM_HB) && 3017 (pkey != IB_PKEY_INVALID_FULL)) { 3018 3019 /* 3020 * Allocate and prepare a service entry 3021 */ 3022 new_service = 3023 kmem_zalloc(1 * sizeof (rib_service_t), 3024 KM_SLEEP); 3025 3026 new_service->srv_type = service_type; 3027 new_service->srv_hdl = srv_hdl; 3028 new_service->srv_next = NULL; 3029 3030 ibt_status = ibt_bind_service(srv_hdl, 3031 port_infop[i].p_sgid_tbl[0], 3032 NULL, rib_stat, NULL); 3033 3034 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3035 int, ibt_status); 3036 3037 if (ibt_status != IBT_SUCCESS) { 3038 kmem_free(new_service, 3039 sizeof (rib_service_t)); 3040 new_service = NULL; 3041 continue; 3042 } 3043 3044 /* 3045 * Add to the service list for this HCA 3046 */ 3047 new_service->srv_next = hca->service_list; 3048 hca->service_list = new_service; 3049 new_service = NULL; 3050 nbinds++; 3051 } 3052 } 3053 } 3054 rw_exit(&hca->service_list_lock); 3055 3056 ibt_free_portinfo(port_infop, port_size); 3057 3058 if (nbinds == 0) { 3059 return (RDMA_FAILED); 3060 } else { 3061 /* 3062 * Put this plugin into accept state, since atleast 3063 * one registration was successful. 3064 */ 3065 mutex_enter(&plugin_state_lock); 3066 plugin_state = ACCEPT; 3067 mutex_exit(&plugin_state_lock); 3068 return (RDMA_SUCCESS); 3069 } 3070 } 3071 3072 void 3073 rib_listen(struct rdma_svc_data *rd) 3074 { 3075 rdma_stat status = RDMA_SUCCESS; 3076 3077 rd->active = 0; 3078 rd->err_code = RDMA_FAILED; 3079 3080 /* 3081 * First check if a hca is still attached 3082 */ 3083 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3084 if (rib_stat->hca->state != HCA_INITED) { 3085 rw_exit(&rib_stat->hca->state_lock); 3086 return; 3087 } 3088 rw_exit(&rib_stat->hca->state_lock); 3089 3090 rib_stat->q = &rd->q; 3091 /* 3092 * Right now the only service type is NFS. Hence force feed this 3093 * value. Ideally to communicate the service type it should be 3094 * passed down in rdma_svc_data. 3095 */ 3096 rib_stat->service_type = NFS; 3097 status = rib_register_service(rib_stat->hca, NFS); 3098 if (status != RDMA_SUCCESS) { 3099 rd->err_code = status; 3100 return; 3101 } 3102 /* 3103 * Service active on an HCA, check rd->err_code for more 3104 * explainable errors. 3105 */ 3106 rd->active = 1; 3107 rd->err_code = status; 3108 } 3109 3110 /* XXXX */ 3111 /* ARGSUSED */ 3112 static void 3113 rib_listen_stop(struct rdma_svc_data *svcdata) 3114 { 3115 rib_hca_t *hca; 3116 3117 /* 3118 * KRPC called the RDMATF to stop the listeners, this means 3119 * stop sending incomming or recieved requests to KRPC master 3120 * transport handle for RDMA-IB. This is also means that the 3121 * master transport handle, responsible for us, is going away. 3122 */ 3123 mutex_enter(&plugin_state_lock); 3124 plugin_state = NO_ACCEPT; 3125 if (svcdata != NULL) 3126 svcdata->active = 0; 3127 mutex_exit(&plugin_state_lock); 3128 3129 /* 3130 * First check if a hca is still attached 3131 */ 3132 hca = rib_stat->hca; 3133 rw_enter(&hca->state_lock, RW_READER); 3134 if (hca->state != HCA_INITED) { 3135 rw_exit(&hca->state_lock); 3136 return; 3137 } 3138 rib_close_channels(&hca->srv_conn_list); 3139 rib_stop_services(hca); 3140 rw_exit(&hca->state_lock); 3141 } 3142 3143 /* 3144 * Traverse the HCA's service list to unbind and deregister services. 3145 * Instead of unbinding the service for a service handle by 3146 * calling ibt_unbind_service() for each port/pkey, we unbind 3147 * all the services for the service handle by making only one 3148 * call to ibt_unbind_all_services(). Then, we deregister the 3149 * service for the service handle. 3150 * 3151 * When traversing the entries in service_list, we compare the 3152 * srv_hdl of the current entry with that of the next. If they 3153 * are different or if the next entry is NULL, the current entry 3154 * marks the last binding of the service handle. In this case, 3155 * call ibt_unbind_all_services() and deregister the service for 3156 * the service handle. If they are the same, the current and the 3157 * next entries are bound to the same service handle. In this 3158 * case, move on to the next entry. 3159 */ 3160 static void 3161 rib_stop_services(rib_hca_t *hca) 3162 { 3163 rib_service_t *srv_list, *to_remove; 3164 3165 /* 3166 * unbind and deregister the services for this service type. 3167 * Right now there is only one service type. In future it will 3168 * be passed down to this function. 3169 */ 3170 rw_enter(&hca->service_list_lock, RW_WRITER); 3171 srv_list = hca->service_list; 3172 while (srv_list != NULL) { 3173 to_remove = srv_list; 3174 srv_list = to_remove->srv_next; 3175 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3176 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3177 3178 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3179 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3180 to_remove->srv_hdl); 3181 } 3182 3183 kmem_free(to_remove, sizeof (rib_service_t)); 3184 } 3185 hca->service_list = NULL; 3186 rw_exit(&hca->service_list_lock); 3187 } 3188 3189 static struct svc_recv * 3190 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3191 { 3192 struct svc_recv *recvp; 3193 3194 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3195 recvp->vaddr = sgl->ds_va; 3196 recvp->qp = qp; 3197 recvp->bytes_xfer = 0; 3198 return (recvp); 3199 } 3200 3201 static int 3202 rib_free_svc_recv(struct svc_recv *recvp) 3203 { 3204 kmem_free(recvp, sizeof (*recvp)); 3205 3206 return (0); 3207 } 3208 3209 static struct reply * 3210 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3211 { 3212 struct reply *rep; 3213 3214 3215 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3216 if (rep == NULL) { 3217 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3218 return (NULL); 3219 } 3220 rep->xid = msgid; 3221 rep->vaddr_cq = NULL; 3222 rep->bytes_xfer = 0; 3223 rep->status = (uint_t)REPLY_WAIT; 3224 rep->prev = NULL; 3225 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3226 3227 mutex_enter(&qp->replylist_lock); 3228 if (qp->replylist) { 3229 rep->next = qp->replylist; 3230 qp->replylist->prev = rep; 3231 } 3232 qp->rep_list_size++; 3233 3234 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3235 int, qp->rep_list_size); 3236 3237 qp->replylist = rep; 3238 mutex_exit(&qp->replylist_lock); 3239 3240 return (rep); 3241 } 3242 3243 static rdma_stat 3244 rib_rem_replylist(rib_qp_t *qp) 3245 { 3246 struct reply *r, *n; 3247 3248 mutex_enter(&qp->replylist_lock); 3249 for (r = qp->replylist; r != NULL; r = n) { 3250 n = r->next; 3251 (void) rib_remreply(qp, r); 3252 } 3253 mutex_exit(&qp->replylist_lock); 3254 3255 return (RDMA_SUCCESS); 3256 } 3257 3258 static int 3259 rib_remreply(rib_qp_t *qp, struct reply *rep) 3260 { 3261 3262 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3263 if (rep->prev) { 3264 rep->prev->next = rep->next; 3265 } 3266 if (rep->next) { 3267 rep->next->prev = rep->prev; 3268 } 3269 if (qp->replylist == rep) 3270 qp->replylist = rep->next; 3271 3272 cv_destroy(&rep->wait_cv); 3273 qp->rep_list_size--; 3274 3275 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3276 int, qp->rep_list_size); 3277 3278 kmem_free(rep, sizeof (*rep)); 3279 3280 return (0); 3281 } 3282 3283 rdma_stat 3284 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3285 struct mrc *buf_handle) 3286 { 3287 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3288 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3289 rdma_stat status; 3290 rib_hca_t *hca = (ctoqp(conn))->hca; 3291 3292 /* 3293 * Note: ALL buffer pools use the same memory type RDMARW. 3294 */ 3295 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3296 if (status == RDMA_SUCCESS) { 3297 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3298 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3299 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3300 } else { 3301 buf_handle->mrc_linfo = NULL; 3302 buf_handle->mrc_lmr = 0; 3303 buf_handle->mrc_rmr = 0; 3304 } 3305 return (status); 3306 } 3307 3308 static rdma_stat 3309 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3310 ibt_mr_flags_t spec, 3311 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3312 { 3313 ibt_mr_attr_t mem_attr; 3314 ibt_status_t ibt_status; 3315 mem_attr.mr_vaddr = (uintptr_t)buf; 3316 mem_attr.mr_len = (ib_msglen_t)size; 3317 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3318 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3319 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3320 IBT_MR_ENABLE_WINDOW_BIND | spec; 3321 3322 rw_enter(&hca->state_lock, RW_READER); 3323 if (hca->state == HCA_INITED) { 3324 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3325 &mem_attr, mr_hdlp, mr_descp); 3326 rw_exit(&hca->state_lock); 3327 } else { 3328 rw_exit(&hca->state_lock); 3329 return (RDMA_FAILED); 3330 } 3331 3332 if (ibt_status != IBT_SUCCESS) { 3333 return (RDMA_FAILED); 3334 } 3335 return (RDMA_SUCCESS); 3336 } 3337 3338 rdma_stat 3339 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3340 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3341 { 3342 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3343 rib_lrc_entry_t *l; 3344 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3345 rdma_stat status; 3346 rib_hca_t *hca = (ctoqp(conn))->hca; 3347 3348 /* 3349 * Non-coherent memory registration. 3350 */ 3351 l = (rib_lrc_entry_t *)lrc; 3352 if (l) { 3353 if (l->registered) { 3354 buf_handle->mrc_linfo = 3355 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3356 buf_handle->mrc_lmr = 3357 (uint32_t)l->lrc_mhandle.mrc_lmr; 3358 buf_handle->mrc_rmr = 3359 (uint32_t)l->lrc_mhandle.mrc_rmr; 3360 *sync_handle = (RIB_SYNCMEM_HANDLE) 3361 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3362 return (RDMA_SUCCESS); 3363 } else { 3364 /* Always register the whole buffer */ 3365 buf = (caddr_t)l->lrc_buf; 3366 buflen = l->lrc_len; 3367 } 3368 } 3369 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3370 3371 if (status == RDMA_SUCCESS) { 3372 if (l) { 3373 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3374 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3375 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3376 l->registered = TRUE; 3377 } 3378 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3379 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3380 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3381 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3382 } else { 3383 buf_handle->mrc_linfo = NULL; 3384 buf_handle->mrc_lmr = 0; 3385 buf_handle->mrc_rmr = 0; 3386 } 3387 return (status); 3388 } 3389 3390 /* ARGSUSED */ 3391 rdma_stat 3392 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3393 { 3394 rib_hca_t *hca = (ctoqp(conn))->hca; 3395 /* 3396 * Allow memory deregistration even if HCA is 3397 * getting detached. Need all outstanding 3398 * memory registrations to be deregistered 3399 * before HCA_DETACH_EVENT can be accepted. 3400 */ 3401 (void) ibt_deregister_mr(hca->hca_hdl, 3402 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3403 return (RDMA_SUCCESS); 3404 } 3405 3406 /* ARGSUSED */ 3407 rdma_stat 3408 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3409 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3410 { 3411 rib_lrc_entry_t *l; 3412 l = (rib_lrc_entry_t *)lrc; 3413 if (l) 3414 if (l->registered) 3415 return (RDMA_SUCCESS); 3416 3417 (void) rib_deregistermem(conn, buf, buf_handle); 3418 3419 return (RDMA_SUCCESS); 3420 } 3421 3422 /* ARGSUSED */ 3423 rdma_stat 3424 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3425 int len, int cpu) 3426 { 3427 ibt_status_t status; 3428 rib_hca_t *hca = (ctoqp(conn))->hca; 3429 ibt_mr_sync_t mr_segment; 3430 3431 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3432 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3433 mr_segment.ms_len = (ib_memlen_t)len; 3434 if (cpu) { 3435 /* make incoming data visible to memory */ 3436 mr_segment.ms_flags = IBT_SYNC_WRITE; 3437 } else { 3438 /* make memory changes visible to IO */ 3439 mr_segment.ms_flags = IBT_SYNC_READ; 3440 } 3441 rw_enter(&hca->state_lock, RW_READER); 3442 if (hca->state == HCA_INITED) { 3443 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3444 rw_exit(&hca->state_lock); 3445 } else { 3446 rw_exit(&hca->state_lock); 3447 return (RDMA_FAILED); 3448 } 3449 3450 if (status == IBT_SUCCESS) 3451 return (RDMA_SUCCESS); 3452 else { 3453 return (RDMA_FAILED); 3454 } 3455 } 3456 3457 /* 3458 * XXXX ???? 3459 */ 3460 static rdma_stat 3461 rib_getinfo(rdma_info_t *info) 3462 { 3463 /* 3464 * XXXX Hack! 3465 */ 3466 info->addrlen = 16; 3467 info->mts = 1000000; 3468 info->mtu = 1000000; 3469 3470 return (RDMA_SUCCESS); 3471 } 3472 3473 rib_bufpool_t * 3474 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3475 { 3476 rib_bufpool_t *rbp = NULL; 3477 bufpool_t *bp = NULL; 3478 caddr_t buf; 3479 ibt_mr_attr_t mem_attr; 3480 ibt_status_t ibt_status; 3481 int i, j; 3482 3483 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3484 3485 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3486 num * sizeof (void *), KM_SLEEP); 3487 3488 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3489 bp->numelems = num; 3490 3491 3492 switch (ptype) { 3493 case SEND_BUFFER: 3494 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3495 bp->rsize = RPC_MSG_SZ; 3496 break; 3497 case RECV_BUFFER: 3498 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3499 bp->rsize = RPC_BUF_SIZE; 3500 break; 3501 default: 3502 goto fail; 3503 } 3504 3505 /* 3506 * Register the pool. 3507 */ 3508 bp->bufsize = num * bp->rsize; 3509 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3510 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3511 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3512 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3513 sizeof (ibt_mr_desc_t), KM_SLEEP); 3514 rw_enter(&hca->state_lock, RW_READER); 3515 3516 if (hca->state != HCA_INITED) { 3517 rw_exit(&hca->state_lock); 3518 goto fail; 3519 } 3520 3521 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3522 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3523 mem_attr.mr_vaddr = (uintptr_t)buf; 3524 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3525 mem_attr.mr_as = NULL; 3526 ibt_status = ibt_register_mr(hca->hca_hdl, 3527 hca->pd_hdl, &mem_attr, 3528 &rbp->mr_hdl[i], 3529 &rbp->mr_desc[i]); 3530 if (ibt_status != IBT_SUCCESS) { 3531 for (j = 0; j < i; j++) { 3532 (void) ibt_deregister_mr(hca->hca_hdl, 3533 rbp->mr_hdl[j]); 3534 } 3535 rw_exit(&hca->state_lock); 3536 goto fail; 3537 } 3538 } 3539 rw_exit(&hca->state_lock); 3540 buf = (caddr_t)bp->buf; 3541 for (i = 0; i < num; i++, buf += bp->rsize) { 3542 bp->buflist[i] = (void *)buf; 3543 } 3544 bp->buffree = num - 1; /* no. of free buffers */ 3545 rbp->bpool = bp; 3546 3547 return (rbp); 3548 fail: 3549 if (bp) { 3550 if (bp->buf) 3551 kmem_free(bp->buf, bp->bufsize); 3552 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3553 } 3554 if (rbp) { 3555 if (rbp->mr_hdl) 3556 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3557 if (rbp->mr_desc) 3558 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3559 kmem_free(rbp, sizeof (rib_bufpool_t)); 3560 } 3561 return (NULL); 3562 } 3563 3564 static void 3565 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3566 { 3567 int i; 3568 rib_bufpool_t *rbp = NULL; 3569 bufpool_t *bp; 3570 3571 /* 3572 * Obtain pool address based on type of pool 3573 */ 3574 switch (ptype) { 3575 case SEND_BUFFER: 3576 rbp = hca->send_pool; 3577 break; 3578 case RECV_BUFFER: 3579 rbp = hca->recv_pool; 3580 break; 3581 default: 3582 return; 3583 } 3584 if (rbp == NULL) 3585 return; 3586 3587 bp = rbp->bpool; 3588 3589 /* 3590 * Deregister the pool memory and free it. 3591 */ 3592 for (i = 0; i < bp->numelems; i++) { 3593 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3594 } 3595 } 3596 3597 static void 3598 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3599 { 3600 3601 rib_bufpool_t *rbp = NULL; 3602 bufpool_t *bp; 3603 3604 /* 3605 * Obtain pool address based on type of pool 3606 */ 3607 switch (ptype) { 3608 case SEND_BUFFER: 3609 rbp = hca->send_pool; 3610 break; 3611 case RECV_BUFFER: 3612 rbp = hca->recv_pool; 3613 break; 3614 default: 3615 return; 3616 } 3617 if (rbp == NULL) 3618 return; 3619 3620 bp = rbp->bpool; 3621 3622 /* 3623 * Free the pool memory. 3624 */ 3625 if (rbp->mr_hdl) 3626 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3627 3628 if (rbp->mr_desc) 3629 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3630 if (bp->buf) 3631 kmem_free(bp->buf, bp->bufsize); 3632 mutex_destroy(&bp->buflock); 3633 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3634 kmem_free(rbp, sizeof (rib_bufpool_t)); 3635 } 3636 3637 void 3638 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3639 { 3640 /* 3641 * Deregister the pool memory and free it. 3642 */ 3643 rib_rbufpool_deregister(hca, ptype); 3644 rib_rbufpool_free(hca, ptype); 3645 } 3646 3647 /* 3648 * Fetch a buffer from the pool of type specified in rdbuf->type. 3649 */ 3650 static rdma_stat 3651 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3652 { 3653 rib_lrc_entry_t *rlep; 3654 3655 if (rdbuf->type == RDMA_LONG_BUFFER) { 3656 rlep = rib_get_cache_buf(conn, rdbuf->len); 3657 rdbuf->rb_private = (caddr_t)rlep; 3658 rdbuf->addr = rlep->lrc_buf; 3659 rdbuf->handle = rlep->lrc_mhandle; 3660 return (RDMA_SUCCESS); 3661 } 3662 3663 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3664 if (rdbuf->addr) { 3665 switch (rdbuf->type) { 3666 case SEND_BUFFER: 3667 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3668 break; 3669 case RECV_BUFFER: 3670 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3671 break; 3672 default: 3673 rdbuf->len = 0; 3674 } 3675 return (RDMA_SUCCESS); 3676 } else 3677 return (RDMA_FAILED); 3678 } 3679 3680 #if defined(MEASURE_POOL_DEPTH) 3681 static void rib_recv_bufs(uint32_t x) { 3682 3683 } 3684 3685 static void rib_send_bufs(uint32_t x) { 3686 3687 } 3688 #endif 3689 3690 /* 3691 * Fetch a buffer of specified type. 3692 * Note that rdbuf->handle is mw's rkey. 3693 */ 3694 static void * 3695 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3696 { 3697 rib_qp_t *qp = ctoqp(conn); 3698 rib_hca_t *hca = qp->hca; 3699 rdma_btype ptype = rdbuf->type; 3700 void *buf; 3701 rib_bufpool_t *rbp = NULL; 3702 bufpool_t *bp; 3703 int i; 3704 3705 /* 3706 * Obtain pool address based on type of pool 3707 */ 3708 switch (ptype) { 3709 case SEND_BUFFER: 3710 rbp = hca->send_pool; 3711 break; 3712 case RECV_BUFFER: 3713 rbp = hca->recv_pool; 3714 break; 3715 default: 3716 return (NULL); 3717 } 3718 if (rbp == NULL) 3719 return (NULL); 3720 3721 bp = rbp->bpool; 3722 3723 mutex_enter(&bp->buflock); 3724 if (bp->buffree < 0) { 3725 mutex_exit(&bp->buflock); 3726 return (NULL); 3727 } 3728 3729 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3730 buf = bp->buflist[bp->buffree]; 3731 rdbuf->addr = buf; 3732 rdbuf->len = bp->rsize; 3733 for (i = bp->numelems - 1; i >= 0; i--) { 3734 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3735 rdbuf->handle.mrc_rmr = 3736 (uint32_t)rbp->mr_desc[i].md_rkey; 3737 rdbuf->handle.mrc_linfo = 3738 (uintptr_t)rbp->mr_hdl[i]; 3739 rdbuf->handle.mrc_lmr = 3740 (uint32_t)rbp->mr_desc[i].md_lkey; 3741 #if defined(MEASURE_POOL_DEPTH) 3742 if (ptype == SEND_BUFFER) 3743 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 3744 if (ptype == RECV_BUFFER) 3745 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 3746 #endif 3747 bp->buffree--; 3748 3749 mutex_exit(&bp->buflock); 3750 3751 return (buf); 3752 } 3753 } 3754 3755 mutex_exit(&bp->buflock); 3756 3757 return (NULL); 3758 } 3759 3760 static void 3761 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3762 { 3763 3764 if (rdbuf->type == RDMA_LONG_BUFFER) { 3765 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3766 rdbuf->rb_private = NULL; 3767 return; 3768 } 3769 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3770 } 3771 3772 static void 3773 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3774 { 3775 rib_qp_t *qp = ctoqp(conn); 3776 rib_hca_t *hca = qp->hca; 3777 rib_bufpool_t *rbp = NULL; 3778 bufpool_t *bp; 3779 3780 /* 3781 * Obtain pool address based on type of pool 3782 */ 3783 switch (ptype) { 3784 case SEND_BUFFER: 3785 rbp = hca->send_pool; 3786 break; 3787 case RECV_BUFFER: 3788 rbp = hca->recv_pool; 3789 break; 3790 default: 3791 return; 3792 } 3793 if (rbp == NULL) 3794 return; 3795 3796 bp = rbp->bpool; 3797 3798 mutex_enter(&bp->buflock); 3799 if (++bp->buffree >= bp->numelems) { 3800 /* 3801 * Should never happen 3802 */ 3803 bp->buffree--; 3804 } else { 3805 bp->buflist[bp->buffree] = buf; 3806 } 3807 mutex_exit(&bp->buflock); 3808 } 3809 3810 static rdma_stat 3811 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3812 { 3813 rw_enter(&connlist->conn_lock, RW_WRITER); 3814 if (connlist->conn_hd) { 3815 cn->c_next = connlist->conn_hd; 3816 connlist->conn_hd->c_prev = cn; 3817 } 3818 connlist->conn_hd = cn; 3819 rw_exit(&connlist->conn_lock); 3820 3821 return (RDMA_SUCCESS); 3822 } 3823 3824 static rdma_stat 3825 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3826 { 3827 rw_enter(&connlist->conn_lock, RW_WRITER); 3828 if (cn->c_prev) { 3829 cn->c_prev->c_next = cn->c_next; 3830 } 3831 if (cn->c_next) { 3832 cn->c_next->c_prev = cn->c_prev; 3833 } 3834 if (connlist->conn_hd == cn) 3835 connlist->conn_hd = cn->c_next; 3836 rw_exit(&connlist->conn_lock); 3837 3838 return (RDMA_SUCCESS); 3839 } 3840 3841 /* 3842 * Connection management. 3843 * IBTF does not support recycling of channels. So connections are only 3844 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3845 * C_DISCONN_PEND state. No C_IDLE state. 3846 * C_CONN_PEND state: Connection establishment in progress to the server. 3847 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3848 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3849 * only in this state. 3850 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3851 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3852 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3853 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3854 * c_ref drops to 0 (this indicates that RPC has no more references to this 3855 * connection), the connection should be destroyed. A connection transitions 3856 * into this state when it is being destroyed. 3857 */ 3858 /* ARGSUSED */ 3859 static rdma_stat 3860 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3861 { 3862 CONN *cn; 3863 int status = RDMA_SUCCESS; 3864 rib_hca_t *hca = rib_stat->hca; 3865 rib_qp_t *qp; 3866 clock_t cv_stat, timout; 3867 rpcib_ping_t rpt; 3868 3869 if (hca == NULL) 3870 return (RDMA_FAILED); 3871 3872 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3873 if (hca->state == HCA_DETACHED) { 3874 rw_exit(&rib_stat->hca->state_lock); 3875 return (RDMA_FAILED); 3876 } 3877 rw_exit(&rib_stat->hca->state_lock); 3878 3879 again: 3880 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3881 cn = hca->cl_conn_list.conn_hd; 3882 while (cn != NULL) { 3883 /* 3884 * First, clear up any connection in the ERROR state 3885 */ 3886 mutex_enter(&cn->c_lock); 3887 if (cn->c_state == C_ERROR_CONN) { 3888 if (cn->c_ref == 0) { 3889 /* 3890 * Remove connection from list and destroy it. 3891 */ 3892 cn->c_state = C_DISCONN_PEND; 3893 mutex_exit(&cn->c_lock); 3894 rw_exit(&hca->cl_conn_list.conn_lock); 3895 (void) rib_disconnect_channel(cn, 3896 &hca->cl_conn_list); 3897 goto again; 3898 } 3899 mutex_exit(&cn->c_lock); 3900 cn = cn->c_next; 3901 continue; 3902 } 3903 if (cn->c_state == C_DISCONN_PEND) { 3904 mutex_exit(&cn->c_lock); 3905 cn = cn->c_next; 3906 continue; 3907 } 3908 if ((cn->c_raddr.len == svcaddr->len) && 3909 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 3910 /* 3911 * Our connection. Give up conn list lock 3912 * as we are done traversing the list. 3913 */ 3914 rw_exit(&hca->cl_conn_list.conn_lock); 3915 if (cn->c_state == C_CONNECTED) { 3916 cn->c_ref++; /* sharing a conn */ 3917 mutex_exit(&cn->c_lock); 3918 *conn = cn; 3919 return (status); 3920 } 3921 if (cn->c_state == C_CONN_PEND) { 3922 /* 3923 * Hold a reference to this conn before 3924 * we give up the lock. 3925 */ 3926 cn->c_ref++; 3927 timout = ddi_get_lbolt() + 3928 drv_usectohz(CONN_WAIT_TIME * 1000000); 3929 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 3930 &cn->c_lock, timout)) > 0 && 3931 cn->c_state == C_CONN_PEND) 3932 ; 3933 if (cv_stat == 0) { 3934 cn->c_ref--; 3935 mutex_exit(&cn->c_lock); 3936 return (RDMA_INTR); 3937 } 3938 if (cv_stat < 0) { 3939 cn->c_ref--; 3940 mutex_exit(&cn->c_lock); 3941 return (RDMA_TIMEDOUT); 3942 } 3943 if (cn->c_state == C_CONNECTED) { 3944 *conn = cn; 3945 mutex_exit(&cn->c_lock); 3946 return (status); 3947 } else { 3948 cn->c_ref--; 3949 mutex_exit(&cn->c_lock); 3950 return (RDMA_TIMEDOUT); 3951 } 3952 } 3953 } 3954 mutex_exit(&cn->c_lock); 3955 cn = cn->c_next; 3956 } 3957 rw_exit(&hca->cl_conn_list.conn_lock); 3958 3959 bzero(&rpt, sizeof (rpcib_ping_t)); 3960 3961 status = rib_ping_srv(addr_type, svcaddr, &rpt); 3962 if (status != RDMA_SUCCESS) { 3963 return (RDMA_FAILED); 3964 } 3965 3966 /* 3967 * Channel to server doesn't exist yet, create one. 3968 */ 3969 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 3970 return (RDMA_FAILED); 3971 } 3972 cn = qptoc(qp); 3973 cn->c_state = C_CONN_PEND; 3974 cn->c_ref = 1; 3975 3976 /* 3977 * Add to conn list. 3978 * We had given up the READER lock. In the time since then, 3979 * another thread might have created the connection we are 3980 * trying here. But for now, that is quiet alright - there 3981 * might be two connections between a pair of hosts instead 3982 * of one. If we really want to close that window, 3983 * then need to check the list after acquiring the 3984 * WRITER lock. 3985 */ 3986 (void) rib_add_connlist(cn, &hca->cl_conn_list); 3987 status = rib_conn_to_srv(hca, qp, &rpt); 3988 mutex_enter(&cn->c_lock); 3989 if (status == RDMA_SUCCESS) { 3990 cn->c_state = C_CONNECTED; 3991 *conn = cn; 3992 } else { 3993 cn->c_state = C_ERROR_CONN; 3994 cn->c_ref--; 3995 } 3996 cv_broadcast(&cn->c_cv); 3997 mutex_exit(&cn->c_lock); 3998 return (status); 3999 } 4000 4001 static rdma_stat 4002 rib_conn_release(CONN *conn) 4003 { 4004 rib_qp_t *qp = ctoqp(conn); 4005 4006 mutex_enter(&conn->c_lock); 4007 conn->c_ref--; 4008 4009 /* 4010 * If a conn is C_ERROR_CONN, close the channel. 4011 * If it's CONNECTED, keep it that way. 4012 */ 4013 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4014 conn->c_state = C_DISCONN_PEND; 4015 mutex_exit(&conn->c_lock); 4016 if (qp->mode == RIB_SERVER) 4017 (void) rib_disconnect_channel(conn, 4018 &qp->hca->srv_conn_list); 4019 else 4020 (void) rib_disconnect_channel(conn, 4021 &qp->hca->cl_conn_list); 4022 return (RDMA_SUCCESS); 4023 } 4024 mutex_exit(&conn->c_lock); 4025 return (RDMA_SUCCESS); 4026 } 4027 4028 /* 4029 * Add at front of list 4030 */ 4031 static struct rdma_done_list * 4032 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4033 { 4034 struct rdma_done_list *rd; 4035 4036 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4037 4038 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4039 rd->xid = xid; 4040 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4041 4042 rd->prev = NULL; 4043 rd->next = qp->rdlist; 4044 if (qp->rdlist != NULL) 4045 qp->rdlist->prev = rd; 4046 qp->rdlist = rd; 4047 4048 return (rd); 4049 } 4050 4051 static void 4052 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4053 { 4054 struct rdma_done_list *r; 4055 4056 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4057 4058 r = rd->next; 4059 if (r != NULL) { 4060 r->prev = rd->prev; 4061 } 4062 4063 r = rd->prev; 4064 if (r != NULL) { 4065 r->next = rd->next; 4066 } else { 4067 qp->rdlist = rd->next; 4068 } 4069 4070 cv_destroy(&rd->rdma_done_cv); 4071 kmem_free(rd, sizeof (*rd)); 4072 } 4073 4074 static void 4075 rdma_done_rem_list(rib_qp_t *qp) 4076 { 4077 struct rdma_done_list *r, *n; 4078 4079 mutex_enter(&qp->rdlist_lock); 4080 for (r = qp->rdlist; r != NULL; r = n) { 4081 n = r->next; 4082 rdma_done_rm(qp, r); 4083 } 4084 mutex_exit(&qp->rdlist_lock); 4085 } 4086 4087 static void 4088 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4089 { 4090 struct rdma_done_list *r = qp->rdlist; 4091 4092 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4093 4094 while (r) { 4095 if (r->xid == xid) { 4096 cv_signal(&r->rdma_done_cv); 4097 return; 4098 } else { 4099 r = r->next; 4100 } 4101 } 4102 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4103 int, xid); 4104 } 4105 4106 4107 /* 4108 * Goes through all connections and closes the channel 4109 * This will cause all the WRs on those channels to be 4110 * flushed. 4111 */ 4112 static void 4113 rib_close_channels(rib_conn_list_t *connlist) 4114 { 4115 CONN *conn; 4116 rib_qp_t *qp; 4117 4118 rw_enter(&connlist->conn_lock, RW_READER); 4119 conn = connlist->conn_hd; 4120 while (conn != NULL) { 4121 mutex_enter(&conn->c_lock); 4122 qp = ctoqp(conn); 4123 if (conn->c_state == C_CONNECTED) { 4124 /* 4125 * Live connection in CONNECTED state. 4126 * Call ibt_close_rc_channel in nonblocking mode 4127 * with no callbacks. 4128 */ 4129 conn->c_state = C_ERROR_CONN; 4130 (void) ibt_close_rc_channel(qp->qp_hdl, 4131 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4132 (void) ibt_free_channel(qp->qp_hdl); 4133 qp->qp_hdl = NULL; 4134 } else { 4135 if (conn->c_state == C_ERROR_CONN && 4136 qp->qp_hdl != NULL) { 4137 /* 4138 * Connection in ERROR state but 4139 * channel is not yet freed. 4140 */ 4141 (void) ibt_close_rc_channel(qp->qp_hdl, 4142 IBT_NOCALLBACKS, NULL, 0, NULL, 4143 NULL, 0); 4144 (void) ibt_free_channel(qp->qp_hdl); 4145 qp->qp_hdl = NULL; 4146 } 4147 } 4148 mutex_exit(&conn->c_lock); 4149 conn = conn->c_next; 4150 } 4151 rw_exit(&connlist->conn_lock); 4152 } 4153 4154 /* 4155 * Frees up all connections that are no longer being referenced 4156 */ 4157 static void 4158 rib_purge_connlist(rib_conn_list_t *connlist) 4159 { 4160 CONN *conn; 4161 4162 top: 4163 rw_enter(&connlist->conn_lock, RW_READER); 4164 conn = connlist->conn_hd; 4165 while (conn != NULL) { 4166 mutex_enter(&conn->c_lock); 4167 4168 /* 4169 * At this point connection is either in ERROR 4170 * or DISCONN_PEND state. If in DISCONN_PEND state 4171 * then some other thread is culling that connection. 4172 * If not and if c_ref is 0, then destroy the connection. 4173 */ 4174 if (conn->c_ref == 0 && 4175 conn->c_state != C_DISCONN_PEND) { 4176 /* 4177 * Cull the connection 4178 */ 4179 conn->c_state = C_DISCONN_PEND; 4180 mutex_exit(&conn->c_lock); 4181 rw_exit(&connlist->conn_lock); 4182 (void) rib_disconnect_channel(conn, connlist); 4183 goto top; 4184 } else { 4185 /* 4186 * conn disconnect already scheduled or will 4187 * happen from conn_release when c_ref drops to 0. 4188 */ 4189 mutex_exit(&conn->c_lock); 4190 } 4191 conn = conn->c_next; 4192 } 4193 rw_exit(&connlist->conn_lock); 4194 4195 /* 4196 * At this point, only connections with c_ref != 0 are on the list 4197 */ 4198 } 4199 4200 /* 4201 * Cleans and closes up all uses of the HCA 4202 */ 4203 static void 4204 rib_detach_hca(rib_hca_t *hca) 4205 { 4206 4207 /* 4208 * Stop all services on the HCA 4209 * Go through cl_conn_list and close all rc_channels 4210 * Go through svr_conn_list and close all rc_channels 4211 * Free connections whose c_ref has dropped to 0 4212 * Destroy all CQs 4213 * Deregister and released all buffer pool memory after all 4214 * connections are destroyed 4215 * Free the protection domain 4216 * ibt_close_hca() 4217 */ 4218 rw_enter(&hca->state_lock, RW_WRITER); 4219 if (hca->state == HCA_DETACHED) { 4220 rw_exit(&hca->state_lock); 4221 return; 4222 } 4223 4224 hca->state = HCA_DETACHED; 4225 rib_stat->nhca_inited--; 4226 4227 rib_stop_services(hca); 4228 rib_close_channels(&hca->cl_conn_list); 4229 rib_close_channels(&hca->srv_conn_list); 4230 4231 rib_mod.rdma_count--; 4232 4233 rw_exit(&hca->state_lock); 4234 4235 /* 4236 * purge will free all datastructures used by CQ handlers. We don't 4237 * want to receive completions after purge, so we'll free the CQs now. 4238 */ 4239 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4240 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4241 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4242 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4243 4244 rib_purge_connlist(&hca->cl_conn_list); 4245 rib_purge_connlist(&hca->srv_conn_list); 4246 4247 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4248 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4249 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4250 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4251 if (stats_enabled) { 4252 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4253 GLOBAL_ZONEID); 4254 } 4255 4256 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4257 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4258 if (hca->srv_conn_list.conn_hd == NULL && 4259 hca->cl_conn_list.conn_hd == NULL) { 4260 /* 4261 * conn_lists are NULL, so destroy 4262 * buffers, close hca and be done. 4263 */ 4264 rib_rbufpool_destroy(hca, RECV_BUFFER); 4265 rib_rbufpool_destroy(hca, SEND_BUFFER); 4266 rib_destroy_cache(hca); 4267 rdma_unregister_mod(&rib_mod); 4268 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4269 (void) ibt_close_hca(hca->hca_hdl); 4270 hca->hca_hdl = NULL; 4271 } 4272 rw_exit(&hca->cl_conn_list.conn_lock); 4273 rw_exit(&hca->srv_conn_list.conn_lock); 4274 4275 if (hca->hca_hdl != NULL) { 4276 mutex_enter(&hca->inuse_lock); 4277 while (hca->inuse) 4278 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4279 mutex_exit(&hca->inuse_lock); 4280 4281 rdma_unregister_mod(&rib_mod); 4282 4283 /* 4284 * conn_lists are now NULL, so destroy 4285 * buffers, close hca and be done. 4286 */ 4287 rib_rbufpool_destroy(hca, RECV_BUFFER); 4288 rib_rbufpool_destroy(hca, SEND_BUFFER); 4289 rib_destroy_cache(hca); 4290 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4291 (void) ibt_close_hca(hca->hca_hdl); 4292 hca->hca_hdl = NULL; 4293 } 4294 } 4295 4296 static void 4297 rib_server_side_cache_reclaim(void *argp) 4298 { 4299 cache_avl_struct_t *rcas; 4300 rib_lrc_entry_t *rb; 4301 rib_hca_t *hca = (rib_hca_t *)argp; 4302 4303 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4304 rcas = avl_first(&hca->avl_tree); 4305 if (rcas != NULL) 4306 avl_remove(&hca->avl_tree, rcas); 4307 4308 while (rcas != NULL) { 4309 while (rcas->r.forw != &rcas->r) { 4310 rcas->elements--; 4311 rib_total_buffers --; 4312 rb = rcas->r.forw; 4313 remque(rb); 4314 if (rb->registered) 4315 (void) rib_deregistermem_via_hca(hca, 4316 rb->lrc_buf, rb->lrc_mhandle); 4317 cache_allocation -= rb->lrc_len; 4318 kmem_free(rb->lrc_buf, rb->lrc_len); 4319 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4320 } 4321 mutex_destroy(&rcas->node_lock); 4322 kmem_cache_free(hca->server_side_cache, rcas); 4323 rcas = avl_first(&hca->avl_tree); 4324 if (rcas != NULL) 4325 avl_remove(&hca->avl_tree, rcas); 4326 } 4327 rw_exit(&hca->avl_rw_lock); 4328 } 4329 4330 static void 4331 rib_server_side_cache_cleanup(void *argp) 4332 { 4333 cache_avl_struct_t *rcas; 4334 rib_lrc_entry_t *rb; 4335 rib_hca_t *hca = (rib_hca_t *)argp; 4336 4337 rw_enter(&hca->avl_rw_lock, RW_READER); 4338 if (cache_allocation < cache_limit) { 4339 rw_exit(&hca->avl_rw_lock); 4340 return; 4341 } 4342 rw_exit(&hca->avl_rw_lock); 4343 4344 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4345 rcas = avl_last(&hca->avl_tree); 4346 if (rcas != NULL) 4347 avl_remove(&hca->avl_tree, rcas); 4348 4349 while (rcas != NULL) { 4350 while (rcas->r.forw != &rcas->r) { 4351 rcas->elements--; 4352 rib_total_buffers --; 4353 rb = rcas->r.forw; 4354 remque(rb); 4355 if (rb->registered) 4356 (void) rib_deregistermem_via_hca(hca, 4357 rb->lrc_buf, rb->lrc_mhandle); 4358 cache_allocation -= rb->lrc_len; 4359 kmem_free(rb->lrc_buf, rb->lrc_len); 4360 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4361 } 4362 mutex_destroy(&rcas->node_lock); 4363 if (hca->server_side_cache) { 4364 kmem_cache_free(hca->server_side_cache, rcas); 4365 } 4366 if ((cache_allocation) < cache_limit) { 4367 rw_exit(&hca->avl_rw_lock); 4368 return; 4369 } 4370 4371 rcas = avl_last(&hca->avl_tree); 4372 if (rcas != NULL) 4373 avl_remove(&hca->avl_tree, rcas); 4374 } 4375 rw_exit(&hca->avl_rw_lock); 4376 } 4377 4378 static int 4379 avl_compare(const void *t1, const void *t2) 4380 { 4381 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4382 return (0); 4383 4384 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4385 return (-1); 4386 4387 return (1); 4388 } 4389 4390 static void 4391 rib_destroy_cache(rib_hca_t *hca) 4392 { 4393 if (hca->reg_cache_clean_up != NULL) { 4394 ddi_taskq_destroy(hca->reg_cache_clean_up); 4395 hca->reg_cache_clean_up = NULL; 4396 } 4397 if (hca->avl_init) { 4398 rib_server_side_cache_reclaim((void *)hca); 4399 if (hca->server_side_cache) { 4400 kmem_cache_destroy(hca->server_side_cache); 4401 hca->server_side_cache = NULL; 4402 } 4403 avl_destroy(&hca->avl_tree); 4404 mutex_destroy(&hca->cache_allocation); 4405 rw_destroy(&hca->avl_rw_lock); 4406 } 4407 hca->avl_init = FALSE; 4408 } 4409 4410 static void 4411 rib_force_cleanup(void *hca) 4412 { 4413 if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL) 4414 (void) ddi_taskq_dispatch( 4415 ((rib_hca_t *)hca)->reg_cache_clean_up, 4416 rib_server_side_cache_cleanup, 4417 (void *)hca, DDI_NOSLEEP); 4418 } 4419 4420 static rib_lrc_entry_t * 4421 rib_get_cache_buf(CONN *conn, uint32_t len) 4422 { 4423 cache_avl_struct_t cas, *rcas; 4424 rib_hca_t *hca = (ctoqp(conn))->hca; 4425 rib_lrc_entry_t *reply_buf; 4426 avl_index_t where = NULL; 4427 uint64_t c_alloc = 0; 4428 4429 if (!hca->avl_init) 4430 goto error_alloc; 4431 4432 cas.len = len; 4433 4434 rw_enter(&hca->avl_rw_lock, RW_READER); 4435 4436 mutex_enter(&hca->cache_allocation); 4437 c_alloc = cache_allocation; 4438 mutex_exit(&hca->cache_allocation); 4439 4440 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4441 &where)) == NULL) { 4442 /* Am I above the cache limit */ 4443 if ((c_alloc + len) >= cache_limit) { 4444 rib_force_cleanup((void *)hca); 4445 rw_exit(&hca->avl_rw_lock); 4446 cache_misses_above_the_limit ++; 4447 4448 /* Allocate and register the buffer directly */ 4449 goto error_alloc; 4450 } 4451 4452 rw_exit(&hca->avl_rw_lock); 4453 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4454 4455 /* Recheck to make sure no other thread added the entry in */ 4456 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4457 &cas, &where)) == NULL) { 4458 /* Allocate an avl tree entry */ 4459 rcas = (cache_avl_struct_t *) 4460 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4461 4462 bzero(rcas, sizeof (cache_avl_struct_t)); 4463 rcas->elements = 0; 4464 rcas->r.forw = &rcas->r; 4465 rcas->r.back = &rcas->r; 4466 rcas->len = len; 4467 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4468 avl_insert(&hca->avl_tree, rcas, where); 4469 } 4470 } 4471 4472 mutex_enter(&rcas->node_lock); 4473 4474 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4475 rib_total_buffers--; 4476 cache_hits++; 4477 reply_buf = rcas->r.forw; 4478 remque(reply_buf); 4479 rcas->elements--; 4480 mutex_exit(&rcas->node_lock); 4481 rw_exit(&hca->avl_rw_lock); 4482 mutex_enter(&hca->cache_allocation); 4483 cache_allocation -= len; 4484 mutex_exit(&hca->cache_allocation); 4485 } else { 4486 /* Am I above the cache limit */ 4487 mutex_exit(&rcas->node_lock); 4488 if ((c_alloc + len) >= cache_limit) { 4489 rib_force_cleanup((void *)hca); 4490 rw_exit(&hca->avl_rw_lock); 4491 cache_misses_above_the_limit ++; 4492 /* Allocate and register the buffer directly */ 4493 goto error_alloc; 4494 } 4495 rw_exit(&hca->avl_rw_lock); 4496 cache_misses ++; 4497 /* Allocate a reply_buf entry */ 4498 reply_buf = (rib_lrc_entry_t *) 4499 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4500 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4501 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4502 reply_buf->lrc_len = len; 4503 reply_buf->registered = FALSE; 4504 reply_buf->avl_node = (void *)rcas; 4505 } 4506 4507 return (reply_buf); 4508 4509 error_alloc: 4510 reply_buf = (rib_lrc_entry_t *) 4511 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4512 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4513 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4514 reply_buf->lrc_len = len; 4515 reply_buf->registered = FALSE; 4516 reply_buf->avl_node = NULL; 4517 4518 return (reply_buf); 4519 } 4520 4521 /* 4522 * Return a pre-registered back to the cache (without 4523 * unregistering the buffer).. 4524 */ 4525 4526 static void 4527 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4528 { 4529 cache_avl_struct_t cas, *rcas; 4530 avl_index_t where = NULL; 4531 rib_hca_t *hca = (ctoqp(conn))->hca; 4532 4533 if (!hca->avl_init) 4534 goto error_free; 4535 4536 cas.len = reg_buf->lrc_len; 4537 rw_enter(&hca->avl_rw_lock, RW_READER); 4538 if ((rcas = (cache_avl_struct_t *) 4539 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4540 rw_exit(&hca->avl_rw_lock); 4541 goto error_free; 4542 } else { 4543 rib_total_buffers ++; 4544 cas.len = reg_buf->lrc_len; 4545 mutex_enter(&rcas->node_lock); 4546 insque(reg_buf, &rcas->r); 4547 rcas->elements ++; 4548 mutex_exit(&rcas->node_lock); 4549 rw_exit(&hca->avl_rw_lock); 4550 mutex_enter(&hca->cache_allocation); 4551 cache_allocation += cas.len; 4552 mutex_exit(&hca->cache_allocation); 4553 } 4554 4555 return; 4556 4557 error_free: 4558 4559 if (reg_buf->registered) 4560 (void) rib_deregistermem_via_hca(hca, 4561 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4562 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4563 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4564 } 4565 4566 static rdma_stat 4567 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4568 uint_t buflen, struct mrc *buf_handle) 4569 { 4570 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4571 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4572 rdma_stat status; 4573 4574 4575 /* 4576 * Note: ALL buffer pools use the same memory type RDMARW. 4577 */ 4578 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4579 if (status == RDMA_SUCCESS) { 4580 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4581 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4582 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4583 } else { 4584 buf_handle->mrc_linfo = NULL; 4585 buf_handle->mrc_lmr = 0; 4586 buf_handle->mrc_rmr = 0; 4587 } 4588 return (status); 4589 } 4590 4591 /* ARGSUSED */ 4592 static rdma_stat 4593 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4594 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4595 { 4596 4597 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4598 return (RDMA_SUCCESS); 4599 } 4600 4601 /* ARGSUSED */ 4602 static rdma_stat 4603 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4604 { 4605 4606 (void) ibt_deregister_mr(hca->hca_hdl, 4607 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4608 return (RDMA_SUCCESS); 4609 } 4610 4611 /* 4612 * Check if the IP interface named by `lifrp' is RDMA-capable. 4613 */ 4614 static boolean_t 4615 rpcib_rdma_capable_interface(struct lifreq *lifrp) 4616 { 4617 char ifname[LIFNAMSIZ]; 4618 char *cp; 4619 4620 if (lifrp->lifr_type == IFT_IB) 4621 return (B_TRUE); 4622 4623 /* 4624 * Strip off the logical interface portion before getting 4625 * intimate with the name. 4626 */ 4627 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 4628 if ((cp = strchr(ifname, ':')) != NULL) 4629 *cp = '\0'; 4630 4631 return (strcmp("lo0", ifname) == 0); 4632 } 4633 4634 static int 4635 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 4636 { 4637 vnode_t *kvp, *vp; 4638 TIUSER *tiptr; 4639 struct strioctl iocb; 4640 k_sigset_t smask; 4641 int err = 0; 4642 4643 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 4644 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 4645 &tiptr, CRED()) == 0) { 4646 vp = tiptr->fp->f_vnode; 4647 } else { 4648 VN_RELE(kvp); 4649 return (EPROTO); 4650 } 4651 } else { 4652 return (EPROTO); 4653 } 4654 4655 iocb.ic_cmd = cmd; 4656 iocb.ic_timout = 0; 4657 iocb.ic_len = len; 4658 iocb.ic_dp = (caddr_t)arg; 4659 sigintr(&smask, 0); 4660 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4661 sigunintr(&smask); 4662 (void) t_kclose(tiptr, 0); 4663 VN_RELE(kvp); 4664 return (err); 4665 } 4666 4667 /* 4668 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 4669 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 4670 */ 4671 static int 4672 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 4673 { 4674 int err; 4675 struct lifnum lifn; 4676 4677 bzero(&lifn, sizeof (struct lifnum)); 4678 lifn.lifn_family = AF_UNSPEC; 4679 4680 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 4681 if (err != 0) 4682 return (err); 4683 4684 /* 4685 * Pad the interface count to account for additional interfaces that 4686 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 4687 */ 4688 lifn.lifn_count += 4; 4689 4690 bzero(lifcp, sizeof (struct lifconf)); 4691 lifcp->lifc_family = AF_UNSPEC; 4692 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 4693 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 4694 4695 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 4696 if (err != 0) { 4697 kmem_free(lifcp->lifc_buf, *bufsizep); 4698 return (err); 4699 } 4700 return (0); 4701 } 4702 4703 static boolean_t 4704 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 4705 { 4706 uint_t i, nifs; 4707 uint_t bufsize; 4708 struct lifconf lifc; 4709 struct lifreq *lifrp; 4710 struct sockaddr_in *sinp; 4711 struct sockaddr_in6 *sin6p; 4712 4713 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 4714 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 4715 4716 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 4717 return (B_FALSE); 4718 4719 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 4720 kmem_free(lifc.lifc_buf, bufsize); 4721 return (B_FALSE); 4722 } 4723 4724 /* 4725 * Worst case is that all of the addresses are IB-capable and have 4726 * the same address family, so size our buffers accordingly. 4727 */ 4728 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 4729 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 4730 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 4731 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 4732 4733 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 4734 if (!rpcib_rdma_capable_interface(lifrp)) 4735 continue; 4736 4737 if (lifrp->lifr_addr.ss_family == AF_INET) { 4738 sinp = addrs4->ri_list; 4739 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 4740 sizeof (struct sockaddr_in)); 4741 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 4742 sin6p = addrs6->ri_list; 4743 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 4744 sizeof (struct sockaddr_in6)); 4745 } 4746 } 4747 4748 kmem_free(lifc.lifc_buf, bufsize); 4749 return (B_TRUE); 4750 } 4751 4752 /* ARGSUSED */ 4753 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4754 4755 if (KSTAT_WRITE == rw) { 4756 return (EACCES); 4757 } 4758 rpcib_kstat.cache_limit.value.ui64 = 4759 (uint64_t)cache_limit; 4760 rpcib_kstat.cache_allocation.value.ui64 = 4761 (uint64_t)cache_allocation; 4762 rpcib_kstat.cache_hits.value.ui64 = 4763 (uint64_t)cache_hits; 4764 rpcib_kstat.cache_misses.value.ui64 = 4765 (uint64_t)cache_misses; 4766 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4767 (uint64_t)cache_misses_above_the_limit; 4768 return (0); 4769 } 4770