1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 118 struct { 119 kstat_named_t cache_limit; 120 kstat_named_t cache_allocation; 121 kstat_named_t cache_hits; 122 kstat_named_t cache_misses; 123 kstat_named_t cache_misses_above_the_limit; 124 } rpcib_kstat = { 125 {"cache_limit", KSTAT_DATA_UINT64 }, 126 {"cache_allocation", KSTAT_DATA_UINT64 }, 127 {"cache_hits", KSTAT_DATA_UINT64 }, 128 {"cache_misses", KSTAT_DATA_UINT64 }, 129 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 130 }; 131 132 /* rpcib cb_ops */ 133 static struct cb_ops rpcib_cbops = { 134 nulldev, /* open */ 135 nulldev, /* close */ 136 nodev, /* strategy */ 137 nodev, /* print */ 138 nodev, /* dump */ 139 nodev, /* read */ 140 nodev, /* write */ 141 nodev, /* ioctl */ 142 nodev, /* devmap */ 143 nodev, /* mmap */ 144 nodev, /* segmap */ 145 nochpoll, /* poll */ 146 ddi_prop_op, /* prop_op */ 147 NULL, /* stream */ 148 D_MP, /* cb_flag */ 149 CB_REV, /* rev */ 150 nodev, /* int (*cb_aread)() */ 151 nodev /* int (*cb_awrite)() */ 152 }; 153 154 /* 155 * Device options 156 */ 157 static struct dev_ops rpcib_ops = { 158 DEVO_REV, /* devo_rev, */ 159 0, /* refcnt */ 160 rpcib_getinfo, /* info */ 161 nulldev, /* identify */ 162 nulldev, /* probe */ 163 rpcib_attach, /* attach */ 164 rpcib_detach, /* detach */ 165 nodev, /* reset */ 166 &rpcib_cbops, /* driver ops - devctl interfaces */ 167 NULL, /* bus operations */ 168 NULL, /* power */ 169 ddi_quiesce_not_needed, /* quiesce */ 170 }; 171 172 /* 173 * Module linkage information. 174 */ 175 176 static struct modldrv rib_modldrv = { 177 &mod_driverops, /* Driver module */ 178 "RPCIB plugin driver", /* Driver name and version */ 179 &rpcib_ops, /* Driver ops */ 180 }; 181 182 static struct modlinkage rib_modlinkage = { 183 MODREV_1, 184 (void *)&rib_modldrv, 185 NULL 186 }; 187 188 typedef struct rib_lrc_entry { 189 struct rib_lrc_entry *forw; 190 struct rib_lrc_entry *back; 191 char *lrc_buf; 192 193 uint32_t lrc_len; 194 void *avl_node; 195 bool_t registered; 196 197 struct mrc lrc_mhandle; 198 bool_t lrc_on_freed_list; 199 } rib_lrc_entry_t; 200 201 typedef struct cache_struct { 202 rib_lrc_entry_t r; 203 uint32_t len; 204 uint32_t elements; 205 kmutex_t node_lock; 206 avl_node_t avl_link; 207 } cache_avl_struct_t; 208 209 static uint64_t rib_total_buffers = 0; 210 uint64_t cache_limit = 100 * 1024 * 1024; 211 static volatile uint64_t cache_allocation = 0; 212 static uint64_t cache_watermark = 80 * 1024 * 1024; 213 static uint64_t cache_hits = 0; 214 static uint64_t cache_misses = 0; 215 static uint64_t cache_cold_misses = 0; 216 static uint64_t cache_hot_misses = 0; 217 static uint64_t cache_misses_above_the_limit = 0; 218 static bool_t stats_enabled = FALSE; 219 220 static uint64_t max_unsignaled_rws = 5; 221 int nfs_rdma_port = NFS_RDMA_PORT; 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 306 static rdma_stat rib_conn_release(CONN *conn); 307 static rdma_stat rib_getinfo(rdma_info_t *info); 308 309 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 310 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 311 static void rib_destroy_cache(rib_hca_t *hca); 312 static void rib_server_side_cache_reclaim(void *argp); 313 static int avl_compare(const void *t1, const void *t2); 314 315 static void rib_stop_services(rib_hca_t *); 316 static void rib_close_channels(rib_conn_list_t *); 317 static void rib_conn_close(void *); 318 319 /* 320 * RPCIB addressing operations 321 */ 322 323 /* 324 * RDMA operations the RPCIB module exports 325 */ 326 static rdmaops_t rib_ops = { 327 rib_reachable, 328 rib_conn_get, 329 rib_conn_release, 330 rib_listen, 331 rib_listen_stop, 332 rib_registermem, 333 rib_deregistermem, 334 rib_registermemsync, 335 rib_deregistermemsync, 336 rib_syncmem, 337 rib_reg_buf_alloc, 338 rib_reg_buf_free, 339 rib_send, 340 rib_send_resp, 341 rib_post_resp, 342 rib_post_resp_remove, 343 rib_post_recv, 344 rib_recv, 345 rib_read, 346 rib_write, 347 rib_getinfo, 348 }; 349 350 /* 351 * RDMATF RPCIB plugin details 352 */ 353 static rdma_mod_t rib_mod = { 354 "ibtf", /* api name */ 355 RDMATF_VERS_1, 356 0, 357 &rib_ops, /* rdma op vector for ibtf */ 358 }; 359 360 static rdma_stat open_hcas(rpcib_state_t *); 361 static rdma_stat rib_qp_init(rib_qp_t *, int); 362 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 363 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 364 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 365 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 366 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 367 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 368 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 369 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 370 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 371 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 372 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 373 rib_qp_t **); 374 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 375 rib_qp_t **); 376 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 377 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 378 static int rib_free_sendwait(struct send_wid *); 379 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 380 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 381 static void rdma_done_rem_list(rib_qp_t *); 382 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 383 384 static void rib_async_handler(void *, 385 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 386 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 387 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 388 static int rib_free_svc_recv(struct svc_recv *); 389 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 390 static void rib_free_wid(struct recv_wid *); 391 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 392 static void rib_detach_hca(rib_hca_t *); 393 static void rib_close_a_channel(CONN *); 394 static void rib_send_hold(rib_qp_t *); 395 static void rib_send_rele(rib_qp_t *); 396 397 /* 398 * Registration with IBTF as a consumer 399 */ 400 static struct ibt_clnt_modinfo_s rib_modinfo = { 401 IBTI_V_CURR, 402 IBT_GENERIC, 403 rib_async_handler, /* async event handler */ 404 NULL, /* Memory Region Handler */ 405 "nfs/ib" 406 }; 407 408 /* 409 * Global strucuture 410 */ 411 412 typedef struct rpcib_s { 413 dev_info_t *rpcib_dip; 414 kmutex_t rpcib_mutex; 415 } rpcib_t; 416 417 rpcib_t rpcib; 418 419 /* 420 * /etc/system controlled variable to control 421 * debugging in rpcib kernel module. 422 * Set it to values greater that 1 to control 423 * the amount of debugging messages required. 424 */ 425 int rib_debug = 0; 426 427 int 428 _init(void) 429 { 430 int error; 431 432 error = mod_install((struct modlinkage *)&rib_modlinkage); 433 if (error != 0) { 434 /* 435 * Could not load module 436 */ 437 return (error); 438 } 439 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 440 return (0); 441 } 442 443 int 444 _fini() 445 { 446 int status; 447 448 /* 449 * Remove module 450 */ 451 if ((status = mod_remove(&rib_modlinkage)) != 0) { 452 return (status); 453 } 454 mutex_destroy(&plugin_state_lock); 455 return (0); 456 } 457 458 int 459 _info(struct modinfo *modinfop) 460 { 461 return (mod_info(&rib_modlinkage, modinfop)); 462 } 463 464 /* 465 * rpcib_getinfo() 466 * Given the device number, return the devinfo pointer or the 467 * instance number. 468 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 469 */ 470 471 /*ARGSUSED*/ 472 static int 473 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 474 { 475 int ret = DDI_SUCCESS; 476 477 switch (cmd) { 478 case DDI_INFO_DEVT2DEVINFO: 479 if (rpcib.rpcib_dip != NULL) 480 *result = rpcib.rpcib_dip; 481 else { 482 *result = NULL; 483 ret = DDI_FAILURE; 484 } 485 break; 486 487 case DDI_INFO_DEVT2INSTANCE: 488 *result = NULL; 489 break; 490 491 default: 492 ret = DDI_FAILURE; 493 } 494 return (ret); 495 } 496 497 static int 498 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 499 { 500 ibt_status_t ibt_status; 501 rdma_stat r_status; 502 503 switch (cmd) { 504 case DDI_ATTACH: 505 break; 506 case DDI_RESUME: 507 return (DDI_SUCCESS); 508 default: 509 return (DDI_FAILURE); 510 } 511 512 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 513 514 mutex_enter(&rpcib.rpcib_mutex); 515 if (rpcib.rpcib_dip != NULL) { 516 mutex_exit(&rpcib.rpcib_mutex); 517 return (DDI_FAILURE); 518 } 519 rpcib.rpcib_dip = dip; 520 mutex_exit(&rpcib.rpcib_mutex); 521 /* 522 * Create the "rpcib" minor-node. 523 */ 524 if (ddi_create_minor_node(dip, 525 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 526 /* Error message, no cmn_err as they print on console */ 527 return (DDI_FAILURE); 528 } 529 530 if (rib_stat == NULL) { 531 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 532 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 533 } 534 535 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 536 if (rib_stat->hca_count < 1) { 537 mutex_destroy(&rib_stat->open_hca_lock); 538 kmem_free(rib_stat, sizeof (*rib_stat)); 539 rib_stat = NULL; 540 return (DDI_FAILURE); 541 } 542 543 ibt_status = ibt_attach(&rib_modinfo, dip, 544 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 545 546 if (ibt_status != IBT_SUCCESS) { 547 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 548 mutex_destroy(&rib_stat->open_hca_lock); 549 kmem_free(rib_stat, sizeof (*rib_stat)); 550 rib_stat = NULL; 551 return (DDI_FAILURE); 552 } 553 554 mutex_enter(&rib_stat->open_hca_lock); 555 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 556 mutex_exit(&rib_stat->open_hca_lock); 557 goto open_fail; 558 } 559 mutex_exit(&rib_stat->open_hca_lock); 560 561 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 562 DDI_PROP_SUCCESS) { 563 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 564 "failed."); 565 goto register_fail; 566 } 567 568 /* 569 * Register with rdmatf 570 */ 571 rib_mod.rdma_count = rib_stat->nhca_inited; 572 r_status = rdma_register_mod(&rib_mod); 573 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 574 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 575 "status = %d", r_status); 576 goto register_fail; 577 } 578 579 return (DDI_SUCCESS); 580 581 register_fail: 582 rib_detach_hca(rib_stat->hca); 583 open_fail: 584 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 585 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 586 mutex_destroy(&rib_stat->open_hca_lock); 587 kmem_free(rib_stat, sizeof (*rib_stat)); 588 rib_stat = NULL; 589 return (DDI_FAILURE); 590 } 591 592 /*ARGSUSED*/ 593 static int 594 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 595 { 596 switch (cmd) { 597 598 case DDI_DETACH: 599 break; 600 601 case DDI_SUSPEND: 602 default: 603 return (DDI_FAILURE); 604 } 605 606 /* 607 * Detach the hca and free resources 608 */ 609 mutex_enter(&plugin_state_lock); 610 plugin_state = NO_ACCEPT; 611 mutex_exit(&plugin_state_lock); 612 rib_detach_hca(rib_stat->hca); 613 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 614 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 615 mutex_destroy(&rib_stat->open_hca_lock); 616 if (rib_stat->hcas) { 617 kmem_free(rib_stat->hcas, rib_stat->hca_count * 618 sizeof (rib_hca_t)); 619 rib_stat->hcas = NULL; 620 } 621 kmem_free(rib_stat, sizeof (*rib_stat)); 622 rib_stat = NULL; 623 624 mutex_enter(&rpcib.rpcib_mutex); 625 rpcib.rpcib_dip = NULL; 626 mutex_exit(&rpcib.rpcib_mutex); 627 mutex_destroy(&rpcib.rpcib_mutex); 628 return (DDI_SUCCESS); 629 } 630 631 632 static void rib_rbufpool_free(rib_hca_t *, int); 633 static void rib_rbufpool_deregister(rib_hca_t *, int); 634 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 635 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 636 static rdma_stat rib_rem_replylist(rib_qp_t *); 637 static int rib_remreply(rib_qp_t *, struct reply *); 638 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 639 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 640 641 642 /* 643 * One CQ pair per HCA 644 */ 645 static rdma_stat 646 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 647 rib_cq_t **cqp, rpcib_state_t *ribstat) 648 { 649 rib_cq_t *cq; 650 ibt_cq_attr_t cq_attr; 651 uint32_t real_size; 652 ibt_status_t status; 653 rdma_stat error = RDMA_SUCCESS; 654 655 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 656 cq->rib_hca = hca; 657 cq_attr.cq_size = cq_size; 658 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 659 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 660 &real_size); 661 if (status != IBT_SUCCESS) { 662 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 663 " status=%d", status); 664 error = RDMA_FAILED; 665 goto fail; 666 } 667 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 668 669 /* 670 * Enable CQ callbacks. CQ Callbacks are single shot 671 * (e.g. you have to call ibt_enable_cq_notify() 672 * after each callback to get another one). 673 */ 674 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 675 if (status != IBT_SUCCESS) { 676 cmn_err(CE_WARN, "rib_create_cq: " 677 "enable_cq_notify failed, status %d", status); 678 error = RDMA_FAILED; 679 goto fail; 680 } 681 *cqp = cq; 682 683 return (error); 684 fail: 685 if (cq->rib_cq_hdl) 686 (void) ibt_free_cq(cq->rib_cq_hdl); 687 if (cq) 688 kmem_free(cq, sizeof (rib_cq_t)); 689 return (error); 690 } 691 692 static rdma_stat 693 open_hcas(rpcib_state_t *ribstat) 694 { 695 rib_hca_t *hca; 696 ibt_status_t ibt_status; 697 rdma_stat status; 698 ibt_hca_portinfo_t *pinfop; 699 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 700 uint_t size, cq_size; 701 int i; 702 kstat_t *ksp; 703 cache_avl_struct_t example_avl_node; 704 char rssc_name[32]; 705 706 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 707 708 if (ribstat->hcas == NULL) 709 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 710 sizeof (rib_hca_t), KM_SLEEP); 711 712 /* 713 * Open a hca and setup for RDMA 714 */ 715 for (i = 0; i < ribstat->hca_count; i++) { 716 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 717 ribstat->hca_guids[i], 718 &ribstat->hcas[i].hca_hdl); 719 if (ibt_status != IBT_SUCCESS) { 720 continue; 721 } 722 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 723 hca = &(ribstat->hcas[i]); 724 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 725 hca->state = HCA_INITED; 726 727 /* 728 * query HCA info 729 */ 730 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 731 if (ibt_status != IBT_SUCCESS) { 732 goto fail1; 733 } 734 735 /* 736 * One PD (Protection Domain) per HCA. 737 * A qp is allowed to access a memory region 738 * only when it's in the same PD as that of 739 * the memory region. 740 */ 741 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 742 if (ibt_status != IBT_SUCCESS) { 743 goto fail1; 744 } 745 746 /* 747 * query HCA ports 748 */ 749 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 750 0, &pinfop, &hca->hca_nports, &size); 751 if (ibt_status != IBT_SUCCESS) { 752 goto fail2; 753 } 754 hca->hca_ports = pinfop; 755 hca->hca_pinfosz = size; 756 pinfop = NULL; 757 758 cq_size = DEF_CQ_SIZE; /* default cq size */ 759 /* 760 * Create 2 pairs of cq's (1 pair for client 761 * and the other pair for server) on this hca. 762 * If number of qp's gets too large, then several 763 * cq's will be needed. 764 */ 765 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 766 &hca->svc_rcq, ribstat); 767 if (status != RDMA_SUCCESS) { 768 goto fail3; 769 } 770 771 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 772 &hca->svc_scq, ribstat); 773 if (status != RDMA_SUCCESS) { 774 goto fail3; 775 } 776 777 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 778 &hca->clnt_rcq, ribstat); 779 if (status != RDMA_SUCCESS) { 780 goto fail3; 781 } 782 783 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 784 &hca->clnt_scq, ribstat); 785 if (status != RDMA_SUCCESS) { 786 goto fail3; 787 } 788 789 /* 790 * Create buffer pools. 791 * Note rib_rbuf_create also allocates memory windows. 792 */ 793 hca->recv_pool = rib_rbufpool_create(hca, 794 RECV_BUFFER, rib_max_rbufs); 795 if (hca->recv_pool == NULL) { 796 goto fail3; 797 } 798 799 hca->send_pool = rib_rbufpool_create(hca, 800 SEND_BUFFER, rib_max_rbufs); 801 if (hca->send_pool == NULL) { 802 rib_rbufpool_destroy(hca, RECV_BUFFER); 803 goto fail3; 804 } 805 806 if (hca->server_side_cache == NULL) { 807 (void) sprintf(rssc_name, 808 "rib_server_side_cache_%04d", i); 809 hca->server_side_cache = kmem_cache_create( 810 rssc_name, 811 sizeof (cache_avl_struct_t), 0, 812 NULL, 813 NULL, 814 rib_server_side_cache_reclaim, 815 hca, NULL, 0); 816 } 817 818 avl_create(&hca->avl_tree, 819 avl_compare, 820 sizeof (cache_avl_struct_t), 821 (uint_t)(uintptr_t)&example_avl_node.avl_link- 822 (uint_t)(uintptr_t)&example_avl_node); 823 824 rw_init(&hca->avl_rw_lock, 825 NULL, RW_DRIVER, hca->iblock); 826 mutex_init(&hca->cache_allocation, 827 NULL, MUTEX_DRIVER, NULL); 828 hca->avl_init = TRUE; 829 830 /* Create kstats for the cache */ 831 ASSERT(INGLOBALZONE(curproc)); 832 833 if (!stats_enabled) { 834 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 835 KSTAT_TYPE_NAMED, 836 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 837 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 838 GLOBAL_ZONEID); 839 if (ksp) { 840 ksp->ks_data = (void *) &rpcib_kstat; 841 ksp->ks_update = rpcib_cache_kstat_update; 842 kstat_install(ksp); 843 stats_enabled = TRUE; 844 } 845 } 846 if (hca->cleanup_helper == NULL) { 847 hca->cleanup_helper = ddi_taskq_create(NULL, 848 "CLEANUP_HELPER", 1, TASKQ_DEFAULTPRI, 0); 849 } 850 851 /* 852 * Initialize the registered service list and 853 * the lock 854 */ 855 hca->service_list = NULL; 856 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 857 858 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 859 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 860 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 861 hca->iblock); 862 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 863 hca->iblock); 864 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 865 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 866 hca->inuse = TRUE; 867 /* 868 * XXX One hca only. Add multi-hca functionality if needed 869 * later. 870 */ 871 ribstat->hca = hca; 872 ribstat->nhca_inited++; 873 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 874 break; 875 876 fail3: 877 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 878 fail2: 879 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 880 fail1: 881 (void) ibt_close_hca(hca->hca_hdl); 882 883 } 884 if (ribstat->hca != NULL) 885 return (RDMA_SUCCESS); 886 else 887 return (RDMA_FAILED); 888 } 889 890 /* 891 * Callback routines 892 */ 893 894 /* 895 * SCQ handlers 896 */ 897 /* ARGSUSED */ 898 static void 899 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 900 { 901 ibt_status_t ibt_status; 902 ibt_wc_t wc; 903 struct send_wid *wd; 904 CONN *conn; 905 rib_qp_t *qp; 906 int i; 907 908 /* 909 * Re-enable cq notify here to avoid missing any 910 * completion queue notification. 911 */ 912 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 913 914 ibt_status = IBT_SUCCESS; 915 while (ibt_status != IBT_CQ_EMPTY) { 916 bzero(&wc, sizeof (wc)); 917 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 918 if (ibt_status != IBT_SUCCESS) 919 return; 920 921 /* 922 * Got a send completion 923 */ 924 if (wc.wc_id != RDMA_DUMMY_WRID) { 925 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 926 qp = wd->qp; 927 conn = qptoc(qp); 928 929 mutex_enter(&wd->sendwait_lock); 930 switch (wc.wc_status) { 931 case IBT_WC_SUCCESS: 932 wd->status = RDMA_SUCCESS; 933 break; 934 default: 935 /* 936 * RC Send Q Error Code Local state Remote State 937 * ==================== =========== ============ 938 * IBT_WC_BAD_RESPONSE_ERR ERROR None 939 * IBT_WC_LOCAL_LEN_ERR ERROR None 940 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 941 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 942 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 943 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 944 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 945 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 946 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 947 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 948 * IBT_WC_WR_FLUSHED_ERR ERROR None 949 */ 950 /* 951 * Channel in error state. Set connection to 952 * ERROR and cleanup will happen either from 953 * conn_release or from rib_conn_get 954 */ 955 wd->status = RDMA_FAILED; 956 mutex_enter(&conn->c_lock); 957 if (conn->c_state != C_DISCONN_PEND) 958 conn->c_state = C_ERROR_CONN; 959 mutex_exit(&conn->c_lock); 960 break; 961 } 962 963 if (wd->cv_sig == 1) { 964 /* 965 * Notify poster 966 */ 967 cv_signal(&wd->wait_cv); 968 mutex_exit(&wd->sendwait_lock); 969 } else { 970 /* 971 * Poster not waiting for notification. 972 * Free the send buffers and send_wid 973 */ 974 for (i = 0; i < wd->nsbufs; i++) { 975 rib_rbuf_free(qptoc(wd->qp), 976 SEND_BUFFER, 977 (void *)(uintptr_t)wd->sbufaddr[i]); 978 } 979 980 /* decrement the send ref count */ 981 rib_send_rele(qp); 982 983 mutex_exit(&wd->sendwait_lock); 984 (void) rib_free_sendwait(wd); 985 } 986 } 987 } 988 } 989 990 /* ARGSUSED */ 991 static void 992 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 993 { 994 ibt_status_t ibt_status; 995 ibt_wc_t wc; 996 struct send_wid *wd; 997 rib_qp_t *qp; 998 CONN *conn; 999 int i; 1000 1001 /* 1002 * Re-enable cq notify here to avoid missing any 1003 * completion queue notification. 1004 */ 1005 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1006 1007 ibt_status = IBT_SUCCESS; 1008 while (ibt_status != IBT_CQ_EMPTY) { 1009 bzero(&wc, sizeof (wc)); 1010 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1011 if (ibt_status != IBT_SUCCESS) 1012 return; 1013 1014 /* 1015 * Got a send completion 1016 */ 1017 if (wc.wc_id != RDMA_DUMMY_WRID) { 1018 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1019 qp = wd->qp; 1020 conn = qptoc(qp); 1021 mutex_enter(&wd->sendwait_lock); 1022 1023 switch (wc.wc_status) { 1024 case IBT_WC_SUCCESS: 1025 wd->status = RDMA_SUCCESS; 1026 break; 1027 default: 1028 /* 1029 * Channel in error state. Set connection to 1030 * ERROR and cleanup will happen either from 1031 * conn_release or conn timeout. 1032 */ 1033 wd->status = RDMA_FAILED; 1034 mutex_enter(&conn->c_lock); 1035 if (conn->c_state != C_DISCONN_PEND) 1036 conn->c_state = C_ERROR_CONN; 1037 mutex_exit(&conn->c_lock); 1038 break; 1039 } 1040 1041 if (wd->cv_sig == 1) { 1042 /* 1043 * Update completion status and notify poster 1044 */ 1045 cv_signal(&wd->wait_cv); 1046 mutex_exit(&wd->sendwait_lock); 1047 } else { 1048 /* 1049 * Poster not waiting for notification. 1050 * Free the send buffers and send_wid 1051 */ 1052 for (i = 0; i < wd->nsbufs; i++) { 1053 rib_rbuf_free(qptoc(wd->qp), 1054 SEND_BUFFER, 1055 (void *)(uintptr_t)wd->sbufaddr[i]); 1056 } 1057 1058 /* decrement the send ref count */ 1059 rib_send_rele(qp); 1060 1061 mutex_exit(&wd->sendwait_lock); 1062 (void) rib_free_sendwait(wd); 1063 } 1064 } 1065 } 1066 } 1067 1068 /* 1069 * RCQ handler 1070 */ 1071 /* ARGSUSED */ 1072 static void 1073 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1074 { 1075 rib_qp_t *qp; 1076 ibt_status_t ibt_status; 1077 ibt_wc_t wc; 1078 struct recv_wid *rwid; 1079 1080 /* 1081 * Re-enable cq notify here to avoid missing any 1082 * completion queue notification. 1083 */ 1084 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1085 1086 ibt_status = IBT_SUCCESS; 1087 while (ibt_status != IBT_CQ_EMPTY) { 1088 bzero(&wc, sizeof (wc)); 1089 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1090 if (ibt_status != IBT_SUCCESS) 1091 return; 1092 1093 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1094 qp = rwid->qp; 1095 if (wc.wc_status == IBT_WC_SUCCESS) { 1096 XDR inxdrs, *xdrs; 1097 uint_t xid, vers, op, find_xid = 0; 1098 struct reply *r; 1099 CONN *conn = qptoc(qp); 1100 uint32_t rdma_credit = 0; 1101 1102 xdrs = &inxdrs; 1103 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1104 wc.wc_bytes_xfer, XDR_DECODE); 1105 /* 1106 * Treat xid as opaque (xid is the first entity 1107 * in the rpc rdma message). 1108 */ 1109 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1110 1111 /* Skip xid and set the xdr position accordingly. */ 1112 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1113 (void) xdr_u_int(xdrs, &vers); 1114 (void) xdr_u_int(xdrs, &rdma_credit); 1115 (void) xdr_u_int(xdrs, &op); 1116 XDR_DESTROY(xdrs); 1117 1118 if (vers != RPCRDMA_VERS) { 1119 /* 1120 * Invalid RPC/RDMA version. Cannot 1121 * interoperate. Set connection to 1122 * ERROR state and bail out. 1123 */ 1124 mutex_enter(&conn->c_lock); 1125 if (conn->c_state != C_DISCONN_PEND) 1126 conn->c_state = C_ERROR_CONN; 1127 mutex_exit(&conn->c_lock); 1128 rib_rbuf_free(conn, RECV_BUFFER, 1129 (void *)(uintptr_t)rwid->addr); 1130 rib_free_wid(rwid); 1131 continue; 1132 } 1133 1134 mutex_enter(&qp->replylist_lock); 1135 for (r = qp->replylist; r != NULL; r = r->next) { 1136 if (r->xid == xid) { 1137 find_xid = 1; 1138 switch (op) { 1139 case RDMA_MSG: 1140 case RDMA_NOMSG: 1141 case RDMA_MSGP: 1142 r->status = RDMA_SUCCESS; 1143 r->vaddr_cq = rwid->addr; 1144 r->bytes_xfer = 1145 wc.wc_bytes_xfer; 1146 cv_signal(&r->wait_cv); 1147 break; 1148 default: 1149 rib_rbuf_free(qptoc(qp), 1150 RECV_BUFFER, 1151 (void *)(uintptr_t) 1152 rwid->addr); 1153 break; 1154 } 1155 break; 1156 } 1157 } 1158 mutex_exit(&qp->replylist_lock); 1159 if (find_xid == 0) { 1160 /* RPC caller not waiting for reply */ 1161 1162 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1163 int, xid); 1164 1165 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1166 (void *)(uintptr_t)rwid->addr); 1167 } 1168 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1169 CONN *conn = qptoc(qp); 1170 1171 /* 1172 * Connection being flushed. Just free 1173 * the posted buffer 1174 */ 1175 rib_rbuf_free(conn, RECV_BUFFER, 1176 (void *)(uintptr_t)rwid->addr); 1177 } else { 1178 CONN *conn = qptoc(qp); 1179 /* 1180 * RC Recv Q Error Code Local state Remote State 1181 * ==================== =========== ============ 1182 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1183 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1184 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1185 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1186 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1187 * IBT_WC_WR_FLUSHED_ERR None None 1188 */ 1189 /* 1190 * Channel in error state. Set connection 1191 * in ERROR state. 1192 */ 1193 mutex_enter(&conn->c_lock); 1194 if (conn->c_state != C_DISCONN_PEND) 1195 conn->c_state = C_ERROR_CONN; 1196 mutex_exit(&conn->c_lock); 1197 rib_rbuf_free(conn, RECV_BUFFER, 1198 (void *)(uintptr_t)rwid->addr); 1199 } 1200 rib_free_wid(rwid); 1201 } 1202 } 1203 1204 /* Server side */ 1205 /* ARGSUSED */ 1206 static void 1207 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1208 { 1209 rdma_recv_data_t *rdp; 1210 rib_qp_t *qp; 1211 ibt_status_t ibt_status; 1212 ibt_wc_t wc; 1213 struct svc_recv *s_recvp; 1214 CONN *conn; 1215 mblk_t *mp; 1216 1217 /* 1218 * Re-enable cq notify here to avoid missing any 1219 * completion queue notification. 1220 */ 1221 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1222 1223 ibt_status = IBT_SUCCESS; 1224 while (ibt_status != IBT_CQ_EMPTY) { 1225 bzero(&wc, sizeof (wc)); 1226 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1227 if (ibt_status != IBT_SUCCESS) 1228 return; 1229 1230 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1231 qp = s_recvp->qp; 1232 conn = qptoc(qp); 1233 mutex_enter(&qp->posted_rbufs_lock); 1234 qp->n_posted_rbufs--; 1235 if (qp->n_posted_rbufs == 0) 1236 cv_signal(&qp->posted_rbufs_cv); 1237 mutex_exit(&qp->posted_rbufs_lock); 1238 1239 if (wc.wc_status == IBT_WC_SUCCESS) { 1240 XDR inxdrs, *xdrs; 1241 uint_t xid, vers, op; 1242 uint32_t rdma_credit; 1243 1244 xdrs = &inxdrs; 1245 /* s_recvp->vaddr stores data */ 1246 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1247 wc.wc_bytes_xfer, XDR_DECODE); 1248 1249 /* 1250 * Treat xid as opaque (xid is the first entity 1251 * in the rpc rdma message). 1252 */ 1253 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1254 /* Skip xid and set the xdr position accordingly. */ 1255 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1256 if (!xdr_u_int(xdrs, &vers) || 1257 !xdr_u_int(xdrs, &rdma_credit) || 1258 !xdr_u_int(xdrs, &op)) { 1259 rib_rbuf_free(conn, RECV_BUFFER, 1260 (void *)(uintptr_t)s_recvp->vaddr); 1261 XDR_DESTROY(xdrs); 1262 (void) rib_free_svc_recv(s_recvp); 1263 continue; 1264 } 1265 XDR_DESTROY(xdrs); 1266 1267 if (vers != RPCRDMA_VERS) { 1268 /* 1269 * Invalid RPC/RDMA version. 1270 * Drop rpc rdma message. 1271 */ 1272 rib_rbuf_free(conn, RECV_BUFFER, 1273 (void *)(uintptr_t)s_recvp->vaddr); 1274 (void) rib_free_svc_recv(s_recvp); 1275 continue; 1276 } 1277 /* 1278 * Is this for RDMA_DONE? 1279 */ 1280 if (op == RDMA_DONE) { 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)s_recvp->vaddr); 1283 /* 1284 * Wake up the thread waiting on 1285 * a RDMA_DONE for xid 1286 */ 1287 mutex_enter(&qp->rdlist_lock); 1288 rdma_done_notify(qp, xid); 1289 mutex_exit(&qp->rdlist_lock); 1290 (void) rib_free_svc_recv(s_recvp); 1291 continue; 1292 } 1293 1294 mutex_enter(&plugin_state_lock); 1295 if (plugin_state == ACCEPT) { 1296 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1297 == NULL) 1298 (void) strwaitbuf( 1299 sizeof (*rdp), BPRI_LO); 1300 /* 1301 * Plugin is in accept state, hence the master 1302 * transport queue for this is still accepting 1303 * requests. Hence we can call svc_queuereq to 1304 * queue this recieved msg. 1305 */ 1306 rdp = (rdma_recv_data_t *)mp->b_rptr; 1307 rdp->conn = conn; 1308 rdp->rpcmsg.addr = 1309 (caddr_t)(uintptr_t)s_recvp->vaddr; 1310 rdp->rpcmsg.type = RECV_BUFFER; 1311 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1312 rdp->status = wc.wc_status; 1313 mutex_enter(&conn->c_lock); 1314 conn->c_ref++; 1315 mutex_exit(&conn->c_lock); 1316 mp->b_wptr += sizeof (*rdp); 1317 svc_queuereq((queue_t *)rib_stat->q, mp); 1318 mutex_exit(&plugin_state_lock); 1319 } else { 1320 /* 1321 * The master transport for this is going 1322 * away and the queue is not accepting anymore 1323 * requests for krpc, so don't do anything, just 1324 * free the msg. 1325 */ 1326 mutex_exit(&plugin_state_lock); 1327 rib_rbuf_free(conn, RECV_BUFFER, 1328 (void *)(uintptr_t)s_recvp->vaddr); 1329 } 1330 } else { 1331 rib_rbuf_free(conn, RECV_BUFFER, 1332 (void *)(uintptr_t)s_recvp->vaddr); 1333 } 1334 (void) rib_free_svc_recv(s_recvp); 1335 } 1336 } 1337 1338 /* 1339 * Handles DR event of IBT_HCA_DETACH_EVENT. 1340 */ 1341 /* ARGSUSED */ 1342 static void 1343 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1344 ibt_async_code_t code, ibt_async_event_t *event) 1345 { 1346 1347 switch (code) { 1348 case IBT_HCA_ATTACH_EVENT: 1349 /* ignore */ 1350 break; 1351 case IBT_HCA_DETACH_EVENT: 1352 { 1353 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1354 rib_detach_hca(rib_stat->hca); 1355 #ifdef DEBUG 1356 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1357 #endif 1358 break; 1359 } 1360 #ifdef DEBUG 1361 case IBT_EVENT_PATH_MIGRATED: 1362 cmn_err(CE_NOTE, "rib_async_handler(): " 1363 "IBT_EVENT_PATH_MIGRATED\n"); 1364 break; 1365 case IBT_EVENT_SQD: 1366 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1367 break; 1368 case IBT_EVENT_COM_EST: 1369 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1370 break; 1371 case IBT_ERROR_CATASTROPHIC_CHAN: 1372 cmn_err(CE_NOTE, "rib_async_handler(): " 1373 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1374 break; 1375 case IBT_ERROR_INVALID_REQUEST_CHAN: 1376 cmn_err(CE_NOTE, "rib_async_handler(): " 1377 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1378 break; 1379 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1380 cmn_err(CE_NOTE, "rib_async_handler(): " 1381 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1382 break; 1383 case IBT_ERROR_PATH_MIGRATE_REQ: 1384 cmn_err(CE_NOTE, "rib_async_handler(): " 1385 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1386 break; 1387 case IBT_ERROR_CQ: 1388 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1389 break; 1390 case IBT_ERROR_PORT_DOWN: 1391 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1392 break; 1393 case IBT_EVENT_PORT_UP: 1394 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1395 break; 1396 case IBT_ASYNC_OPAQUE1: 1397 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1398 break; 1399 case IBT_ASYNC_OPAQUE2: 1400 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1401 break; 1402 case IBT_ASYNC_OPAQUE3: 1403 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1404 break; 1405 case IBT_ASYNC_OPAQUE4: 1406 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1407 break; 1408 #endif 1409 default: 1410 break; 1411 } 1412 } 1413 1414 /* 1415 * Client's reachable function. 1416 */ 1417 static rdma_stat 1418 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1419 { 1420 rdma_stat status; 1421 rpcib_ping_t rpt; 1422 1423 /* 1424 * First check if a hca is still attached 1425 */ 1426 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1427 if (rib_stat->hca->state != HCA_INITED) { 1428 rw_exit(&rib_stat->hca->state_lock); 1429 return (RDMA_FAILED); 1430 } 1431 1432 bzero(&rpt, sizeof (rpcib_ping_t)); 1433 status = rib_ping_srv(addr_type, raddr, &rpt); 1434 rw_exit(&rib_stat->hca->state_lock); 1435 1436 if (status == RDMA_SUCCESS) { 1437 *handle = (void *)rpt.hca; 1438 return (RDMA_SUCCESS); 1439 } else { 1440 *handle = NULL; 1441 DTRACE_PROBE(rpcib__i__pingfailed); 1442 return (RDMA_FAILED); 1443 } 1444 } 1445 1446 /* Client side qp creation */ 1447 static rdma_stat 1448 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1449 { 1450 rib_qp_t *kqp = NULL; 1451 CONN *conn; 1452 rdma_clnt_cred_ctrl_t *cc_info; 1453 1454 ASSERT(qp != NULL); 1455 *qp = NULL; 1456 1457 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1458 conn = qptoc(kqp); 1459 kqp->hca = hca; 1460 kqp->rdmaconn.c_rdmamod = &rib_mod; 1461 kqp->rdmaconn.c_private = (caddr_t)kqp; 1462 1463 kqp->mode = RIB_CLIENT; 1464 kqp->chan_flags = IBT_BLOCKING; 1465 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1466 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1467 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1468 /* 1469 * Initialize 1470 */ 1471 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1472 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1473 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1474 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1475 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1476 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1477 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1478 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1479 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1480 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1481 /* 1482 * Initialize the client credit control 1483 * portion of the rdmaconn struct. 1484 */ 1485 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1486 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1487 cc_info->clnt_cc_granted_ops = 0; 1488 cc_info->clnt_cc_in_flight_ops = 0; 1489 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1490 1491 *qp = kqp; 1492 return (RDMA_SUCCESS); 1493 } 1494 1495 /* Server side qp creation */ 1496 static rdma_stat 1497 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1498 { 1499 rib_qp_t *kqp = NULL; 1500 ibt_chan_sizes_t chan_sizes; 1501 ibt_rc_chan_alloc_args_t qp_attr; 1502 ibt_status_t ibt_status; 1503 rdma_srv_cred_ctrl_t *cc_info; 1504 1505 *qp = NULL; 1506 1507 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1508 kqp->hca = hca; 1509 kqp->port_num = port; 1510 kqp->rdmaconn.c_rdmamod = &rib_mod; 1511 kqp->rdmaconn.c_private = (caddr_t)kqp; 1512 1513 /* 1514 * Create the qp handle 1515 */ 1516 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1517 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1518 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1519 qp_attr.rc_pd = hca->pd_hdl; 1520 qp_attr.rc_hca_port_num = port; 1521 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1522 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1523 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1524 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1525 qp_attr.rc_clone_chan = NULL; 1526 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1527 qp_attr.rc_flags = IBT_WR_SIGNALED; 1528 1529 rw_enter(&hca->state_lock, RW_READER); 1530 if (hca->state != HCA_DETACHED) { 1531 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1532 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1533 &chan_sizes); 1534 } else { 1535 rw_exit(&hca->state_lock); 1536 goto fail; 1537 } 1538 rw_exit(&hca->state_lock); 1539 1540 if (ibt_status != IBT_SUCCESS) { 1541 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1542 int, ibt_status); 1543 goto fail; 1544 } 1545 1546 kqp->mode = RIB_SERVER; 1547 kqp->chan_flags = IBT_BLOCKING; 1548 kqp->q = q; /* server ONLY */ 1549 1550 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1551 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1552 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1553 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1554 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1555 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1556 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1557 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1558 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1559 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1560 /* 1561 * Set the private data area to qp to be used in callbacks 1562 */ 1563 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1564 kqp->rdmaconn.c_state = C_CONNECTED; 1565 1566 /* 1567 * Initialize the server credit control 1568 * portion of the rdmaconn struct. 1569 */ 1570 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1571 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1572 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1573 cc_info->srv_cc_cur_buffers_used = 0; 1574 cc_info->srv_cc_posted = preposted_rbufs; 1575 1576 *qp = kqp; 1577 1578 return (RDMA_SUCCESS); 1579 fail: 1580 if (kqp) 1581 kmem_free(kqp, sizeof (rib_qp_t)); 1582 1583 return (RDMA_FAILED); 1584 } 1585 1586 /* ARGSUSED */ 1587 ibt_cm_status_t 1588 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1589 ibt_cm_return_args_t *ret_args, void *priv_data, 1590 ibt_priv_data_len_t len) 1591 { 1592 rpcib_state_t *ribstat; 1593 rib_hca_t *hca; 1594 1595 ribstat = (rpcib_state_t *)clnt_hdl; 1596 hca = (rib_hca_t *)ribstat->hca; 1597 1598 switch (event->cm_type) { 1599 1600 /* got a connection close event */ 1601 case IBT_CM_EVENT_CONN_CLOSED: 1602 { 1603 CONN *conn; 1604 rib_qp_t *qp; 1605 1606 /* check reason why connection was closed */ 1607 switch (event->cm_event.closed) { 1608 case IBT_CM_CLOSED_DREP_RCVD: 1609 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1610 case IBT_CM_CLOSED_DUP: 1611 case IBT_CM_CLOSED_ABORT: 1612 case IBT_CM_CLOSED_ALREADY: 1613 /* 1614 * These cases indicate the local end initiated 1615 * the closing of the channel. Nothing to do here. 1616 */ 1617 break; 1618 default: 1619 /* 1620 * Reason for CONN_CLOSED event must be one of 1621 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1622 * or IBT_CM_CLOSED_STALE. These indicate cases were 1623 * the remote end is closing the channel. In these 1624 * cases free the channel and transition to error 1625 * state 1626 */ 1627 qp = ibt_get_chan_private(event->cm_channel); 1628 conn = qptoc(qp); 1629 mutex_enter(&conn->c_lock); 1630 if (conn->c_state == C_DISCONN_PEND) { 1631 mutex_exit(&conn->c_lock); 1632 break; 1633 } 1634 1635 conn->c_state = C_ERROR_CONN; 1636 1637 /* 1638 * Free the conn if c_ref is down to 0 already 1639 */ 1640 if (conn->c_ref == 0) { 1641 /* 1642 * Remove from list and free conn 1643 */ 1644 conn->c_state = C_DISCONN_PEND; 1645 mutex_exit(&conn->c_lock); 1646 (void) rib_disconnect_channel(conn, 1647 &hca->cl_conn_list); 1648 } else { 1649 /* 1650 * conn will be freed when c_ref goes to 0. 1651 * Indicate to cleaning thread not to close 1652 * the connection, but just free the channel. 1653 */ 1654 conn->c_flags |= C_CLOSE_NOTNEEDED; 1655 mutex_exit(&conn->c_lock); 1656 } 1657 #ifdef DEBUG 1658 if (rib_debug) 1659 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1660 "(CONN_CLOSED) channel disconnected"); 1661 #endif 1662 break; 1663 } 1664 break; 1665 } 1666 default: 1667 break; 1668 } 1669 return (IBT_CM_ACCEPT); 1670 } 1671 1672 /* 1673 * Connect to the server. 1674 */ 1675 rdma_stat 1676 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1677 { 1678 ibt_chan_open_args_t chan_args; /* channel args */ 1679 ibt_chan_sizes_t chan_sizes; 1680 ibt_rc_chan_alloc_args_t qp_attr; 1681 ibt_status_t ibt_status; 1682 ibt_rc_returns_t ret_args; /* conn reject info */ 1683 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1684 ibt_ip_cm_info_t ipcm_info; 1685 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1686 1687 1688 (void) bzero(&chan_args, sizeof (chan_args)); 1689 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1690 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1691 1692 ipcm_info.src_addr.family = rptp->srcip.family; 1693 switch (ipcm_info.src_addr.family) { 1694 case AF_INET: 1695 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1696 break; 1697 case AF_INET6: 1698 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1699 break; 1700 } 1701 1702 ipcm_info.dst_addr.family = rptp->srcip.family; 1703 switch (ipcm_info.dst_addr.family) { 1704 case AF_INET: 1705 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1706 break; 1707 case AF_INET6: 1708 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1709 break; 1710 } 1711 1712 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1713 1714 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1715 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1716 1717 if (ibt_status != IBT_SUCCESS) { 1718 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1719 return (-1); 1720 } 1721 1722 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1723 /* Alloc a RC channel */ 1724 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1725 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1726 qp_attr.rc_pd = hca->pd_hdl; 1727 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1728 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1729 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1730 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1731 qp_attr.rc_clone_chan = NULL; 1732 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1733 qp_attr.rc_flags = IBT_WR_SIGNALED; 1734 1735 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1736 chan_args.oc_path = &rptp->path; 1737 1738 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1739 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1740 chan_args.oc_rdma_ra_out = 4; 1741 chan_args.oc_rdma_ra_in = 4; 1742 chan_args.oc_path_retry_cnt = 2; 1743 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1744 chan_args.oc_priv_data = cmp_ip_pvt; 1745 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1746 1747 refresh: 1748 rw_enter(&hca->state_lock, RW_READER); 1749 if (hca->state != HCA_DETACHED) { 1750 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1751 IBT_ACHAN_NO_FLAGS, 1752 &qp_attr, &qp->qp_hdl, 1753 &chan_sizes); 1754 } else { 1755 rw_exit(&hca->state_lock); 1756 return (RDMA_FAILED); 1757 } 1758 rw_exit(&hca->state_lock); 1759 1760 if (ibt_status != IBT_SUCCESS) { 1761 DTRACE_PROBE1(rpcib__i_conntosrv, 1762 int, ibt_status); 1763 return (RDMA_FAILED); 1764 } 1765 1766 /* Connect to the Server */ 1767 (void) bzero(&ret_args, sizeof (ret_args)); 1768 mutex_enter(&qp->cb_lock); 1769 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1770 IBT_BLOCKING, &chan_args, &ret_args); 1771 if (ibt_status != IBT_SUCCESS) { 1772 DTRACE_PROBE2(rpcib__i_openrctosrv, 1773 int, ibt_status, int, ret_args.rc_status); 1774 1775 (void) ibt_free_channel(qp->qp_hdl); 1776 qp->qp_hdl = NULL; 1777 mutex_exit(&qp->cb_lock); 1778 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1779 ret_args.rc_status == IBT_CM_CONN_STALE) { 1780 /* 1781 * Got IBT_CM_CONN_STALE probably because of stale 1782 * data on the passive end of a channel that existed 1783 * prior to reboot. Retry establishing a channel 1784 * REFRESH_ATTEMPTS times, during which time the 1785 * stale conditions on the server might clear up. 1786 */ 1787 goto refresh; 1788 } 1789 return (RDMA_FAILED); 1790 } 1791 mutex_exit(&qp->cb_lock); 1792 /* 1793 * Set the private data area to qp to be used in callbacks 1794 */ 1795 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1796 return (RDMA_SUCCESS); 1797 } 1798 1799 rdma_stat 1800 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1801 { 1802 uint_t i; 1803 ibt_status_t ibt_status; 1804 uint8_t num_paths_p; 1805 ibt_ip_path_attr_t ipattr; 1806 ibt_path_ip_src_t srcip; 1807 rpcib_ipaddrs_t addrs4; 1808 rpcib_ipaddrs_t addrs6; 1809 struct sockaddr_in *sinp; 1810 struct sockaddr_in6 *sin6p; 1811 rdma_stat retval = RDMA_SUCCESS; 1812 1813 ASSERT(raddr->buf != NULL); 1814 1815 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1816 1817 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1818 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1819 retval = RDMA_FAILED; 1820 goto done; 1821 } 1822 1823 /* Prep the destination address */ 1824 switch (addr_type) { 1825 case AF_INET: 1826 sinp = (struct sockaddr_in *)raddr->buf; 1827 rptp->dstip.family = AF_INET; 1828 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1829 sinp = addrs4.ri_list; 1830 1831 ipattr.ipa_dst_ip = &rptp->dstip; 1832 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1833 ipattr.ipa_ndst = 1; 1834 ipattr.ipa_max_paths = 1; 1835 ipattr.ipa_src_ip.family = rptp->dstip.family; 1836 for (i = 0; i < addrs4.ri_count; i++) { 1837 num_paths_p = 0; 1838 ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; 1839 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1840 1841 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1842 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1843 &num_paths_p, &srcip); 1844 if (ibt_status == IBT_SUCCESS && 1845 num_paths_p != 0 && 1846 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1847 rptp->hca = rib_stat->hca; 1848 rptp->srcip.family = AF_INET; 1849 rptp->srcip.un.ip4addr = 1850 srcip.ip_primary.un.ip4addr; 1851 goto done; 1852 } 1853 } 1854 retval = RDMA_FAILED; 1855 break; 1856 1857 case AF_INET6: 1858 sin6p = (struct sockaddr_in6 *)raddr->buf; 1859 rptp->dstip.family = AF_INET6; 1860 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1861 sin6p = addrs6.ri_list; 1862 1863 ipattr.ipa_dst_ip = &rptp->dstip; 1864 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1865 ipattr.ipa_ndst = 1; 1866 ipattr.ipa_max_paths = 1; 1867 ipattr.ipa_src_ip.family = rptp->dstip.family; 1868 for (i = 0; i < addrs6.ri_count; i++) { 1869 num_paths_p = 0; 1870 ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; 1871 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1872 1873 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1874 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1875 &num_paths_p, &srcip); 1876 if (ibt_status == IBT_SUCCESS && 1877 num_paths_p != 0 && 1878 rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { 1879 rptp->hca = rib_stat->hca; 1880 rptp->srcip.family = AF_INET6; 1881 rptp->srcip.un.ip6addr = 1882 srcip.ip_primary.un.ip6addr; 1883 goto done; 1884 } 1885 } 1886 retval = RDMA_FAILED; 1887 break; 1888 1889 default: 1890 retval = RDMA_INVAL; 1891 break; 1892 } 1893 done: 1894 1895 if (addrs4.ri_size > 0) 1896 kmem_free(addrs4.ri_list, addrs4.ri_size); 1897 if (addrs6.ri_size > 0) 1898 kmem_free(addrs6.ri_list, addrs6.ri_size); 1899 return (retval); 1900 } 1901 1902 /* 1903 * Close channel, remove from connection list and 1904 * free up resources allocated for that channel. 1905 */ 1906 rdma_stat 1907 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1908 { 1909 rib_qp_t *qp = ctoqp(conn); 1910 rib_hca_t *hca; 1911 1912 mutex_enter(&conn->c_lock); 1913 if (conn->c_timeout != NULL) { 1914 mutex_exit(&conn->c_lock); 1915 (void) untimeout(conn->c_timeout); 1916 mutex_enter(&conn->c_lock); 1917 } 1918 1919 while (conn->c_flags & C_CLOSE_PENDING) { 1920 cv_wait(&conn->c_cv, &conn->c_lock); 1921 } 1922 mutex_exit(&conn->c_lock); 1923 1924 /* 1925 * c_ref == 0 and connection is in C_DISCONN_PEND 1926 */ 1927 hca = qp->hca; 1928 if (conn_list != NULL) 1929 (void) rib_rm_conn(conn, conn_list); 1930 1931 /* 1932 * There is only one case where we get here with 1933 * qp_hdl = NULL, which is during connection setup on 1934 * the client. In such a case there are no posted 1935 * send/recv buffers. 1936 */ 1937 if (qp->qp_hdl != NULL) { 1938 mutex_enter(&qp->posted_rbufs_lock); 1939 while (qp->n_posted_rbufs) 1940 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1941 mutex_exit(&qp->posted_rbufs_lock); 1942 1943 mutex_enter(&qp->send_rbufs_lock); 1944 while (qp->n_send_rbufs) 1945 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 1946 mutex_exit(&qp->send_rbufs_lock); 1947 1948 (void) ibt_free_channel(qp->qp_hdl); 1949 qp->qp_hdl = NULL; 1950 } 1951 1952 ASSERT(qp->rdlist == NULL); 1953 1954 if (qp->replylist != NULL) { 1955 (void) rib_rem_replylist(qp); 1956 } 1957 1958 cv_destroy(&qp->cb_conn_cv); 1959 cv_destroy(&qp->posted_rbufs_cv); 1960 cv_destroy(&qp->send_rbufs_cv); 1961 mutex_destroy(&qp->cb_lock); 1962 mutex_destroy(&qp->replylist_lock); 1963 mutex_destroy(&qp->posted_rbufs_lock); 1964 mutex_destroy(&qp->send_rbufs_lock); 1965 mutex_destroy(&qp->rdlist_lock); 1966 1967 cv_destroy(&conn->c_cv); 1968 mutex_destroy(&conn->c_lock); 1969 1970 if (conn->c_raddr.buf != NULL) { 1971 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 1972 } 1973 if (conn->c_laddr.buf != NULL) { 1974 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 1975 } 1976 1977 /* 1978 * Credit control cleanup. 1979 */ 1980 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 1981 rdma_clnt_cred_ctrl_t *cc_info; 1982 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1983 cv_destroy(&cc_info->clnt_cc_cv); 1984 } 1985 1986 kmem_free(qp, sizeof (rib_qp_t)); 1987 1988 /* 1989 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 1990 * then the hca is no longer being used. 1991 */ 1992 if (conn_list != NULL) { 1993 rw_enter(&hca->state_lock, RW_READER); 1994 if (hca->state == HCA_DETACHED) { 1995 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 1996 if (hca->srv_conn_list.conn_hd == NULL) { 1997 rw_enter(&hca->cl_conn_list.conn_lock, 1998 RW_READER); 1999 2000 if (hca->cl_conn_list.conn_hd == NULL) { 2001 mutex_enter(&hca->inuse_lock); 2002 hca->inuse = FALSE; 2003 cv_signal(&hca->cb_cv); 2004 mutex_exit(&hca->inuse_lock); 2005 } 2006 rw_exit(&hca->cl_conn_list.conn_lock); 2007 } 2008 rw_exit(&hca->srv_conn_list.conn_lock); 2009 } 2010 rw_exit(&hca->state_lock); 2011 } 2012 2013 return (RDMA_SUCCESS); 2014 } 2015 2016 /* 2017 * All sends are done under the protection of 2018 * the wdesc->sendwait_lock. n_send_rbufs count 2019 * is protected using the send_rbufs_lock. 2020 * lock ordering is: 2021 * sendwait_lock -> send_rbufs_lock 2022 */ 2023 2024 void 2025 rib_send_hold(rib_qp_t *qp) 2026 { 2027 mutex_enter(&qp->send_rbufs_lock); 2028 qp->n_send_rbufs++; 2029 mutex_exit(&qp->send_rbufs_lock); 2030 } 2031 2032 void 2033 rib_send_rele(rib_qp_t *qp) 2034 { 2035 mutex_enter(&qp->send_rbufs_lock); 2036 qp->n_send_rbufs--; 2037 if (qp->n_send_rbufs == 0) 2038 cv_signal(&qp->send_rbufs_cv); 2039 mutex_exit(&qp->send_rbufs_lock); 2040 } 2041 2042 /* 2043 * Wait for send completion notification. Only on receiving a 2044 * notification be it a successful or error completion, free the 2045 * send_wid. 2046 */ 2047 static rdma_stat 2048 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2049 { 2050 clock_t timout, cv_wait_ret; 2051 rdma_stat error = RDMA_SUCCESS; 2052 int i; 2053 2054 /* 2055 * Wait for send to complete 2056 */ 2057 ASSERT(wd != NULL); 2058 mutex_enter(&wd->sendwait_lock); 2059 if (wd->status == (uint_t)SEND_WAIT) { 2060 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2061 ddi_get_lbolt(); 2062 2063 if (qp->mode == RIB_SERVER) { 2064 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2065 &wd->sendwait_lock, timout)) > 0 && 2066 wd->status == (uint_t)SEND_WAIT) 2067 ; 2068 switch (cv_wait_ret) { 2069 case -1: /* timeout */ 2070 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2071 2072 wd->cv_sig = 0; /* no signal needed */ 2073 error = RDMA_TIMEDOUT; 2074 break; 2075 default: /* got send completion */ 2076 break; 2077 } 2078 } else { 2079 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2080 &wd->sendwait_lock, timout)) > 0 && 2081 wd->status == (uint_t)SEND_WAIT) 2082 ; 2083 switch (cv_wait_ret) { 2084 case -1: /* timeout */ 2085 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2086 2087 wd->cv_sig = 0; /* no signal needed */ 2088 error = RDMA_TIMEDOUT; 2089 break; 2090 case 0: /* interrupted */ 2091 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2092 2093 wd->cv_sig = 0; /* no signal needed */ 2094 error = RDMA_INTR; 2095 break; 2096 default: /* got send completion */ 2097 break; 2098 } 2099 } 2100 } 2101 2102 if (wd->status != (uint_t)SEND_WAIT) { 2103 /* got send completion */ 2104 if (wd->status != RDMA_SUCCESS) { 2105 switch (wd->status) { 2106 case RDMA_CONNLOST: 2107 error = RDMA_CONNLOST; 2108 break; 2109 default: 2110 error = RDMA_FAILED; 2111 break; 2112 } 2113 } 2114 for (i = 0; i < wd->nsbufs; i++) { 2115 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2116 (void *)(uintptr_t)wd->sbufaddr[i]); 2117 } 2118 2119 rib_send_rele(qp); 2120 2121 mutex_exit(&wd->sendwait_lock); 2122 (void) rib_free_sendwait(wd); 2123 2124 } else { 2125 mutex_exit(&wd->sendwait_lock); 2126 } 2127 return (error); 2128 } 2129 2130 static struct send_wid * 2131 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2132 { 2133 struct send_wid *wd; 2134 2135 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2136 wd->xid = xid; 2137 wd->cv_sig = cv_sig; 2138 wd->qp = qp; 2139 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2140 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2141 wd->status = (uint_t)SEND_WAIT; 2142 2143 return (wd); 2144 } 2145 2146 static int 2147 rib_free_sendwait(struct send_wid *wdesc) 2148 { 2149 cv_destroy(&wdesc->wait_cv); 2150 mutex_destroy(&wdesc->sendwait_lock); 2151 kmem_free(wdesc, sizeof (*wdesc)); 2152 2153 return (0); 2154 } 2155 2156 static rdma_stat 2157 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2158 { 2159 mutex_enter(&qp->replylist_lock); 2160 if (rep != NULL) { 2161 (void) rib_remreply(qp, rep); 2162 mutex_exit(&qp->replylist_lock); 2163 return (RDMA_SUCCESS); 2164 } 2165 mutex_exit(&qp->replylist_lock); 2166 return (RDMA_FAILED); 2167 } 2168 2169 /* 2170 * Send buffers are freed here only in case of error in posting 2171 * on QP. If the post succeeded, the send buffers are freed upon 2172 * send completion in rib_sendwait() or in the scq_handler. 2173 */ 2174 rdma_stat 2175 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2176 int send_sig, int cv_sig, caddr_t *swid) 2177 { 2178 struct send_wid *wdesc; 2179 struct clist *clp; 2180 ibt_status_t ibt_status = IBT_SUCCESS; 2181 rdma_stat ret = RDMA_SUCCESS; 2182 ibt_send_wr_t tx_wr; 2183 int i, nds; 2184 ibt_wr_ds_t sgl[DSEG_MAX]; 2185 uint_t total_msg_size; 2186 rib_qp_t *qp; 2187 2188 qp = ctoqp(conn); 2189 2190 ASSERT(cl != NULL); 2191 2192 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2193 2194 nds = 0; 2195 total_msg_size = 0; 2196 clp = cl; 2197 while (clp != NULL) { 2198 if (nds >= DSEG_MAX) { 2199 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2200 return (RDMA_FAILED); 2201 } 2202 sgl[nds].ds_va = clp->w.c_saddr; 2203 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2204 sgl[nds].ds_len = clp->c_len; 2205 total_msg_size += clp->c_len; 2206 clp = clp->c_next; 2207 nds++; 2208 } 2209 2210 if (send_sig) { 2211 /* Set SEND_SIGNAL flag. */ 2212 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2213 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2214 *swid = (caddr_t)wdesc; 2215 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2216 mutex_enter(&wdesc->sendwait_lock); 2217 wdesc->nsbufs = nds; 2218 for (i = 0; i < nds; i++) { 2219 wdesc->sbufaddr[i] = sgl[i].ds_va; 2220 } 2221 } else { 2222 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2223 *swid = NULL; 2224 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2225 } 2226 2227 tx_wr.wr_opcode = IBT_WRC_SEND; 2228 tx_wr.wr_trans = IBT_RC_SRV; 2229 tx_wr.wr_nds = nds; 2230 tx_wr.wr_sgl = sgl; 2231 2232 mutex_enter(&conn->c_lock); 2233 if (conn->c_state == C_CONNECTED) { 2234 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2235 } 2236 if (conn->c_state != C_CONNECTED || 2237 ibt_status != IBT_SUCCESS) { 2238 if (conn->c_state != C_DISCONN_PEND) 2239 conn->c_state = C_ERROR_CONN; 2240 mutex_exit(&conn->c_lock); 2241 if (send_sig) { 2242 for (i = 0; i < nds; i++) { 2243 rib_rbuf_free(conn, SEND_BUFFER, 2244 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2245 } 2246 mutex_exit(&wdesc->sendwait_lock); 2247 (void) rib_free_sendwait(wdesc); 2248 } 2249 return (RDMA_CONNLOST); 2250 } 2251 2252 mutex_exit(&conn->c_lock); 2253 2254 if (send_sig) { 2255 rib_send_hold(qp); 2256 mutex_exit(&wdesc->sendwait_lock); 2257 if (cv_sig) { 2258 /* 2259 * cv_wait for send to complete. 2260 * We can fail due to a timeout or signal or 2261 * unsuccessful send. 2262 */ 2263 ret = rib_sendwait(qp, wdesc); 2264 2265 return (ret); 2266 } 2267 } 2268 2269 return (RDMA_SUCCESS); 2270 } 2271 2272 2273 rdma_stat 2274 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2275 { 2276 rdma_stat ret; 2277 caddr_t wd; 2278 2279 /* send-wait & cv_signal */ 2280 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2281 return (ret); 2282 } 2283 2284 /* 2285 * Deprecated/obsolete interface not used currently 2286 * but earlier used for READ-READ protocol. 2287 * Send RPC reply and wait for RDMA_DONE. 2288 */ 2289 rdma_stat 2290 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2291 { 2292 rdma_stat ret = RDMA_SUCCESS; 2293 struct rdma_done_list *rd; 2294 clock_t timout, cv_wait_ret; 2295 caddr_t *wid = NULL; 2296 rib_qp_t *qp = ctoqp(conn); 2297 2298 mutex_enter(&qp->rdlist_lock); 2299 rd = rdma_done_add(qp, msgid); 2300 2301 /* No cv_signal (whether send-wait or no-send-wait) */ 2302 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2303 2304 if (ret != RDMA_SUCCESS) { 2305 rdma_done_rm(qp, rd); 2306 } else { 2307 /* 2308 * Wait for RDMA_DONE from remote end 2309 */ 2310 timout = 2311 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2312 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2313 &qp->rdlist_lock, 2314 timout); 2315 2316 rdma_done_rm(qp, rd); 2317 2318 if (cv_wait_ret < 0) { 2319 ret = RDMA_TIMEDOUT; 2320 } 2321 } 2322 2323 mutex_exit(&qp->rdlist_lock); 2324 return (ret); 2325 } 2326 2327 static struct recv_wid * 2328 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2329 { 2330 struct recv_wid *rwid; 2331 2332 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2333 rwid->xid = msgid; 2334 rwid->addr = sgl->ds_va; 2335 rwid->qp = qp; 2336 2337 return (rwid); 2338 } 2339 2340 static void 2341 rib_free_wid(struct recv_wid *rwid) 2342 { 2343 kmem_free(rwid, sizeof (struct recv_wid)); 2344 } 2345 2346 rdma_stat 2347 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2348 { 2349 rib_qp_t *qp = ctoqp(conn); 2350 struct clist *clp = cl; 2351 struct reply *rep; 2352 struct recv_wid *rwid; 2353 int nds; 2354 ibt_wr_ds_t sgl[DSEG_MAX]; 2355 ibt_recv_wr_t recv_wr; 2356 rdma_stat ret; 2357 ibt_status_t ibt_status; 2358 2359 /* 2360 * rdma_clnt_postrecv uses RECV_BUFFER. 2361 */ 2362 2363 nds = 0; 2364 while (cl != NULL) { 2365 if (nds >= DSEG_MAX) { 2366 ret = RDMA_FAILED; 2367 goto done; 2368 } 2369 sgl[nds].ds_va = cl->w.c_saddr; 2370 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2371 sgl[nds].ds_len = cl->c_len; 2372 cl = cl->c_next; 2373 nds++; 2374 } 2375 2376 if (nds != 1) { 2377 ret = RDMA_FAILED; 2378 goto done; 2379 } 2380 2381 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2382 recv_wr.wr_nds = nds; 2383 recv_wr.wr_sgl = sgl; 2384 2385 rwid = rib_create_wid(qp, &sgl[0], msgid); 2386 if (rwid) { 2387 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2388 } else { 2389 ret = RDMA_NORESOURCE; 2390 goto done; 2391 } 2392 rep = rib_addreplylist(qp, msgid); 2393 if (!rep) { 2394 rib_free_wid(rwid); 2395 ret = RDMA_NORESOURCE; 2396 goto done; 2397 } 2398 2399 mutex_enter(&conn->c_lock); 2400 2401 if (conn->c_state == C_CONNECTED) { 2402 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2403 } 2404 2405 if (conn->c_state != C_CONNECTED || 2406 ibt_status != IBT_SUCCESS) { 2407 if (conn->c_state != C_DISCONN_PEND) 2408 conn->c_state = C_ERROR_CONN; 2409 mutex_exit(&conn->c_lock); 2410 rib_free_wid(rwid); 2411 (void) rib_rem_rep(qp, rep); 2412 ret = RDMA_CONNLOST; 2413 goto done; 2414 } 2415 mutex_exit(&conn->c_lock); 2416 return (RDMA_SUCCESS); 2417 2418 done: 2419 while (clp != NULL) { 2420 rib_rbuf_free(conn, RECV_BUFFER, 2421 (void *)(uintptr_t)clp->w.c_saddr3); 2422 clp = clp->c_next; 2423 } 2424 return (ret); 2425 } 2426 2427 rdma_stat 2428 rib_svc_post(CONN* conn, struct clist *cl) 2429 { 2430 rib_qp_t *qp = ctoqp(conn); 2431 struct svc_recv *s_recvp; 2432 int nds; 2433 ibt_wr_ds_t sgl[DSEG_MAX]; 2434 ibt_recv_wr_t recv_wr; 2435 ibt_status_t ibt_status; 2436 2437 nds = 0; 2438 while (cl != NULL) { 2439 if (nds >= DSEG_MAX) { 2440 return (RDMA_FAILED); 2441 } 2442 sgl[nds].ds_va = cl->w.c_saddr; 2443 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2444 sgl[nds].ds_len = cl->c_len; 2445 cl = cl->c_next; 2446 nds++; 2447 } 2448 2449 if (nds != 1) { 2450 rib_rbuf_free(conn, RECV_BUFFER, 2451 (caddr_t)(uintptr_t)sgl[0].ds_va); 2452 2453 return (RDMA_FAILED); 2454 } 2455 2456 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2457 recv_wr.wr_nds = nds; 2458 recv_wr.wr_sgl = sgl; 2459 2460 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2461 /* Use s_recvp's addr as wr id */ 2462 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2463 mutex_enter(&conn->c_lock); 2464 if (conn->c_state == C_CONNECTED) { 2465 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2466 } 2467 if (conn->c_state != C_CONNECTED || 2468 ibt_status != IBT_SUCCESS) { 2469 if (conn->c_state != C_DISCONN_PEND) 2470 conn->c_state = C_ERROR_CONN; 2471 mutex_exit(&conn->c_lock); 2472 rib_rbuf_free(conn, RECV_BUFFER, 2473 (caddr_t)(uintptr_t)sgl[0].ds_va); 2474 (void) rib_free_svc_recv(s_recvp); 2475 2476 return (RDMA_CONNLOST); 2477 } 2478 mutex_exit(&conn->c_lock); 2479 2480 return (RDMA_SUCCESS); 2481 } 2482 2483 /* Client */ 2484 rdma_stat 2485 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2486 { 2487 2488 return (rib_clnt_post(conn, cl, msgid)); 2489 } 2490 2491 /* Client */ 2492 rdma_stat 2493 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2494 { 2495 rib_qp_t *qp = ctoqp(conn); 2496 struct reply *rep; 2497 2498 mutex_enter(&qp->replylist_lock); 2499 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2500 if (rep->xid == msgid) { 2501 if (rep->vaddr_cq) { 2502 rib_rbuf_free(conn, RECV_BUFFER, 2503 (caddr_t)(uintptr_t)rep->vaddr_cq); 2504 } 2505 (void) rib_remreply(qp, rep); 2506 break; 2507 } 2508 } 2509 mutex_exit(&qp->replylist_lock); 2510 2511 return (RDMA_SUCCESS); 2512 } 2513 2514 /* Server */ 2515 rdma_stat 2516 rib_post_recv(CONN *conn, struct clist *cl) 2517 { 2518 rib_qp_t *qp = ctoqp(conn); 2519 2520 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2521 mutex_enter(&qp->posted_rbufs_lock); 2522 qp->n_posted_rbufs++; 2523 mutex_exit(&qp->posted_rbufs_lock); 2524 return (RDMA_SUCCESS); 2525 } 2526 return (RDMA_FAILED); 2527 } 2528 2529 /* 2530 * Client side only interface to "recv" the rpc reply buf 2531 * posted earlier by rib_post_resp(conn, cl, msgid). 2532 */ 2533 rdma_stat 2534 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2535 { 2536 struct reply *rep = NULL; 2537 clock_t timout, cv_wait_ret; 2538 rdma_stat ret = RDMA_SUCCESS; 2539 rib_qp_t *qp = ctoqp(conn); 2540 2541 /* 2542 * Find the reply structure for this msgid 2543 */ 2544 mutex_enter(&qp->replylist_lock); 2545 2546 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2547 if (rep->xid == msgid) 2548 break; 2549 } 2550 2551 if (rep != NULL) { 2552 /* 2553 * If message not yet received, wait. 2554 */ 2555 if (rep->status == (uint_t)REPLY_WAIT) { 2556 timout = ddi_get_lbolt() + 2557 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2558 2559 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2560 &qp->replylist_lock, timout)) > 0 && 2561 rep->status == (uint_t)REPLY_WAIT) 2562 ; 2563 2564 switch (cv_wait_ret) { 2565 case -1: /* timeout */ 2566 ret = RDMA_TIMEDOUT; 2567 break; 2568 case 0: 2569 ret = RDMA_INTR; 2570 break; 2571 default: 2572 break; 2573 } 2574 } 2575 2576 if (rep->status == RDMA_SUCCESS) { 2577 struct clist *cl = NULL; 2578 2579 /* 2580 * Got message successfully 2581 */ 2582 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2583 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2584 *clp = cl; 2585 } else { 2586 if (rep->status != (uint_t)REPLY_WAIT) { 2587 /* 2588 * Got error in reply message. Free 2589 * recv buffer here. 2590 */ 2591 ret = rep->status; 2592 rib_rbuf_free(conn, RECV_BUFFER, 2593 (caddr_t)(uintptr_t)rep->vaddr_cq); 2594 } 2595 } 2596 (void) rib_remreply(qp, rep); 2597 } else { 2598 /* 2599 * No matching reply structure found for given msgid on the 2600 * reply wait list. 2601 */ 2602 ret = RDMA_INVAL; 2603 DTRACE_PROBE(rpcib__i__nomatchxid2); 2604 } 2605 2606 /* 2607 * Done. 2608 */ 2609 mutex_exit(&qp->replylist_lock); 2610 return (ret); 2611 } 2612 2613 /* 2614 * RDMA write a buffer to the remote address. 2615 */ 2616 rdma_stat 2617 rib_write(CONN *conn, struct clist *cl, int wait) 2618 { 2619 ibt_send_wr_t tx_wr; 2620 int cv_sig; 2621 ibt_wr_ds_t sgl[DSEG_MAX]; 2622 struct send_wid *wdesc; 2623 ibt_status_t ibt_status; 2624 rdma_stat ret = RDMA_SUCCESS; 2625 rib_qp_t *qp = ctoqp(conn); 2626 uint64_t n_writes = 0; 2627 2628 if (cl == NULL) { 2629 return (RDMA_FAILED); 2630 } 2631 2632 while ((cl != NULL)) { 2633 if (cl->c_len > 0) { 2634 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2635 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2636 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2637 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2638 sgl[0].ds_va = cl->w.c_saddr; 2639 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2640 sgl[0].ds_len = cl->c_len; 2641 2642 if (wait) { 2643 cv_sig = 1; 2644 } else { 2645 if (n_writes > max_unsignaled_rws) { 2646 n_writes = 0; 2647 cv_sig = 1; 2648 } else { 2649 cv_sig = 0; 2650 } 2651 } 2652 2653 if (cv_sig) { 2654 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2655 wdesc = rib_init_sendwait(0, cv_sig, qp); 2656 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2657 mutex_enter(&wdesc->sendwait_lock); 2658 } else { 2659 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2660 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2661 } 2662 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2663 tx_wr.wr_trans = IBT_RC_SRV; 2664 tx_wr.wr_nds = 1; 2665 tx_wr.wr_sgl = sgl; 2666 2667 mutex_enter(&conn->c_lock); 2668 if (conn->c_state == C_CONNECTED) { 2669 ibt_status = 2670 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2671 } 2672 if (conn->c_state != C_CONNECTED || 2673 ibt_status != IBT_SUCCESS) { 2674 if (conn->c_state != C_DISCONN_PEND) 2675 conn->c_state = C_ERROR_CONN; 2676 mutex_exit(&conn->c_lock); 2677 if (cv_sig) { 2678 mutex_exit(&wdesc->sendwait_lock); 2679 (void) rib_free_sendwait(wdesc); 2680 } 2681 return (RDMA_CONNLOST); 2682 } 2683 2684 mutex_exit(&conn->c_lock); 2685 2686 /* 2687 * Wait for send to complete 2688 */ 2689 if (cv_sig) { 2690 2691 rib_send_hold(qp); 2692 mutex_exit(&wdesc->sendwait_lock); 2693 2694 ret = rib_sendwait(qp, wdesc); 2695 if (ret != 0) 2696 return (ret); 2697 } 2698 n_writes ++; 2699 } 2700 cl = cl->c_next; 2701 } 2702 return (RDMA_SUCCESS); 2703 } 2704 2705 /* 2706 * RDMA Read a buffer from the remote address. 2707 */ 2708 rdma_stat 2709 rib_read(CONN *conn, struct clist *cl, int wait) 2710 { 2711 ibt_send_wr_t rx_wr; 2712 int cv_sig = 0; 2713 ibt_wr_ds_t sgl; 2714 struct send_wid *wdesc; 2715 ibt_status_t ibt_status = IBT_SUCCESS; 2716 rdma_stat ret = RDMA_SUCCESS; 2717 rib_qp_t *qp = ctoqp(conn); 2718 2719 if (cl == NULL) { 2720 return (RDMA_FAILED); 2721 } 2722 2723 while (cl != NULL) { 2724 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2725 /* 2726 * Remote address is at the head chunk item in list. 2727 */ 2728 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2729 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2730 2731 sgl.ds_va = cl->u.c_daddr; 2732 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2733 sgl.ds_len = cl->c_len; 2734 2735 /* 2736 * If there are multiple chunks to be read, and 2737 * wait is set, ask for signal only for the last chunk 2738 * and wait only on the last chunk. The completion of 2739 * RDMA_READ on last chunk ensures that reads on all 2740 * previous chunks are also completed. 2741 */ 2742 if (wait && (cl->c_next == NULL)) { 2743 cv_sig = 1; 2744 wdesc = rib_init_sendwait(0, cv_sig, qp); 2745 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2746 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2747 mutex_enter(&wdesc->sendwait_lock); 2748 } else { 2749 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2750 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2751 } 2752 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2753 rx_wr.wr_trans = IBT_RC_SRV; 2754 rx_wr.wr_nds = 1; 2755 rx_wr.wr_sgl = &sgl; 2756 2757 mutex_enter(&conn->c_lock); 2758 if (conn->c_state == C_CONNECTED) { 2759 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2760 } 2761 if (conn->c_state != C_CONNECTED || 2762 ibt_status != IBT_SUCCESS) { 2763 if (conn->c_state != C_DISCONN_PEND) 2764 conn->c_state = C_ERROR_CONN; 2765 mutex_exit(&conn->c_lock); 2766 if (wait && (cl->c_next == NULL)) { 2767 mutex_exit(&wdesc->sendwait_lock); 2768 (void) rib_free_sendwait(wdesc); 2769 } 2770 return (RDMA_CONNLOST); 2771 } 2772 2773 mutex_exit(&conn->c_lock); 2774 2775 /* 2776 * Wait for send to complete if this is the 2777 * last item in the list. 2778 */ 2779 if (wait && cl->c_next == NULL) { 2780 rib_send_hold(qp); 2781 mutex_exit(&wdesc->sendwait_lock); 2782 2783 ret = rib_sendwait(qp, wdesc); 2784 2785 if (ret != 0) 2786 return (ret); 2787 } 2788 cl = cl->c_next; 2789 } 2790 return (RDMA_SUCCESS); 2791 } 2792 2793 /* 2794 * rib_srv_cm_handler() 2795 * Connection Manager callback to handle RC connection requests. 2796 */ 2797 /* ARGSUSED */ 2798 static ibt_cm_status_t 2799 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2800 ibt_cm_return_args_t *ret_args, void *priv_data, 2801 ibt_priv_data_len_t len) 2802 { 2803 queue_t *q; 2804 rib_qp_t *qp; 2805 rpcib_state_t *ribstat; 2806 rib_hca_t *hca; 2807 rdma_stat status = RDMA_SUCCESS; 2808 int i; 2809 struct clist cl; 2810 rdma_buf_t rdbuf = {0}; 2811 void *buf = NULL; 2812 CONN *conn; 2813 ibt_ip_cm_info_t ipinfo; 2814 struct sockaddr_in *s; 2815 struct sockaddr_in6 *s6; 2816 int sin_size = sizeof (struct sockaddr_in); 2817 int in_size = sizeof (struct in_addr); 2818 int sin6_size = sizeof (struct sockaddr_in6); 2819 2820 ASSERT(any != NULL); 2821 ASSERT(event != NULL); 2822 2823 ribstat = (rpcib_state_t *)any; 2824 hca = (rib_hca_t *)ribstat->hca; 2825 ASSERT(hca != NULL); 2826 2827 /* got a connection request */ 2828 switch (event->cm_type) { 2829 case IBT_CM_EVENT_REQ_RCV: 2830 /* 2831 * If the plugin is in the NO_ACCEPT state, bail out. 2832 */ 2833 mutex_enter(&plugin_state_lock); 2834 if (plugin_state == NO_ACCEPT) { 2835 mutex_exit(&plugin_state_lock); 2836 return (IBT_CM_REJECT); 2837 } 2838 mutex_exit(&plugin_state_lock); 2839 2840 /* 2841 * Need to send a MRA MAD to CM so that it does not 2842 * timeout on us. 2843 */ 2844 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2845 event->cm_event.req.req_timeout * 8, NULL, 0); 2846 2847 mutex_enter(&rib_stat->open_hca_lock); 2848 q = rib_stat->q; 2849 mutex_exit(&rib_stat->open_hca_lock); 2850 2851 status = rib_svc_create_chan(hca, (caddr_t)q, 2852 event->cm_event.req.req_prim_hca_port, &qp); 2853 2854 if (status) { 2855 return (IBT_CM_REJECT); 2856 } 2857 2858 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2859 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2860 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2861 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2862 2863 /* 2864 * Pre-posts RECV buffers 2865 */ 2866 conn = qptoc(qp); 2867 for (i = 0; i < preposted_rbufs; i++) { 2868 bzero(&rdbuf, sizeof (rdbuf)); 2869 rdbuf.type = RECV_BUFFER; 2870 buf = rib_rbuf_alloc(conn, &rdbuf); 2871 if (buf == NULL) { 2872 /* 2873 * A connection is not established yet. 2874 * Just flush the channel. Buffers 2875 * posted till now will error out with 2876 * IBT_WC_WR_FLUSHED_ERR. 2877 */ 2878 (void) ibt_flush_channel(qp->qp_hdl); 2879 (void) rib_disconnect_channel(conn, NULL); 2880 return (IBT_CM_REJECT); 2881 } 2882 2883 bzero(&cl, sizeof (cl)); 2884 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2885 cl.c_len = rdbuf.len; 2886 cl.c_smemhandle.mrc_lmr = 2887 rdbuf.handle.mrc_lmr; /* lkey */ 2888 cl.c_next = NULL; 2889 status = rib_post_recv(conn, &cl); 2890 if (status != RDMA_SUCCESS) { 2891 /* 2892 * A connection is not established yet. 2893 * Just flush the channel. Buffers 2894 * posted till now will error out with 2895 * IBT_WC_WR_FLUSHED_ERR. 2896 */ 2897 (void) ibt_flush_channel(qp->qp_hdl); 2898 (void) rib_disconnect_channel(conn, NULL); 2899 return (IBT_CM_REJECT); 2900 } 2901 } 2902 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2903 2904 /* 2905 * Get the address translation 2906 */ 2907 rw_enter(&hca->state_lock, RW_READER); 2908 if (hca->state == HCA_DETACHED) { 2909 rw_exit(&hca->state_lock); 2910 return (IBT_CM_REJECT); 2911 } 2912 rw_exit(&hca->state_lock); 2913 2914 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2915 2916 if (ibt_get_ip_data(event->cm_priv_data_len, 2917 event->cm_priv_data, 2918 &ipinfo) != IBT_SUCCESS) { 2919 2920 return (IBT_CM_REJECT); 2921 } 2922 2923 switch (ipinfo.src_addr.family) { 2924 case AF_INET: 2925 2926 conn->c_raddr.maxlen = 2927 conn->c_raddr.len = sin_size; 2928 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2929 2930 s = (struct sockaddr_in *)conn->c_raddr.buf; 2931 s->sin_family = AF_INET; 2932 2933 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2934 &s->sin_addr, in_size); 2935 2936 break; 2937 2938 case AF_INET6: 2939 2940 conn->c_raddr.maxlen = 2941 conn->c_raddr.len = sin6_size; 2942 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2943 2944 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2945 s6->sin6_family = AF_INET6; 2946 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2947 &s6->sin6_addr, 2948 sizeof (struct in6_addr)); 2949 2950 break; 2951 2952 default: 2953 return (IBT_CM_REJECT); 2954 } 2955 2956 break; 2957 2958 case IBT_CM_EVENT_CONN_CLOSED: 2959 { 2960 CONN *conn; 2961 rib_qp_t *qp; 2962 2963 switch (event->cm_event.closed) { 2964 case IBT_CM_CLOSED_DREP_RCVD: 2965 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2966 case IBT_CM_CLOSED_DUP: 2967 case IBT_CM_CLOSED_ABORT: 2968 case IBT_CM_CLOSED_ALREADY: 2969 /* 2970 * These cases indicate the local end initiated 2971 * the closing of the channel. Nothing to do here. 2972 */ 2973 break; 2974 default: 2975 /* 2976 * Reason for CONN_CLOSED event must be one of 2977 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2978 * or IBT_CM_CLOSED_STALE. These indicate cases were 2979 * the remote end is closing the channel. In these 2980 * cases free the channel and transition to error 2981 * state 2982 */ 2983 qp = ibt_get_chan_private(event->cm_channel); 2984 conn = qptoc(qp); 2985 mutex_enter(&conn->c_lock); 2986 if (conn->c_state == C_DISCONN_PEND) { 2987 mutex_exit(&conn->c_lock); 2988 break; 2989 } 2990 conn->c_state = C_ERROR_CONN; 2991 2992 /* 2993 * Free the conn if c_ref goes down to 0 2994 */ 2995 if (conn->c_ref == 0) { 2996 /* 2997 * Remove from list and free conn 2998 */ 2999 conn->c_state = C_DISCONN_PEND; 3000 mutex_exit(&conn->c_lock); 3001 (void) rib_disconnect_channel(conn, 3002 &hca->srv_conn_list); 3003 } else { 3004 /* 3005 * conn will be freed when c_ref goes to 0. 3006 * Indicate to cleaning thread not to close 3007 * the connection, but just free the channel. 3008 */ 3009 conn->c_flags |= C_CLOSE_NOTNEEDED; 3010 mutex_exit(&conn->c_lock); 3011 } 3012 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3013 break; 3014 } 3015 break; 3016 } 3017 case IBT_CM_EVENT_CONN_EST: 3018 /* 3019 * RTU received, hence connection established. 3020 */ 3021 if (rib_debug > 1) 3022 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3023 "(CONN_EST) channel established"); 3024 break; 3025 3026 default: 3027 if (rib_debug > 2) { 3028 /* Let CM handle the following events. */ 3029 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3030 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3031 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3032 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3033 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3034 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3035 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3036 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3037 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3038 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3039 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3040 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3041 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3042 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3043 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3044 } 3045 } 3046 return (IBT_CM_DEFAULT); 3047 } 3048 3049 /* accept all other CM messages (i.e. let the CM handle them) */ 3050 return (IBT_CM_ACCEPT); 3051 } 3052 3053 static rdma_stat 3054 rib_register_service(rib_hca_t *hca, int service_type) 3055 { 3056 ibt_srv_desc_t sdesc; 3057 ibt_hca_portinfo_t *port_infop; 3058 ib_svc_id_t srv_id; 3059 ibt_srv_hdl_t srv_hdl; 3060 uint_t port_size; 3061 uint_t pki, i, num_ports, nbinds; 3062 ibt_status_t ibt_status; 3063 rib_service_t *new_service; 3064 ib_pkey_t pkey; 3065 3066 /* 3067 * Query all ports for the given HCA 3068 */ 3069 rw_enter(&hca->state_lock, RW_READER); 3070 if (hca->state != HCA_DETACHED) { 3071 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3072 &num_ports, &port_size); 3073 rw_exit(&hca->state_lock); 3074 } else { 3075 rw_exit(&hca->state_lock); 3076 return (RDMA_FAILED); 3077 } 3078 if (ibt_status != IBT_SUCCESS) { 3079 return (RDMA_FAILED); 3080 } 3081 3082 DTRACE_PROBE1(rpcib__i__regservice_numports, 3083 int, num_ports); 3084 3085 for (i = 0; i < num_ports; i++) { 3086 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3087 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3088 int, i+1); 3089 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3090 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3091 int, i+1); 3092 } 3093 } 3094 3095 /* 3096 * Get all the IP addresses on this system to register the 3097 * given "service type" on all DNS recognized IP addrs. 3098 * Each service type such as NFS will have all the systems 3099 * IP addresses as its different names. For now the only 3100 * type of service we support in RPCIB is NFS. 3101 */ 3102 rw_enter(&hca->service_list_lock, RW_WRITER); 3103 /* 3104 * Start registering and binding service to active 3105 * on active ports on this HCA. 3106 */ 3107 nbinds = 0; 3108 new_service = NULL; 3109 3110 /* 3111 * We use IP addresses as the service names for 3112 * service registration. Register each of them 3113 * with CM to obtain a svc_id and svc_hdl. We do not 3114 * register the service with machine's loopback address. 3115 */ 3116 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3117 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3118 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3119 3120 sdesc.sd_handler = rib_srv_cm_handler; 3121 sdesc.sd_flags = 0; 3122 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3123 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port), 3124 1, &srv_hdl, &srv_id); 3125 3126 for (i = 0; i < num_ports; i++) { 3127 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3128 continue; 3129 3130 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3131 pkey = port_infop[i].p_pkey_tbl[pki]; 3132 if ((pkey & IBSRM_HB) && 3133 (pkey != IB_PKEY_INVALID_FULL)) { 3134 3135 /* 3136 * Allocate and prepare a service entry 3137 */ 3138 new_service = 3139 kmem_zalloc(1 * sizeof (rib_service_t), 3140 KM_SLEEP); 3141 3142 new_service->srv_type = service_type; 3143 new_service->srv_hdl = srv_hdl; 3144 new_service->srv_next = NULL; 3145 3146 ibt_status = ibt_bind_service(srv_hdl, 3147 port_infop[i].p_sgid_tbl[0], 3148 NULL, rib_stat, NULL); 3149 3150 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3151 int, ibt_status); 3152 3153 if (ibt_status != IBT_SUCCESS) { 3154 kmem_free(new_service, 3155 sizeof (rib_service_t)); 3156 new_service = NULL; 3157 continue; 3158 } 3159 3160 /* 3161 * Add to the service list for this HCA 3162 */ 3163 new_service->srv_next = hca->service_list; 3164 hca->service_list = new_service; 3165 new_service = NULL; 3166 nbinds++; 3167 } 3168 } 3169 } 3170 rw_exit(&hca->service_list_lock); 3171 3172 ibt_free_portinfo(port_infop, port_size); 3173 3174 if (nbinds == 0) { 3175 return (RDMA_FAILED); 3176 } else { 3177 /* 3178 * Put this plugin into accept state, since atleast 3179 * one registration was successful. 3180 */ 3181 mutex_enter(&plugin_state_lock); 3182 plugin_state = ACCEPT; 3183 mutex_exit(&plugin_state_lock); 3184 return (RDMA_SUCCESS); 3185 } 3186 } 3187 3188 void 3189 rib_listen(struct rdma_svc_data *rd) 3190 { 3191 rdma_stat status = RDMA_SUCCESS; 3192 3193 rd->active = 0; 3194 rd->err_code = RDMA_FAILED; 3195 3196 /* 3197 * First check if a hca is still attached 3198 */ 3199 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3200 if (rib_stat->hca->state != HCA_INITED) { 3201 rw_exit(&rib_stat->hca->state_lock); 3202 return; 3203 } 3204 rw_exit(&rib_stat->hca->state_lock); 3205 3206 rib_stat->q = &rd->q; 3207 /* 3208 * Right now the only service type is NFS. Hence force feed this 3209 * value. Ideally to communicate the service type it should be 3210 * passed down in rdma_svc_data. 3211 */ 3212 rib_stat->service_type = NFS; 3213 status = rib_register_service(rib_stat->hca, NFS); 3214 if (status != RDMA_SUCCESS) { 3215 rd->err_code = status; 3216 return; 3217 } 3218 /* 3219 * Service active on an HCA, check rd->err_code for more 3220 * explainable errors. 3221 */ 3222 rd->active = 1; 3223 rd->err_code = status; 3224 } 3225 3226 /* XXXX */ 3227 /* ARGSUSED */ 3228 static void 3229 rib_listen_stop(struct rdma_svc_data *svcdata) 3230 { 3231 rib_hca_t *hca; 3232 3233 /* 3234 * KRPC called the RDMATF to stop the listeners, this means 3235 * stop sending incomming or recieved requests to KRPC master 3236 * transport handle for RDMA-IB. This is also means that the 3237 * master transport handle, responsible for us, is going away. 3238 */ 3239 mutex_enter(&plugin_state_lock); 3240 plugin_state = NO_ACCEPT; 3241 if (svcdata != NULL) 3242 svcdata->active = 0; 3243 mutex_exit(&plugin_state_lock); 3244 3245 /* 3246 * First check if a hca is still attached 3247 */ 3248 hca = rib_stat->hca; 3249 rw_enter(&hca->state_lock, RW_READER); 3250 if (hca->state != HCA_INITED) { 3251 rw_exit(&hca->state_lock); 3252 return; 3253 } 3254 rib_close_channels(&hca->srv_conn_list); 3255 rib_stop_services(hca); 3256 rw_exit(&hca->state_lock); 3257 } 3258 3259 /* 3260 * Traverse the HCA's service list to unbind and deregister services. 3261 * Instead of unbinding the service for a service handle by 3262 * calling ibt_unbind_service() for each port/pkey, we unbind 3263 * all the services for the service handle by making only one 3264 * call to ibt_unbind_all_services(). Then, we deregister the 3265 * service for the service handle. 3266 * 3267 * When traversing the entries in service_list, we compare the 3268 * srv_hdl of the current entry with that of the next. If they 3269 * are different or if the next entry is NULL, the current entry 3270 * marks the last binding of the service handle. In this case, 3271 * call ibt_unbind_all_services() and deregister the service for 3272 * the service handle. If they are the same, the current and the 3273 * next entries are bound to the same service handle. In this 3274 * case, move on to the next entry. 3275 */ 3276 static void 3277 rib_stop_services(rib_hca_t *hca) 3278 { 3279 rib_service_t *srv_list, *to_remove; 3280 3281 /* 3282 * unbind and deregister the services for this service type. 3283 * Right now there is only one service type. In future it will 3284 * be passed down to this function. 3285 */ 3286 rw_enter(&hca->service_list_lock, RW_WRITER); 3287 srv_list = hca->service_list; 3288 while (srv_list != NULL) { 3289 to_remove = srv_list; 3290 srv_list = to_remove->srv_next; 3291 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3292 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3293 3294 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3295 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3296 to_remove->srv_hdl); 3297 } 3298 3299 kmem_free(to_remove, sizeof (rib_service_t)); 3300 } 3301 hca->service_list = NULL; 3302 rw_exit(&hca->service_list_lock); 3303 } 3304 3305 static struct svc_recv * 3306 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3307 { 3308 struct svc_recv *recvp; 3309 3310 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3311 recvp->vaddr = sgl->ds_va; 3312 recvp->qp = qp; 3313 recvp->bytes_xfer = 0; 3314 return (recvp); 3315 } 3316 3317 static int 3318 rib_free_svc_recv(struct svc_recv *recvp) 3319 { 3320 kmem_free(recvp, sizeof (*recvp)); 3321 3322 return (0); 3323 } 3324 3325 static struct reply * 3326 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3327 { 3328 struct reply *rep; 3329 3330 3331 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3332 if (rep == NULL) { 3333 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3334 return (NULL); 3335 } 3336 rep->xid = msgid; 3337 rep->vaddr_cq = NULL; 3338 rep->bytes_xfer = 0; 3339 rep->status = (uint_t)REPLY_WAIT; 3340 rep->prev = NULL; 3341 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3342 3343 mutex_enter(&qp->replylist_lock); 3344 if (qp->replylist) { 3345 rep->next = qp->replylist; 3346 qp->replylist->prev = rep; 3347 } 3348 qp->rep_list_size++; 3349 3350 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3351 int, qp->rep_list_size); 3352 3353 qp->replylist = rep; 3354 mutex_exit(&qp->replylist_lock); 3355 3356 return (rep); 3357 } 3358 3359 static rdma_stat 3360 rib_rem_replylist(rib_qp_t *qp) 3361 { 3362 struct reply *r, *n; 3363 3364 mutex_enter(&qp->replylist_lock); 3365 for (r = qp->replylist; r != NULL; r = n) { 3366 n = r->next; 3367 (void) rib_remreply(qp, r); 3368 } 3369 mutex_exit(&qp->replylist_lock); 3370 3371 return (RDMA_SUCCESS); 3372 } 3373 3374 static int 3375 rib_remreply(rib_qp_t *qp, struct reply *rep) 3376 { 3377 3378 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3379 if (rep->prev) { 3380 rep->prev->next = rep->next; 3381 } 3382 if (rep->next) { 3383 rep->next->prev = rep->prev; 3384 } 3385 if (qp->replylist == rep) 3386 qp->replylist = rep->next; 3387 3388 cv_destroy(&rep->wait_cv); 3389 qp->rep_list_size--; 3390 3391 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3392 int, qp->rep_list_size); 3393 3394 kmem_free(rep, sizeof (*rep)); 3395 3396 return (0); 3397 } 3398 3399 rdma_stat 3400 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3401 struct mrc *buf_handle) 3402 { 3403 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3404 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3405 rdma_stat status; 3406 rib_hca_t *hca = (ctoqp(conn))->hca; 3407 3408 /* 3409 * Note: ALL buffer pools use the same memory type RDMARW. 3410 */ 3411 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3412 if (status == RDMA_SUCCESS) { 3413 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3414 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3415 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3416 } else { 3417 buf_handle->mrc_linfo = NULL; 3418 buf_handle->mrc_lmr = 0; 3419 buf_handle->mrc_rmr = 0; 3420 } 3421 return (status); 3422 } 3423 3424 static rdma_stat 3425 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3426 ibt_mr_flags_t spec, 3427 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3428 { 3429 ibt_mr_attr_t mem_attr; 3430 ibt_status_t ibt_status; 3431 mem_attr.mr_vaddr = (uintptr_t)buf; 3432 mem_attr.mr_len = (ib_msglen_t)size; 3433 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3434 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3435 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3436 IBT_MR_ENABLE_WINDOW_BIND | spec; 3437 3438 rw_enter(&hca->state_lock, RW_READER); 3439 if (hca->state == HCA_INITED) { 3440 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3441 &mem_attr, mr_hdlp, mr_descp); 3442 rw_exit(&hca->state_lock); 3443 } else { 3444 rw_exit(&hca->state_lock); 3445 return (RDMA_FAILED); 3446 } 3447 3448 if (ibt_status != IBT_SUCCESS) { 3449 return (RDMA_FAILED); 3450 } 3451 return (RDMA_SUCCESS); 3452 } 3453 3454 rdma_stat 3455 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3456 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3457 { 3458 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3459 rib_lrc_entry_t *l; 3460 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3461 rdma_stat status; 3462 rib_hca_t *hca = (ctoqp(conn))->hca; 3463 3464 /* 3465 * Non-coherent memory registration. 3466 */ 3467 l = (rib_lrc_entry_t *)lrc; 3468 if (l) { 3469 if (l->registered) { 3470 buf_handle->mrc_linfo = 3471 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3472 buf_handle->mrc_lmr = 3473 (uint32_t)l->lrc_mhandle.mrc_lmr; 3474 buf_handle->mrc_rmr = 3475 (uint32_t)l->lrc_mhandle.mrc_rmr; 3476 *sync_handle = (RIB_SYNCMEM_HANDLE) 3477 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3478 return (RDMA_SUCCESS); 3479 } else { 3480 /* Always register the whole buffer */ 3481 buf = (caddr_t)l->lrc_buf; 3482 buflen = l->lrc_len; 3483 } 3484 } 3485 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3486 3487 if (status == RDMA_SUCCESS) { 3488 if (l) { 3489 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3490 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3491 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3492 l->registered = TRUE; 3493 } 3494 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3495 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3496 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3497 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3498 } else { 3499 buf_handle->mrc_linfo = NULL; 3500 buf_handle->mrc_lmr = 0; 3501 buf_handle->mrc_rmr = 0; 3502 } 3503 return (status); 3504 } 3505 3506 /* ARGSUSED */ 3507 rdma_stat 3508 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3509 { 3510 rib_hca_t *hca = (ctoqp(conn))->hca; 3511 /* 3512 * Allow memory deregistration even if HCA is 3513 * getting detached. Need all outstanding 3514 * memory registrations to be deregistered 3515 * before HCA_DETACH_EVENT can be accepted. 3516 */ 3517 (void) ibt_deregister_mr(hca->hca_hdl, 3518 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3519 return (RDMA_SUCCESS); 3520 } 3521 3522 /* ARGSUSED */ 3523 rdma_stat 3524 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3525 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3526 { 3527 rib_lrc_entry_t *l; 3528 l = (rib_lrc_entry_t *)lrc; 3529 if (l) 3530 if (l->registered) 3531 return (RDMA_SUCCESS); 3532 3533 (void) rib_deregistermem(conn, buf, buf_handle); 3534 3535 return (RDMA_SUCCESS); 3536 } 3537 3538 /* ARGSUSED */ 3539 rdma_stat 3540 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3541 int len, int cpu) 3542 { 3543 ibt_status_t status; 3544 rib_hca_t *hca = (ctoqp(conn))->hca; 3545 ibt_mr_sync_t mr_segment; 3546 3547 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3548 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3549 mr_segment.ms_len = (ib_memlen_t)len; 3550 if (cpu) { 3551 /* make incoming data visible to memory */ 3552 mr_segment.ms_flags = IBT_SYNC_WRITE; 3553 } else { 3554 /* make memory changes visible to IO */ 3555 mr_segment.ms_flags = IBT_SYNC_READ; 3556 } 3557 rw_enter(&hca->state_lock, RW_READER); 3558 if (hca->state == HCA_INITED) { 3559 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3560 rw_exit(&hca->state_lock); 3561 } else { 3562 rw_exit(&hca->state_lock); 3563 return (RDMA_FAILED); 3564 } 3565 3566 if (status == IBT_SUCCESS) 3567 return (RDMA_SUCCESS); 3568 else { 3569 return (RDMA_FAILED); 3570 } 3571 } 3572 3573 /* 3574 * XXXX ???? 3575 */ 3576 static rdma_stat 3577 rib_getinfo(rdma_info_t *info) 3578 { 3579 /* 3580 * XXXX Hack! 3581 */ 3582 info->addrlen = 16; 3583 info->mts = 1000000; 3584 info->mtu = 1000000; 3585 3586 return (RDMA_SUCCESS); 3587 } 3588 3589 rib_bufpool_t * 3590 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3591 { 3592 rib_bufpool_t *rbp = NULL; 3593 bufpool_t *bp = NULL; 3594 caddr_t buf; 3595 ibt_mr_attr_t mem_attr; 3596 ibt_status_t ibt_status; 3597 int i, j; 3598 3599 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3600 3601 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3602 num * sizeof (void *), KM_SLEEP); 3603 3604 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3605 bp->numelems = num; 3606 3607 3608 switch (ptype) { 3609 case SEND_BUFFER: 3610 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3611 bp->rsize = RPC_MSG_SZ; 3612 break; 3613 case RECV_BUFFER: 3614 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3615 bp->rsize = RPC_BUF_SIZE; 3616 break; 3617 default: 3618 goto fail; 3619 } 3620 3621 /* 3622 * Register the pool. 3623 */ 3624 bp->bufsize = num * bp->rsize; 3625 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3626 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3627 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3628 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3629 sizeof (ibt_mr_desc_t), KM_SLEEP); 3630 rw_enter(&hca->state_lock, RW_READER); 3631 3632 if (hca->state != HCA_INITED) { 3633 rw_exit(&hca->state_lock); 3634 goto fail; 3635 } 3636 3637 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3638 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3639 mem_attr.mr_vaddr = (uintptr_t)buf; 3640 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3641 mem_attr.mr_as = NULL; 3642 ibt_status = ibt_register_mr(hca->hca_hdl, 3643 hca->pd_hdl, &mem_attr, 3644 &rbp->mr_hdl[i], 3645 &rbp->mr_desc[i]); 3646 if (ibt_status != IBT_SUCCESS) { 3647 for (j = 0; j < i; j++) { 3648 (void) ibt_deregister_mr(hca->hca_hdl, 3649 rbp->mr_hdl[j]); 3650 } 3651 rw_exit(&hca->state_lock); 3652 goto fail; 3653 } 3654 } 3655 rw_exit(&hca->state_lock); 3656 buf = (caddr_t)bp->buf; 3657 for (i = 0; i < num; i++, buf += bp->rsize) { 3658 bp->buflist[i] = (void *)buf; 3659 } 3660 bp->buffree = num - 1; /* no. of free buffers */ 3661 rbp->bpool = bp; 3662 3663 return (rbp); 3664 fail: 3665 if (bp) { 3666 if (bp->buf) 3667 kmem_free(bp->buf, bp->bufsize); 3668 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3669 } 3670 if (rbp) { 3671 if (rbp->mr_hdl) 3672 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3673 if (rbp->mr_desc) 3674 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3675 kmem_free(rbp, sizeof (rib_bufpool_t)); 3676 } 3677 return (NULL); 3678 } 3679 3680 static void 3681 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3682 { 3683 int i; 3684 rib_bufpool_t *rbp = NULL; 3685 bufpool_t *bp; 3686 3687 /* 3688 * Obtain pool address based on type of pool 3689 */ 3690 switch (ptype) { 3691 case SEND_BUFFER: 3692 rbp = hca->send_pool; 3693 break; 3694 case RECV_BUFFER: 3695 rbp = hca->recv_pool; 3696 break; 3697 default: 3698 return; 3699 } 3700 if (rbp == NULL) 3701 return; 3702 3703 bp = rbp->bpool; 3704 3705 /* 3706 * Deregister the pool memory and free it. 3707 */ 3708 for (i = 0; i < bp->numelems; i++) { 3709 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3710 } 3711 } 3712 3713 static void 3714 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3715 { 3716 3717 rib_bufpool_t *rbp = NULL; 3718 bufpool_t *bp; 3719 3720 /* 3721 * Obtain pool address based on type of pool 3722 */ 3723 switch (ptype) { 3724 case SEND_BUFFER: 3725 rbp = hca->send_pool; 3726 break; 3727 case RECV_BUFFER: 3728 rbp = hca->recv_pool; 3729 break; 3730 default: 3731 return; 3732 } 3733 if (rbp == NULL) 3734 return; 3735 3736 bp = rbp->bpool; 3737 3738 /* 3739 * Free the pool memory. 3740 */ 3741 if (rbp->mr_hdl) 3742 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3743 3744 if (rbp->mr_desc) 3745 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3746 if (bp->buf) 3747 kmem_free(bp->buf, bp->bufsize); 3748 mutex_destroy(&bp->buflock); 3749 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3750 kmem_free(rbp, sizeof (rib_bufpool_t)); 3751 } 3752 3753 void 3754 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3755 { 3756 /* 3757 * Deregister the pool memory and free it. 3758 */ 3759 rib_rbufpool_deregister(hca, ptype); 3760 rib_rbufpool_free(hca, ptype); 3761 } 3762 3763 /* 3764 * Fetch a buffer from the pool of type specified in rdbuf->type. 3765 */ 3766 static rdma_stat 3767 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3768 { 3769 rib_lrc_entry_t *rlep; 3770 3771 if (rdbuf->type == RDMA_LONG_BUFFER) { 3772 rlep = rib_get_cache_buf(conn, rdbuf->len); 3773 rdbuf->rb_private = (caddr_t)rlep; 3774 rdbuf->addr = rlep->lrc_buf; 3775 rdbuf->handle = rlep->lrc_mhandle; 3776 return (RDMA_SUCCESS); 3777 } 3778 3779 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3780 if (rdbuf->addr) { 3781 switch (rdbuf->type) { 3782 case SEND_BUFFER: 3783 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3784 break; 3785 case RECV_BUFFER: 3786 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3787 break; 3788 default: 3789 rdbuf->len = 0; 3790 } 3791 return (RDMA_SUCCESS); 3792 } else 3793 return (RDMA_FAILED); 3794 } 3795 3796 /* 3797 * Fetch a buffer of specified type. 3798 * Note that rdbuf->handle is mw's rkey. 3799 */ 3800 static void * 3801 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3802 { 3803 rib_qp_t *qp = ctoqp(conn); 3804 rib_hca_t *hca = qp->hca; 3805 rdma_btype ptype = rdbuf->type; 3806 void *buf; 3807 rib_bufpool_t *rbp = NULL; 3808 bufpool_t *bp; 3809 int i; 3810 3811 /* 3812 * Obtain pool address based on type of pool 3813 */ 3814 switch (ptype) { 3815 case SEND_BUFFER: 3816 rbp = hca->send_pool; 3817 break; 3818 case RECV_BUFFER: 3819 rbp = hca->recv_pool; 3820 break; 3821 default: 3822 return (NULL); 3823 } 3824 if (rbp == NULL) 3825 return (NULL); 3826 3827 bp = rbp->bpool; 3828 3829 mutex_enter(&bp->buflock); 3830 if (bp->buffree < 0) { 3831 mutex_exit(&bp->buflock); 3832 return (NULL); 3833 } 3834 3835 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3836 buf = bp->buflist[bp->buffree]; 3837 rdbuf->addr = buf; 3838 rdbuf->len = bp->rsize; 3839 for (i = bp->numelems - 1; i >= 0; i--) { 3840 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3841 rdbuf->handle.mrc_rmr = 3842 (uint32_t)rbp->mr_desc[i].md_rkey; 3843 rdbuf->handle.mrc_linfo = 3844 (uintptr_t)rbp->mr_hdl[i]; 3845 rdbuf->handle.mrc_lmr = 3846 (uint32_t)rbp->mr_desc[i].md_lkey; 3847 bp->buffree--; 3848 3849 mutex_exit(&bp->buflock); 3850 3851 return (buf); 3852 } 3853 } 3854 3855 mutex_exit(&bp->buflock); 3856 3857 return (NULL); 3858 } 3859 3860 static void 3861 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3862 { 3863 3864 if (rdbuf->type == RDMA_LONG_BUFFER) { 3865 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3866 rdbuf->rb_private = NULL; 3867 return; 3868 } 3869 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3870 } 3871 3872 static void 3873 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3874 { 3875 rib_qp_t *qp = ctoqp(conn); 3876 rib_hca_t *hca = qp->hca; 3877 rib_bufpool_t *rbp = NULL; 3878 bufpool_t *bp; 3879 3880 /* 3881 * Obtain pool address based on type of pool 3882 */ 3883 switch (ptype) { 3884 case SEND_BUFFER: 3885 rbp = hca->send_pool; 3886 break; 3887 case RECV_BUFFER: 3888 rbp = hca->recv_pool; 3889 break; 3890 default: 3891 return; 3892 } 3893 if (rbp == NULL) 3894 return; 3895 3896 bp = rbp->bpool; 3897 3898 mutex_enter(&bp->buflock); 3899 if (++bp->buffree >= bp->numelems) { 3900 /* 3901 * Should never happen 3902 */ 3903 bp->buffree--; 3904 } else { 3905 bp->buflist[bp->buffree] = buf; 3906 } 3907 mutex_exit(&bp->buflock); 3908 } 3909 3910 static rdma_stat 3911 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3912 { 3913 rw_enter(&connlist->conn_lock, RW_WRITER); 3914 if (connlist->conn_hd) { 3915 cn->c_next = connlist->conn_hd; 3916 connlist->conn_hd->c_prev = cn; 3917 } 3918 connlist->conn_hd = cn; 3919 rw_exit(&connlist->conn_lock); 3920 3921 return (RDMA_SUCCESS); 3922 } 3923 3924 static rdma_stat 3925 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3926 { 3927 rw_enter(&connlist->conn_lock, RW_WRITER); 3928 if (cn->c_prev) { 3929 cn->c_prev->c_next = cn->c_next; 3930 } 3931 if (cn->c_next) { 3932 cn->c_next->c_prev = cn->c_prev; 3933 } 3934 if (connlist->conn_hd == cn) 3935 connlist->conn_hd = cn->c_next; 3936 rw_exit(&connlist->conn_lock); 3937 3938 return (RDMA_SUCCESS); 3939 } 3940 3941 /* 3942 * Connection management. 3943 * IBTF does not support recycling of channels. So connections are only 3944 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3945 * C_DISCONN_PEND state. No C_IDLE state. 3946 * C_CONN_PEND state: Connection establishment in progress to the server. 3947 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3948 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3949 * only in this state. 3950 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3951 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3952 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3953 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3954 * c_ref drops to 0 (this indicates that RPC has no more references to this 3955 * connection), the connection should be destroyed. A connection transitions 3956 * into this state when it is being destroyed. 3957 */ 3958 /* ARGSUSED */ 3959 static rdma_stat 3960 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3961 { 3962 CONN *cn; 3963 int status = RDMA_SUCCESS; 3964 rib_hca_t *hca = rib_stat->hca; 3965 rib_qp_t *qp; 3966 clock_t cv_stat, timout; 3967 rpcib_ping_t rpt; 3968 3969 if (hca == NULL) 3970 return (RDMA_FAILED); 3971 3972 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3973 if (hca->state == HCA_DETACHED) { 3974 rw_exit(&rib_stat->hca->state_lock); 3975 return (RDMA_FAILED); 3976 } 3977 rw_exit(&rib_stat->hca->state_lock); 3978 3979 again: 3980 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3981 cn = hca->cl_conn_list.conn_hd; 3982 while (cn != NULL) { 3983 /* 3984 * First, clear up any connection in the ERROR state 3985 */ 3986 mutex_enter(&cn->c_lock); 3987 if (cn->c_state == C_ERROR_CONN) { 3988 if (cn->c_ref == 0) { 3989 /* 3990 * Remove connection from list and destroy it. 3991 */ 3992 cn->c_state = C_DISCONN_PEND; 3993 mutex_exit(&cn->c_lock); 3994 rw_exit(&hca->cl_conn_list.conn_lock); 3995 rib_conn_close((void *)cn); 3996 goto again; 3997 } 3998 mutex_exit(&cn->c_lock); 3999 cn = cn->c_next; 4000 continue; 4001 } 4002 if (cn->c_state == C_DISCONN_PEND) { 4003 mutex_exit(&cn->c_lock); 4004 cn = cn->c_next; 4005 continue; 4006 } 4007 if ((cn->c_raddr.len == svcaddr->len) && 4008 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 4009 /* 4010 * Our connection. Give up conn list lock 4011 * as we are done traversing the list. 4012 */ 4013 rw_exit(&hca->cl_conn_list.conn_lock); 4014 if (cn->c_state == C_CONNECTED) { 4015 cn->c_ref++; /* sharing a conn */ 4016 mutex_exit(&cn->c_lock); 4017 *conn = cn; 4018 return (status); 4019 } 4020 if (cn->c_state == C_CONN_PEND) { 4021 /* 4022 * Hold a reference to this conn before 4023 * we give up the lock. 4024 */ 4025 cn->c_ref++; 4026 timout = ddi_get_lbolt() + 4027 drv_usectohz(CONN_WAIT_TIME * 1000000); 4028 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4029 &cn->c_lock, timout)) > 0 && 4030 cn->c_state == C_CONN_PEND) 4031 ; 4032 if (cv_stat == 0) { 4033 cn->c_ref--; 4034 mutex_exit(&cn->c_lock); 4035 return (RDMA_INTR); 4036 } 4037 if (cv_stat < 0) { 4038 cn->c_ref--; 4039 mutex_exit(&cn->c_lock); 4040 return (RDMA_TIMEDOUT); 4041 } 4042 if (cn->c_state == C_CONNECTED) { 4043 *conn = cn; 4044 mutex_exit(&cn->c_lock); 4045 return (status); 4046 } else { 4047 cn->c_ref--; 4048 mutex_exit(&cn->c_lock); 4049 return (RDMA_TIMEDOUT); 4050 } 4051 } 4052 } 4053 mutex_exit(&cn->c_lock); 4054 cn = cn->c_next; 4055 } 4056 rw_exit(&hca->cl_conn_list.conn_lock); 4057 4058 bzero(&rpt, sizeof (rpcib_ping_t)); 4059 4060 status = rib_ping_srv(addr_type, svcaddr, &rpt); 4061 if (status != RDMA_SUCCESS) { 4062 return (RDMA_FAILED); 4063 } 4064 4065 /* 4066 * Channel to server doesn't exist yet, create one. 4067 */ 4068 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4069 return (RDMA_FAILED); 4070 } 4071 cn = qptoc(qp); 4072 cn->c_state = C_CONN_PEND; 4073 cn->c_ref = 1; 4074 4075 /* 4076 * Add to conn list. 4077 * We had given up the READER lock. In the time since then, 4078 * another thread might have created the connection we are 4079 * trying here. But for now, that is quiet alright - there 4080 * might be two connections between a pair of hosts instead 4081 * of one. If we really want to close that window, 4082 * then need to check the list after acquiring the 4083 * WRITER lock. 4084 */ 4085 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4086 status = rib_conn_to_srv(hca, qp, &rpt); 4087 mutex_enter(&cn->c_lock); 4088 if (status == RDMA_SUCCESS) { 4089 cn->c_state = C_CONNECTED; 4090 *conn = cn; 4091 } else { 4092 cn->c_state = C_ERROR_CONN; 4093 cn->c_ref--; 4094 } 4095 cv_broadcast(&cn->c_cv); 4096 mutex_exit(&cn->c_lock); 4097 return (status); 4098 } 4099 4100 static void 4101 rib_conn_close(void *rarg) 4102 { 4103 CONN *conn = (CONN *)rarg; 4104 rib_qp_t *qp = ctoqp(conn); 4105 4106 mutex_enter(&conn->c_lock); 4107 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4108 4109 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4110 /* 4111 * Live connection in CONNECTED state. 4112 */ 4113 if (conn->c_state == C_CONNECTED) { 4114 conn->c_state = C_ERROR_CONN; 4115 } 4116 mutex_exit(&conn->c_lock); 4117 4118 rib_close_a_channel(conn); 4119 4120 mutex_enter(&conn->c_lock); 4121 conn->c_flags &= ~C_CLOSE_PENDING; 4122 cv_signal(&conn->c_cv); 4123 } 4124 4125 mutex_exit(&conn->c_lock); 4126 4127 if (qp->mode == RIB_SERVER) 4128 (void) rib_disconnect_channel(conn, 4129 &qp->hca->srv_conn_list); 4130 else 4131 (void) rib_disconnect_channel(conn, 4132 &qp->hca->cl_conn_list); 4133 } 4134 4135 static void 4136 rib_conn_timeout_call(void *carg) 4137 { 4138 time_t idle_time; 4139 CONN *conn = (CONN *)carg; 4140 rib_hca_t *hca = ctoqp(conn)->hca; 4141 int error; 4142 4143 mutex_enter(&conn->c_lock); 4144 if ((conn->c_ref > 0) || 4145 (conn->c_state == C_DISCONN_PEND)) { 4146 conn->c_timeout = NULL; 4147 mutex_exit(&conn->c_lock); 4148 return; 4149 } 4150 4151 idle_time = (gethrestime_sec() - conn->c_last_used); 4152 4153 if ((idle_time <= rib_conn_timeout) && 4154 (conn->c_state != C_ERROR_CONN)) { 4155 /* 4156 * There was activity after the last timeout. 4157 * Extend the conn life. Unless the conn is 4158 * already in error state. 4159 */ 4160 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4161 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4162 mutex_exit(&conn->c_lock); 4163 return; 4164 } 4165 4166 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4167 (void *)conn, DDI_NOSLEEP); 4168 4169 /* 4170 * If taskq dispatch fails above, then reset the timeout 4171 * to try again after 10 secs. 4172 */ 4173 4174 if (error != DDI_SUCCESS) { 4175 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4176 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4177 mutex_exit(&conn->c_lock); 4178 return; 4179 } 4180 4181 conn->c_state = C_DISCONN_PEND; 4182 mutex_exit(&conn->c_lock); 4183 } 4184 4185 static rdma_stat 4186 rib_conn_release(CONN *conn) 4187 { 4188 4189 mutex_enter(&conn->c_lock); 4190 conn->c_ref--; 4191 4192 conn->c_last_used = gethrestime_sec(); 4193 if (conn->c_ref > 0) { 4194 mutex_exit(&conn->c_lock); 4195 return (RDMA_SUCCESS); 4196 } 4197 4198 /* 4199 * If a conn is C_ERROR_CONN, close the channel. 4200 */ 4201 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4202 conn->c_state = C_DISCONN_PEND; 4203 mutex_exit(&conn->c_lock); 4204 rib_conn_close((void *)conn); 4205 return (RDMA_SUCCESS); 4206 } 4207 4208 /* 4209 * c_ref == 0, set a timeout for conn release 4210 */ 4211 4212 if (conn->c_timeout == NULL) { 4213 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4214 SEC_TO_TICK(rib_conn_timeout)); 4215 } 4216 4217 mutex_exit(&conn->c_lock); 4218 return (RDMA_SUCCESS); 4219 } 4220 4221 /* 4222 * Add at front of list 4223 */ 4224 static struct rdma_done_list * 4225 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4226 { 4227 struct rdma_done_list *rd; 4228 4229 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4230 4231 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4232 rd->xid = xid; 4233 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4234 4235 rd->prev = NULL; 4236 rd->next = qp->rdlist; 4237 if (qp->rdlist != NULL) 4238 qp->rdlist->prev = rd; 4239 qp->rdlist = rd; 4240 4241 return (rd); 4242 } 4243 4244 static void 4245 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4246 { 4247 struct rdma_done_list *r; 4248 4249 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4250 4251 r = rd->next; 4252 if (r != NULL) { 4253 r->prev = rd->prev; 4254 } 4255 4256 r = rd->prev; 4257 if (r != NULL) { 4258 r->next = rd->next; 4259 } else { 4260 qp->rdlist = rd->next; 4261 } 4262 4263 cv_destroy(&rd->rdma_done_cv); 4264 kmem_free(rd, sizeof (*rd)); 4265 } 4266 4267 static void 4268 rdma_done_rem_list(rib_qp_t *qp) 4269 { 4270 struct rdma_done_list *r, *n; 4271 4272 mutex_enter(&qp->rdlist_lock); 4273 for (r = qp->rdlist; r != NULL; r = n) { 4274 n = r->next; 4275 rdma_done_rm(qp, r); 4276 } 4277 mutex_exit(&qp->rdlist_lock); 4278 } 4279 4280 static void 4281 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4282 { 4283 struct rdma_done_list *r = qp->rdlist; 4284 4285 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4286 4287 while (r) { 4288 if (r->xid == xid) { 4289 cv_signal(&r->rdma_done_cv); 4290 return; 4291 } else { 4292 r = r->next; 4293 } 4294 } 4295 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4296 int, xid); 4297 } 4298 4299 /* 4300 * Expects conn->c_lock to be held by the caller. 4301 */ 4302 4303 static void 4304 rib_close_a_channel(CONN *conn) 4305 { 4306 rib_qp_t *qp; 4307 qp = ctoqp(conn); 4308 4309 if (qp->qp_hdl == NULL) { 4310 /* channel already freed */ 4311 return; 4312 } 4313 4314 /* 4315 * Call ibt_close_rc_channel in blocking mode 4316 * with no callbacks. 4317 */ 4318 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4319 NULL, 0, NULL, NULL, 0); 4320 } 4321 4322 /* 4323 * Goes through all connections and closes the channel 4324 * This will cause all the WRs on those channels to be 4325 * flushed. 4326 */ 4327 static void 4328 rib_close_channels(rib_conn_list_t *connlist) 4329 { 4330 CONN *conn, *tmp; 4331 4332 rw_enter(&connlist->conn_lock, RW_READER); 4333 conn = connlist->conn_hd; 4334 while (conn != NULL) { 4335 mutex_enter(&conn->c_lock); 4336 tmp = conn->c_next; 4337 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4338 4339 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4340 4341 /* 4342 * Live connection in CONNECTED state. 4343 */ 4344 if (conn->c_state == C_CONNECTED) 4345 conn->c_state = C_ERROR_CONN; 4346 mutex_exit(&conn->c_lock); 4347 4348 rib_close_a_channel(conn); 4349 4350 mutex_enter(&conn->c_lock); 4351 conn->c_flags &= ~C_CLOSE_PENDING; 4352 /* Signal a pending rib_disconnect_channel() */ 4353 cv_signal(&conn->c_cv); 4354 } 4355 mutex_exit(&conn->c_lock); 4356 conn = tmp; 4357 } 4358 rw_exit(&connlist->conn_lock); 4359 } 4360 4361 /* 4362 * Frees up all connections that are no longer being referenced 4363 */ 4364 static void 4365 rib_purge_connlist(rib_conn_list_t *connlist) 4366 { 4367 CONN *conn; 4368 4369 top: 4370 rw_enter(&connlist->conn_lock, RW_READER); 4371 conn = connlist->conn_hd; 4372 while (conn != NULL) { 4373 mutex_enter(&conn->c_lock); 4374 4375 /* 4376 * At this point connection is either in ERROR 4377 * or DISCONN_PEND state. If in DISCONN_PEND state 4378 * then some other thread is culling that connection. 4379 * If not and if c_ref is 0, then destroy the connection. 4380 */ 4381 if (conn->c_ref == 0 && 4382 conn->c_state != C_DISCONN_PEND) { 4383 /* 4384 * Cull the connection 4385 */ 4386 conn->c_state = C_DISCONN_PEND; 4387 mutex_exit(&conn->c_lock); 4388 rw_exit(&connlist->conn_lock); 4389 (void) rib_disconnect_channel(conn, connlist); 4390 goto top; 4391 } else { 4392 /* 4393 * conn disconnect already scheduled or will 4394 * happen from conn_release when c_ref drops to 0. 4395 */ 4396 mutex_exit(&conn->c_lock); 4397 } 4398 conn = conn->c_next; 4399 } 4400 rw_exit(&connlist->conn_lock); 4401 4402 /* 4403 * At this point, only connections with c_ref != 0 are on the list 4404 */ 4405 } 4406 4407 /* 4408 * Free all the HCA resources and close 4409 * the hca. 4410 */ 4411 4412 static void 4413 rib_free_hca(rib_hca_t *hca) 4414 { 4415 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4416 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4417 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4418 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4419 4420 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4421 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4422 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4423 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4424 4425 rib_rbufpool_destroy(hca, RECV_BUFFER); 4426 rib_rbufpool_destroy(hca, SEND_BUFFER); 4427 rib_destroy_cache(hca); 4428 if (rib_mod.rdma_count == 0) 4429 rdma_unregister_mod(&rib_mod); 4430 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4431 (void) ibt_close_hca(hca->hca_hdl); 4432 hca->hca_hdl = NULL; 4433 } 4434 4435 /* 4436 * Cleans and closes up all uses of the HCA 4437 */ 4438 static void 4439 rib_detach_hca(rib_hca_t *hca) 4440 { 4441 4442 /* 4443 * Stop all services on the HCA 4444 * Go through cl_conn_list and close all rc_channels 4445 * Go through svr_conn_list and close all rc_channels 4446 * Free connections whose c_ref has dropped to 0 4447 * Destroy all CQs 4448 * Deregister and released all buffer pool memory after all 4449 * connections are destroyed 4450 * Free the protection domain 4451 * ibt_close_hca() 4452 */ 4453 rw_enter(&hca->state_lock, RW_WRITER); 4454 if (hca->state == HCA_DETACHED) { 4455 rw_exit(&hca->state_lock); 4456 return; 4457 } 4458 4459 hca->state = HCA_DETACHED; 4460 rib_stat->nhca_inited--; 4461 4462 rib_stop_services(hca); 4463 rib_close_channels(&hca->cl_conn_list); 4464 rib_close_channels(&hca->srv_conn_list); 4465 4466 rib_mod.rdma_count--; 4467 4468 rw_exit(&hca->state_lock); 4469 4470 rib_purge_connlist(&hca->cl_conn_list); 4471 rib_purge_connlist(&hca->srv_conn_list); 4472 4473 if (stats_enabled) { 4474 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4475 GLOBAL_ZONEID); 4476 } 4477 4478 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4479 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4480 if (hca->srv_conn_list.conn_hd == NULL && 4481 hca->cl_conn_list.conn_hd == NULL) { 4482 /* 4483 * conn_lists are NULL, so destroy 4484 * buffers, close hca and be done. 4485 */ 4486 rib_free_hca(hca); 4487 } 4488 rw_exit(&hca->cl_conn_list.conn_lock); 4489 rw_exit(&hca->srv_conn_list.conn_lock); 4490 4491 if (hca->hca_hdl != NULL) { 4492 mutex_enter(&hca->inuse_lock); 4493 while (hca->inuse) 4494 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4495 mutex_exit(&hca->inuse_lock); 4496 4497 rib_free_hca(hca); 4498 } 4499 4500 if (hca->cleanup_helper != NULL) { 4501 ddi_taskq_destroy(hca->cleanup_helper); 4502 hca->cleanup_helper = NULL; 4503 } 4504 } 4505 4506 static void 4507 rib_server_side_cache_reclaim(void *argp) 4508 { 4509 cache_avl_struct_t *rcas; 4510 rib_lrc_entry_t *rb; 4511 rib_hca_t *hca = (rib_hca_t *)argp; 4512 4513 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4514 rcas = avl_first(&hca->avl_tree); 4515 if (rcas != NULL) 4516 avl_remove(&hca->avl_tree, rcas); 4517 4518 while (rcas != NULL) { 4519 while (rcas->r.forw != &rcas->r) { 4520 rcas->elements--; 4521 rib_total_buffers --; 4522 rb = rcas->r.forw; 4523 remque(rb); 4524 if (rb->registered) 4525 (void) rib_deregistermem_via_hca(hca, 4526 rb->lrc_buf, rb->lrc_mhandle); 4527 cache_allocation -= rb->lrc_len; 4528 kmem_free(rb->lrc_buf, rb->lrc_len); 4529 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4530 } 4531 mutex_destroy(&rcas->node_lock); 4532 kmem_cache_free(hca->server_side_cache, rcas); 4533 rcas = avl_first(&hca->avl_tree); 4534 if (rcas != NULL) 4535 avl_remove(&hca->avl_tree, rcas); 4536 } 4537 rw_exit(&hca->avl_rw_lock); 4538 } 4539 4540 static void 4541 rib_server_side_cache_cleanup(void *argp) 4542 { 4543 cache_avl_struct_t *rcas; 4544 rib_lrc_entry_t *rb; 4545 rib_hca_t *hca = (rib_hca_t *)argp; 4546 4547 rw_enter(&hca->avl_rw_lock, RW_READER); 4548 if (cache_allocation < cache_limit) { 4549 rw_exit(&hca->avl_rw_lock); 4550 return; 4551 } 4552 rw_exit(&hca->avl_rw_lock); 4553 4554 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4555 rcas = avl_last(&hca->avl_tree); 4556 if (rcas != NULL) 4557 avl_remove(&hca->avl_tree, rcas); 4558 4559 while (rcas != NULL) { 4560 while (rcas->r.forw != &rcas->r) { 4561 rcas->elements--; 4562 rib_total_buffers --; 4563 rb = rcas->r.forw; 4564 remque(rb); 4565 if (rb->registered) 4566 (void) rib_deregistermem_via_hca(hca, 4567 rb->lrc_buf, rb->lrc_mhandle); 4568 cache_allocation -= rb->lrc_len; 4569 kmem_free(rb->lrc_buf, rb->lrc_len); 4570 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4571 } 4572 mutex_destroy(&rcas->node_lock); 4573 if (hca->server_side_cache) { 4574 kmem_cache_free(hca->server_side_cache, rcas); 4575 } 4576 if ((cache_allocation) < cache_limit) { 4577 rw_exit(&hca->avl_rw_lock); 4578 return; 4579 } 4580 4581 rcas = avl_last(&hca->avl_tree); 4582 if (rcas != NULL) 4583 avl_remove(&hca->avl_tree, rcas); 4584 } 4585 rw_exit(&hca->avl_rw_lock); 4586 } 4587 4588 static int 4589 avl_compare(const void *t1, const void *t2) 4590 { 4591 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4592 return (0); 4593 4594 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4595 return (-1); 4596 4597 return (1); 4598 } 4599 4600 static void 4601 rib_destroy_cache(rib_hca_t *hca) 4602 { 4603 if (hca->avl_init) { 4604 rib_server_side_cache_reclaim((void *)hca); 4605 if (hca->server_side_cache) { 4606 kmem_cache_destroy(hca->server_side_cache); 4607 hca->server_side_cache = NULL; 4608 } 4609 avl_destroy(&hca->avl_tree); 4610 mutex_destroy(&hca->cache_allocation); 4611 rw_destroy(&hca->avl_rw_lock); 4612 } 4613 hca->avl_init = FALSE; 4614 } 4615 4616 static void 4617 rib_force_cleanup(void *hca) 4618 { 4619 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 4620 (void) ddi_taskq_dispatch( 4621 ((rib_hca_t *)hca)->cleanup_helper, 4622 rib_server_side_cache_cleanup, 4623 (void *)hca, DDI_NOSLEEP); 4624 } 4625 4626 static rib_lrc_entry_t * 4627 rib_get_cache_buf(CONN *conn, uint32_t len) 4628 { 4629 cache_avl_struct_t cas, *rcas; 4630 rib_hca_t *hca = (ctoqp(conn))->hca; 4631 rib_lrc_entry_t *reply_buf; 4632 avl_index_t where = NULL; 4633 uint64_t c_alloc = 0; 4634 4635 if (!hca->avl_init) 4636 goto error_alloc; 4637 4638 cas.len = len; 4639 4640 rw_enter(&hca->avl_rw_lock, RW_READER); 4641 4642 mutex_enter(&hca->cache_allocation); 4643 c_alloc = cache_allocation; 4644 mutex_exit(&hca->cache_allocation); 4645 4646 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4647 &where)) == NULL) { 4648 /* Am I above the cache limit */ 4649 if ((c_alloc + len) >= cache_limit) { 4650 rib_force_cleanup((void *)hca); 4651 rw_exit(&hca->avl_rw_lock); 4652 cache_misses_above_the_limit ++; 4653 4654 /* Allocate and register the buffer directly */ 4655 goto error_alloc; 4656 } 4657 4658 rw_exit(&hca->avl_rw_lock); 4659 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4660 4661 /* Recheck to make sure no other thread added the entry in */ 4662 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4663 &cas, &where)) == NULL) { 4664 /* Allocate an avl tree entry */ 4665 rcas = (cache_avl_struct_t *) 4666 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4667 4668 bzero(rcas, sizeof (cache_avl_struct_t)); 4669 rcas->elements = 0; 4670 rcas->r.forw = &rcas->r; 4671 rcas->r.back = &rcas->r; 4672 rcas->len = len; 4673 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4674 avl_insert(&hca->avl_tree, rcas, where); 4675 } 4676 } 4677 4678 mutex_enter(&rcas->node_lock); 4679 4680 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4681 rib_total_buffers--; 4682 cache_hits++; 4683 reply_buf = rcas->r.forw; 4684 remque(reply_buf); 4685 rcas->elements--; 4686 mutex_exit(&rcas->node_lock); 4687 rw_exit(&hca->avl_rw_lock); 4688 mutex_enter(&hca->cache_allocation); 4689 cache_allocation -= len; 4690 mutex_exit(&hca->cache_allocation); 4691 } else { 4692 /* Am I above the cache limit */ 4693 mutex_exit(&rcas->node_lock); 4694 if ((c_alloc + len) >= cache_limit) { 4695 rib_force_cleanup((void *)hca); 4696 rw_exit(&hca->avl_rw_lock); 4697 cache_misses_above_the_limit ++; 4698 /* Allocate and register the buffer directly */ 4699 goto error_alloc; 4700 } 4701 rw_exit(&hca->avl_rw_lock); 4702 cache_misses ++; 4703 /* Allocate a reply_buf entry */ 4704 reply_buf = (rib_lrc_entry_t *) 4705 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4706 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4707 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4708 reply_buf->lrc_len = len; 4709 reply_buf->registered = FALSE; 4710 reply_buf->avl_node = (void *)rcas; 4711 } 4712 4713 return (reply_buf); 4714 4715 error_alloc: 4716 reply_buf = (rib_lrc_entry_t *) 4717 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4718 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4719 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4720 reply_buf->lrc_len = len; 4721 reply_buf->registered = FALSE; 4722 reply_buf->avl_node = NULL; 4723 4724 return (reply_buf); 4725 } 4726 4727 /* 4728 * Return a pre-registered back to the cache (without 4729 * unregistering the buffer).. 4730 */ 4731 4732 static void 4733 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4734 { 4735 cache_avl_struct_t cas, *rcas; 4736 avl_index_t where = NULL; 4737 rib_hca_t *hca = (ctoqp(conn))->hca; 4738 4739 if (!hca->avl_init) 4740 goto error_free; 4741 4742 cas.len = reg_buf->lrc_len; 4743 rw_enter(&hca->avl_rw_lock, RW_READER); 4744 if ((rcas = (cache_avl_struct_t *) 4745 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4746 rw_exit(&hca->avl_rw_lock); 4747 goto error_free; 4748 } else { 4749 rib_total_buffers ++; 4750 cas.len = reg_buf->lrc_len; 4751 mutex_enter(&rcas->node_lock); 4752 insque(reg_buf, &rcas->r); 4753 rcas->elements ++; 4754 mutex_exit(&rcas->node_lock); 4755 rw_exit(&hca->avl_rw_lock); 4756 mutex_enter(&hca->cache_allocation); 4757 cache_allocation += cas.len; 4758 mutex_exit(&hca->cache_allocation); 4759 } 4760 4761 return; 4762 4763 error_free: 4764 4765 if (reg_buf->registered) 4766 (void) rib_deregistermem_via_hca(hca, 4767 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4768 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4769 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4770 } 4771 4772 static rdma_stat 4773 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4774 uint_t buflen, struct mrc *buf_handle) 4775 { 4776 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4777 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4778 rdma_stat status; 4779 4780 4781 /* 4782 * Note: ALL buffer pools use the same memory type RDMARW. 4783 */ 4784 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4785 if (status == RDMA_SUCCESS) { 4786 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4787 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4788 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4789 } else { 4790 buf_handle->mrc_linfo = NULL; 4791 buf_handle->mrc_lmr = 0; 4792 buf_handle->mrc_rmr = 0; 4793 } 4794 return (status); 4795 } 4796 4797 /* ARGSUSED */ 4798 static rdma_stat 4799 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4800 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4801 { 4802 4803 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4804 return (RDMA_SUCCESS); 4805 } 4806 4807 /* ARGSUSED */ 4808 static rdma_stat 4809 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4810 { 4811 4812 (void) ibt_deregister_mr(hca->hca_hdl, 4813 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4814 return (RDMA_SUCCESS); 4815 } 4816 4817 /* 4818 * Check if the IP interface named by `lifrp' is RDMA-capable. 4819 */ 4820 static boolean_t 4821 rpcib_rdma_capable_interface(struct lifreq *lifrp) 4822 { 4823 char ifname[LIFNAMSIZ]; 4824 char *cp; 4825 4826 if (lifrp->lifr_type == IFT_IB) 4827 return (B_TRUE); 4828 4829 /* 4830 * Strip off the logical interface portion before getting 4831 * intimate with the name. 4832 */ 4833 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 4834 if ((cp = strchr(ifname, ':')) != NULL) 4835 *cp = '\0'; 4836 4837 return (strcmp("lo0", ifname) == 0); 4838 } 4839 4840 static int 4841 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 4842 { 4843 vnode_t *kvp, *vp; 4844 TIUSER *tiptr; 4845 struct strioctl iocb; 4846 k_sigset_t smask; 4847 int err = 0; 4848 4849 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) { 4850 if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE, 4851 &tiptr, CRED()) == 0) { 4852 vp = tiptr->fp->f_vnode; 4853 } else { 4854 VN_RELE(kvp); 4855 return (EPROTO); 4856 } 4857 } else { 4858 return (EPROTO); 4859 } 4860 4861 iocb.ic_cmd = cmd; 4862 iocb.ic_timout = 0; 4863 iocb.ic_len = len; 4864 iocb.ic_dp = (caddr_t)arg; 4865 sigintr(&smask, 0); 4866 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4867 sigunintr(&smask); 4868 (void) t_kclose(tiptr, 0); 4869 VN_RELE(kvp); 4870 return (err); 4871 } 4872 4873 /* 4874 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 4875 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 4876 */ 4877 static int 4878 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 4879 { 4880 int err; 4881 struct lifnum lifn; 4882 4883 bzero(&lifn, sizeof (struct lifnum)); 4884 lifn.lifn_family = AF_UNSPEC; 4885 4886 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 4887 if (err != 0) 4888 return (err); 4889 4890 /* 4891 * Pad the interface count to account for additional interfaces that 4892 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 4893 */ 4894 lifn.lifn_count += 4; 4895 4896 bzero(lifcp, sizeof (struct lifconf)); 4897 lifcp->lifc_family = AF_UNSPEC; 4898 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 4899 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 4900 4901 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 4902 if (err != 0) { 4903 kmem_free(lifcp->lifc_buf, *bufsizep); 4904 return (err); 4905 } 4906 return (0); 4907 } 4908 4909 static boolean_t 4910 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 4911 { 4912 uint_t i, nifs; 4913 uint_t bufsize; 4914 struct lifconf lifc; 4915 struct lifreq *lifrp; 4916 struct sockaddr_in *sinp; 4917 struct sockaddr_in6 *sin6p; 4918 4919 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 4920 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 4921 4922 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 4923 return (B_FALSE); 4924 4925 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 4926 kmem_free(lifc.lifc_buf, bufsize); 4927 return (B_FALSE); 4928 } 4929 4930 /* 4931 * Worst case is that all of the addresses are IB-capable and have 4932 * the same address family, so size our buffers accordingly. 4933 */ 4934 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 4935 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 4936 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 4937 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 4938 4939 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 4940 if (!rpcib_rdma_capable_interface(lifrp)) 4941 continue; 4942 4943 if (lifrp->lifr_addr.ss_family == AF_INET) { 4944 sinp = addrs4->ri_list; 4945 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 4946 sizeof (struct sockaddr_in)); 4947 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 4948 sin6p = addrs6->ri_list; 4949 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 4950 sizeof (struct sockaddr_in6)); 4951 } 4952 } 4953 4954 kmem_free(lifc.lifc_buf, bufsize); 4955 return (B_TRUE); 4956 } 4957 4958 /* ARGSUSED */ 4959 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4960 4961 if (KSTAT_WRITE == rw) { 4962 return (EACCES); 4963 } 4964 rpcib_kstat.cache_limit.value.ui64 = 4965 (uint64_t)cache_limit; 4966 rpcib_kstat.cache_allocation.value.ui64 = 4967 (uint64_t)cache_allocation; 4968 rpcib_kstat.cache_hits.value.ui64 = 4969 (uint64_t)cache_hits; 4970 rpcib_kstat.cache_misses.value.ui64 = 4971 (uint64_t)cache_misses; 4972 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4973 (uint64_t)cache_misses_above_the_limit; 4974 return (0); 4975 } 4976