1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/systm.h> 60 #include <sys/pathname.h> 61 #include <sys/kstat.h> 62 #include <sys/t_lock.h> 63 #include <sys/ddi.h> 64 #include <sys/cmn_err.h> 65 #include <sys/time.h> 66 #include <sys/isa_defs.h> 67 #include <sys/callb.h> 68 #include <sys/sunddi.h> 69 #include <sys/sunndi.h> 70 #include <sys/sunldi.h> 71 #include <sys/sdt.h> 72 #include <sys/dlpi.h> 73 #include <sys/ib/ibtl/ibti.h> 74 #include <rpc/rpc.h> 75 #include <rpc/ib.h> 76 77 #include <sys/modctl.h> 78 79 #include <sys/pathname.h> 80 #include <sys/kstr.h> 81 #include <sys/sockio.h> 82 #include <sys/vnode.h> 83 #include <sys/tiuser.h> 84 #include <net/if.h> 85 #include <sys/cred.h> 86 #include <rpc/rpc_rdma.h> 87 88 #include <nfs/nfs.h> 89 #include <sys/kstat.h> 90 #include <sys/atomic.h> 91 92 #define NFS_RDMA_PORT 2050 93 94 extern char *inet_ntop(int, const void *, char *, int); 95 96 97 /* 98 * Prototype declarations for driver ops 99 */ 100 101 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 102 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 103 void *, void **); 104 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 105 static int rpcib_is_ib_interface(char *); 106 static int rpcib_dl_info(ldi_handle_t, dl_info_ack_t *); 107 static int rpcib_do_ip_ioctl(int, int, caddr_t); 108 static boolean_t rpcib_get_ib_addresses(struct sockaddr_in *, 109 struct sockaddr_in6 *, uint_t *, uint_t *); 110 static uint_t rpcib_get_number_interfaces(void); 111 static int rpcib_cache_kstat_update(kstat_t *, int); 112 static void rib_force_cleanup(void *); 113 114 struct { 115 kstat_named_t cache_limit; 116 kstat_named_t cache_allocation; 117 kstat_named_t cache_hits; 118 kstat_named_t cache_misses; 119 kstat_named_t cache_misses_above_the_limit; 120 } rpcib_kstat = { 121 {"cache_limit", KSTAT_DATA_UINT64 }, 122 {"cache_allocation", KSTAT_DATA_UINT64 }, 123 {"cache_hits", KSTAT_DATA_UINT64 }, 124 {"cache_misses", KSTAT_DATA_UINT64 }, 125 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 126 }; 127 128 /* rpcib cb_ops */ 129 static struct cb_ops rpcib_cbops = { 130 nulldev, /* open */ 131 nulldev, /* close */ 132 nodev, /* strategy */ 133 nodev, /* print */ 134 nodev, /* dump */ 135 nodev, /* read */ 136 nodev, /* write */ 137 nodev, /* ioctl */ 138 nodev, /* devmap */ 139 nodev, /* mmap */ 140 nodev, /* segmap */ 141 nochpoll, /* poll */ 142 ddi_prop_op, /* prop_op */ 143 NULL, /* stream */ 144 D_MP, /* cb_flag */ 145 CB_REV, /* rev */ 146 nodev, /* int (*cb_aread)() */ 147 nodev /* int (*cb_awrite)() */ 148 }; 149 150 151 152 153 /* 154 * Device options 155 */ 156 static struct dev_ops rpcib_ops = { 157 DEVO_REV, /* devo_rev, */ 158 0, /* refcnt */ 159 rpcib_getinfo, /* info */ 160 nulldev, /* identify */ 161 nulldev, /* probe */ 162 rpcib_attach, /* attach */ 163 rpcib_detach, /* detach */ 164 nodev, /* reset */ 165 &rpcib_cbops, /* driver ops - devctl interfaces */ 166 NULL, /* bus operations */ 167 NULL, /* power */ 168 ddi_quiesce_not_needed, /* quiesce */ 169 }; 170 171 /* 172 * Module linkage information. 173 */ 174 175 static struct modldrv rib_modldrv = { 176 &mod_driverops, /* Driver module */ 177 "RPCIB plugin driver", /* Driver name and version */ 178 &rpcib_ops, /* Driver ops */ 179 }; 180 181 static struct modlinkage rib_modlinkage = { 182 MODREV_1, 183 (void *)&rib_modldrv, 184 NULL 185 }; 186 187 typedef struct rib_lrc_entry { 188 struct rib_lrc_entry *forw; 189 struct rib_lrc_entry *back; 190 char *lrc_buf; 191 192 uint32_t lrc_len; 193 void *avl_node; 194 bool_t registered; 195 196 struct mrc lrc_mhandle; 197 bool_t lrc_on_freed_list; 198 } rib_lrc_entry_t; 199 200 typedef struct cache_struct { 201 rib_lrc_entry_t r; 202 uint32_t len; 203 uint32_t elements; 204 kmutex_t node_lock; 205 avl_node_t avl_link; 206 } cache_avl_struct_t; 207 208 209 static uint64_t rib_total_buffers = 0; 210 uint64_t cache_limit = 100 * 1024 * 1024; 211 static volatile uint64_t cache_allocation = 0; 212 static uint64_t cache_watermark = 80 * 1024 * 1024; 213 static uint64_t cache_hits = 0; 214 static uint64_t cache_misses = 0; 215 static uint64_t cache_cold_misses = 0; 216 static uint64_t cache_hot_misses = 0; 217 static uint64_t cache_misses_above_the_limit = 0; 218 static bool_t stats_enabled = FALSE; 219 220 static uint64_t max_unsignaled_rws = 5; 221 222 /* 223 * rib_stat: private data pointer used when registering 224 * with the IBTF. It is returned to the consumer 225 * in all callbacks. 226 */ 227 static rpcib_state_t *rib_stat = NULL; 228 229 #define RNR_RETRIES IBT_RNR_RETRY_1 230 #define MAX_PORTS 2 231 232 int preposted_rbufs = RDMA_BUFS_GRANT; 233 int send_threshold = 1; 234 235 /* 236 * State of the plugin. 237 * ACCEPT = accepting new connections and requests. 238 * NO_ACCEPT = not accepting new connection and requests. 239 * This should eventually move to rpcib_state_t structure, since this 240 * will tell in which state the plugin is for a particular type of service 241 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 242 * state for one and in no_accept state for the other. 243 */ 244 int plugin_state; 245 kmutex_t plugin_state_lock; 246 247 ldi_ident_t rpcib_li; 248 249 /* 250 * RPCIB RDMATF operations 251 */ 252 #if defined(MEASURE_POOL_DEPTH) 253 static void rib_posted_rbufs(uint32_t x) { return; } 254 #endif 255 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 256 static rdma_stat rib_disconnect(CONN *conn); 257 static void rib_listen(struct rdma_svc_data *rd); 258 static void rib_listen_stop(struct rdma_svc_data *rd); 259 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 260 uint_t buflen, struct mrc *buf_handle); 261 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 262 struct mrc buf_handle); 263 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 264 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 265 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 266 struct mrc buf_handle); 267 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 268 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 269 void *lrc); 270 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 271 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 272 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 273 caddr_t buf, int len, int cpu); 274 275 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 276 277 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 278 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 279 280 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 281 282 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 283 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 284 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 285 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 286 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 287 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 288 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 289 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 290 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); 291 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); 292 static rdma_stat rib_conn_release(CONN *conn); 293 static rdma_stat rib_getinfo(rdma_info_t *info); 294 295 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 296 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 297 static void rib_destroy_cache(rib_hca_t *hca); 298 static void rib_server_side_cache_reclaim(void *argp); 299 static int avl_compare(const void *t1, const void *t2); 300 301 static void rib_stop_services(rib_hca_t *); 302 static void rib_close_channels(rib_conn_list_t *); 303 304 /* 305 * RPCIB addressing operations 306 */ 307 308 /* 309 * RDMA operations the RPCIB module exports 310 */ 311 static rdmaops_t rib_ops = { 312 rib_reachable, 313 rib_conn_get, 314 rib_conn_release, 315 rib_listen, 316 rib_listen_stop, 317 rib_registermem, 318 rib_deregistermem, 319 rib_registermemsync, 320 rib_deregistermemsync, 321 rib_syncmem, 322 rib_reg_buf_alloc, 323 rib_reg_buf_free, 324 rib_send, 325 rib_send_resp, 326 rib_post_resp, 327 rib_post_resp_remove, 328 rib_post_recv, 329 rib_recv, 330 rib_read, 331 rib_write, 332 rib_getinfo, 333 }; 334 335 /* 336 * RDMATF RPCIB plugin details 337 */ 338 static rdma_mod_t rib_mod = { 339 "ibtf", /* api name */ 340 RDMATF_VERS_1, 341 0, 342 &rib_ops, /* rdma op vector for ibtf */ 343 }; 344 345 static rdma_stat open_hcas(rpcib_state_t *); 346 static rdma_stat rib_qp_init(rib_qp_t *, int); 347 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 348 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 349 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 350 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 351 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 352 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 353 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 354 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 355 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 356 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *, 357 ibt_ip_addr_t *, ibt_ip_addr_t *); 358 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 359 rib_qp_t **); 360 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 361 rib_qp_t **); 362 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 363 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 364 static int rib_free_sendwait(struct send_wid *); 365 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 366 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 367 static void rdma_done_rem_list(rib_qp_t *); 368 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 369 370 static void rib_async_handler(void *, 371 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 372 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 373 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 374 static int rib_free_svc_recv(struct svc_recv *); 375 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 376 static void rib_free_wid(struct recv_wid *); 377 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 378 static void rib_detach_hca(rib_hca_t *); 379 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int, 380 ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *); 381 382 /* 383 * Registration with IBTF as a consumer 384 */ 385 static struct ibt_clnt_modinfo_s rib_modinfo = { 386 IBTI_V2, 387 IBT_GENERIC, 388 rib_async_handler, /* async event handler */ 389 NULL, /* Memory Region Handler */ 390 "nfs/ib" 391 }; 392 393 /* 394 * Global strucuture 395 */ 396 397 typedef struct rpcib_s { 398 dev_info_t *rpcib_dip; 399 kmutex_t rpcib_mutex; 400 } rpcib_t; 401 402 rpcib_t rpcib; 403 404 /* 405 * /etc/system controlled variable to control 406 * debugging in rpcib kernel module. 407 * Set it to values greater that 1 to control 408 * the amount of debugging messages required. 409 */ 410 int rib_debug = 0; 411 412 413 int 414 _init(void) 415 { 416 int error; 417 int ret; 418 419 error = mod_install((struct modlinkage *)&rib_modlinkage); 420 if (error != 0) { 421 /* 422 * Could not load module 423 */ 424 return (error); 425 } 426 ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li); 427 if (ret != 0) 428 rpcib_li = NULL; 429 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 430 431 return (0); 432 } 433 434 int 435 _fini() 436 { 437 int status; 438 439 if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) { 440 return (EBUSY); 441 } 442 443 /* 444 * Remove module 445 */ 446 if ((status = mod_remove(&rib_modlinkage)) != 0) { 447 (void) rdma_register_mod(&rib_mod); 448 return (status); 449 } 450 mutex_destroy(&plugin_state_lock); 451 ldi_ident_release(rpcib_li); 452 return (0); 453 } 454 455 int 456 _info(struct modinfo *modinfop) 457 { 458 return (mod_info(&rib_modlinkage, modinfop)); 459 } 460 461 462 /* 463 * rpcib_getinfo() 464 * Given the device number, return the devinfo pointer or the 465 * instance number. 466 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 467 */ 468 469 /*ARGSUSED*/ 470 static int 471 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 472 { 473 int ret = DDI_SUCCESS; 474 475 switch (cmd) { 476 case DDI_INFO_DEVT2DEVINFO: 477 if (rpcib.rpcib_dip != NULL) 478 *result = rpcib.rpcib_dip; 479 else { 480 *result = NULL; 481 ret = DDI_FAILURE; 482 } 483 break; 484 485 case DDI_INFO_DEVT2INSTANCE: 486 *result = NULL; 487 break; 488 489 default: 490 ret = DDI_FAILURE; 491 } 492 return (ret); 493 } 494 495 static int 496 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 497 { 498 ibt_status_t ibt_status; 499 rdma_stat r_status; 500 501 switch (cmd) { 502 case DDI_ATTACH: 503 break; 504 case DDI_RESUME: 505 return (DDI_SUCCESS); 506 default: 507 return (DDI_FAILURE); 508 } 509 510 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 511 512 mutex_enter(&rpcib.rpcib_mutex); 513 if (rpcib.rpcib_dip != NULL) { 514 mutex_exit(&rpcib.rpcib_mutex); 515 return (DDI_FAILURE); 516 } 517 rpcib.rpcib_dip = dip; 518 mutex_exit(&rpcib.rpcib_mutex); 519 /* 520 * Create the "rpcib" minor-node. 521 */ 522 if (ddi_create_minor_node(dip, 523 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 524 /* Error message, no cmn_err as they print on console */ 525 return (DDI_FAILURE); 526 } 527 528 if (rib_stat == NULL) { 529 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 530 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 531 } 532 533 rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); 534 if (rib_stat->hca_count < 1) { 535 mutex_destroy(&rib_stat->open_hca_lock); 536 kmem_free(rib_stat, sizeof (*rib_stat)); 537 rib_stat = NULL; 538 return (DDI_FAILURE); 539 } 540 541 ibt_status = ibt_attach(&rib_modinfo, dip, 542 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 543 544 if (ibt_status != IBT_SUCCESS) { 545 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 546 mutex_destroy(&rib_stat->open_hca_lock); 547 kmem_free(rib_stat, sizeof (*rib_stat)); 548 rib_stat = NULL; 549 return (DDI_FAILURE); 550 } 551 552 mutex_enter(&rib_stat->open_hca_lock); 553 if (open_hcas(rib_stat) != RDMA_SUCCESS) { 554 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 555 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 556 mutex_exit(&rib_stat->open_hca_lock); 557 mutex_destroy(&rib_stat->open_hca_lock); 558 kmem_free(rib_stat, sizeof (*rib_stat)); 559 rib_stat = NULL; 560 return (DDI_FAILURE); 561 } 562 mutex_exit(&rib_stat->open_hca_lock); 563 564 /* 565 * Register with rdmatf 566 */ 567 rib_mod.rdma_count = rib_stat->hca_count; 568 r_status = rdma_register_mod(&rib_mod); 569 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 570 rib_detach_hca(rib_stat->hca); 571 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 572 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 573 mutex_destroy(&rib_stat->open_hca_lock); 574 kmem_free(rib_stat, sizeof (*rib_stat)); 575 rib_stat = NULL; 576 return (DDI_FAILURE); 577 } 578 579 580 return (DDI_SUCCESS); 581 } 582 583 /*ARGSUSED*/ 584 static int 585 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 586 { 587 switch (cmd) { 588 589 case DDI_DETACH: 590 break; 591 592 case DDI_SUSPEND: 593 default: 594 return (DDI_FAILURE); 595 } 596 597 /* 598 * Detach the hca and free resources 599 */ 600 mutex_enter(&plugin_state_lock); 601 plugin_state = NO_ACCEPT; 602 mutex_exit(&plugin_state_lock); 603 rib_detach_hca(rib_stat->hca); 604 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); 605 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 606 607 mutex_enter(&rpcib.rpcib_mutex); 608 rpcib.rpcib_dip = NULL; 609 mutex_exit(&rpcib.rpcib_mutex); 610 611 mutex_destroy(&rpcib.rpcib_mutex); 612 return (DDI_SUCCESS); 613 } 614 615 616 static void rib_rbufpool_free(rib_hca_t *, int); 617 static void rib_rbufpool_deregister(rib_hca_t *, int); 618 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 619 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 620 static rdma_stat rib_rem_replylist(rib_qp_t *); 621 static int rib_remreply(rib_qp_t *, struct reply *); 622 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 623 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 624 625 626 /* 627 * One CQ pair per HCA 628 */ 629 static rdma_stat 630 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 631 rib_cq_t **cqp, rpcib_state_t *ribstat) 632 { 633 rib_cq_t *cq; 634 ibt_cq_attr_t cq_attr; 635 uint32_t real_size; 636 ibt_status_t status; 637 rdma_stat error = RDMA_SUCCESS; 638 639 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 640 cq->rib_hca = hca; 641 cq_attr.cq_size = cq_size; 642 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 643 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 644 &real_size); 645 if (status != IBT_SUCCESS) { 646 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 647 " status=%d", status); 648 error = RDMA_FAILED; 649 goto fail; 650 } 651 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); 652 653 /* 654 * Enable CQ callbacks. CQ Callbacks are single shot 655 * (e.g. you have to call ibt_enable_cq_notify() 656 * after each callback to get another one). 657 */ 658 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 659 if (status != IBT_SUCCESS) { 660 cmn_err(CE_WARN, "rib_create_cq: " 661 "enable_cq_notify failed, status %d", status); 662 error = RDMA_FAILED; 663 goto fail; 664 } 665 *cqp = cq; 666 667 return (error); 668 fail: 669 if (cq->rib_cq_hdl) 670 (void) ibt_free_cq(cq->rib_cq_hdl); 671 if (cq) 672 kmem_free(cq, sizeof (rib_cq_t)); 673 return (error); 674 } 675 676 static rdma_stat 677 open_hcas(rpcib_state_t *ribstat) 678 { 679 rib_hca_t *hca; 680 ibt_status_t ibt_status; 681 rdma_stat status; 682 ibt_hca_portinfo_t *pinfop; 683 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 684 uint_t size, cq_size; 685 int i; 686 kstat_t *ksp; 687 cache_avl_struct_t example_avl_node; 688 char rssc_name[32]; 689 690 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 691 692 if (ribstat->hcas == NULL) 693 ribstat->hcas = kmem_zalloc(ribstat->hca_count * 694 sizeof (rib_hca_t), KM_SLEEP); 695 696 /* 697 * Open a hca and setup for RDMA 698 */ 699 for (i = 0; i < ribstat->hca_count; i++) { 700 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 701 ribstat->hca_guids[i], 702 &ribstat->hcas[i].hca_hdl); 703 if (ibt_status != IBT_SUCCESS) { 704 continue; 705 } 706 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; 707 hca = &(ribstat->hcas[i]); 708 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 709 hca->state = HCA_INITED; 710 711 /* 712 * query HCA info 713 */ 714 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 715 if (ibt_status != IBT_SUCCESS) { 716 goto fail1; 717 } 718 719 /* 720 * One PD (Protection Domain) per HCA. 721 * A qp is allowed to access a memory region 722 * only when it's in the same PD as that of 723 * the memory region. 724 */ 725 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 726 if (ibt_status != IBT_SUCCESS) { 727 goto fail1; 728 } 729 730 /* 731 * query HCA ports 732 */ 733 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 734 0, &pinfop, &hca->hca_nports, &size); 735 if (ibt_status != IBT_SUCCESS) { 736 goto fail2; 737 } 738 hca->hca_ports = pinfop; 739 hca->hca_pinfosz = size; 740 pinfop = NULL; 741 742 cq_size = DEF_CQ_SIZE; /* default cq size */ 743 /* 744 * Create 2 pairs of cq's (1 pair for client 745 * and the other pair for server) on this hca. 746 * If number of qp's gets too large, then several 747 * cq's will be needed. 748 */ 749 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 750 &hca->svc_rcq, ribstat); 751 if (status != RDMA_SUCCESS) { 752 goto fail3; 753 } 754 755 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 756 &hca->svc_scq, ribstat); 757 if (status != RDMA_SUCCESS) { 758 goto fail3; 759 } 760 761 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 762 &hca->clnt_rcq, ribstat); 763 if (status != RDMA_SUCCESS) { 764 goto fail3; 765 } 766 767 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 768 &hca->clnt_scq, ribstat); 769 if (status != RDMA_SUCCESS) { 770 goto fail3; 771 } 772 773 /* 774 * Create buffer pools. 775 * Note rib_rbuf_create also allocates memory windows. 776 */ 777 hca->recv_pool = rib_rbufpool_create(hca, 778 RECV_BUFFER, MAX_BUFS); 779 if (hca->recv_pool == NULL) { 780 goto fail3; 781 } 782 783 hca->send_pool = rib_rbufpool_create(hca, 784 SEND_BUFFER, MAX_BUFS); 785 if (hca->send_pool == NULL) { 786 rib_rbufpool_destroy(hca, RECV_BUFFER); 787 goto fail3; 788 } 789 790 if (hca->server_side_cache == NULL) { 791 (void) sprintf(rssc_name, 792 "rib_server_side_cache_%04d", i); 793 hca->server_side_cache = kmem_cache_create( 794 rssc_name, 795 sizeof (cache_avl_struct_t), 0, 796 NULL, 797 NULL, 798 rib_server_side_cache_reclaim, 799 hca, NULL, 0); 800 } 801 802 avl_create(&hca->avl_tree, 803 avl_compare, 804 sizeof (cache_avl_struct_t), 805 (uint_t)(uintptr_t)&example_avl_node.avl_link- 806 (uint_t)(uintptr_t)&example_avl_node); 807 808 rw_init(&hca->avl_rw_lock, 809 NULL, RW_DRIVER, hca->iblock); 810 mutex_init(&hca->cache_allocation, 811 NULL, MUTEX_DRIVER, NULL); 812 hca->avl_init = TRUE; 813 814 /* Create kstats for the cache */ 815 ASSERT(INGLOBALZONE(curproc)); 816 817 if (!stats_enabled) { 818 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 819 KSTAT_TYPE_NAMED, 820 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 821 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 822 GLOBAL_ZONEID); 823 if (ksp) { 824 ksp->ks_data = (void *) &rpcib_kstat; 825 ksp->ks_update = rpcib_cache_kstat_update; 826 kstat_install(ksp); 827 stats_enabled = TRUE; 828 } 829 } 830 if (NULL == hca->reg_cache_clean_up) { 831 hca->reg_cache_clean_up = ddi_taskq_create(NULL, 832 "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); 833 } 834 835 /* 836 * Initialize the registered service list and 837 * the lock 838 */ 839 hca->service_list = NULL; 840 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); 841 842 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 843 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 844 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 845 hca->iblock); 846 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 847 hca->iblock); 848 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 849 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 850 hca->inuse = TRUE; 851 /* 852 * XXX One hca only. Add multi-hca functionality if needed 853 * later. 854 */ 855 ribstat->hca = hca; 856 ribstat->nhca_inited++; 857 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 858 break; 859 860 fail3: 861 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 862 fail2: 863 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 864 fail1: 865 (void) ibt_close_hca(hca->hca_hdl); 866 867 } 868 if (ribstat->hca != NULL) 869 return (RDMA_SUCCESS); 870 else 871 return (RDMA_FAILED); 872 } 873 874 /* 875 * Callback routines 876 */ 877 878 /* 879 * SCQ handlers 880 */ 881 /* ARGSUSED */ 882 static void 883 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 884 { 885 ibt_status_t ibt_status; 886 ibt_wc_t wc; 887 int i; 888 889 /* 890 * Re-enable cq notify here to avoid missing any 891 * completion queue notification. 892 */ 893 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 894 895 ibt_status = IBT_SUCCESS; 896 while (ibt_status != IBT_CQ_EMPTY) { 897 bzero(&wc, sizeof (wc)); 898 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 899 if (ibt_status != IBT_SUCCESS) 900 return; 901 902 /* 903 * Got a send completion 904 */ 905 if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ 906 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; 907 CONN *conn = qptoc(wd->qp); 908 909 mutex_enter(&wd->sendwait_lock); 910 switch (wc.wc_status) { 911 case IBT_WC_SUCCESS: 912 wd->status = RDMA_SUCCESS; 913 break; 914 case IBT_WC_WR_FLUSHED_ERR: 915 wd->status = RDMA_FAILED; 916 break; 917 default: 918 /* 919 * RC Send Q Error Code Local state Remote State 920 * ==================== =========== ============ 921 * IBT_WC_BAD_RESPONSE_ERR ERROR None 922 * IBT_WC_LOCAL_LEN_ERR ERROR None 923 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 924 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 925 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 926 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 927 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 928 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 929 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 930 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 931 * IBT_WC_WR_FLUSHED_ERR None None 932 */ 933 /* 934 * Channel in error state. Set connection to 935 * ERROR and cleanup will happen either from 936 * conn_release or from rib_conn_get 937 */ 938 wd->status = RDMA_FAILED; 939 mutex_enter(&conn->c_lock); 940 if (conn->c_state != C_DISCONN_PEND) 941 conn->c_state = C_ERROR_CONN; 942 mutex_exit(&conn->c_lock); 943 break; 944 } 945 946 if (wd->cv_sig == 1) { 947 /* 948 * Notify poster 949 */ 950 cv_signal(&wd->wait_cv); 951 mutex_exit(&wd->sendwait_lock); 952 } else { 953 /* 954 * Poster not waiting for notification. 955 * Free the send buffers and send_wid 956 */ 957 for (i = 0; i < wd->nsbufs; i++) { 958 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, 959 (void *)(uintptr_t)wd->sbufaddr[i]); 960 } 961 mutex_exit(&wd->sendwait_lock); 962 (void) rib_free_sendwait(wd); 963 } 964 } 965 } 966 } 967 968 /* ARGSUSED */ 969 static void 970 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 971 { 972 ibt_status_t ibt_status; 973 ibt_wc_t wc; 974 int i; 975 976 /* 977 * Re-enable cq notify here to avoid missing any 978 * completion queue notification. 979 */ 980 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 981 982 ibt_status = IBT_SUCCESS; 983 while (ibt_status != IBT_CQ_EMPTY) { 984 bzero(&wc, sizeof (wc)); 985 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 986 if (ibt_status != IBT_SUCCESS) 987 return; 988 989 /* 990 * Got a send completion 991 */ 992 if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ 993 struct send_wid *wd = 994 (struct send_wid *)(uintptr_t)wc.wc_id; 995 mutex_enter(&wd->sendwait_lock); 996 if (wd->cv_sig == 1) { 997 /* 998 * Update completion status and notify poster 999 */ 1000 if (wc.wc_status == IBT_WC_SUCCESS) 1001 wd->status = RDMA_SUCCESS; 1002 else 1003 wd->status = RDMA_FAILED; 1004 cv_signal(&wd->wait_cv); 1005 mutex_exit(&wd->sendwait_lock); 1006 } else { 1007 /* 1008 * Poster not waiting for notification. 1009 * Free the send buffers and send_wid 1010 */ 1011 for (i = 0; i < wd->nsbufs; i++) { 1012 rib_rbuf_free(qptoc(wd->qp), 1013 SEND_BUFFER, 1014 (void *)(uintptr_t)wd->sbufaddr[i]); 1015 } 1016 mutex_exit(&wd->sendwait_lock); 1017 (void) rib_free_sendwait(wd); 1018 } 1019 } 1020 } 1021 } 1022 1023 /* 1024 * RCQ handler 1025 */ 1026 /* ARGSUSED */ 1027 static void 1028 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1029 { 1030 rib_qp_t *qp; 1031 ibt_status_t ibt_status; 1032 ibt_wc_t wc; 1033 struct recv_wid *rwid; 1034 1035 /* 1036 * Re-enable cq notify here to avoid missing any 1037 * completion queue notification. 1038 */ 1039 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1040 1041 ibt_status = IBT_SUCCESS; 1042 while (ibt_status != IBT_CQ_EMPTY) { 1043 bzero(&wc, sizeof (wc)); 1044 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1045 if (ibt_status != IBT_SUCCESS) 1046 return; 1047 1048 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1049 qp = rwid->qp; 1050 if (wc.wc_status == IBT_WC_SUCCESS) { 1051 XDR inxdrs, *xdrs; 1052 uint_t xid, vers, op, find_xid = 0; 1053 struct reply *r; 1054 CONN *conn = qptoc(qp); 1055 uint32_t rdma_credit = 0; 1056 1057 xdrs = &inxdrs; 1058 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1059 wc.wc_bytes_xfer, XDR_DECODE); 1060 /* 1061 * Treat xid as opaque (xid is the first entity 1062 * in the rpc rdma message). 1063 */ 1064 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1065 1066 /* Skip xid and set the xdr position accordingly. */ 1067 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1068 (void) xdr_u_int(xdrs, &vers); 1069 (void) xdr_u_int(xdrs, &rdma_credit); 1070 (void) xdr_u_int(xdrs, &op); 1071 XDR_DESTROY(xdrs); 1072 1073 if (vers != RPCRDMA_VERS) { 1074 /* 1075 * Invalid RPC/RDMA version. Cannot 1076 * interoperate. Set connection to 1077 * ERROR state and bail out. 1078 */ 1079 mutex_enter(&conn->c_lock); 1080 if (conn->c_state != C_DISCONN_PEND) 1081 conn->c_state = C_ERROR_CONN; 1082 mutex_exit(&conn->c_lock); 1083 rib_rbuf_free(conn, RECV_BUFFER, 1084 (void *)(uintptr_t)rwid->addr); 1085 rib_free_wid(rwid); 1086 continue; 1087 } 1088 1089 mutex_enter(&qp->replylist_lock); 1090 for (r = qp->replylist; r != NULL; r = r->next) { 1091 if (r->xid == xid) { 1092 find_xid = 1; 1093 switch (op) { 1094 case RDMA_MSG: 1095 case RDMA_NOMSG: 1096 case RDMA_MSGP: 1097 r->status = RDMA_SUCCESS; 1098 r->vaddr_cq = rwid->addr; 1099 r->bytes_xfer = 1100 wc.wc_bytes_xfer; 1101 cv_signal(&r->wait_cv); 1102 break; 1103 default: 1104 rib_rbuf_free(qptoc(qp), 1105 RECV_BUFFER, 1106 (void *)(uintptr_t) 1107 rwid->addr); 1108 break; 1109 } 1110 break; 1111 } 1112 } 1113 mutex_exit(&qp->replylist_lock); 1114 if (find_xid == 0) { 1115 /* RPC caller not waiting for reply */ 1116 1117 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1118 int, xid); 1119 1120 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1121 (void *)(uintptr_t)rwid->addr); 1122 } 1123 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1124 CONN *conn = qptoc(qp); 1125 1126 /* 1127 * Connection being flushed. Just free 1128 * the posted buffer 1129 */ 1130 rib_rbuf_free(conn, RECV_BUFFER, 1131 (void *)(uintptr_t)rwid->addr); 1132 } else { 1133 CONN *conn = qptoc(qp); 1134 /* 1135 * RC Recv Q Error Code Local state Remote State 1136 * ==================== =========== ============ 1137 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1138 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1139 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1140 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1141 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1142 * IBT_WC_WR_FLUSHED_ERR None None 1143 */ 1144 /* 1145 * Channel in error state. Set connection 1146 * in ERROR state. 1147 */ 1148 mutex_enter(&conn->c_lock); 1149 if (conn->c_state != C_DISCONN_PEND) 1150 conn->c_state = C_ERROR_CONN; 1151 mutex_exit(&conn->c_lock); 1152 rib_rbuf_free(conn, RECV_BUFFER, 1153 (void *)(uintptr_t)rwid->addr); 1154 } 1155 rib_free_wid(rwid); 1156 } 1157 } 1158 1159 /* Server side */ 1160 /* ARGSUSED */ 1161 static void 1162 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1163 { 1164 rdma_recv_data_t *rdp; 1165 rib_qp_t *qp; 1166 ibt_status_t ibt_status; 1167 ibt_wc_t wc; 1168 struct svc_recv *s_recvp; 1169 CONN *conn; 1170 mblk_t *mp; 1171 1172 /* 1173 * Re-enable cq notify here to avoid missing any 1174 * completion queue notification. 1175 */ 1176 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1177 1178 ibt_status = IBT_SUCCESS; 1179 while (ibt_status != IBT_CQ_EMPTY) { 1180 bzero(&wc, sizeof (wc)); 1181 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1182 if (ibt_status != IBT_SUCCESS) 1183 return; 1184 1185 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1186 qp = s_recvp->qp; 1187 conn = qptoc(qp); 1188 mutex_enter(&qp->posted_rbufs_lock); 1189 qp->n_posted_rbufs--; 1190 #if defined(MEASURE_POOL_DEPTH) 1191 rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); 1192 #endif 1193 if (qp->n_posted_rbufs == 0) 1194 cv_signal(&qp->posted_rbufs_cv); 1195 mutex_exit(&qp->posted_rbufs_lock); 1196 1197 if (wc.wc_status == IBT_WC_SUCCESS) { 1198 XDR inxdrs, *xdrs; 1199 uint_t xid, vers, op; 1200 uint32_t rdma_credit; 1201 1202 xdrs = &inxdrs; 1203 /* s_recvp->vaddr stores data */ 1204 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1205 wc.wc_bytes_xfer, XDR_DECODE); 1206 1207 /* 1208 * Treat xid as opaque (xid is the first entity 1209 * in the rpc rdma message). 1210 */ 1211 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1212 /* Skip xid and set the xdr position accordingly. */ 1213 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1214 if (!xdr_u_int(xdrs, &vers) || 1215 !xdr_u_int(xdrs, &rdma_credit) || 1216 !xdr_u_int(xdrs, &op)) { 1217 rib_rbuf_free(conn, RECV_BUFFER, 1218 (void *)(uintptr_t)s_recvp->vaddr); 1219 XDR_DESTROY(xdrs); 1220 (void) rib_free_svc_recv(s_recvp); 1221 continue; 1222 } 1223 XDR_DESTROY(xdrs); 1224 1225 if (vers != RPCRDMA_VERS) { 1226 /* 1227 * Invalid RPC/RDMA version. 1228 * Drop rpc rdma message. 1229 */ 1230 rib_rbuf_free(conn, RECV_BUFFER, 1231 (void *)(uintptr_t)s_recvp->vaddr); 1232 (void) rib_free_svc_recv(s_recvp); 1233 continue; 1234 } 1235 /* 1236 * Is this for RDMA_DONE? 1237 */ 1238 if (op == RDMA_DONE) { 1239 rib_rbuf_free(conn, RECV_BUFFER, 1240 (void *)(uintptr_t)s_recvp->vaddr); 1241 /* 1242 * Wake up the thread waiting on 1243 * a RDMA_DONE for xid 1244 */ 1245 mutex_enter(&qp->rdlist_lock); 1246 rdma_done_notify(qp, xid); 1247 mutex_exit(&qp->rdlist_lock); 1248 (void) rib_free_svc_recv(s_recvp); 1249 continue; 1250 } 1251 1252 mutex_enter(&plugin_state_lock); 1253 if (plugin_state == ACCEPT) { 1254 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1255 == NULL) 1256 (void) strwaitbuf( 1257 sizeof (*rdp), BPRI_LO); 1258 /* 1259 * Plugin is in accept state, hence the master 1260 * transport queue for this is still accepting 1261 * requests. Hence we can call svc_queuereq to 1262 * queue this recieved msg. 1263 */ 1264 rdp = (rdma_recv_data_t *)mp->b_rptr; 1265 rdp->conn = conn; 1266 rdp->rpcmsg.addr = 1267 (caddr_t)(uintptr_t)s_recvp->vaddr; 1268 rdp->rpcmsg.type = RECV_BUFFER; 1269 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1270 rdp->status = wc.wc_status; 1271 mutex_enter(&conn->c_lock); 1272 conn->c_ref++; 1273 mutex_exit(&conn->c_lock); 1274 mp->b_wptr += sizeof (*rdp); 1275 svc_queuereq((queue_t *)rib_stat->q, mp); 1276 mutex_exit(&plugin_state_lock); 1277 } else { 1278 /* 1279 * The master transport for this is going 1280 * away and the queue is not accepting anymore 1281 * requests for krpc, so don't do anything, just 1282 * free the msg. 1283 */ 1284 mutex_exit(&plugin_state_lock); 1285 rib_rbuf_free(conn, RECV_BUFFER, 1286 (void *)(uintptr_t)s_recvp->vaddr); 1287 } 1288 } else { 1289 rib_rbuf_free(conn, RECV_BUFFER, 1290 (void *)(uintptr_t)s_recvp->vaddr); 1291 } 1292 (void) rib_free_svc_recv(s_recvp); 1293 } 1294 } 1295 1296 /* 1297 * Handles DR event of IBT_HCA_DETACH_EVENT. 1298 */ 1299 /* ARGSUSED */ 1300 static void 1301 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1302 ibt_async_code_t code, ibt_async_event_t *event) 1303 { 1304 1305 switch (code) { 1306 case IBT_HCA_ATTACH_EVENT: 1307 /* ignore */ 1308 break; 1309 case IBT_HCA_DETACH_EVENT: 1310 { 1311 ASSERT(rib_stat->hca->hca_hdl == hca_hdl); 1312 rib_detach_hca(rib_stat->hca); 1313 #ifdef DEBUG 1314 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1315 #endif 1316 break; 1317 } 1318 #ifdef DEBUG 1319 case IBT_EVENT_PATH_MIGRATED: 1320 cmn_err(CE_NOTE, "rib_async_handler(): " 1321 "IBT_EVENT_PATH_MIGRATED\n"); 1322 break; 1323 case IBT_EVENT_SQD: 1324 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1325 break; 1326 case IBT_EVENT_COM_EST: 1327 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1328 break; 1329 case IBT_ERROR_CATASTROPHIC_CHAN: 1330 cmn_err(CE_NOTE, "rib_async_handler(): " 1331 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1332 break; 1333 case IBT_ERROR_INVALID_REQUEST_CHAN: 1334 cmn_err(CE_NOTE, "rib_async_handler(): " 1335 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1336 break; 1337 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1338 cmn_err(CE_NOTE, "rib_async_handler(): " 1339 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1340 break; 1341 case IBT_ERROR_PATH_MIGRATE_REQ: 1342 cmn_err(CE_NOTE, "rib_async_handler(): " 1343 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1344 break; 1345 case IBT_ERROR_CQ: 1346 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1347 break; 1348 case IBT_ERROR_PORT_DOWN: 1349 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1350 break; 1351 case IBT_EVENT_PORT_UP: 1352 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1353 break; 1354 case IBT_ASYNC_OPAQUE1: 1355 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1356 break; 1357 case IBT_ASYNC_OPAQUE2: 1358 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1359 break; 1360 case IBT_ASYNC_OPAQUE3: 1361 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1362 break; 1363 case IBT_ASYNC_OPAQUE4: 1364 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1365 break; 1366 #endif 1367 default: 1368 break; 1369 } 1370 } 1371 1372 /* 1373 * Client's reachable function. 1374 */ 1375 static rdma_stat 1376 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1377 { 1378 rib_hca_t *hca; 1379 rdma_stat status; 1380 1381 /* 1382 * First check if a hca is still attached 1383 */ 1384 *handle = NULL; 1385 rw_enter(&rib_stat->hca->state_lock, RW_READER); 1386 if (rib_stat->hca->state != HCA_INITED) { 1387 rw_exit(&rib_stat->hca->state_lock); 1388 return (RDMA_FAILED); 1389 } 1390 status = rib_ping_srv(addr_type, raddr, &hca); 1391 rw_exit(&rib_stat->hca->state_lock); 1392 1393 if (status == RDMA_SUCCESS) { 1394 *handle = (void *)hca; 1395 return (RDMA_SUCCESS); 1396 } else { 1397 *handle = NULL; 1398 DTRACE_PROBE(rpcib__i__pingfailed); 1399 return (RDMA_FAILED); 1400 } 1401 } 1402 1403 /* Client side qp creation */ 1404 static rdma_stat 1405 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1406 { 1407 rib_qp_t *kqp = NULL; 1408 CONN *conn; 1409 rdma_clnt_cred_ctrl_t *cc_info; 1410 1411 ASSERT(qp != NULL); 1412 *qp = NULL; 1413 1414 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1415 conn = qptoc(kqp); 1416 kqp->hca = hca; 1417 kqp->rdmaconn.c_rdmamod = &rib_mod; 1418 kqp->rdmaconn.c_private = (caddr_t)kqp; 1419 1420 kqp->mode = RIB_CLIENT; 1421 kqp->chan_flags = IBT_BLOCKING; 1422 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1423 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1424 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1425 /* 1426 * Initialize 1427 */ 1428 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1429 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1430 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1431 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1432 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1433 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1434 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1435 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1436 /* 1437 * Initialize the client credit control 1438 * portion of the rdmaconn struct. 1439 */ 1440 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1441 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1442 cc_info->clnt_cc_granted_ops = 0; 1443 cc_info->clnt_cc_in_flight_ops = 0; 1444 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1445 1446 *qp = kqp; 1447 return (RDMA_SUCCESS); 1448 } 1449 1450 /* Server side qp creation */ 1451 static rdma_stat 1452 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1453 { 1454 rib_qp_t *kqp = NULL; 1455 ibt_chan_sizes_t chan_sizes; 1456 ibt_rc_chan_alloc_args_t qp_attr; 1457 ibt_status_t ibt_status; 1458 rdma_srv_cred_ctrl_t *cc_info; 1459 1460 *qp = NULL; 1461 1462 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1463 kqp->hca = hca; 1464 kqp->port_num = port; 1465 kqp->rdmaconn.c_rdmamod = &rib_mod; 1466 kqp->rdmaconn.c_private = (caddr_t)kqp; 1467 1468 /* 1469 * Create the qp handle 1470 */ 1471 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1472 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1473 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1474 qp_attr.rc_pd = hca->pd_hdl; 1475 qp_attr.rc_hca_port_num = port; 1476 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1477 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1478 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1479 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1480 qp_attr.rc_clone_chan = NULL; 1481 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1482 qp_attr.rc_flags = IBT_WR_SIGNALED; 1483 1484 rw_enter(&hca->state_lock, RW_READER); 1485 if (hca->state != HCA_DETACHED) { 1486 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1487 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1488 &chan_sizes); 1489 } else { 1490 rw_exit(&hca->state_lock); 1491 goto fail; 1492 } 1493 rw_exit(&hca->state_lock); 1494 1495 if (ibt_status != IBT_SUCCESS) { 1496 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1497 int, ibt_status); 1498 goto fail; 1499 } 1500 1501 kqp->mode = RIB_SERVER; 1502 kqp->chan_flags = IBT_BLOCKING; 1503 kqp->q = q; /* server ONLY */ 1504 1505 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1506 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1507 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1508 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1509 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1510 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1511 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1512 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1513 /* 1514 * Set the private data area to qp to be used in callbacks 1515 */ 1516 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1517 kqp->rdmaconn.c_state = C_CONNECTED; 1518 1519 /* 1520 * Initialize the server credit control 1521 * portion of the rdmaconn struct. 1522 */ 1523 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1524 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1525 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1526 cc_info->srv_cc_cur_buffers_used = 0; 1527 cc_info->srv_cc_posted = preposted_rbufs; 1528 1529 *qp = kqp; 1530 1531 return (RDMA_SUCCESS); 1532 fail: 1533 if (kqp) 1534 kmem_free(kqp, sizeof (rib_qp_t)); 1535 1536 return (RDMA_FAILED); 1537 } 1538 1539 /* ARGSUSED */ 1540 ibt_cm_status_t 1541 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1542 ibt_cm_return_args_t *ret_args, void *priv_data, 1543 ibt_priv_data_len_t len) 1544 { 1545 rpcib_state_t *ribstat; 1546 rib_hca_t *hca; 1547 1548 ribstat = (rpcib_state_t *)clnt_hdl; 1549 hca = (rib_hca_t *)ribstat->hca; 1550 1551 switch (event->cm_type) { 1552 1553 /* got a connection close event */ 1554 case IBT_CM_EVENT_CONN_CLOSED: 1555 { 1556 CONN *conn; 1557 rib_qp_t *qp; 1558 1559 /* check reason why connection was closed */ 1560 switch (event->cm_event.closed) { 1561 case IBT_CM_CLOSED_DREP_RCVD: 1562 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1563 case IBT_CM_CLOSED_DUP: 1564 case IBT_CM_CLOSED_ABORT: 1565 case IBT_CM_CLOSED_ALREADY: 1566 /* 1567 * These cases indicate the local end initiated 1568 * the closing of the channel. Nothing to do here. 1569 */ 1570 break; 1571 default: 1572 /* 1573 * Reason for CONN_CLOSED event must be one of 1574 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1575 * or IBT_CM_CLOSED_STALE. These indicate cases were 1576 * the remote end is closing the channel. In these 1577 * cases free the channel and transition to error 1578 * state 1579 */ 1580 qp = ibt_get_chan_private(event->cm_channel); 1581 conn = qptoc(qp); 1582 mutex_enter(&conn->c_lock); 1583 if (conn->c_state == C_DISCONN_PEND) { 1584 mutex_exit(&conn->c_lock); 1585 break; 1586 } 1587 1588 conn->c_state = C_ERROR_CONN; 1589 1590 /* 1591 * Free the rc_channel. Channel has already 1592 * transitioned to ERROR state and WRs have been 1593 * FLUSHED_ERR already. 1594 */ 1595 (void) ibt_free_channel(qp->qp_hdl); 1596 qp->qp_hdl = NULL; 1597 1598 /* 1599 * Free the conn if c_ref is down to 0 already 1600 */ 1601 if (conn->c_ref == 0) { 1602 /* 1603 * Remove from list and free conn 1604 */ 1605 conn->c_state = C_DISCONN_PEND; 1606 mutex_exit(&conn->c_lock); 1607 (void) rib_disconnect_channel(conn, 1608 &hca->cl_conn_list); 1609 } else { 1610 mutex_exit(&conn->c_lock); 1611 } 1612 #ifdef DEBUG 1613 if (rib_debug) 1614 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1615 "(CONN_CLOSED) channel disconnected"); 1616 #endif 1617 break; 1618 } 1619 break; 1620 } 1621 default: 1622 break; 1623 } 1624 return (IBT_CM_ACCEPT); 1625 } 1626 1627 /* Check server ib address */ 1628 rdma_stat 1629 rib_chk_srv_ibaddr(struct netbuf *raddr, 1630 int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip, 1631 ibt_ip_addr_t *d_ip) 1632 { 1633 struct sockaddr_in *sin4; 1634 struct sockaddr_in6 *sin6; 1635 ibt_status_t ibt_status; 1636 ibt_ip_path_attr_t ipattr; 1637 uint8_t npaths = 0; 1638 ibt_path_ip_src_t srcip; 1639 1640 ASSERT(raddr->buf != NULL); 1641 1642 (void) bzero(path, sizeof (ibt_path_info_t)); 1643 1644 switch (addr_type) { 1645 case AF_INET: 1646 sin4 = (struct sockaddr_in *)raddr->buf; 1647 d_ip->family = AF_INET; 1648 d_ip->un.ip4addr = sin4->sin_addr.s_addr; 1649 break; 1650 1651 case AF_INET6: 1652 sin6 = (struct sockaddr_in6 *)raddr->buf; 1653 d_ip->family = AF_INET6; 1654 d_ip->un.ip6addr = sin6->sin6_addr; 1655 break; 1656 1657 default: 1658 return (RDMA_INVAL); 1659 } 1660 1661 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1662 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1663 1664 ipattr.ipa_dst_ip = d_ip; 1665 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1666 ipattr.ipa_ndst = 1; 1667 ipattr.ipa_max_paths = 1; 1668 npaths = 0; 1669 1670 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1671 IBT_PATH_NO_FLAGS, 1672 &ipattr, 1673 path, 1674 &npaths, 1675 &srcip); 1676 1677 if (ibt_status != IBT_SUCCESS || 1678 npaths < 1 || 1679 path->pi_hca_guid != rib_stat->hca->hca_guid) { 1680 1681 bzero(s_ip, sizeof (ibt_path_ip_src_t)); 1682 return (RDMA_FAILED); 1683 } 1684 1685 if (srcip.ip_primary.family == AF_INET) { 1686 s_ip->family = AF_INET; 1687 s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr; 1688 } else { 1689 s_ip->family = AF_INET6; 1690 s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr; 1691 } 1692 1693 return (RDMA_SUCCESS); 1694 } 1695 1696 1697 /* 1698 * Connect to the server. 1699 */ 1700 rdma_stat 1701 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path, 1702 ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip) 1703 { 1704 ibt_chan_open_args_t chan_args; /* channel args */ 1705 ibt_chan_sizes_t chan_sizes; 1706 ibt_rc_chan_alloc_args_t qp_attr; 1707 ibt_status_t ibt_status; 1708 ibt_rc_returns_t ret_args; /* conn reject info */ 1709 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1710 ibt_ip_cm_info_t ipcm_info; 1711 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1712 1713 1714 (void) bzero(&chan_args, sizeof (chan_args)); 1715 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1716 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1717 1718 switch (ipcm_info.src_addr.family = s_ip->family) { 1719 case AF_INET: 1720 ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr; 1721 break; 1722 case AF_INET6: 1723 ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr; 1724 break; 1725 } 1726 1727 switch (ipcm_info.dst_addr.family = d_ip->family) { 1728 case AF_INET: 1729 ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr; 1730 break; 1731 case AF_INET6: 1732 ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr; 1733 break; 1734 } 1735 1736 ipcm_info.src_port = NFS_RDMA_PORT; 1737 1738 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1739 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1740 1741 if (ibt_status != IBT_SUCCESS) { 1742 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1743 return (-1); 1744 } 1745 1746 qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; 1747 /* Alloc a RC channel */ 1748 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1749 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1750 qp_attr.rc_pd = hca->pd_hdl; 1751 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1752 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1753 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1754 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1755 qp_attr.rc_clone_chan = NULL; 1756 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1757 qp_attr.rc_flags = IBT_WR_SIGNALED; 1758 1759 path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT); 1760 chan_args.oc_path = path; 1761 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1762 chan_args.oc_cm_clnt_private = (void *)rib_stat; 1763 chan_args.oc_rdma_ra_out = 4; 1764 chan_args.oc_rdma_ra_in = 4; 1765 chan_args.oc_path_retry_cnt = 2; 1766 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1767 chan_args.oc_priv_data = cmp_ip_pvt; 1768 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1769 1770 refresh: 1771 rw_enter(&hca->state_lock, RW_READER); 1772 if (hca->state != HCA_DETACHED) { 1773 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1774 IBT_ACHAN_NO_FLAGS, 1775 &qp_attr, &qp->qp_hdl, 1776 &chan_sizes); 1777 } else { 1778 rw_exit(&hca->state_lock); 1779 return (RDMA_FAILED); 1780 } 1781 rw_exit(&hca->state_lock); 1782 1783 if (ibt_status != IBT_SUCCESS) { 1784 DTRACE_PROBE1(rpcib__i_conntosrv, 1785 int, ibt_status); 1786 return (RDMA_FAILED); 1787 } 1788 1789 /* Connect to the Server */ 1790 (void) bzero(&ret_args, sizeof (ret_args)); 1791 mutex_enter(&qp->cb_lock); 1792 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1793 IBT_BLOCKING, &chan_args, &ret_args); 1794 if (ibt_status != IBT_SUCCESS) { 1795 DTRACE_PROBE2(rpcib__i_openrctosrv, 1796 int, ibt_status, int, ret_args.rc_status); 1797 1798 (void) ibt_free_channel(qp->qp_hdl); 1799 qp->qp_hdl = NULL; 1800 mutex_exit(&qp->cb_lock); 1801 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1802 ret_args.rc_status == IBT_CM_CONN_STALE) { 1803 /* 1804 * Got IBT_CM_CONN_STALE probably because of stale 1805 * data on the passive end of a channel that existed 1806 * prior to reboot. Retry establishing a channel 1807 * REFRESH_ATTEMPTS times, during which time the 1808 * stale conditions on the server might clear up. 1809 */ 1810 goto refresh; 1811 } 1812 return (RDMA_FAILED); 1813 } 1814 mutex_exit(&qp->cb_lock); 1815 /* 1816 * Set the private data area to qp to be used in callbacks 1817 */ 1818 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1819 return (RDMA_SUCCESS); 1820 } 1821 1822 rdma_stat 1823 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) 1824 { 1825 struct sockaddr_in *sin4, *sin4arr; 1826 struct sockaddr_in6 *sin6, *sin6arr; 1827 uint_t nif, nif4, nif6, i; 1828 ibt_path_info_t path; 1829 ibt_status_t ibt_status; 1830 uint8_t num_paths_p; 1831 ibt_ip_path_attr_t ipattr; 1832 ibt_ip_addr_t dstip; 1833 ibt_path_ip_src_t srcip; 1834 1835 1836 *hca = NULL; 1837 1838 ASSERT(raddr->buf != NULL); 1839 1840 bzero(&path, sizeof (ibt_path_info_t)); 1841 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1842 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1843 1844 /* Obtain the source IP addresses for the system */ 1845 nif = rpcib_get_number_interfaces(); 1846 sin4arr = (struct sockaddr_in *) 1847 kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP); 1848 sin6arr = (struct sockaddr_in6 *) 1849 kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP); 1850 1851 (void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6); 1852 1853 /* Are there really any IB interfaces available */ 1854 if (nif4 == 0 && nif6 == 0) { 1855 kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); 1856 kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); 1857 return (RDMA_FAILED); 1858 } 1859 1860 /* Prep the destination address */ 1861 switch (addr_type) { 1862 case AF_INET: 1863 sin4 = (struct sockaddr_in *)raddr->buf; 1864 dstip.family = AF_INET; 1865 dstip.un.ip4addr = sin4->sin_addr.s_addr; 1866 1867 for (i = 0; i < nif4; i++) { 1868 num_paths_p = 0; 1869 ipattr.ipa_dst_ip = &dstip; 1870 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1871 ipattr.ipa_ndst = 1; 1872 ipattr.ipa_max_paths = 1; 1873 ipattr.ipa_src_ip.family = dstip.family; 1874 ipattr.ipa_src_ip.un.ip4addr = 1875 sin4arr[i].sin_addr.s_addr; 1876 1877 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1878 IBT_PATH_NO_FLAGS, 1879 &ipattr, 1880 &path, 1881 &num_paths_p, 1882 &srcip); 1883 if (ibt_status == IBT_SUCCESS && 1884 num_paths_p != 0 && 1885 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1886 *hca = rib_stat->hca; 1887 1888 kmem_free(sin4arr, 1889 sizeof (struct sockaddr_in) * nif); 1890 kmem_free(sin6arr, 1891 sizeof (struct sockaddr_in6) * nif); 1892 1893 return (RDMA_SUCCESS); 1894 } 1895 } 1896 break; 1897 1898 case AF_INET6: 1899 sin6 = (struct sockaddr_in6 *)raddr->buf; 1900 dstip.family = AF_INET6; 1901 dstip.un.ip6addr = sin6->sin6_addr; 1902 1903 for (i = 0; i < nif6; i++) { 1904 num_paths_p = 0; 1905 ipattr.ipa_dst_ip = &dstip; 1906 ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; 1907 ipattr.ipa_ndst = 1; 1908 ipattr.ipa_max_paths = 1; 1909 ipattr.ipa_src_ip.family = dstip.family; 1910 ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr; 1911 1912 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1913 IBT_PATH_NO_FLAGS, 1914 &ipattr, 1915 &path, 1916 &num_paths_p, 1917 &srcip); 1918 if (ibt_status == IBT_SUCCESS && 1919 num_paths_p != 0 && 1920 path.pi_hca_guid == rib_stat->hca->hca_guid) { 1921 *hca = rib_stat->hca; 1922 1923 kmem_free(sin4arr, 1924 sizeof (struct sockaddr_in) * nif); 1925 kmem_free(sin6arr, 1926 sizeof (struct sockaddr_in6) * nif); 1927 1928 return (RDMA_SUCCESS); 1929 } 1930 } 1931 1932 break; 1933 1934 default: 1935 kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); 1936 kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); 1937 return (RDMA_INVAL); 1938 } 1939 1940 kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif); 1941 kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif); 1942 return (RDMA_FAILED); 1943 } 1944 1945 /* 1946 * Close channel, remove from connection list and 1947 * free up resources allocated for that channel. 1948 */ 1949 rdma_stat 1950 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 1951 { 1952 rib_qp_t *qp = ctoqp(conn); 1953 rib_hca_t *hca; 1954 1955 /* 1956 * c_ref == 0 and connection is in C_DISCONN_PEND 1957 */ 1958 hca = qp->hca; 1959 if (conn_list != NULL) 1960 (void) rib_rm_conn(conn, conn_list); 1961 1962 if (qp->qp_hdl != NULL) { 1963 /* 1964 * If the channel has not been establised, 1965 * ibt_flush_channel is called to flush outstanding WRs 1966 * on the Qs. Otherwise, ibt_close_rc_channel() is 1967 * called. The channel is then freed. 1968 */ 1969 if (conn_list != NULL) 1970 (void) ibt_close_rc_channel(qp->qp_hdl, 1971 IBT_BLOCKING, NULL, 0, NULL, NULL, 0); 1972 else 1973 (void) ibt_flush_channel(qp->qp_hdl); 1974 1975 mutex_enter(&qp->posted_rbufs_lock); 1976 while (qp->n_posted_rbufs) 1977 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 1978 mutex_exit(&qp->posted_rbufs_lock); 1979 (void) ibt_free_channel(qp->qp_hdl); 1980 qp->qp_hdl = NULL; 1981 } 1982 1983 ASSERT(qp->rdlist == NULL); 1984 1985 if (qp->replylist != NULL) { 1986 (void) rib_rem_replylist(qp); 1987 } 1988 1989 cv_destroy(&qp->cb_conn_cv); 1990 cv_destroy(&qp->posted_rbufs_cv); 1991 mutex_destroy(&qp->cb_lock); 1992 1993 mutex_destroy(&qp->replylist_lock); 1994 mutex_destroy(&qp->posted_rbufs_lock); 1995 mutex_destroy(&qp->rdlist_lock); 1996 1997 cv_destroy(&conn->c_cv); 1998 mutex_destroy(&conn->c_lock); 1999 2000 if (conn->c_raddr.buf != NULL) { 2001 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2002 } 2003 if (conn->c_laddr.buf != NULL) { 2004 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2005 } 2006 2007 /* 2008 * Credit control cleanup. 2009 */ 2010 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2011 rdma_clnt_cred_ctrl_t *cc_info; 2012 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2013 cv_destroy(&cc_info->clnt_cc_cv); 2014 } 2015 2016 kmem_free(qp, sizeof (rib_qp_t)); 2017 2018 /* 2019 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2020 * then the hca is no longer being used. 2021 */ 2022 if (conn_list != NULL) { 2023 rw_enter(&hca->state_lock, RW_READER); 2024 if (hca->state == HCA_DETACHED) { 2025 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2026 if (hca->srv_conn_list.conn_hd == NULL) { 2027 rw_enter(&hca->cl_conn_list.conn_lock, 2028 RW_READER); 2029 2030 if (hca->cl_conn_list.conn_hd == NULL) { 2031 mutex_enter(&hca->inuse_lock); 2032 hca->inuse = FALSE; 2033 cv_signal(&hca->cb_cv); 2034 mutex_exit(&hca->inuse_lock); 2035 } 2036 rw_exit(&hca->cl_conn_list.conn_lock); 2037 } 2038 rw_exit(&hca->srv_conn_list.conn_lock); 2039 } 2040 rw_exit(&hca->state_lock); 2041 } 2042 2043 return (RDMA_SUCCESS); 2044 } 2045 2046 /* 2047 * Wait for send completion notification. Only on receiving a 2048 * notification be it a successful or error completion, free the 2049 * send_wid. 2050 */ 2051 static rdma_stat 2052 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2053 { 2054 clock_t timout, cv_wait_ret; 2055 rdma_stat error = RDMA_SUCCESS; 2056 int i; 2057 2058 /* 2059 * Wait for send to complete 2060 */ 2061 ASSERT(wd != NULL); 2062 mutex_enter(&wd->sendwait_lock); 2063 if (wd->status == (uint_t)SEND_WAIT) { 2064 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2065 ddi_get_lbolt(); 2066 2067 if (qp->mode == RIB_SERVER) { 2068 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2069 &wd->sendwait_lock, timout)) > 0 && 2070 wd->status == (uint_t)SEND_WAIT) 2071 ; 2072 switch (cv_wait_ret) { 2073 case -1: /* timeout */ 2074 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2075 2076 wd->cv_sig = 0; /* no signal needed */ 2077 error = RDMA_TIMEDOUT; 2078 break; 2079 default: /* got send completion */ 2080 break; 2081 } 2082 } else { 2083 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2084 &wd->sendwait_lock, timout)) > 0 && 2085 wd->status == (uint_t)SEND_WAIT) 2086 ; 2087 switch (cv_wait_ret) { 2088 case -1: /* timeout */ 2089 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2090 2091 wd->cv_sig = 0; /* no signal needed */ 2092 error = RDMA_TIMEDOUT; 2093 break; 2094 case 0: /* interrupted */ 2095 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2096 2097 wd->cv_sig = 0; /* no signal needed */ 2098 error = RDMA_INTR; 2099 break; 2100 default: /* got send completion */ 2101 break; 2102 } 2103 } 2104 } 2105 2106 if (wd->status != (uint_t)SEND_WAIT) { 2107 /* got send completion */ 2108 if (wd->status != RDMA_SUCCESS) { 2109 error = wd->status; 2110 if (wd->status != RDMA_CONNLOST) 2111 error = RDMA_FAILED; 2112 } 2113 for (i = 0; i < wd->nsbufs; i++) { 2114 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2115 (void *)(uintptr_t)wd->sbufaddr[i]); 2116 } 2117 mutex_exit(&wd->sendwait_lock); 2118 (void) rib_free_sendwait(wd); 2119 } else { 2120 mutex_exit(&wd->sendwait_lock); 2121 } 2122 return (error); 2123 } 2124 2125 static struct send_wid * 2126 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2127 { 2128 struct send_wid *wd; 2129 2130 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2131 wd->xid = xid; 2132 wd->cv_sig = cv_sig; 2133 wd->qp = qp; 2134 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2135 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2136 wd->status = (uint_t)SEND_WAIT; 2137 2138 return (wd); 2139 } 2140 2141 static int 2142 rib_free_sendwait(struct send_wid *wdesc) 2143 { 2144 cv_destroy(&wdesc->wait_cv); 2145 mutex_destroy(&wdesc->sendwait_lock); 2146 kmem_free(wdesc, sizeof (*wdesc)); 2147 2148 return (0); 2149 } 2150 2151 static rdma_stat 2152 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2153 { 2154 mutex_enter(&qp->replylist_lock); 2155 if (rep != NULL) { 2156 (void) rib_remreply(qp, rep); 2157 mutex_exit(&qp->replylist_lock); 2158 return (RDMA_SUCCESS); 2159 } 2160 mutex_exit(&qp->replylist_lock); 2161 return (RDMA_FAILED); 2162 } 2163 2164 /* 2165 * Send buffers are freed here only in case of error in posting 2166 * on QP. If the post succeeded, the send buffers are freed upon 2167 * send completion in rib_sendwait() or in the scq_handler. 2168 */ 2169 rdma_stat 2170 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2171 int send_sig, int cv_sig, caddr_t *swid) 2172 { 2173 struct send_wid *wdesc; 2174 struct clist *clp; 2175 ibt_status_t ibt_status = IBT_SUCCESS; 2176 rdma_stat ret = RDMA_SUCCESS; 2177 ibt_send_wr_t tx_wr; 2178 int i, nds; 2179 ibt_wr_ds_t sgl[DSEG_MAX]; 2180 uint_t total_msg_size; 2181 rib_qp_t *qp; 2182 2183 qp = ctoqp(conn); 2184 2185 ASSERT(cl != NULL); 2186 2187 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2188 2189 nds = 0; 2190 total_msg_size = 0; 2191 clp = cl; 2192 while (clp != NULL) { 2193 if (nds >= DSEG_MAX) { 2194 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2195 return (RDMA_FAILED); 2196 } 2197 sgl[nds].ds_va = clp->w.c_saddr; 2198 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2199 sgl[nds].ds_len = clp->c_len; 2200 total_msg_size += clp->c_len; 2201 clp = clp->c_next; 2202 nds++; 2203 } 2204 2205 if (send_sig) { 2206 /* Set SEND_SIGNAL flag. */ 2207 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2208 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2209 *swid = (caddr_t)wdesc; 2210 } else { 2211 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2212 wdesc = rib_init_sendwait(msgid, 0, qp); 2213 *swid = (caddr_t)wdesc; 2214 } 2215 wdesc->nsbufs = nds; 2216 for (i = 0; i < nds; i++) { 2217 wdesc->sbufaddr[i] = sgl[i].ds_va; 2218 } 2219 2220 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2221 tx_wr.wr_opcode = IBT_WRC_SEND; 2222 tx_wr.wr_trans = IBT_RC_SRV; 2223 tx_wr.wr_nds = nds; 2224 tx_wr.wr_sgl = sgl; 2225 2226 mutex_enter(&conn->c_lock); 2227 if (conn->c_state == C_CONNECTED) { 2228 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2229 } 2230 if (conn->c_state != C_CONNECTED || 2231 ibt_status != IBT_SUCCESS) { 2232 if (conn->c_state != C_DISCONN_PEND) 2233 conn->c_state = C_ERROR_CONN; 2234 mutex_exit(&conn->c_lock); 2235 for (i = 0; i < nds; i++) { 2236 rib_rbuf_free(conn, SEND_BUFFER, 2237 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2238 } 2239 2240 (void) rib_free_sendwait(wdesc); 2241 2242 return (RDMA_CONNLOST); 2243 } 2244 mutex_exit(&conn->c_lock); 2245 2246 if (send_sig) { 2247 if (cv_sig) { 2248 /* 2249 * cv_wait for send to complete. 2250 * We can fail due to a timeout or signal or 2251 * unsuccessful send. 2252 */ 2253 ret = rib_sendwait(qp, wdesc); 2254 2255 return (ret); 2256 } 2257 } 2258 2259 return (RDMA_SUCCESS); 2260 } 2261 2262 2263 rdma_stat 2264 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2265 { 2266 rdma_stat ret; 2267 caddr_t wd; 2268 2269 /* send-wait & cv_signal */ 2270 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2271 return (ret); 2272 } 2273 2274 /* 2275 * Server interface (svc_rdma_ksend). 2276 * Send RPC reply and wait for RDMA_DONE. 2277 */ 2278 rdma_stat 2279 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2280 { 2281 rdma_stat ret = RDMA_SUCCESS; 2282 struct rdma_done_list *rd; 2283 clock_t timout, cv_wait_ret; 2284 caddr_t *wid = NULL; 2285 rib_qp_t *qp = ctoqp(conn); 2286 2287 mutex_enter(&qp->rdlist_lock); 2288 rd = rdma_done_add(qp, msgid); 2289 2290 /* No cv_signal (whether send-wait or no-send-wait) */ 2291 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2292 2293 if (ret != RDMA_SUCCESS) { 2294 rdma_done_rm(qp, rd); 2295 } else { 2296 /* 2297 * Wait for RDMA_DONE from remote end 2298 */ 2299 timout = 2300 drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); 2301 cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, 2302 &qp->rdlist_lock, 2303 timout); 2304 2305 rdma_done_rm(qp, rd); 2306 2307 if (cv_wait_ret < 0) { 2308 ret = RDMA_TIMEDOUT; 2309 } 2310 } 2311 2312 mutex_exit(&qp->rdlist_lock); 2313 return (ret); 2314 } 2315 2316 static struct recv_wid * 2317 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2318 { 2319 struct recv_wid *rwid; 2320 2321 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2322 rwid->xid = msgid; 2323 rwid->addr = sgl->ds_va; 2324 rwid->qp = qp; 2325 2326 return (rwid); 2327 } 2328 2329 static void 2330 rib_free_wid(struct recv_wid *rwid) 2331 { 2332 kmem_free(rwid, sizeof (struct recv_wid)); 2333 } 2334 2335 rdma_stat 2336 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2337 { 2338 rib_qp_t *qp = ctoqp(conn); 2339 struct clist *clp = cl; 2340 struct reply *rep; 2341 struct recv_wid *rwid; 2342 int nds; 2343 ibt_wr_ds_t sgl[DSEG_MAX]; 2344 ibt_recv_wr_t recv_wr; 2345 rdma_stat ret; 2346 ibt_status_t ibt_status; 2347 2348 /* 2349 * rdma_clnt_postrecv uses RECV_BUFFER. 2350 */ 2351 2352 nds = 0; 2353 while (cl != NULL) { 2354 if (nds >= DSEG_MAX) { 2355 ret = RDMA_FAILED; 2356 goto done; 2357 } 2358 sgl[nds].ds_va = cl->w.c_saddr; 2359 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2360 sgl[nds].ds_len = cl->c_len; 2361 cl = cl->c_next; 2362 nds++; 2363 } 2364 2365 if (nds != 1) { 2366 ret = RDMA_FAILED; 2367 goto done; 2368 } 2369 2370 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2371 recv_wr.wr_nds = nds; 2372 recv_wr.wr_sgl = sgl; 2373 2374 rwid = rib_create_wid(qp, &sgl[0], msgid); 2375 if (rwid) { 2376 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2377 } else { 2378 ret = RDMA_NORESOURCE; 2379 goto done; 2380 } 2381 rep = rib_addreplylist(qp, msgid); 2382 if (!rep) { 2383 rib_free_wid(rwid); 2384 ret = RDMA_NORESOURCE; 2385 goto done; 2386 } 2387 2388 mutex_enter(&conn->c_lock); 2389 2390 if (conn->c_state == C_CONNECTED) { 2391 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2392 } 2393 2394 if (conn->c_state != C_CONNECTED || 2395 ibt_status != IBT_SUCCESS) { 2396 if (conn->c_state != C_DISCONN_PEND) 2397 conn->c_state = C_ERROR_CONN; 2398 mutex_exit(&conn->c_lock); 2399 rib_free_wid(rwid); 2400 (void) rib_rem_rep(qp, rep); 2401 ret = RDMA_CONNLOST; 2402 goto done; 2403 } 2404 mutex_exit(&conn->c_lock); 2405 return (RDMA_SUCCESS); 2406 2407 done: 2408 while (clp != NULL) { 2409 rib_rbuf_free(conn, RECV_BUFFER, 2410 (void *)(uintptr_t)clp->w.c_saddr3); 2411 clp = clp->c_next; 2412 } 2413 return (ret); 2414 } 2415 2416 rdma_stat 2417 rib_svc_post(CONN* conn, struct clist *cl) 2418 { 2419 rib_qp_t *qp = ctoqp(conn); 2420 struct svc_recv *s_recvp; 2421 int nds; 2422 ibt_wr_ds_t sgl[DSEG_MAX]; 2423 ibt_recv_wr_t recv_wr; 2424 ibt_status_t ibt_status; 2425 2426 nds = 0; 2427 while (cl != NULL) { 2428 if (nds >= DSEG_MAX) { 2429 return (RDMA_FAILED); 2430 } 2431 sgl[nds].ds_va = cl->w.c_saddr; 2432 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2433 sgl[nds].ds_len = cl->c_len; 2434 cl = cl->c_next; 2435 nds++; 2436 } 2437 2438 if (nds != 1) { 2439 rib_rbuf_free(conn, RECV_BUFFER, 2440 (caddr_t)(uintptr_t)sgl[0].ds_va); 2441 2442 return (RDMA_FAILED); 2443 } 2444 2445 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2446 recv_wr.wr_nds = nds; 2447 recv_wr.wr_sgl = sgl; 2448 2449 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2450 /* Use s_recvp's addr as wr id */ 2451 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2452 mutex_enter(&conn->c_lock); 2453 if (conn->c_state == C_CONNECTED) { 2454 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2455 } 2456 if (conn->c_state != C_CONNECTED || 2457 ibt_status != IBT_SUCCESS) { 2458 if (conn->c_state != C_DISCONN_PEND) 2459 conn->c_state = C_ERROR_CONN; 2460 mutex_exit(&conn->c_lock); 2461 rib_rbuf_free(conn, RECV_BUFFER, 2462 (caddr_t)(uintptr_t)sgl[0].ds_va); 2463 (void) rib_free_svc_recv(s_recvp); 2464 2465 return (RDMA_CONNLOST); 2466 } 2467 mutex_exit(&conn->c_lock); 2468 2469 return (RDMA_SUCCESS); 2470 } 2471 2472 /* Client */ 2473 rdma_stat 2474 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2475 { 2476 2477 return (rib_clnt_post(conn, cl, msgid)); 2478 } 2479 2480 /* Client */ 2481 rdma_stat 2482 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2483 { 2484 rib_qp_t *qp = ctoqp(conn); 2485 struct reply *rep; 2486 2487 mutex_enter(&qp->replylist_lock); 2488 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2489 if (rep->xid == msgid) { 2490 if (rep->vaddr_cq) { 2491 rib_rbuf_free(conn, RECV_BUFFER, 2492 (caddr_t)(uintptr_t)rep->vaddr_cq); 2493 } 2494 (void) rib_remreply(qp, rep); 2495 break; 2496 } 2497 } 2498 mutex_exit(&qp->replylist_lock); 2499 2500 return (RDMA_SUCCESS); 2501 } 2502 2503 /* Server */ 2504 rdma_stat 2505 rib_post_recv(CONN *conn, struct clist *cl) 2506 { 2507 rib_qp_t *qp = ctoqp(conn); 2508 2509 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2510 mutex_enter(&qp->posted_rbufs_lock); 2511 qp->n_posted_rbufs++; 2512 mutex_exit(&qp->posted_rbufs_lock); 2513 return (RDMA_SUCCESS); 2514 } 2515 return (RDMA_FAILED); 2516 } 2517 2518 /* 2519 * Client side only interface to "recv" the rpc reply buf 2520 * posted earlier by rib_post_resp(conn, cl, msgid). 2521 */ 2522 rdma_stat 2523 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2524 { 2525 struct reply *rep = NULL; 2526 clock_t timout, cv_wait_ret; 2527 rdma_stat ret = RDMA_SUCCESS; 2528 rib_qp_t *qp = ctoqp(conn); 2529 2530 /* 2531 * Find the reply structure for this msgid 2532 */ 2533 mutex_enter(&qp->replylist_lock); 2534 2535 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2536 if (rep->xid == msgid) 2537 break; 2538 } 2539 2540 if (rep != NULL) { 2541 /* 2542 * If message not yet received, wait. 2543 */ 2544 if (rep->status == (uint_t)REPLY_WAIT) { 2545 timout = ddi_get_lbolt() + 2546 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2547 2548 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2549 &qp->replylist_lock, timout)) > 0 && 2550 rep->status == (uint_t)REPLY_WAIT) 2551 ; 2552 2553 switch (cv_wait_ret) { 2554 case -1: /* timeout */ 2555 ret = RDMA_TIMEDOUT; 2556 break; 2557 case 0: 2558 ret = RDMA_INTR; 2559 break; 2560 default: 2561 break; 2562 } 2563 } 2564 2565 if (rep->status == RDMA_SUCCESS) { 2566 struct clist *cl = NULL; 2567 2568 /* 2569 * Got message successfully 2570 */ 2571 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2572 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2573 *clp = cl; 2574 } else { 2575 if (rep->status != (uint_t)REPLY_WAIT) { 2576 /* 2577 * Got error in reply message. Free 2578 * recv buffer here. 2579 */ 2580 ret = rep->status; 2581 rib_rbuf_free(conn, RECV_BUFFER, 2582 (caddr_t)(uintptr_t)rep->vaddr_cq); 2583 } 2584 } 2585 (void) rib_remreply(qp, rep); 2586 } else { 2587 /* 2588 * No matching reply structure found for given msgid on the 2589 * reply wait list. 2590 */ 2591 ret = RDMA_INVAL; 2592 DTRACE_PROBE(rpcib__i__nomatchxid2); 2593 } 2594 2595 /* 2596 * Done. 2597 */ 2598 mutex_exit(&qp->replylist_lock); 2599 return (ret); 2600 } 2601 2602 /* 2603 * RDMA write a buffer to the remote address. 2604 */ 2605 rdma_stat 2606 rib_write(CONN *conn, struct clist *cl, int wait) 2607 { 2608 ibt_send_wr_t tx_wr; 2609 int cv_sig; 2610 int i; 2611 ibt_wr_ds_t sgl[DSEG_MAX]; 2612 struct send_wid *wdesc; 2613 ibt_status_t ibt_status; 2614 rdma_stat ret = RDMA_SUCCESS; 2615 rib_qp_t *qp = ctoqp(conn); 2616 uint64_t n_writes = 0; 2617 bool_t force_wait = FALSE; 2618 2619 if (cl == NULL) { 2620 return (RDMA_FAILED); 2621 } 2622 2623 2624 while ((cl != NULL)) { 2625 if (cl->c_len > 0) { 2626 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2627 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2628 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2629 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2630 sgl[0].ds_va = cl->w.c_saddr; 2631 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2632 sgl[0].ds_len = cl->c_len; 2633 2634 if (wait) { 2635 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2636 cv_sig = 1; 2637 } else { 2638 if (n_writes > max_unsignaled_rws) { 2639 n_writes = 0; 2640 force_wait = TRUE; 2641 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2642 cv_sig = 1; 2643 } else { 2644 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2645 cv_sig = 0; 2646 } 2647 } 2648 2649 wdesc = rib_init_sendwait(0, cv_sig, qp); 2650 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2651 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2652 tx_wr.wr_trans = IBT_RC_SRV; 2653 tx_wr.wr_nds = 1; 2654 tx_wr.wr_sgl = sgl; 2655 2656 mutex_enter(&conn->c_lock); 2657 if (conn->c_state == C_CONNECTED) { 2658 ibt_status = 2659 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2660 } 2661 if (conn->c_state != C_CONNECTED || 2662 ibt_status != IBT_SUCCESS) { 2663 if (conn->c_state != C_DISCONN_PEND) 2664 conn->c_state = C_ERROR_CONN; 2665 mutex_exit(&conn->c_lock); 2666 (void) rib_free_sendwait(wdesc); 2667 return (RDMA_CONNLOST); 2668 } 2669 mutex_exit(&conn->c_lock); 2670 2671 /* 2672 * Wait for send to complete 2673 */ 2674 if (wait || force_wait) { 2675 force_wait = FALSE; 2676 ret = rib_sendwait(qp, wdesc); 2677 if (ret != 0) { 2678 return (ret); 2679 } 2680 } else { 2681 mutex_enter(&wdesc->sendwait_lock); 2682 for (i = 0; i < wdesc->nsbufs; i++) { 2683 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2684 (void *)(uintptr_t) 2685 wdesc->sbufaddr[i]); 2686 } 2687 mutex_exit(&wdesc->sendwait_lock); 2688 (void) rib_free_sendwait(wdesc); 2689 } 2690 n_writes ++; 2691 } 2692 cl = cl->c_next; 2693 } 2694 return (RDMA_SUCCESS); 2695 } 2696 2697 /* 2698 * RDMA Read a buffer from the remote address. 2699 */ 2700 rdma_stat 2701 rib_read(CONN *conn, struct clist *cl, int wait) 2702 { 2703 ibt_send_wr_t rx_wr; 2704 int cv_sig; 2705 int i; 2706 ibt_wr_ds_t sgl; 2707 struct send_wid *wdesc; 2708 ibt_status_t ibt_status = IBT_SUCCESS; 2709 rdma_stat ret = RDMA_SUCCESS; 2710 rib_qp_t *qp = ctoqp(conn); 2711 2712 if (cl == NULL) { 2713 return (RDMA_FAILED); 2714 } 2715 2716 while (cl != NULL) { 2717 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2718 /* 2719 * Remote address is at the head chunk item in list. 2720 */ 2721 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2722 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2723 2724 sgl.ds_va = cl->u.c_daddr; 2725 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2726 sgl.ds_len = cl->c_len; 2727 2728 if (wait) { 2729 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2730 cv_sig = 1; 2731 } else { 2732 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2733 cv_sig = 0; 2734 } 2735 2736 wdesc = rib_init_sendwait(0, cv_sig, qp); 2737 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2738 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2739 rx_wr.wr_trans = IBT_RC_SRV; 2740 rx_wr.wr_nds = 1; 2741 rx_wr.wr_sgl = &sgl; 2742 2743 mutex_enter(&conn->c_lock); 2744 if (conn->c_state == C_CONNECTED) { 2745 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2746 } 2747 if (conn->c_state != C_CONNECTED || 2748 ibt_status != IBT_SUCCESS) { 2749 if (conn->c_state != C_DISCONN_PEND) 2750 conn->c_state = C_ERROR_CONN; 2751 mutex_exit(&conn->c_lock); 2752 (void) rib_free_sendwait(wdesc); 2753 return (RDMA_CONNLOST); 2754 } 2755 mutex_exit(&conn->c_lock); 2756 2757 /* 2758 * Wait for send to complete if this is the 2759 * last item in the list. 2760 */ 2761 if (wait && cl->c_next == NULL) { 2762 ret = rib_sendwait(qp, wdesc); 2763 if (ret != 0) { 2764 return (ret); 2765 } 2766 } else { 2767 mutex_enter(&wdesc->sendwait_lock); 2768 for (i = 0; i < wdesc->nsbufs; i++) { 2769 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2770 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2771 } 2772 mutex_exit(&wdesc->sendwait_lock); 2773 (void) rib_free_sendwait(wdesc); 2774 } 2775 cl = cl->c_next; 2776 } 2777 return (RDMA_SUCCESS); 2778 } 2779 2780 /* 2781 * rib_srv_cm_handler() 2782 * Connection Manager callback to handle RC connection requests. 2783 */ 2784 /* ARGSUSED */ 2785 static ibt_cm_status_t 2786 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2787 ibt_cm_return_args_t *ret_args, void *priv_data, 2788 ibt_priv_data_len_t len) 2789 { 2790 queue_t *q; 2791 rib_qp_t *qp; 2792 rpcib_state_t *ribstat; 2793 rib_hca_t *hca; 2794 rdma_stat status = RDMA_SUCCESS; 2795 int i; 2796 struct clist cl; 2797 rdma_buf_t rdbuf = {0}; 2798 void *buf = NULL; 2799 CONN *conn; 2800 ibt_ip_cm_info_t ipinfo; 2801 struct sockaddr_in *s; 2802 struct sockaddr_in6 *s6; 2803 int sin_size = sizeof (struct sockaddr_in); 2804 int in_size = sizeof (struct in_addr); 2805 int sin6_size = sizeof (struct sockaddr_in6); 2806 2807 ASSERT(any != NULL); 2808 ASSERT(event != NULL); 2809 2810 ribstat = (rpcib_state_t *)any; 2811 hca = (rib_hca_t *)ribstat->hca; 2812 ASSERT(hca != NULL); 2813 2814 /* got a connection request */ 2815 switch (event->cm_type) { 2816 case IBT_CM_EVENT_REQ_RCV: 2817 /* 2818 * If the plugin is in the NO_ACCEPT state, bail out. 2819 */ 2820 mutex_enter(&plugin_state_lock); 2821 if (plugin_state == NO_ACCEPT) { 2822 mutex_exit(&plugin_state_lock); 2823 return (IBT_CM_REJECT); 2824 } 2825 mutex_exit(&plugin_state_lock); 2826 2827 /* 2828 * Need to send a MRA MAD to CM so that it does not 2829 * timeout on us. 2830 */ 2831 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2832 event->cm_event.req.req_timeout * 8, NULL, 0); 2833 2834 mutex_enter(&rib_stat->open_hca_lock); 2835 q = rib_stat->q; 2836 mutex_exit(&rib_stat->open_hca_lock); 2837 2838 status = rib_svc_create_chan(hca, (caddr_t)q, 2839 event->cm_event.req.req_prim_hca_port, &qp); 2840 2841 if (status) { 2842 return (IBT_CM_REJECT); 2843 } 2844 2845 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2846 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2847 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2848 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2849 2850 /* 2851 * Pre-posts RECV buffers 2852 */ 2853 conn = qptoc(qp); 2854 for (i = 0; i < preposted_rbufs; i++) { 2855 bzero(&rdbuf, sizeof (rdbuf)); 2856 rdbuf.type = RECV_BUFFER; 2857 buf = rib_rbuf_alloc(conn, &rdbuf); 2858 if (buf == NULL) { 2859 (void) rib_disconnect_channel(conn, NULL); 2860 return (IBT_CM_REJECT); 2861 } 2862 2863 bzero(&cl, sizeof (cl)); 2864 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 2865 cl.c_len = rdbuf.len; 2866 cl.c_smemhandle.mrc_lmr = 2867 rdbuf.handle.mrc_lmr; /* lkey */ 2868 cl.c_next = NULL; 2869 status = rib_post_recv(conn, &cl); 2870 if (status != RDMA_SUCCESS) { 2871 (void) rib_disconnect_channel(conn, NULL); 2872 return (IBT_CM_REJECT); 2873 } 2874 } 2875 (void) rib_add_connlist(conn, &hca->srv_conn_list); 2876 2877 /* 2878 * Get the address translation 2879 */ 2880 rw_enter(&hca->state_lock, RW_READER); 2881 if (hca->state == HCA_DETACHED) { 2882 rw_exit(&hca->state_lock); 2883 return (IBT_CM_REJECT); 2884 } 2885 rw_exit(&hca->state_lock); 2886 2887 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 2888 2889 if (ibt_get_ip_data(event->cm_priv_data_len, 2890 event->cm_priv_data, 2891 &ipinfo) != IBT_SUCCESS) { 2892 2893 return (IBT_CM_REJECT); 2894 } 2895 2896 switch (ipinfo.src_addr.family) { 2897 case AF_INET: 2898 2899 conn->c_raddr.maxlen = 2900 conn->c_raddr.len = sin_size; 2901 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 2902 2903 s = (struct sockaddr_in *)conn->c_raddr.buf; 2904 s->sin_family = AF_INET; 2905 2906 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 2907 &s->sin_addr, in_size); 2908 2909 break; 2910 2911 case AF_INET6: 2912 2913 conn->c_raddr.maxlen = 2914 conn->c_raddr.len = sin6_size; 2915 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 2916 2917 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 2918 s6->sin6_family = AF_INET6; 2919 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 2920 &s6->sin6_addr, 2921 sizeof (struct in6_addr)); 2922 2923 break; 2924 2925 default: 2926 return (IBT_CM_REJECT); 2927 } 2928 2929 break; 2930 2931 case IBT_CM_EVENT_CONN_CLOSED: 2932 { 2933 CONN *conn; 2934 rib_qp_t *qp; 2935 2936 switch (event->cm_event.closed) { 2937 case IBT_CM_CLOSED_DREP_RCVD: 2938 case IBT_CM_CLOSED_DREQ_TIMEOUT: 2939 case IBT_CM_CLOSED_DUP: 2940 case IBT_CM_CLOSED_ABORT: 2941 case IBT_CM_CLOSED_ALREADY: 2942 /* 2943 * These cases indicate the local end initiated 2944 * the closing of the channel. Nothing to do here. 2945 */ 2946 break; 2947 default: 2948 /* 2949 * Reason for CONN_CLOSED event must be one of 2950 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 2951 * or IBT_CM_CLOSED_STALE. These indicate cases were 2952 * the remote end is closing the channel. In these 2953 * cases free the channel and transition to error 2954 * state 2955 */ 2956 qp = ibt_get_chan_private(event->cm_channel); 2957 conn = qptoc(qp); 2958 mutex_enter(&conn->c_lock); 2959 if (conn->c_state == C_DISCONN_PEND) { 2960 mutex_exit(&conn->c_lock); 2961 break; 2962 } 2963 conn->c_state = C_ERROR_CONN; 2964 2965 /* 2966 * Free the rc_channel. Channel has already 2967 * transitioned to ERROR state and WRs have been 2968 * FLUSHED_ERR already. 2969 */ 2970 (void) ibt_free_channel(qp->qp_hdl); 2971 qp->qp_hdl = NULL; 2972 2973 /* 2974 * Free the conn if c_ref goes down to 0 2975 */ 2976 if (conn->c_ref == 0) { 2977 /* 2978 * Remove from list and free conn 2979 */ 2980 conn->c_state = C_DISCONN_PEND; 2981 mutex_exit(&conn->c_lock); 2982 (void) rib_disconnect_channel(conn, 2983 &hca->srv_conn_list); 2984 } else { 2985 mutex_exit(&conn->c_lock); 2986 } 2987 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 2988 break; 2989 } 2990 break; 2991 } 2992 case IBT_CM_EVENT_CONN_EST: 2993 /* 2994 * RTU received, hence connection established. 2995 */ 2996 if (rib_debug > 1) 2997 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 2998 "(CONN_EST) channel established"); 2999 break; 3000 3001 default: 3002 if (rib_debug > 2) { 3003 /* Let CM handle the following events. */ 3004 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3005 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3006 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3007 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3008 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3009 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3010 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3011 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3012 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3013 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3014 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3015 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3016 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3017 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3018 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3019 } 3020 } 3021 return (IBT_CM_DEFAULT); 3022 } 3023 3024 /* accept all other CM messages (i.e. let the CM handle them) */ 3025 return (IBT_CM_ACCEPT); 3026 } 3027 3028 static rdma_stat 3029 rib_register_service(rib_hca_t *hca, int service_type) 3030 { 3031 ibt_srv_desc_t sdesc; 3032 ibt_hca_portinfo_t *port_infop; 3033 ib_svc_id_t srv_id; 3034 ibt_srv_hdl_t srv_hdl; 3035 uint_t port_size; 3036 uint_t pki, i, num_ports, nbinds; 3037 ibt_status_t ibt_status; 3038 rib_service_t *new_service; 3039 ib_pkey_t pkey; 3040 3041 /* 3042 * Query all ports for the given HCA 3043 */ 3044 rw_enter(&hca->state_lock, RW_READER); 3045 if (hca->state != HCA_DETACHED) { 3046 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3047 &num_ports, &port_size); 3048 rw_exit(&hca->state_lock); 3049 } else { 3050 rw_exit(&hca->state_lock); 3051 return (RDMA_FAILED); 3052 } 3053 if (ibt_status != IBT_SUCCESS) { 3054 return (RDMA_FAILED); 3055 } 3056 3057 DTRACE_PROBE1(rpcib__i__regservice_numports, 3058 int, num_ports); 3059 3060 for (i = 0; i < num_ports; i++) { 3061 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3062 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3063 int, i+1); 3064 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3065 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3066 int, i+1); 3067 } 3068 } 3069 3070 /* 3071 * Get all the IP addresses on this system to register the 3072 * given "service type" on all DNS recognized IP addrs. 3073 * Each service type such as NFS will have all the systems 3074 * IP addresses as its different names. For now the only 3075 * type of service we support in RPCIB is NFS. 3076 */ 3077 rw_enter(&hca->service_list_lock, RW_WRITER); 3078 /* 3079 * Start registering and binding service to active 3080 * on active ports on this HCA. 3081 */ 3082 nbinds = 0; 3083 new_service = NULL; 3084 3085 /* 3086 * We use IP addresses as the service names for 3087 * service registration. Register each of them 3088 * with CM to obtain a svc_id and svc_hdl. We do not 3089 * register the service with machine's loopback address. 3090 */ 3091 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3092 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3093 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3094 3095 sdesc.sd_handler = rib_srv_cm_handler; 3096 sdesc.sd_flags = 0; 3097 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3098 &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT), 3099 1, &srv_hdl, &srv_id); 3100 3101 for (i = 0; i < num_ports; i++) { 3102 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3103 continue; 3104 3105 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3106 pkey = port_infop[i].p_pkey_tbl[pki]; 3107 if ((pkey & IBSRM_HB) && 3108 (pkey != IB_PKEY_INVALID_FULL)) { 3109 3110 /* 3111 * Allocate and prepare a service entry 3112 */ 3113 new_service = 3114 kmem_zalloc(1 * sizeof (rib_service_t), 3115 KM_SLEEP); 3116 3117 new_service->srv_type = service_type; 3118 new_service->srv_hdl = srv_hdl; 3119 new_service->srv_next = NULL; 3120 3121 ibt_status = ibt_bind_service(srv_hdl, 3122 port_infop[i].p_sgid_tbl[0], 3123 NULL, rib_stat, NULL); 3124 3125 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3126 int, ibt_status); 3127 3128 if (ibt_status != IBT_SUCCESS) { 3129 kmem_free(new_service, 3130 sizeof (rib_service_t)); 3131 new_service = NULL; 3132 continue; 3133 } 3134 3135 /* 3136 * Add to the service list for this HCA 3137 */ 3138 new_service->srv_next = hca->service_list; 3139 hca->service_list = new_service; 3140 new_service = NULL; 3141 nbinds++; 3142 } 3143 } 3144 } 3145 rw_exit(&hca->service_list_lock); 3146 3147 ibt_free_portinfo(port_infop, port_size); 3148 3149 if (nbinds == 0) { 3150 return (RDMA_FAILED); 3151 } else { 3152 /* 3153 * Put this plugin into accept state, since atleast 3154 * one registration was successful. 3155 */ 3156 mutex_enter(&plugin_state_lock); 3157 plugin_state = ACCEPT; 3158 mutex_exit(&plugin_state_lock); 3159 return (RDMA_SUCCESS); 3160 } 3161 } 3162 3163 void 3164 rib_listen(struct rdma_svc_data *rd) 3165 { 3166 rdma_stat status = RDMA_SUCCESS; 3167 3168 rd->active = 0; 3169 rd->err_code = RDMA_FAILED; 3170 3171 /* 3172 * First check if a hca is still attached 3173 */ 3174 rw_enter(&rib_stat->hca->state_lock, RW_READER); 3175 if (rib_stat->hca->state != HCA_INITED) { 3176 rw_exit(&rib_stat->hca->state_lock); 3177 return; 3178 } 3179 rw_exit(&rib_stat->hca->state_lock); 3180 3181 rib_stat->q = &rd->q; 3182 /* 3183 * Right now the only service type is NFS. Hence force feed this 3184 * value. Ideally to communicate the service type it should be 3185 * passed down in rdma_svc_data. 3186 */ 3187 rib_stat->service_type = NFS; 3188 status = rib_register_service(rib_stat->hca, NFS); 3189 if (status != RDMA_SUCCESS) { 3190 rd->err_code = status; 3191 return; 3192 } 3193 /* 3194 * Service active on an HCA, check rd->err_code for more 3195 * explainable errors. 3196 */ 3197 rd->active = 1; 3198 rd->err_code = status; 3199 } 3200 3201 /* XXXX */ 3202 /* ARGSUSED */ 3203 static void 3204 rib_listen_stop(struct rdma_svc_data *svcdata) 3205 { 3206 rib_hca_t *hca; 3207 3208 /* 3209 * KRPC called the RDMATF to stop the listeners, this means 3210 * stop sending incomming or recieved requests to KRPC master 3211 * transport handle for RDMA-IB. This is also means that the 3212 * master transport handle, responsible for us, is going away. 3213 */ 3214 mutex_enter(&plugin_state_lock); 3215 plugin_state = NO_ACCEPT; 3216 if (svcdata != NULL) 3217 svcdata->active = 0; 3218 mutex_exit(&plugin_state_lock); 3219 3220 /* 3221 * First check if a hca is still attached 3222 */ 3223 hca = rib_stat->hca; 3224 rw_enter(&hca->state_lock, RW_READER); 3225 if (hca->state != HCA_INITED) { 3226 rw_exit(&hca->state_lock); 3227 return; 3228 } 3229 rib_close_channels(&hca->srv_conn_list); 3230 rib_stop_services(hca); 3231 rw_exit(&hca->state_lock); 3232 } 3233 3234 /* 3235 * Traverse the HCA's service list to unbind and deregister services. 3236 * Instead of unbinding the service for a service handle by 3237 * calling ibt_unbind_service() for each port/pkey, we unbind 3238 * all the services for the service handle by making only one 3239 * call to ibt_unbind_all_services(). Then, we deregister the 3240 * service for the service handle. 3241 * 3242 * When traversing the entries in service_list, we compare the 3243 * srv_hdl of the current entry with that of the next. If they 3244 * are different or if the next entry is NULL, the current entry 3245 * marks the last binding of the service handle. In this case, 3246 * call ibt_unbind_all_services() and deregister the service for 3247 * the service handle. If they are the same, the current and the 3248 * next entries are bound to the same service handle. In this 3249 * case, move on to the next entry. 3250 */ 3251 static void 3252 rib_stop_services(rib_hca_t *hca) 3253 { 3254 rib_service_t *srv_list, *to_remove; 3255 3256 /* 3257 * unbind and deregister the services for this service type. 3258 * Right now there is only one service type. In future it will 3259 * be passed down to this function. 3260 */ 3261 rw_enter(&hca->service_list_lock, RW_WRITER); 3262 srv_list = hca->service_list; 3263 while (srv_list != NULL) { 3264 to_remove = srv_list; 3265 srv_list = to_remove->srv_next; 3266 if (srv_list == NULL || bcmp(to_remove->srv_hdl, 3267 srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { 3268 3269 (void) ibt_unbind_all_services(to_remove->srv_hdl); 3270 (void) ibt_deregister_service(hca->ibt_clnt_hdl, 3271 to_remove->srv_hdl); 3272 } 3273 3274 kmem_free(to_remove, sizeof (rib_service_t)); 3275 } 3276 hca->service_list = NULL; 3277 rw_exit(&hca->service_list_lock); 3278 } 3279 3280 static struct svc_recv * 3281 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3282 { 3283 struct svc_recv *recvp; 3284 3285 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3286 recvp->vaddr = sgl->ds_va; 3287 recvp->qp = qp; 3288 recvp->bytes_xfer = 0; 3289 return (recvp); 3290 } 3291 3292 static int 3293 rib_free_svc_recv(struct svc_recv *recvp) 3294 { 3295 kmem_free(recvp, sizeof (*recvp)); 3296 3297 return (0); 3298 } 3299 3300 static struct reply * 3301 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3302 { 3303 struct reply *rep; 3304 3305 3306 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3307 if (rep == NULL) { 3308 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3309 return (NULL); 3310 } 3311 rep->xid = msgid; 3312 rep->vaddr_cq = NULL; 3313 rep->bytes_xfer = 0; 3314 rep->status = (uint_t)REPLY_WAIT; 3315 rep->prev = NULL; 3316 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3317 3318 mutex_enter(&qp->replylist_lock); 3319 if (qp->replylist) { 3320 rep->next = qp->replylist; 3321 qp->replylist->prev = rep; 3322 } 3323 qp->rep_list_size++; 3324 3325 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3326 int, qp->rep_list_size); 3327 3328 qp->replylist = rep; 3329 mutex_exit(&qp->replylist_lock); 3330 3331 return (rep); 3332 } 3333 3334 static rdma_stat 3335 rib_rem_replylist(rib_qp_t *qp) 3336 { 3337 struct reply *r, *n; 3338 3339 mutex_enter(&qp->replylist_lock); 3340 for (r = qp->replylist; r != NULL; r = n) { 3341 n = r->next; 3342 (void) rib_remreply(qp, r); 3343 } 3344 mutex_exit(&qp->replylist_lock); 3345 3346 return (RDMA_SUCCESS); 3347 } 3348 3349 static int 3350 rib_remreply(rib_qp_t *qp, struct reply *rep) 3351 { 3352 3353 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3354 if (rep->prev) { 3355 rep->prev->next = rep->next; 3356 } 3357 if (rep->next) { 3358 rep->next->prev = rep->prev; 3359 } 3360 if (qp->replylist == rep) 3361 qp->replylist = rep->next; 3362 3363 cv_destroy(&rep->wait_cv); 3364 qp->rep_list_size--; 3365 3366 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3367 int, qp->rep_list_size); 3368 3369 kmem_free(rep, sizeof (*rep)); 3370 3371 return (0); 3372 } 3373 3374 rdma_stat 3375 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3376 struct mrc *buf_handle) 3377 { 3378 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3379 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3380 rdma_stat status; 3381 rib_hca_t *hca = (ctoqp(conn))->hca; 3382 3383 /* 3384 * Note: ALL buffer pools use the same memory type RDMARW. 3385 */ 3386 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3387 if (status == RDMA_SUCCESS) { 3388 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3389 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3390 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3391 } else { 3392 buf_handle->mrc_linfo = NULL; 3393 buf_handle->mrc_lmr = 0; 3394 buf_handle->mrc_rmr = 0; 3395 } 3396 return (status); 3397 } 3398 3399 static rdma_stat 3400 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3401 ibt_mr_flags_t spec, 3402 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3403 { 3404 ibt_mr_attr_t mem_attr; 3405 ibt_status_t ibt_status; 3406 mem_attr.mr_vaddr = (uintptr_t)buf; 3407 mem_attr.mr_len = (ib_msglen_t)size; 3408 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3409 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3410 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3411 IBT_MR_ENABLE_WINDOW_BIND | spec; 3412 3413 rw_enter(&hca->state_lock, RW_READER); 3414 if (hca->state == HCA_INITED) { 3415 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3416 &mem_attr, mr_hdlp, mr_descp); 3417 rw_exit(&hca->state_lock); 3418 } else { 3419 rw_exit(&hca->state_lock); 3420 return (RDMA_FAILED); 3421 } 3422 3423 if (ibt_status != IBT_SUCCESS) { 3424 return (RDMA_FAILED); 3425 } 3426 return (RDMA_SUCCESS); 3427 } 3428 3429 rdma_stat 3430 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3431 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3432 { 3433 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3434 rib_lrc_entry_t *l; 3435 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3436 rdma_stat status; 3437 rib_hca_t *hca = (ctoqp(conn))->hca; 3438 3439 /* 3440 * Non-coherent memory registration. 3441 */ 3442 l = (rib_lrc_entry_t *)lrc; 3443 if (l) { 3444 if (l->registered) { 3445 buf_handle->mrc_linfo = 3446 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3447 buf_handle->mrc_lmr = 3448 (uint32_t)l->lrc_mhandle.mrc_lmr; 3449 buf_handle->mrc_rmr = 3450 (uint32_t)l->lrc_mhandle.mrc_rmr; 3451 *sync_handle = (RIB_SYNCMEM_HANDLE) 3452 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3453 return (RDMA_SUCCESS); 3454 } else { 3455 /* Always register the whole buffer */ 3456 buf = (caddr_t)l->lrc_buf; 3457 buflen = l->lrc_len; 3458 } 3459 } 3460 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3461 3462 if (status == RDMA_SUCCESS) { 3463 if (l) { 3464 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3465 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3466 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3467 l->registered = TRUE; 3468 } 3469 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3470 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3471 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3472 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3473 } else { 3474 buf_handle->mrc_linfo = NULL; 3475 buf_handle->mrc_lmr = 0; 3476 buf_handle->mrc_rmr = 0; 3477 } 3478 return (status); 3479 } 3480 3481 /* ARGSUSED */ 3482 rdma_stat 3483 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3484 { 3485 rib_hca_t *hca = (ctoqp(conn))->hca; 3486 /* 3487 * Allow memory deregistration even if HCA is 3488 * getting detached. Need all outstanding 3489 * memory registrations to be deregistered 3490 * before HCA_DETACH_EVENT can be accepted. 3491 */ 3492 (void) ibt_deregister_mr(hca->hca_hdl, 3493 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3494 return (RDMA_SUCCESS); 3495 } 3496 3497 /* ARGSUSED */ 3498 rdma_stat 3499 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3500 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3501 { 3502 rib_lrc_entry_t *l; 3503 l = (rib_lrc_entry_t *)lrc; 3504 if (l) 3505 if (l->registered) 3506 return (RDMA_SUCCESS); 3507 3508 (void) rib_deregistermem(conn, buf, buf_handle); 3509 3510 return (RDMA_SUCCESS); 3511 } 3512 3513 /* ARGSUSED */ 3514 rdma_stat 3515 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3516 int len, int cpu) 3517 { 3518 ibt_status_t status; 3519 rib_hca_t *hca = (ctoqp(conn))->hca; 3520 ibt_mr_sync_t mr_segment; 3521 3522 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3523 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3524 mr_segment.ms_len = (ib_memlen_t)len; 3525 if (cpu) { 3526 /* make incoming data visible to memory */ 3527 mr_segment.ms_flags = IBT_SYNC_WRITE; 3528 } else { 3529 /* make memory changes visible to IO */ 3530 mr_segment.ms_flags = IBT_SYNC_READ; 3531 } 3532 rw_enter(&hca->state_lock, RW_READER); 3533 if (hca->state == HCA_INITED) { 3534 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3535 rw_exit(&hca->state_lock); 3536 } else { 3537 rw_exit(&hca->state_lock); 3538 return (RDMA_FAILED); 3539 } 3540 3541 if (status == IBT_SUCCESS) 3542 return (RDMA_SUCCESS); 3543 else { 3544 return (RDMA_FAILED); 3545 } 3546 } 3547 3548 /* 3549 * XXXX ???? 3550 */ 3551 static rdma_stat 3552 rib_getinfo(rdma_info_t *info) 3553 { 3554 /* 3555 * XXXX Hack! 3556 */ 3557 info->addrlen = 16; 3558 info->mts = 1000000; 3559 info->mtu = 1000000; 3560 3561 return (RDMA_SUCCESS); 3562 } 3563 3564 rib_bufpool_t * 3565 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3566 { 3567 rib_bufpool_t *rbp = NULL; 3568 bufpool_t *bp = NULL; 3569 caddr_t buf; 3570 ibt_mr_attr_t mem_attr; 3571 ibt_status_t ibt_status; 3572 int i, j; 3573 3574 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3575 3576 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3577 num * sizeof (void *), KM_SLEEP); 3578 3579 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3580 bp->numelems = num; 3581 3582 3583 switch (ptype) { 3584 case SEND_BUFFER: 3585 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3586 bp->rsize = RPC_MSG_SZ; 3587 break; 3588 case RECV_BUFFER: 3589 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3590 bp->rsize = RPC_BUF_SIZE; 3591 break; 3592 default: 3593 goto fail; 3594 } 3595 3596 /* 3597 * Register the pool. 3598 */ 3599 bp->bufsize = num * bp->rsize; 3600 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3601 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3602 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3603 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3604 sizeof (ibt_mr_desc_t), KM_SLEEP); 3605 rw_enter(&hca->state_lock, RW_READER); 3606 3607 if (hca->state != HCA_INITED) { 3608 rw_exit(&hca->state_lock); 3609 goto fail; 3610 } 3611 3612 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3613 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3614 mem_attr.mr_vaddr = (uintptr_t)buf; 3615 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3616 mem_attr.mr_as = NULL; 3617 ibt_status = ibt_register_mr(hca->hca_hdl, 3618 hca->pd_hdl, &mem_attr, 3619 &rbp->mr_hdl[i], 3620 &rbp->mr_desc[i]); 3621 if (ibt_status != IBT_SUCCESS) { 3622 for (j = 0; j < i; j++) { 3623 (void) ibt_deregister_mr(hca->hca_hdl, 3624 rbp->mr_hdl[j]); 3625 } 3626 rw_exit(&hca->state_lock); 3627 goto fail; 3628 } 3629 } 3630 rw_exit(&hca->state_lock); 3631 buf = (caddr_t)bp->buf; 3632 for (i = 0; i < num; i++, buf += bp->rsize) { 3633 bp->buflist[i] = (void *)buf; 3634 } 3635 bp->buffree = num - 1; /* no. of free buffers */ 3636 rbp->bpool = bp; 3637 3638 return (rbp); 3639 fail: 3640 if (bp) { 3641 if (bp->buf) 3642 kmem_free(bp->buf, bp->bufsize); 3643 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3644 } 3645 if (rbp) { 3646 if (rbp->mr_hdl) 3647 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3648 if (rbp->mr_desc) 3649 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3650 kmem_free(rbp, sizeof (rib_bufpool_t)); 3651 } 3652 return (NULL); 3653 } 3654 3655 static void 3656 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3657 { 3658 int i; 3659 rib_bufpool_t *rbp = NULL; 3660 bufpool_t *bp; 3661 3662 /* 3663 * Obtain pool address based on type of pool 3664 */ 3665 switch (ptype) { 3666 case SEND_BUFFER: 3667 rbp = hca->send_pool; 3668 break; 3669 case RECV_BUFFER: 3670 rbp = hca->recv_pool; 3671 break; 3672 default: 3673 return; 3674 } 3675 if (rbp == NULL) 3676 return; 3677 3678 bp = rbp->bpool; 3679 3680 /* 3681 * Deregister the pool memory and free it. 3682 */ 3683 for (i = 0; i < bp->numelems; i++) { 3684 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3685 } 3686 } 3687 3688 static void 3689 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3690 { 3691 3692 rib_bufpool_t *rbp = NULL; 3693 bufpool_t *bp; 3694 3695 /* 3696 * Obtain pool address based on type of pool 3697 */ 3698 switch (ptype) { 3699 case SEND_BUFFER: 3700 rbp = hca->send_pool; 3701 break; 3702 case RECV_BUFFER: 3703 rbp = hca->recv_pool; 3704 break; 3705 default: 3706 return; 3707 } 3708 if (rbp == NULL) 3709 return; 3710 3711 bp = rbp->bpool; 3712 3713 /* 3714 * Free the pool memory. 3715 */ 3716 if (rbp->mr_hdl) 3717 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 3718 3719 if (rbp->mr_desc) 3720 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 3721 if (bp->buf) 3722 kmem_free(bp->buf, bp->bufsize); 3723 mutex_destroy(&bp->buflock); 3724 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 3725 kmem_free(rbp, sizeof (rib_bufpool_t)); 3726 } 3727 3728 void 3729 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 3730 { 3731 /* 3732 * Deregister the pool memory and free it. 3733 */ 3734 rib_rbufpool_deregister(hca, ptype); 3735 rib_rbufpool_free(hca, ptype); 3736 } 3737 3738 /* 3739 * Fetch a buffer from the pool of type specified in rdbuf->type. 3740 */ 3741 static rdma_stat 3742 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3743 { 3744 rib_lrc_entry_t *rlep; 3745 3746 if (rdbuf->type == RDMA_LONG_BUFFER) { 3747 rlep = rib_get_cache_buf(conn, rdbuf->len); 3748 rdbuf->rb_private = (caddr_t)rlep; 3749 rdbuf->addr = rlep->lrc_buf; 3750 rdbuf->handle = rlep->lrc_mhandle; 3751 return (RDMA_SUCCESS); 3752 } 3753 3754 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 3755 if (rdbuf->addr) { 3756 switch (rdbuf->type) { 3757 case SEND_BUFFER: 3758 rdbuf->len = RPC_MSG_SZ; /* 1K */ 3759 break; 3760 case RECV_BUFFER: 3761 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 3762 break; 3763 default: 3764 rdbuf->len = 0; 3765 } 3766 return (RDMA_SUCCESS); 3767 } else 3768 return (RDMA_FAILED); 3769 } 3770 3771 #if defined(MEASURE_POOL_DEPTH) 3772 static void rib_recv_bufs(uint32_t x) { 3773 3774 } 3775 3776 static void rib_send_bufs(uint32_t x) { 3777 3778 } 3779 #endif 3780 3781 /* 3782 * Fetch a buffer of specified type. 3783 * Note that rdbuf->handle is mw's rkey. 3784 */ 3785 static void * 3786 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 3787 { 3788 rib_qp_t *qp = ctoqp(conn); 3789 rib_hca_t *hca = qp->hca; 3790 rdma_btype ptype = rdbuf->type; 3791 void *buf; 3792 rib_bufpool_t *rbp = NULL; 3793 bufpool_t *bp; 3794 int i; 3795 3796 /* 3797 * Obtain pool address based on type of pool 3798 */ 3799 switch (ptype) { 3800 case SEND_BUFFER: 3801 rbp = hca->send_pool; 3802 break; 3803 case RECV_BUFFER: 3804 rbp = hca->recv_pool; 3805 break; 3806 default: 3807 return (NULL); 3808 } 3809 if (rbp == NULL) 3810 return (NULL); 3811 3812 bp = rbp->bpool; 3813 3814 mutex_enter(&bp->buflock); 3815 if (bp->buffree < 0) { 3816 mutex_exit(&bp->buflock); 3817 return (NULL); 3818 } 3819 3820 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 3821 buf = bp->buflist[bp->buffree]; 3822 rdbuf->addr = buf; 3823 rdbuf->len = bp->rsize; 3824 for (i = bp->numelems - 1; i >= 0; i--) { 3825 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 3826 rdbuf->handle.mrc_rmr = 3827 (uint32_t)rbp->mr_desc[i].md_rkey; 3828 rdbuf->handle.mrc_linfo = 3829 (uintptr_t)rbp->mr_hdl[i]; 3830 rdbuf->handle.mrc_lmr = 3831 (uint32_t)rbp->mr_desc[i].md_lkey; 3832 #if defined(MEASURE_POOL_DEPTH) 3833 if (ptype == SEND_BUFFER) 3834 rib_send_bufs(MAX_BUFS - (bp->buffree+1)); 3835 if (ptype == RECV_BUFFER) 3836 rib_recv_bufs(MAX_BUFS - (bp->buffree+1)); 3837 #endif 3838 bp->buffree--; 3839 3840 mutex_exit(&bp->buflock); 3841 3842 return (buf); 3843 } 3844 } 3845 3846 mutex_exit(&bp->buflock); 3847 3848 return (NULL); 3849 } 3850 3851 static void 3852 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 3853 { 3854 3855 if (rdbuf->type == RDMA_LONG_BUFFER) { 3856 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 3857 rdbuf->rb_private = NULL; 3858 return; 3859 } 3860 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 3861 } 3862 3863 static void 3864 rib_rbuf_free(CONN *conn, int ptype, void *buf) 3865 { 3866 rib_qp_t *qp = ctoqp(conn); 3867 rib_hca_t *hca = qp->hca; 3868 rib_bufpool_t *rbp = NULL; 3869 bufpool_t *bp; 3870 3871 /* 3872 * Obtain pool address based on type of pool 3873 */ 3874 switch (ptype) { 3875 case SEND_BUFFER: 3876 rbp = hca->send_pool; 3877 break; 3878 case RECV_BUFFER: 3879 rbp = hca->recv_pool; 3880 break; 3881 default: 3882 return; 3883 } 3884 if (rbp == NULL) 3885 return; 3886 3887 bp = rbp->bpool; 3888 3889 mutex_enter(&bp->buflock); 3890 if (++bp->buffree >= bp->numelems) { 3891 /* 3892 * Should never happen 3893 */ 3894 bp->buffree--; 3895 } else { 3896 bp->buflist[bp->buffree] = buf; 3897 } 3898 mutex_exit(&bp->buflock); 3899 } 3900 3901 static rdma_stat 3902 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 3903 { 3904 rw_enter(&connlist->conn_lock, RW_WRITER); 3905 if (connlist->conn_hd) { 3906 cn->c_next = connlist->conn_hd; 3907 connlist->conn_hd->c_prev = cn; 3908 } 3909 connlist->conn_hd = cn; 3910 rw_exit(&connlist->conn_lock); 3911 3912 return (RDMA_SUCCESS); 3913 } 3914 3915 static rdma_stat 3916 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 3917 { 3918 rw_enter(&connlist->conn_lock, RW_WRITER); 3919 if (cn->c_prev) { 3920 cn->c_prev->c_next = cn->c_next; 3921 } 3922 if (cn->c_next) { 3923 cn->c_next->c_prev = cn->c_prev; 3924 } 3925 if (connlist->conn_hd == cn) 3926 connlist->conn_hd = cn->c_next; 3927 rw_exit(&connlist->conn_lock); 3928 3929 return (RDMA_SUCCESS); 3930 } 3931 3932 /* 3933 * Connection management. 3934 * IBTF does not support recycling of channels. So connections are only 3935 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 3936 * C_DISCONN_PEND state. No C_IDLE state. 3937 * C_CONN_PEND state: Connection establishment in progress to the server. 3938 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 3939 * It has an RC channel associated with it. ibt_post_send/recv are allowed 3940 * only in this state. 3941 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 3942 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 3943 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 3944 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 3945 * c_ref drops to 0 (this indicates that RPC has no more references to this 3946 * connection), the connection should be destroyed. A connection transitions 3947 * into this state when it is being destroyed. 3948 */ 3949 static rdma_stat 3950 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) 3951 { 3952 CONN *cn; 3953 int status = RDMA_SUCCESS; 3954 rib_hca_t *hca = (rib_hca_t *)handle; 3955 rib_qp_t *qp; 3956 clock_t cv_stat, timout; 3957 ibt_path_info_t path; 3958 ibt_ip_addr_t s_ip, d_ip; 3959 3960 again: 3961 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 3962 cn = hca->cl_conn_list.conn_hd; 3963 while (cn != NULL) { 3964 /* 3965 * First, clear up any connection in the ERROR state 3966 */ 3967 mutex_enter(&cn->c_lock); 3968 if (cn->c_state == C_ERROR_CONN) { 3969 if (cn->c_ref == 0) { 3970 /* 3971 * Remove connection from list and destroy it. 3972 */ 3973 cn->c_state = C_DISCONN_PEND; 3974 mutex_exit(&cn->c_lock); 3975 rw_exit(&hca->cl_conn_list.conn_lock); 3976 (void) rib_disconnect_channel(cn, 3977 &hca->cl_conn_list); 3978 goto again; 3979 } 3980 mutex_exit(&cn->c_lock); 3981 cn = cn->c_next; 3982 continue; 3983 } 3984 if (cn->c_state == C_DISCONN_PEND) { 3985 mutex_exit(&cn->c_lock); 3986 cn = cn->c_next; 3987 continue; 3988 } 3989 if ((cn->c_raddr.len == svcaddr->len) && 3990 bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { 3991 /* 3992 * Our connection. Give up conn list lock 3993 * as we are done traversing the list. 3994 */ 3995 rw_exit(&hca->cl_conn_list.conn_lock); 3996 if (cn->c_state == C_CONNECTED) { 3997 cn->c_ref++; /* sharing a conn */ 3998 mutex_exit(&cn->c_lock); 3999 *conn = cn; 4000 return (status); 4001 } 4002 if (cn->c_state == C_CONN_PEND) { 4003 /* 4004 * Hold a reference to this conn before 4005 * we give up the lock. 4006 */ 4007 cn->c_ref++; 4008 timout = ddi_get_lbolt() + 4009 drv_usectohz(CONN_WAIT_TIME * 1000000); 4010 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4011 &cn->c_lock, timout)) > 0 && 4012 cn->c_state == C_CONN_PEND) 4013 ; 4014 if (cv_stat == 0) { 4015 cn->c_ref--; 4016 mutex_exit(&cn->c_lock); 4017 return (RDMA_INTR); 4018 } 4019 if (cv_stat < 0) { 4020 cn->c_ref--; 4021 mutex_exit(&cn->c_lock); 4022 return (RDMA_TIMEDOUT); 4023 } 4024 if (cn->c_state == C_CONNECTED) { 4025 *conn = cn; 4026 mutex_exit(&cn->c_lock); 4027 return (status); 4028 } else { 4029 cn->c_ref--; 4030 mutex_exit(&cn->c_lock); 4031 return (RDMA_TIMEDOUT); 4032 } 4033 } 4034 } 4035 mutex_exit(&cn->c_lock); 4036 cn = cn->c_next; 4037 } 4038 rw_exit(&hca->cl_conn_list.conn_lock); 4039 4040 bzero(&path, sizeof (ibt_path_info_t)); 4041 bzero(&s_ip, sizeof (ibt_ip_addr_t)); 4042 bzero(&d_ip, sizeof (ibt_ip_addr_t)); 4043 4044 status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip); 4045 if (status != RDMA_SUCCESS) { 4046 return (RDMA_FAILED); 4047 } 4048 4049 /* 4050 * Channel to server doesn't exist yet, create one. 4051 */ 4052 if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { 4053 return (RDMA_FAILED); 4054 } 4055 cn = qptoc(qp); 4056 cn->c_state = C_CONN_PEND; 4057 cn->c_ref = 1; 4058 4059 /* 4060 * Add to conn list. 4061 * We had given up the READER lock. In the time since then, 4062 * another thread might have created the connection we are 4063 * trying here. But for now, that is quiet alright - there 4064 * might be two connections between a pair of hosts instead 4065 * of one. If we really want to close that window, 4066 * then need to check the list after acquiring the 4067 * WRITER lock. 4068 */ 4069 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4070 status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip); 4071 mutex_enter(&cn->c_lock); 4072 if (status == RDMA_SUCCESS) { 4073 cn->c_state = C_CONNECTED; 4074 *conn = cn; 4075 } else { 4076 cn->c_state = C_ERROR_CONN; 4077 cn->c_ref--; 4078 } 4079 cv_broadcast(&cn->c_cv); 4080 mutex_exit(&cn->c_lock); 4081 return (status); 4082 } 4083 4084 static rdma_stat 4085 rib_conn_release(CONN *conn) 4086 { 4087 rib_qp_t *qp = ctoqp(conn); 4088 4089 mutex_enter(&conn->c_lock); 4090 conn->c_ref--; 4091 4092 /* 4093 * If a conn is C_ERROR_CONN, close the channel. 4094 * If it's CONNECTED, keep it that way. 4095 */ 4096 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4097 conn->c_state = C_DISCONN_PEND; 4098 mutex_exit(&conn->c_lock); 4099 if (qp->mode == RIB_SERVER) 4100 (void) rib_disconnect_channel(conn, 4101 &qp->hca->srv_conn_list); 4102 else 4103 (void) rib_disconnect_channel(conn, 4104 &qp->hca->cl_conn_list); 4105 return (RDMA_SUCCESS); 4106 } 4107 mutex_exit(&conn->c_lock); 4108 return (RDMA_SUCCESS); 4109 } 4110 4111 /* 4112 * Add at front of list 4113 */ 4114 static struct rdma_done_list * 4115 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4116 { 4117 struct rdma_done_list *rd; 4118 4119 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4120 4121 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4122 rd->xid = xid; 4123 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4124 4125 rd->prev = NULL; 4126 rd->next = qp->rdlist; 4127 if (qp->rdlist != NULL) 4128 qp->rdlist->prev = rd; 4129 qp->rdlist = rd; 4130 4131 return (rd); 4132 } 4133 4134 static void 4135 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4136 { 4137 struct rdma_done_list *r; 4138 4139 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4140 4141 r = rd->next; 4142 if (r != NULL) { 4143 r->prev = rd->prev; 4144 } 4145 4146 r = rd->prev; 4147 if (r != NULL) { 4148 r->next = rd->next; 4149 } else { 4150 qp->rdlist = rd->next; 4151 } 4152 4153 cv_destroy(&rd->rdma_done_cv); 4154 kmem_free(rd, sizeof (*rd)); 4155 } 4156 4157 static void 4158 rdma_done_rem_list(rib_qp_t *qp) 4159 { 4160 struct rdma_done_list *r, *n; 4161 4162 mutex_enter(&qp->rdlist_lock); 4163 for (r = qp->rdlist; r != NULL; r = n) { 4164 n = r->next; 4165 rdma_done_rm(qp, r); 4166 } 4167 mutex_exit(&qp->rdlist_lock); 4168 } 4169 4170 static void 4171 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4172 { 4173 struct rdma_done_list *r = qp->rdlist; 4174 4175 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4176 4177 while (r) { 4178 if (r->xid == xid) { 4179 cv_signal(&r->rdma_done_cv); 4180 return; 4181 } else { 4182 r = r->next; 4183 } 4184 } 4185 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4186 int, xid); 4187 } 4188 4189 4190 /* 4191 * Goes through all connections and closes the channel 4192 * This will cause all the WRs on those channels to be 4193 * flushed. 4194 */ 4195 static void 4196 rib_close_channels(rib_conn_list_t *connlist) 4197 { 4198 CONN *conn; 4199 rib_qp_t *qp; 4200 4201 rw_enter(&connlist->conn_lock, RW_READER); 4202 conn = connlist->conn_hd; 4203 while (conn != NULL) { 4204 mutex_enter(&conn->c_lock); 4205 qp = ctoqp(conn); 4206 if (conn->c_state == C_CONNECTED) { 4207 /* 4208 * Live connection in CONNECTED state. 4209 * Call ibt_close_rc_channel in nonblocking mode 4210 * with no callbacks. 4211 */ 4212 conn->c_state = C_ERROR_CONN; 4213 (void) ibt_close_rc_channel(qp->qp_hdl, 4214 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 4215 (void) ibt_free_channel(qp->qp_hdl); 4216 qp->qp_hdl = NULL; 4217 } else { 4218 if (conn->c_state == C_ERROR_CONN && 4219 qp->qp_hdl != NULL) { 4220 /* 4221 * Connection in ERROR state but 4222 * channel is not yet freed. 4223 */ 4224 (void) ibt_close_rc_channel(qp->qp_hdl, 4225 IBT_NOCALLBACKS, NULL, 0, NULL, 4226 NULL, 0); 4227 (void) ibt_free_channel(qp->qp_hdl); 4228 qp->qp_hdl = NULL; 4229 } 4230 } 4231 mutex_exit(&conn->c_lock); 4232 conn = conn->c_next; 4233 } 4234 rw_exit(&connlist->conn_lock); 4235 } 4236 4237 /* 4238 * Frees up all connections that are no longer being referenced 4239 */ 4240 static void 4241 rib_purge_connlist(rib_conn_list_t *connlist) 4242 { 4243 CONN *conn; 4244 4245 top: 4246 rw_enter(&connlist->conn_lock, RW_READER); 4247 conn = connlist->conn_hd; 4248 while (conn != NULL) { 4249 mutex_enter(&conn->c_lock); 4250 4251 /* 4252 * At this point connection is either in ERROR 4253 * or DISCONN_PEND state. If in DISCONN_PEND state 4254 * then some other thread is culling that connection. 4255 * If not and if c_ref is 0, then destroy the connection. 4256 */ 4257 if (conn->c_ref == 0 && 4258 conn->c_state != C_DISCONN_PEND) { 4259 /* 4260 * Cull the connection 4261 */ 4262 conn->c_state = C_DISCONN_PEND; 4263 mutex_exit(&conn->c_lock); 4264 rw_exit(&connlist->conn_lock); 4265 (void) rib_disconnect_channel(conn, connlist); 4266 goto top; 4267 } else { 4268 /* 4269 * conn disconnect already scheduled or will 4270 * happen from conn_release when c_ref drops to 0. 4271 */ 4272 mutex_exit(&conn->c_lock); 4273 } 4274 conn = conn->c_next; 4275 } 4276 rw_exit(&connlist->conn_lock); 4277 4278 /* 4279 * At this point, only connections with c_ref != 0 are on the list 4280 */ 4281 } 4282 4283 /* 4284 * Cleans and closes up all uses of the HCA 4285 */ 4286 static void 4287 rib_detach_hca(rib_hca_t *hca) 4288 { 4289 4290 /* 4291 * Stop all services on the HCA 4292 * Go through cl_conn_list and close all rc_channels 4293 * Go through svr_conn_list and close all rc_channels 4294 * Free connections whose c_ref has dropped to 0 4295 * Destroy all CQs 4296 * Deregister and released all buffer pool memory after all 4297 * connections are destroyed 4298 * Free the protection domain 4299 * ibt_close_hca() 4300 */ 4301 rw_enter(&hca->state_lock, RW_WRITER); 4302 if (hca->state == HCA_DETACHED) { 4303 rw_exit(&hca->state_lock); 4304 return; 4305 } 4306 4307 hca->state = HCA_DETACHED; 4308 rib_stat->nhca_inited--; 4309 4310 rib_stop_services(hca); 4311 rib_close_channels(&hca->cl_conn_list); 4312 rib_close_channels(&hca->srv_conn_list); 4313 rw_exit(&hca->state_lock); 4314 4315 rib_purge_connlist(&hca->cl_conn_list); 4316 rib_purge_connlist(&hca->srv_conn_list); 4317 4318 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4319 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4320 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4321 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4322 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4323 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4324 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4325 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4326 4327 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4328 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4329 if (hca->srv_conn_list.conn_hd == NULL && 4330 hca->cl_conn_list.conn_hd == NULL) { 4331 /* 4332 * conn_lists are NULL, so destroy 4333 * buffers, close hca and be done. 4334 */ 4335 rib_rbufpool_destroy(hca, RECV_BUFFER); 4336 rib_rbufpool_destroy(hca, SEND_BUFFER); 4337 rib_destroy_cache(hca); 4338 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4339 (void) ibt_close_hca(hca->hca_hdl); 4340 hca->hca_hdl = NULL; 4341 } 4342 rw_exit(&hca->cl_conn_list.conn_lock); 4343 rw_exit(&hca->srv_conn_list.conn_lock); 4344 4345 if (hca->hca_hdl != NULL) { 4346 mutex_enter(&hca->inuse_lock); 4347 while (hca->inuse) 4348 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4349 mutex_exit(&hca->inuse_lock); 4350 /* 4351 * conn_lists are now NULL, so destroy 4352 * buffers, close hca and be done. 4353 */ 4354 rib_rbufpool_destroy(hca, RECV_BUFFER); 4355 rib_rbufpool_destroy(hca, SEND_BUFFER); 4356 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4357 (void) ibt_close_hca(hca->hca_hdl); 4358 hca->hca_hdl = NULL; 4359 } 4360 } 4361 4362 static void 4363 rib_server_side_cache_reclaim(void *argp) 4364 { 4365 cache_avl_struct_t *rcas; 4366 rib_lrc_entry_t *rb; 4367 rib_hca_t *hca = (rib_hca_t *)argp; 4368 4369 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4370 rcas = avl_first(&hca->avl_tree); 4371 if (rcas != NULL) 4372 avl_remove(&hca->avl_tree, rcas); 4373 4374 while (rcas != NULL) { 4375 while (rcas->r.forw != &rcas->r) { 4376 rcas->elements--; 4377 rib_total_buffers --; 4378 rb = rcas->r.forw; 4379 remque(rb); 4380 if (rb->registered) 4381 (void) rib_deregistermem_via_hca(hca, 4382 rb->lrc_buf, rb->lrc_mhandle); 4383 cache_allocation -= rb->lrc_len; 4384 kmem_free(rb->lrc_buf, rb->lrc_len); 4385 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4386 } 4387 mutex_destroy(&rcas->node_lock); 4388 kmem_cache_free(hca->server_side_cache, rcas); 4389 rcas = avl_first(&hca->avl_tree); 4390 if (rcas != NULL) 4391 avl_remove(&hca->avl_tree, rcas); 4392 } 4393 rw_exit(&hca->avl_rw_lock); 4394 } 4395 4396 static void 4397 rib_server_side_cache_cleanup(void *argp) 4398 { 4399 cache_avl_struct_t *rcas; 4400 rib_lrc_entry_t *rb; 4401 rib_hca_t *hca = (rib_hca_t *)argp; 4402 4403 rw_enter(&hca->avl_rw_lock, RW_READER); 4404 if (cache_allocation < cache_limit) { 4405 rw_exit(&hca->avl_rw_lock); 4406 return; 4407 } 4408 rw_exit(&hca->avl_rw_lock); 4409 4410 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4411 rcas = avl_last(&hca->avl_tree); 4412 if (rcas != NULL) 4413 avl_remove(&hca->avl_tree, rcas); 4414 4415 while (rcas != NULL) { 4416 while (rcas->r.forw != &rcas->r) { 4417 rcas->elements--; 4418 rib_total_buffers --; 4419 rb = rcas->r.forw; 4420 remque(rb); 4421 if (rb->registered) 4422 (void) rib_deregistermem_via_hca(hca, 4423 rb->lrc_buf, rb->lrc_mhandle); 4424 cache_allocation -= rb->lrc_len; 4425 kmem_free(rb->lrc_buf, rb->lrc_len); 4426 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4427 } 4428 mutex_destroy(&rcas->node_lock); 4429 kmem_cache_free(hca->server_side_cache, rcas); 4430 if ((cache_allocation) < cache_limit) { 4431 rw_exit(&hca->avl_rw_lock); 4432 return; 4433 } 4434 4435 rcas = avl_last(&hca->avl_tree); 4436 if (rcas != NULL) 4437 avl_remove(&hca->avl_tree, rcas); 4438 } 4439 rw_exit(&hca->avl_rw_lock); 4440 } 4441 4442 static int 4443 avl_compare(const void *t1, const void *t2) 4444 { 4445 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 4446 return (0); 4447 4448 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 4449 return (-1); 4450 4451 return (1); 4452 } 4453 4454 static void 4455 rib_destroy_cache(rib_hca_t *hca) 4456 { 4457 if (hca->reg_cache_clean_up != NULL) { 4458 ddi_taskq_destroy(hca->reg_cache_clean_up); 4459 hca->reg_cache_clean_up = NULL; 4460 } 4461 if (!hca->avl_init) { 4462 kmem_cache_destroy(hca->server_side_cache); 4463 avl_destroy(&hca->avl_tree); 4464 mutex_destroy(&hca->cache_allocation); 4465 rw_destroy(&hca->avl_rw_lock); 4466 } 4467 hca->avl_init = FALSE; 4468 } 4469 4470 static void 4471 rib_force_cleanup(void *hca) 4472 { 4473 if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL) 4474 (void) ddi_taskq_dispatch( 4475 ((rib_hca_t *)hca)->reg_cache_clean_up, 4476 rib_server_side_cache_cleanup, 4477 (void *)hca, DDI_NOSLEEP); 4478 } 4479 4480 static rib_lrc_entry_t * 4481 rib_get_cache_buf(CONN *conn, uint32_t len) 4482 { 4483 cache_avl_struct_t cas, *rcas; 4484 rib_hca_t *hca = (ctoqp(conn))->hca; 4485 rib_lrc_entry_t *reply_buf; 4486 avl_index_t where = NULL; 4487 uint64_t c_alloc = 0; 4488 4489 if (!hca->avl_init) 4490 goto error_alloc; 4491 4492 cas.len = len; 4493 4494 rw_enter(&hca->avl_rw_lock, RW_READER); 4495 4496 mutex_enter(&hca->cache_allocation); 4497 c_alloc = cache_allocation; 4498 mutex_exit(&hca->cache_allocation); 4499 4500 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 4501 &where)) == NULL) { 4502 /* Am I above the cache limit */ 4503 if ((c_alloc + len) >= cache_limit) { 4504 rib_force_cleanup((void *)hca); 4505 rw_exit(&hca->avl_rw_lock); 4506 cache_misses_above_the_limit ++; 4507 4508 /* Allocate and register the buffer directly */ 4509 goto error_alloc; 4510 } 4511 4512 rw_exit(&hca->avl_rw_lock); 4513 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4514 4515 /* Recheck to make sure no other thread added the entry in */ 4516 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 4517 &cas, &where)) == NULL) { 4518 /* Allocate an avl tree entry */ 4519 rcas = (cache_avl_struct_t *) 4520 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 4521 4522 bzero(rcas, sizeof (cache_avl_struct_t)); 4523 rcas->elements = 0; 4524 rcas->r.forw = &rcas->r; 4525 rcas->r.back = &rcas->r; 4526 rcas->len = len; 4527 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 4528 avl_insert(&hca->avl_tree, rcas, where); 4529 } 4530 } 4531 4532 mutex_enter(&rcas->node_lock); 4533 4534 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 4535 rib_total_buffers--; 4536 cache_hits++; 4537 reply_buf = rcas->r.forw; 4538 remque(reply_buf); 4539 rcas->elements--; 4540 mutex_exit(&rcas->node_lock); 4541 rw_exit(&hca->avl_rw_lock); 4542 mutex_enter(&hca->cache_allocation); 4543 cache_allocation -= len; 4544 mutex_exit(&hca->cache_allocation); 4545 } else { 4546 /* Am I above the cache limit */ 4547 mutex_exit(&rcas->node_lock); 4548 if ((c_alloc + len) >= cache_limit) { 4549 rib_force_cleanup((void *)hca); 4550 rw_exit(&hca->avl_rw_lock); 4551 cache_misses_above_the_limit ++; 4552 /* Allocate and register the buffer directly */ 4553 goto error_alloc; 4554 } 4555 rw_exit(&hca->avl_rw_lock); 4556 cache_misses ++; 4557 /* Allocate a reply_buf entry */ 4558 reply_buf = (rib_lrc_entry_t *) 4559 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4560 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4561 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4562 reply_buf->lrc_len = len; 4563 reply_buf->registered = FALSE; 4564 reply_buf->avl_node = (void *)rcas; 4565 } 4566 4567 return (reply_buf); 4568 4569 error_alloc: 4570 reply_buf = (rib_lrc_entry_t *) 4571 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 4572 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 4573 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 4574 reply_buf->lrc_len = len; 4575 reply_buf->registered = FALSE; 4576 reply_buf->avl_node = NULL; 4577 4578 return (reply_buf); 4579 } 4580 4581 /* 4582 * Return a pre-registered back to the cache (without 4583 * unregistering the buffer).. 4584 */ 4585 4586 static void 4587 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 4588 { 4589 cache_avl_struct_t cas, *rcas; 4590 avl_index_t where = NULL; 4591 rib_hca_t *hca = (ctoqp(conn))->hca; 4592 4593 if (!hca->avl_init) 4594 goto error_free; 4595 4596 cas.len = reg_buf->lrc_len; 4597 rw_enter(&hca->avl_rw_lock, RW_READER); 4598 if ((rcas = (cache_avl_struct_t *) 4599 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 4600 rw_exit(&hca->avl_rw_lock); 4601 goto error_free; 4602 } else { 4603 rib_total_buffers ++; 4604 cas.len = reg_buf->lrc_len; 4605 mutex_enter(&rcas->node_lock); 4606 insque(reg_buf, &rcas->r); 4607 rcas->elements ++; 4608 mutex_exit(&rcas->node_lock); 4609 rw_exit(&hca->avl_rw_lock); 4610 mutex_enter(&hca->cache_allocation); 4611 cache_allocation += cas.len; 4612 mutex_exit(&hca->cache_allocation); 4613 } 4614 4615 return; 4616 4617 error_free: 4618 4619 if (reg_buf->registered) 4620 (void) rib_deregistermem_via_hca(hca, 4621 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 4622 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 4623 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 4624 } 4625 4626 static rdma_stat 4627 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 4628 uint_t buflen, struct mrc *buf_handle) 4629 { 4630 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 4631 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 4632 rdma_stat status; 4633 4634 4635 /* 4636 * Note: ALL buffer pools use the same memory type RDMARW. 4637 */ 4638 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 4639 if (status == RDMA_SUCCESS) { 4640 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 4641 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 4642 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 4643 } else { 4644 buf_handle->mrc_linfo = NULL; 4645 buf_handle->mrc_lmr = 0; 4646 buf_handle->mrc_rmr = 0; 4647 } 4648 return (status); 4649 } 4650 4651 /* ARGSUSED */ 4652 static rdma_stat 4653 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 4654 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 4655 { 4656 4657 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 4658 return (RDMA_SUCCESS); 4659 } 4660 4661 /* ARGSUSED */ 4662 static rdma_stat 4663 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 4664 { 4665 4666 (void) ibt_deregister_mr(hca->hca_hdl, 4667 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 4668 return (RDMA_SUCCESS); 4669 } 4670 4671 4672 /* 4673 * Return 0 if the interface is IB. 4674 * Return error (>0) if any error is encountered during processing. 4675 * Return -1 if the interface is not IB and no error. 4676 */ 4677 #define isalpha(ch) (((ch) >= 'a' && (ch) <= 'z') || \ 4678 ((ch) >= 'A' && (ch) <= 'Z')) 4679 static int 4680 rpcib_is_ib_interface(char *name) 4681 { 4682 4683 char dev_path[MAXPATHLEN]; 4684 char devname[MAXNAMELEN]; 4685 ldi_handle_t lh; 4686 dl_info_ack_t info; 4687 int ret = 0; 4688 int i; 4689 4690 /* 4691 * ibd devices are only style 2 devices 4692 * so we will open only style 2 devices 4693 * by ignoring the ppa 4694 */ 4695 4696 i = strlen(name) - 1; 4697 while ((i >= 0) && (!isalpha(name[i]))) i--; 4698 4699 if (i < 0) { 4700 /* Invalid interface name, no alphabet */ 4701 return (-1); 4702 } 4703 4704 (void) strncpy(devname, name, i + 1); 4705 devname[i + 1] = '\0'; 4706 4707 if (strcmp("lo", devname) == 0) { 4708 /* 4709 * loopback interface not rpc/rdma capable 4710 */ 4711 return (-1); 4712 } 4713 4714 (void) strncpy(dev_path, "/dev/", MAXPATHLEN); 4715 if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) { 4716 /* string overflow */ 4717 return (-1); 4718 } 4719 4720 ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li); 4721 if (ret != 0) { 4722 return (ret); 4723 } 4724 ret = rpcib_dl_info(lh, &info); 4725 (void) ldi_close(lh, FREAD|FWRITE, kcred); 4726 if (ret != 0) { 4727 return (ret); 4728 } 4729 4730 if (info.dl_mac_type != DL_IB) { 4731 return (-1); 4732 } 4733 4734 return (0); 4735 } 4736 4737 static int 4738 rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info) 4739 { 4740 dl_info_req_t *info_req; 4741 union DL_primitives *dl_prim; 4742 mblk_t *mp; 4743 k_sigset_t smask; 4744 int error; 4745 4746 if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) { 4747 return (ENOMEM); 4748 } 4749 4750 mp->b_datap->db_type = M_PROTO; 4751 4752 info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr; 4753 mp->b_wptr += sizeof (dl_info_req_t); 4754 info_req->dl_primitive = DL_INFO_REQ; 4755 4756 sigintr(&smask, 0); 4757 if ((error = ldi_putmsg(lh, mp)) != 0) { 4758 sigunintr(&smask); 4759 return (error); 4760 } 4761 if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) { 4762 sigunintr(&smask); 4763 return (error); 4764 } 4765 sigunintr(&smask); 4766 4767 dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr; 4768 switch (dl_prim->dl_primitive) { 4769 case DL_INFO_ACK: 4770 if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < 4771 sizeof (dl_info_ack_t)) { 4772 error = -1; 4773 } else { 4774 *info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr; 4775 error = 0; 4776 } 4777 break; 4778 default: 4779 error = -1; 4780 break; 4781 } 4782 4783 freemsg(mp); 4784 return (error); 4785 } 4786 static int 4787 rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg) 4788 { 4789 vnode_t *kvp, *vp; 4790 TIUSER *tiptr; 4791 struct strioctl iocb; 4792 k_sigset_t smask; 4793 int err = 0; 4794 4795 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, 4796 &kvp) == 0) { 4797 if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE, 4798 &tiptr, CRED()) == 0) { 4799 vp = tiptr->fp->f_vnode; 4800 } else { 4801 VN_RELE(kvp); 4802 return (EPROTO); 4803 } 4804 } else { 4805 return (EPROTO); 4806 } 4807 4808 iocb.ic_cmd = cmd; 4809 iocb.ic_timout = 0; 4810 iocb.ic_len = len; 4811 iocb.ic_dp = arg; 4812 sigintr(&smask, 0); 4813 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 4814 sigunintr(&smask); 4815 (void) t_kclose(tiptr, 0); 4816 VN_RELE(kvp); 4817 return (err); 4818 } 4819 4820 static uint_t rpcib_get_number_interfaces(void) { 4821 uint_t numifs; 4822 if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) { 4823 return (0); 4824 } 4825 return (numifs); 4826 } 4827 4828 static boolean_t 4829 rpcib_get_ib_addresses( 4830 struct sockaddr_in *saddr4, 4831 struct sockaddr_in6 *saddr6, 4832 uint_t *number4, 4833 uint_t *number6) 4834 { 4835 int numifs; 4836 struct ifconf kifc; 4837 struct ifreq *ifr; 4838 boolean_t ret = B_FALSE; 4839 4840 *number4 = 0; 4841 *number6 = 0; 4842 4843 if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) { 4844 return (ret); 4845 } 4846 4847 kifc.ifc_len = numifs * sizeof (struct ifreq); 4848 kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP); 4849 4850 if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf), 4851 (caddr_t)&kifc)) { 4852 goto done; 4853 } 4854 4855 ifr = kifc.ifc_req; 4856 for (numifs = kifc.ifc_len / sizeof (struct ifreq); 4857 numifs > 0; numifs--, ifr++) { 4858 struct sockaddr_in *sin4; 4859 struct sockaddr_in6 *sin6; 4860 4861 if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) { 4862 sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr; 4863 sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr; 4864 if (sin4->sin_family == AF_INET) { 4865 saddr4[*number4] = *(struct sockaddr_in *) 4866 (uintptr_t)&ifr->ifr_addr; 4867 *number4 = *number4 + 1; 4868 } else if (sin6->sin6_family == AF_INET6) { 4869 saddr6[*number6] = *(struct sockaddr_in6 *) 4870 (uintptr_t)&ifr->ifr_addr; 4871 *number6 = *number6 + 1; 4872 } 4873 } 4874 } 4875 ret = B_TRUE; 4876 done: 4877 kmem_free(kifc.ifc_buf, kifc.ifc_len); 4878 return (ret); 4879 } 4880 4881 /* ARGSUSED */ 4882 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { 4883 4884 if (KSTAT_WRITE == rw) { 4885 return (EACCES); 4886 } 4887 rpcib_kstat.cache_limit.value.ui64 = 4888 (uint64_t)cache_limit; 4889 rpcib_kstat.cache_allocation.value.ui64 = 4890 (uint64_t)cache_allocation; 4891 rpcib_kstat.cache_hits.value.ui64 = 4892 (uint64_t)cache_hits; 4893 rpcib_kstat.cache_misses.value.ui64 = 4894 (uint64_t)cache_misses; 4895 rpcib_kstat.cache_misses_above_the_limit.value.ui64 = 4896 (uint64_t)cache_misses_above_the_limit; 4897 return (0); 4898 } 4899