/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2007, The Ohio State University. All rights reserved. * * Portions of this source code is developed by the team members of * The Ohio State University's Network-Based Computing Laboratory (NBCL), * headed by Professor Dhabaleswar K. (DK) Panda. * * Acknowledgements to contributions from developors: * Ranjit Noronha: noronha@cse.ohio-state.edu * Lei Chai : chail@cse.ohio-state.edu * Weikuan Yu : yuw@cse.ohio-state.edu * */ /* * The rpcib plugin. Implements the interface for RDMATF's * interaction with IBTF. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NFS_RDMA_PORT 20049 /* * Convenience structures for connection management */ typedef struct rpcib_ipaddrs { void *ri_list; /* pointer to list of addresses */ uint_t ri_count; /* number of addresses in list */ uint_t ri_size; /* size of ri_list in bytes */ } rpcib_ipaddrs_t; typedef struct rpcib_ping { rib_hca_t *hca; ibt_path_info_t path; ibt_ip_addr_t srcip; ibt_ip_addr_t dstip; } rpcib_ping_t; /* * Prototype declarations for driver ops */ static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); static boolean_t rpcib_rdma_capable_interface(struct lifreq *); static int rpcib_do_ip_ioctl(int, int, void *); static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); static int rpcib_cache_kstat_update(kstat_t *, int); static void rib_force_cleanup(void *); static void rib_stop_hca_services(rib_hca_t *); static void rib_attach_hca(void); static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, CONN **conn); struct { kstat_named_t cache_limit; kstat_named_t cache_allocation; kstat_named_t cache_hits; kstat_named_t cache_misses; kstat_named_t cache_misses_above_the_limit; } rpcib_kstat = { {"cache_limit", KSTAT_DATA_UINT64 }, {"cache_allocation", KSTAT_DATA_UINT64 }, {"cache_hits", KSTAT_DATA_UINT64 }, {"cache_misses", KSTAT_DATA_UINT64 }, {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, }; /* rpcib cb_ops */ static struct cb_ops rpcib_cbops = { nulldev, /* open */ nulldev, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ nodev, /* read */ nodev, /* write */ nodev, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* prop_op */ NULL, /* stream */ D_MP, /* cb_flag */ CB_REV, /* rev */ nodev, /* int (*cb_aread)() */ nodev /* int (*cb_awrite)() */ }; /* * Device options */ static struct dev_ops rpcib_ops = { DEVO_REV, /* devo_rev, */ 0, /* refcnt */ rpcib_getinfo, /* info */ nulldev, /* identify */ nulldev, /* probe */ rpcib_attach, /* attach */ rpcib_detach, /* detach */ nodev, /* reset */ &rpcib_cbops, /* driver ops - devctl interfaces */ NULL, /* bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; /* * Module linkage information. */ static struct modldrv rib_modldrv = { &mod_driverops, /* Driver module */ "RPCIB plugin driver", /* Driver name and version */ &rpcib_ops, /* Driver ops */ }; static struct modlinkage rib_modlinkage = { MODREV_1, (void *)&rib_modldrv, NULL }; typedef struct rib_lrc_entry { struct rib_lrc_entry *forw; struct rib_lrc_entry *back; char *lrc_buf; uint32_t lrc_len; void *avl_node; bool_t registered; struct mrc lrc_mhandle; bool_t lrc_on_freed_list; } rib_lrc_entry_t; typedef struct cache_struct { rib_lrc_entry_t r; uint32_t len; uint32_t elements; kmutex_t node_lock; avl_node_t avl_link; } cache_avl_struct_t; uint64_t cache_limit = 100 * 1024 * 1024; static uint64_t cache_watermark = 80 * 1024 * 1024; static bool_t stats_enabled = FALSE; static uint64_t max_unsignaled_rws = 5; int nfs_rdma_port = NFS_RDMA_PORT; #define RIBNETID_TCP "tcp" #define RIBNETID_TCP6 "tcp6" /* * rib_stat: private data pointer used when registering * with the IBTF. It is returned to the consumer * in all callbacks. */ static rpcib_state_t *rib_stat = NULL; #define RNR_RETRIES IBT_RNR_RETRY_1 #define MAX_PORTS 2 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ int preposted_rbufs = RDMA_BUFS_GRANT; int send_threshold = 1; /* * Old cards with Tavor driver have limited memory footprint * when booted in 32bit. The rib_max_rbufs tunable can be * tuned for more buffers if needed. */ #if !defined(_ELF64) && !defined(__sparc) int rib_max_rbufs = MAX_BUFS; #else int rib_max_rbufs = 10 * MAX_BUFS; #endif /* !(_ELF64) && !(__sparc) */ int rib_conn_timeout = 60 * 12; /* 12 minutes */ /* * State of the plugin. * ACCEPT = accepting new connections and requests. * NO_ACCEPT = not accepting new connection and requests. * This should eventually move to rpcib_state_t structure, since this * will tell in which state the plugin is for a particular type of service * like NFS, NLM or v4 Callback deamon. The plugin might be in accept * state for one and in no_accept state for the other. */ int plugin_state; kmutex_t plugin_state_lock; ldi_ident_t rpcib_li; /* * RPCIB RDMATF operations */ static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); static rdma_stat rib_disconnect(CONN *conn); static void rib_listen(struct rdma_svc_data *rd); static void rib_listen_stop(struct rdma_svc_data *rd); static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle); static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle); static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle); static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle); static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc); static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, int len, int cpu); static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); static void rib_rbuf_free(CONN *conn, int ptype, void *buf); static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, int addr_type, void *, CONN **); static rdma_stat rib_conn_release(CONN *conn); static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, rpcib_ping_t *, CONN **); static rdma_stat rib_getinfo(rdma_info_t *info); static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); static void rib_destroy_cache(rib_hca_t *hca); static void rib_server_side_cache_reclaim(void *argp); static int avl_compare(const void *t1, const void *t2); static void rib_stop_services(rib_hca_t *); static void rib_close_channels(rib_conn_list_t *); static void rib_conn_close(void *); static void rib_recv_rele(rib_qp_t *); static rdma_stat rib_conn_release_locked(CONN *conn); /* * RPCIB addressing operations */ /* * RDMA operations the RPCIB module exports */ static rdmaops_t rib_ops = { rib_reachable, rib_conn_get, rib_conn_release, rib_listen, rib_listen_stop, rib_registermem, rib_deregistermem, rib_registermemsync, rib_deregistermemsync, rib_syncmem, rib_reg_buf_alloc, rib_reg_buf_free, rib_send, rib_send_resp, rib_post_resp, rib_post_resp_remove, rib_post_recv, rib_recv, rib_read, rib_write, rib_getinfo, }; /* * RDMATF RPCIB plugin details */ static rdma_mod_t rib_mod = { "ibtf", /* api name */ RDMATF_VERS_1, 0, &rib_ops, /* rdma op vector for ibtf */ }; static rdma_stat rpcib_open_hcas(rpcib_state_t *); static rdma_stat rib_qp_init(rib_qp_t *, int); static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, rib_qp_t **); static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, rib_qp_t **); static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); static int rib_free_sendwait(struct send_wid *); static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); static void rdma_done_rem_list(rib_qp_t *); static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); static void rib_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); static int rib_free_svc_recv(struct svc_recv *); static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); static void rib_free_wid(struct recv_wid *); static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); static void rib_detach_hca(ibt_hca_hdl_t); static void rib_close_a_channel(CONN *); static void rib_send_hold(rib_qp_t *); static void rib_send_rele(rib_qp_t *); /* * Registration with IBTF as a consumer */ static struct ibt_clnt_modinfo_s rib_modinfo = { IBTI_V_CURR, IBT_GENERIC, rib_async_handler, /* async event handler */ NULL, /* Memory Region Handler */ "nfs/ib" }; /* * Global strucuture */ typedef struct rpcib_s { dev_info_t *rpcib_dip; kmutex_t rpcib_mutex; } rpcib_t; rpcib_t rpcib; /* * /etc/system controlled variable to control * debugging in rpcib kernel module. * Set it to values greater that 1 to control * the amount of debugging messages required. */ int rib_debug = 0; int _init(void) { int error; error = mod_install((struct modlinkage *)&rib_modlinkage); if (error != 0) { /* * Could not load module */ return (error); } mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); return (0); } int _fini() { int status; /* * Remove module */ if ((status = mod_remove(&rib_modlinkage)) != 0) { return (status); } mutex_destroy(&plugin_state_lock); return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&rib_modlinkage, modinfop)); } /* * rpcib_getinfo() * Given the device number, return the devinfo pointer or the * instance number. * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. */ /*ARGSUSED*/ static int rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) { int ret = DDI_SUCCESS; switch (cmd) { case DDI_INFO_DEVT2DEVINFO: if (rpcib.rpcib_dip != NULL) *result = rpcib.rpcib_dip; else { *result = NULL; ret = DDI_FAILURE; } break; case DDI_INFO_DEVT2INSTANCE: *result = NULL; break; default: ret = DDI_FAILURE; } return (ret); } static void rpcib_free_hca_list() { rib_hca_t *hca, *hcap; rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); hca = rib_stat->hcas_list; rib_stat->hcas_list = NULL; rw_exit(&rib_stat->hcas_list_lock); while (hca != NULL) { rw_enter(&hca->state_lock, RW_WRITER); hcap = hca; hca = hca->next; rib_stat->nhca_inited--; rib_mod.rdma_count--; hcap->state = HCA_DETACHED; rw_exit(&hcap->state_lock); rib_stop_hca_services(hcap); kmem_free(hcap, sizeof (*hcap)); } } static rdma_stat rpcib_free_service_list() { rib_service_t *service; ibt_status_t ret; rw_enter(&rib_stat->service_list_lock, RW_WRITER); while (rib_stat->service_list != NULL) { service = rib_stat->service_list; ret = ibt_unbind_all_services(service->srv_hdl); if (ret != IBT_SUCCESS) { rw_exit(&rib_stat->service_list_lock); #ifdef DEBUG cmn_err(CE_NOTE, "rpcib_free_service_list: " "ibt_unbind_all_services failed (%d)\n", (int)ret); #endif return (RDMA_FAILED); } ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, service->srv_hdl); if (ret != IBT_SUCCESS) { rw_exit(&rib_stat->service_list_lock); #ifdef DEBUG cmn_err(CE_NOTE, "rpcib_free_service_list: " "ibt_deregister_service failed (%d)\n", (int)ret); #endif return (RDMA_FAILED); } rib_stat->service_list = service->next; kmem_free(service, sizeof (rib_service_t)); } rw_exit(&rib_stat->service_list_lock); return (RDMA_SUCCESS); } static int rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { ibt_status_t ibt_status; rdma_stat r_status; switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: return (DDI_SUCCESS); default: return (DDI_FAILURE); } mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); mutex_enter(&rpcib.rpcib_mutex); if (rpcib.rpcib_dip != NULL) { mutex_exit(&rpcib.rpcib_mutex); return (DDI_FAILURE); } rpcib.rpcib_dip = dip; mutex_exit(&rpcib.rpcib_mutex); /* * Create the "rpcib" minor-node. */ if (ddi_create_minor_node(dip, "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { /* Error message, no cmn_err as they print on console */ return (DDI_FAILURE); } if (rib_stat == NULL) { rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); } rib_stat->hca_count = ibt_get_hca_list(NULL); if (rib_stat->hca_count < 1) { mutex_destroy(&rib_stat->listen_lock); rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; return (DDI_FAILURE); } ibt_status = ibt_attach(&rib_modinfo, dip, (void *)rib_stat, &rib_stat->ibt_clnt_hdl); if (ibt_status != IBT_SUCCESS) { mutex_destroy(&rib_stat->listen_lock); rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; return (DDI_FAILURE); } rib_stat->service_list = NULL; rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); mutex_enter(&rib_stat->open_hca_lock); if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { mutex_exit(&rib_stat->open_hca_lock); goto open_fail; } mutex_exit(&rib_stat->open_hca_lock); if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS) { cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " "failed."); goto register_fail; } /* * Register with rdmatf */ r_status = rdma_register_mod(&rib_mod); if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " "status = %d", r_status); goto register_fail; } return (DDI_SUCCESS); register_fail: open_fail: (void) ibt_detach(rib_stat->ibt_clnt_hdl); rpcib_free_hca_list(); (void) rpcib_free_service_list(); mutex_destroy(&rib_stat->listen_lock); rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); rw_destroy(&rib_stat->service_list_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; return (DDI_FAILURE); } /*ARGSUSED*/ static int rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { switch (cmd) { case DDI_DETACH: break; case DDI_SUSPEND: default: return (DDI_FAILURE); } /* * Detach the hca and free resources */ mutex_enter(&plugin_state_lock); plugin_state = NO_ACCEPT; mutex_exit(&plugin_state_lock); if (rpcib_free_service_list() != RDMA_SUCCESS) return (DDI_FAILURE); rpcib_free_hca_list(); (void) ibt_detach(rib_stat->ibt_clnt_hdl); mutex_destroy(&rib_stat->listen_lock); rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); rw_destroy(&rib_stat->service_list_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; mutex_enter(&rpcib.rpcib_mutex); rpcib.rpcib_dip = NULL; mutex_exit(&rpcib.rpcib_mutex); mutex_destroy(&rpcib.rpcib_mutex); return (DDI_SUCCESS); } static void rib_rbufpool_free(rib_hca_t *, int); static void rib_rbufpool_deregister(rib_hca_t *, int); static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); static rdma_stat rib_rem_replylist(rib_qp_t *); static int rib_remreply(rib_qp_t *, struct reply *); static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); /* * One CQ pair per HCA */ static rdma_stat rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, rib_cq_t **cqp) { rib_cq_t *cq; ibt_cq_attr_t cq_attr; uint32_t real_size; ibt_status_t status; rdma_stat error = RDMA_SUCCESS; cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); cq->rib_hca = hca; cq_attr.cq_size = cq_size; cq_attr.cq_flags = IBT_CQ_NO_FLAGS; status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, &real_size); if (status != IBT_SUCCESS) { cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," " status=%d", status); error = RDMA_FAILED; goto fail; } ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); /* * Enable CQ callbacks. CQ Callbacks are single shot * (e.g. you have to call ibt_enable_cq_notify() * after each callback to get another one). */ status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); if (status != IBT_SUCCESS) { cmn_err(CE_WARN, "rib_create_cq: " "enable_cq_notify failed, status %d", status); error = RDMA_FAILED; goto fail; } *cqp = cq; return (error); fail: if (cq->rib_cq_hdl) (void) ibt_free_cq(cq->rib_cq_hdl); if (cq) kmem_free(cq, sizeof (rib_cq_t)); return (error); } /* * rpcib_find_hca * * Caller should have already locked the hcas_lock before calling * this function. */ static rib_hca_t * rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) { rib_hca_t *hca = ribstat->hcas_list; while (hca && hca->hca_guid != guid) hca = hca->next; return (hca); } static rdma_stat rpcib_open_hcas(rpcib_state_t *ribstat) { rib_hca_t *hca; ibt_status_t ibt_status; rdma_stat status; ibt_hca_portinfo_t *pinfop; ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; uint_t size, cq_size; int i; kstat_t *ksp; cache_avl_struct_t example_avl_node; char rssc_name[32]; int old_nhca_inited = ribstat->nhca_inited; ib_guid_t *hca_guids; ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); ribstat->hca_count = ibt_get_hca_list(&hca_guids); if (ribstat->hca_count == 0) return (RDMA_FAILED); rw_enter(&ribstat->hcas_list_lock, RW_WRITER); /* * Open a hca and setup for RDMA */ for (i = 0; i < ribstat->hca_count; i++) { if (rpcib_find_hca(ribstat, hca_guids[i])) continue; hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, hca_guids[i], &hca->hca_hdl); if (ibt_status != IBT_SUCCESS) { kmem_free(hca, sizeof (rib_hca_t)); continue; } hca->hca_guid = hca_guids[i]; hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; hca->state = HCA_INITED; /* * query HCA info */ ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); if (ibt_status != IBT_SUCCESS) { goto fail1; } /* * One PD (Protection Domain) per HCA. * A qp is allowed to access a memory region * only when it's in the same PD as that of * the memory region. */ ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); if (ibt_status != IBT_SUCCESS) { goto fail1; } /* * query HCA ports */ ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &hca->hca_nports, &size); if (ibt_status != IBT_SUCCESS) { goto fail2; } hca->hca_ports = pinfop; hca->hca_pinfosz = size; pinfop = NULL; cq_size = DEF_CQ_SIZE; /* default cq size */ /* * Create 2 pairs of cq's (1 pair for client * and the other pair for server) on this hca. * If number of qp's gets too large, then several * cq's will be needed. */ status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, &hca->svc_rcq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, &hca->svc_scq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, &hca->clnt_rcq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, &hca->clnt_scq); if (status != RDMA_SUCCESS) { goto fail3; } /* * Create buffer pools. * Note rib_rbuf_create also allocates memory windows. */ hca->recv_pool = rib_rbufpool_create(hca, RECV_BUFFER, rib_max_rbufs); if (hca->recv_pool == NULL) { goto fail3; } hca->send_pool = rib_rbufpool_create(hca, SEND_BUFFER, rib_max_rbufs); if (hca->send_pool == NULL) { rib_rbufpool_destroy(hca, RECV_BUFFER); goto fail3; } if (hca->server_side_cache == NULL) { (void) sprintf(rssc_name, "rib_srvr_cache_%llx", (long long unsigned int) hca->hca_guid); hca->server_side_cache = kmem_cache_create( rssc_name, sizeof (cache_avl_struct_t), 0, NULL, NULL, rib_server_side_cache_reclaim, hca, NULL, 0); } avl_create(&hca->avl_tree, avl_compare, sizeof (cache_avl_struct_t), (uint_t)(uintptr_t)&example_avl_node.avl_link- (uint_t)(uintptr_t)&example_avl_node); rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, hca->iblock); rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock); mutex_init(&hca->cache_allocation_lock, NULL, MUTEX_DRIVER, NULL); hca->avl_init = TRUE; /* Create kstats for the cache */ ASSERT(INGLOBALZONE(curproc)); if (!stats_enabled) { ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", KSTAT_TYPE_NAMED, sizeof (rpcib_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID); if (ksp) { ksp->ks_data = (void *) &rpcib_kstat; ksp->ks_update = rpcib_cache_kstat_update; kstat_install(ksp); stats_enabled = TRUE; } } if (hca->cleanup_helper == NULL) { char tq_name[sizeof (hca->hca_guid) * 2 + 1]; (void) snprintf(tq_name, sizeof (tq_name), "%llX", (unsigned long long int) hca->hca_guid); hca->cleanup_helper = ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); } mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, hca->iblock); rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, hca->iblock); mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); hca->inuse = TRUE; hca->next = ribstat->hcas_list; ribstat->hcas_list = hca; ribstat->nhca_inited++; ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); continue; fail3: ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); fail2: (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); fail1: (void) ibt_close_hca(hca->hca_hdl); kmem_free(hca, sizeof (rib_hca_t)); } rw_exit(&ribstat->hcas_list_lock); ibt_free_hca_list(hca_guids, ribstat->hca_count); rib_mod.rdma_count = rib_stat->nhca_inited; /* * return success if at least one new hca has been configured. */ if (ribstat->nhca_inited != old_nhca_inited) return (RDMA_SUCCESS); else return (RDMA_FAILED); } /* * Callback routines */ /* * SCQ handlers */ /* ARGSUSED */ static void rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibt_status_t ibt_status; ibt_wc_t wc; struct send_wid *wd; CONN *conn; rib_qp_t *qp; int i; /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); if (ibt_status != IBT_SUCCESS) return; /* * Got a send completion */ if (wc.wc_id != RDMA_DUMMY_WRID) { wd = (struct send_wid *)(uintptr_t)wc.wc_id; qp = wd->qp; conn = qptoc(qp); mutex_enter(&wd->sendwait_lock); switch (wc.wc_status) { case IBT_WC_SUCCESS: wd->status = RDMA_SUCCESS; break; default: /* * RC Send Q Error Code Local state Remote State * ==================== =========== ============ * IBT_WC_BAD_RESPONSE_ERR ERROR None * IBT_WC_LOCAL_LEN_ERR ERROR None * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None * IBT_WC_LOCAL_PROTECT_ERR ERROR None * IBT_WC_MEM_WIN_BIND_ERR ERROR None * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR * IBT_WC_REMOTE_OP_ERR ERROR ERROR * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None * IBT_WC_TRANS_TIMEOUT_ERR ERROR None * IBT_WC_WR_FLUSHED_ERR ERROR None */ /* * Channel in error state. Set connection to * ERROR and cleanup will happen either from * conn_release or from rib_conn_get */ wd->status = RDMA_FAILED; mutex_enter(&conn->c_lock); if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); break; } if (wd->cv_sig == 1) { /* * Notify poster */ cv_signal(&wd->wait_cv); mutex_exit(&wd->sendwait_lock); } else { /* * Poster not waiting for notification. * Free the send buffers and send_wid */ for (i = 0; i < wd->nsbufs; i++) { rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, (void *)(uintptr_t)wd->sbufaddr[i]); } /* decrement the send ref count */ rib_send_rele(qp); mutex_exit(&wd->sendwait_lock); (void) rib_free_sendwait(wd); } } } } /* ARGSUSED */ static void rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibt_status_t ibt_status; ibt_wc_t wc; struct send_wid *wd; rib_qp_t *qp; CONN *conn; int i; /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); if (ibt_status != IBT_SUCCESS) return; /* * Got a send completion */ if (wc.wc_id != RDMA_DUMMY_WRID) { wd = (struct send_wid *)(uintptr_t)wc.wc_id; qp = wd->qp; conn = qptoc(qp); mutex_enter(&wd->sendwait_lock); switch (wc.wc_status) { case IBT_WC_SUCCESS: wd->status = RDMA_SUCCESS; break; default: /* * Channel in error state. Set connection to * ERROR and cleanup will happen either from * conn_release or conn timeout. */ wd->status = RDMA_FAILED; mutex_enter(&conn->c_lock); if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); break; } if (wd->cv_sig == 1) { /* * Update completion status and notify poster */ cv_signal(&wd->wait_cv); mutex_exit(&wd->sendwait_lock); } else { /* * Poster not waiting for notification. * Free the send buffers and send_wid */ for (i = 0; i < wd->nsbufs; i++) { rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, (void *)(uintptr_t)wd->sbufaddr[i]); } /* decrement the send ref count */ rib_send_rele(qp); mutex_exit(&wd->sendwait_lock); (void) rib_free_sendwait(wd); } } } } /* * RCQ handler */ /* ARGSUSED */ static void rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { rib_qp_t *qp; ibt_status_t ibt_status; ibt_wc_t wc; struct recv_wid *rwid; /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); if (ibt_status != IBT_SUCCESS) return; rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; qp = rwid->qp; if (wc.wc_status == IBT_WC_SUCCESS) { XDR inxdrs, *xdrs; uint_t xid, vers, op, find_xid = 0; struct reply *r; CONN *conn = qptoc(qp); uint32_t rdma_credit = 0; xdrs = &inxdrs; xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, wc.wc_bytes_xfer, XDR_DECODE); /* * Treat xid as opaque (xid is the first entity * in the rpc rdma message). */ xid = *(uint32_t *)(uintptr_t)rwid->addr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); (void) xdr_u_int(xdrs, &vers); (void) xdr_u_int(xdrs, &rdma_credit); (void) xdr_u_int(xdrs, &op); XDR_DESTROY(xdrs); if (vers != RPCRDMA_VERS) { /* * Invalid RPC/RDMA version. Cannot * interoperate. Set connection to * ERROR state and bail out. */ mutex_enter(&conn->c_lock); if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)rwid->addr); rib_free_wid(rwid); rib_recv_rele(qp); continue; } mutex_enter(&qp->replylist_lock); for (r = qp->replylist; r != NULL; r = r->next) { if (r->xid == xid) { find_xid = 1; switch (op) { case RDMA_MSG: case RDMA_NOMSG: case RDMA_MSGP: r->status = RDMA_SUCCESS; r->vaddr_cq = rwid->addr; r->bytes_xfer = wc.wc_bytes_xfer; cv_signal(&r->wait_cv); break; default: rib_rbuf_free(qptoc(qp), RECV_BUFFER, (void *)(uintptr_t) rwid->addr); break; } break; } } mutex_exit(&qp->replylist_lock); if (find_xid == 0) { /* RPC caller not waiting for reply */ DTRACE_PROBE1(rpcib__i__nomatchxid1, int, xid); rib_rbuf_free(qptoc(qp), RECV_BUFFER, (void *)(uintptr_t)rwid->addr); } } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { CONN *conn = qptoc(qp); /* * Connection being flushed. Just free * the posted buffer */ rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)rwid->addr); } else { CONN *conn = qptoc(qp); /* * RC Recv Q Error Code Local state Remote State * ==================== =========== ============ * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd * IBT_WC_WR_FLUSHED_ERR None None */ /* * Channel in error state. Set connection * in ERROR state. */ mutex_enter(&conn->c_lock); if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)rwid->addr); } rib_free_wid(rwid); rib_recv_rele(qp); } } /* Server side */ /* ARGSUSED */ static void rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { rdma_recv_data_t *rdp; rib_qp_t *qp; ibt_status_t ibt_status; ibt_wc_t wc; struct svc_recv *s_recvp; CONN *conn; mblk_t *mp; /* * Re-enable cq notify here to avoid missing any * completion queue notification. */ (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); ibt_status = IBT_SUCCESS; while (ibt_status != IBT_CQ_EMPTY) { bzero(&wc, sizeof (wc)); ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); if (ibt_status != IBT_SUCCESS) return; s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; qp = s_recvp->qp; conn = qptoc(qp); if (wc.wc_status == IBT_WC_SUCCESS) { XDR inxdrs, *xdrs; uint_t xid, vers, op; uint32_t rdma_credit; xdrs = &inxdrs; /* s_recvp->vaddr stores data */ xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, wc.wc_bytes_xfer, XDR_DECODE); /* * Treat xid as opaque (xid is the first entity * in the rpc rdma message). */ xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; /* Skip xid and set the xdr position accordingly. */ XDR_SETPOS(xdrs, sizeof (uint32_t)); if (!xdr_u_int(xdrs, &vers) || !xdr_u_int(xdrs, &rdma_credit) || !xdr_u_int(xdrs, &op)) { rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); XDR_DESTROY(xdrs); rib_recv_rele(qp); (void) rib_free_svc_recv(s_recvp); continue; } XDR_DESTROY(xdrs); if (vers != RPCRDMA_VERS) { /* * Invalid RPC/RDMA version. * Drop rpc rdma message. */ rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); rib_recv_rele(qp); (void) rib_free_svc_recv(s_recvp); continue; } /* * Is this for RDMA_DONE? */ if (op == RDMA_DONE) { rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); /* * Wake up the thread waiting on * a RDMA_DONE for xid */ mutex_enter(&qp->rdlist_lock); rdma_done_notify(qp, xid); mutex_exit(&qp->rdlist_lock); rib_recv_rele(qp); (void) rib_free_svc_recv(s_recvp); continue; } mutex_enter(&plugin_state_lock); if (plugin_state == ACCEPT) { while ((mp = allocb(sizeof (*rdp), BPRI_LO)) == NULL) (void) strwaitbuf( sizeof (*rdp), BPRI_LO); /* * Plugin is in accept state, hence the master * transport queue for this is still accepting * requests. Hence we can call svc_queuereq to * queue this recieved msg. */ rdp = (rdma_recv_data_t *)mp->b_rptr; rdp->conn = conn; rdp->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr; rdp->rpcmsg.type = RECV_BUFFER; rdp->rpcmsg.len = wc.wc_bytes_xfer; rdp->status = wc.wc_status; mutex_enter(&conn->c_lock); conn->c_ref++; mutex_exit(&conn->c_lock); mp->b_wptr += sizeof (*rdp); svc_queuereq((queue_t *)rib_stat->q, mp); mutex_exit(&plugin_state_lock); } else { /* * The master transport for this is going * away and the queue is not accepting anymore * requests for krpc, so don't do anything, just * free the msg. */ mutex_exit(&plugin_state_lock); rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); } } else { rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)s_recvp->vaddr); } rib_recv_rele(qp); (void) rib_free_svc_recv(s_recvp); } } static void rib_attach_hca() { mutex_enter(&rib_stat->open_hca_lock); (void) rpcib_open_hcas(rib_stat); rib_listen(NULL); mutex_exit(&rib_stat->open_hca_lock); } /* * Handles DR event of IBT_HCA_DETACH_EVENT. */ /* ARGSUSED */ static void rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, ibt_async_code_t code, ibt_async_event_t *event) { switch (code) { case IBT_HCA_ATTACH_EVENT: rib_attach_hca(); break; case IBT_HCA_DETACH_EVENT: rib_detach_hca(hca_hdl); #ifdef DEBUG cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); #endif break; case IBT_EVENT_PORT_UP: /* * A port is up. We should call rib_listen() since there is * a chance that rib_listen() may have failed during * rib_attach_hca() because the port had not been up yet. */ rib_listen(NULL); #ifdef DEBUG cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); #endif break; #ifdef DEBUG case IBT_EVENT_PATH_MIGRATED: cmn_err(CE_NOTE, "rib_async_handler(): " "IBT_EVENT_PATH_MIGRATED\n"); break; case IBT_EVENT_SQD: cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); break; case IBT_EVENT_COM_EST: cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); break; case IBT_ERROR_CATASTROPHIC_CHAN: cmn_err(CE_NOTE, "rib_async_handler(): " "IBT_ERROR_CATASTROPHIC_CHAN\n"); break; case IBT_ERROR_INVALID_REQUEST_CHAN: cmn_err(CE_NOTE, "rib_async_handler(): " "IBT_ERROR_INVALID_REQUEST_CHAN\n"); break; case IBT_ERROR_ACCESS_VIOLATION_CHAN: cmn_err(CE_NOTE, "rib_async_handler(): " "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); break; case IBT_ERROR_PATH_MIGRATE_REQ: cmn_err(CE_NOTE, "rib_async_handler(): " "IBT_ERROR_PATH_MIGRATE_REQ\n"); break; case IBT_ERROR_CQ: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); break; case IBT_ERROR_PORT_DOWN: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); break; case IBT_ASYNC_OPAQUE1: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); break; case IBT_ASYNC_OPAQUE2: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); break; case IBT_ASYNC_OPAQUE3: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); break; case IBT_ASYNC_OPAQUE4: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); break; #endif default: break; } } /* * Client's reachable function. */ static rdma_stat rib_reachable(int addr_type, struct netbuf *raddr, void **handle) { rdma_stat status; rpcib_ping_t rpt; struct netbuf saddr; CONN *conn; bzero(&saddr, sizeof (struct netbuf)); status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); if (status == RDMA_SUCCESS) { *handle = (void *)rpt.hca; /* release the reference */ (void) rib_conn_release(conn); return (RDMA_SUCCESS); } else { *handle = NULL; DTRACE_PROBE(rpcib__i__pingfailed); return (RDMA_FAILED); } } /* Client side qp creation */ static rdma_stat rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) { rib_qp_t *kqp = NULL; CONN *conn; rdma_clnt_cred_ctrl_t *cc_info; ASSERT(qp != NULL); *qp = NULL; kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); conn = qptoc(kqp); kqp->hca = hca; kqp->rdmaconn.c_rdmamod = &rib_mod; kqp->rdmaconn.c_private = (caddr_t)kqp; kqp->mode = RIB_CLIENT; kqp->chan_flags = IBT_BLOCKING; conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; /* * Initialize */ cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); /* * Initialize the client credit control * portion of the rdmaconn struct. */ kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; cc_info->clnt_cc_granted_ops = 0; cc_info->clnt_cc_in_flight_ops = 0; cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); *qp = kqp; return (RDMA_SUCCESS); } /* Server side qp creation */ static rdma_stat rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) { rib_qp_t *kqp = NULL; ibt_chan_sizes_t chan_sizes; ibt_rc_chan_alloc_args_t qp_attr; ibt_status_t ibt_status; rdma_srv_cred_ctrl_t *cc_info; *qp = NULL; kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); kqp->hca = hca; kqp->port_num = port; kqp->rdmaconn.c_rdmamod = &rib_mod; kqp->rdmaconn.c_private = (caddr_t)kqp; /* * Create the qp handle */ bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; qp_attr.rc_pd = hca->pd_hdl; qp_attr.rc_hca_port_num = port; qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; qp_attr.rc_clone_chan = NULL; qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; qp_attr.rc_flags = IBT_WR_SIGNALED; rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, &chan_sizes); } else { rw_exit(&hca->state_lock); goto fail; } rw_exit(&hca->state_lock); if (ibt_status != IBT_SUCCESS) { DTRACE_PROBE1(rpcib__i_svccreatechanfail, int, ibt_status); goto fail; } kqp->mode = RIB_SERVER; kqp->chan_flags = IBT_BLOCKING; kqp->q = q; /* server ONLY */ cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); /* * Set the private data area to qp to be used in callbacks */ ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); kqp->rdmaconn.c_state = C_CONNECTED; /* * Initialize the server credit control * portion of the rdmaconn struct. */ kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; cc_info->srv_cc_buffers_granted = preposted_rbufs; cc_info->srv_cc_cur_buffers_used = 0; cc_info->srv_cc_posted = preposted_rbufs; *qp = kqp; return (RDMA_SUCCESS); fail: if (kqp) kmem_free(kqp, sizeof (rib_qp_t)); return (RDMA_FAILED); } /* ARGSUSED */ ibt_cm_status_t rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, ibt_cm_return_args_t *ret_args, void *priv_data, ibt_priv_data_len_t len) { rib_hca_t *hca; hca = (rib_hca_t *)clnt_hdl; switch (event->cm_type) { /* got a connection close event */ case IBT_CM_EVENT_CONN_CLOSED: { CONN *conn; rib_qp_t *qp; /* check reason why connection was closed */ switch (event->cm_event.closed) { case IBT_CM_CLOSED_DREP_RCVD: case IBT_CM_CLOSED_DREQ_TIMEOUT: case IBT_CM_CLOSED_DUP: case IBT_CM_CLOSED_ABORT: case IBT_CM_CLOSED_ALREADY: /* * These cases indicate the local end initiated * the closing of the channel. Nothing to do here. */ break; default: /* * Reason for CONN_CLOSED event must be one of * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD * or IBT_CM_CLOSED_STALE. These indicate cases were * the remote end is closing the channel. In these * cases free the channel and transition to error * state */ qp = ibt_get_chan_private(event->cm_channel); conn = qptoc(qp); mutex_enter(&conn->c_lock); if (conn->c_state == C_DISCONN_PEND) { mutex_exit(&conn->c_lock); break; } conn->c_state = C_ERROR_CONN; /* * Free the conn if c_ref is down to 0 already */ if (conn->c_ref == 0) { /* * Remove from list and free conn */ conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) (void) rib_disconnect_channel(conn, &hca->cl_conn_list); rw_exit(&hca->state_lock); } else { /* * conn will be freed when c_ref goes to 0. * Indicate to cleaning thread not to close * the connection, but just free the channel. */ conn->c_flags |= C_CLOSE_NOTNEEDED; mutex_exit(&conn->c_lock); } #ifdef DEBUG if (rib_debug) cmn_err(CE_NOTE, "rib_clnt_cm_handler: " "(CONN_CLOSED) channel disconnected"); #endif break; } break; } default: break; } return (IBT_CM_ACCEPT); } /* * Connect to the server. */ rdma_stat rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) { ibt_chan_open_args_t chan_args; /* channel args */ ibt_chan_sizes_t chan_sizes; ibt_rc_chan_alloc_args_t qp_attr; ibt_status_t ibt_status; ibt_rc_returns_t ret_args; /* conn reject info */ int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ ibt_ip_cm_info_t ipcm_info; uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; (void) bzero(&chan_args, sizeof (chan_args)); (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); ipcm_info.src_addr.family = rptp->srcip.family; switch (ipcm_info.src_addr.family) { case AF_INET: ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; break; case AF_INET6: ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; break; } ipcm_info.dst_addr.family = rptp->srcip.family; switch (ipcm_info.dst_addr.family) { case AF_INET: ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; break; case AF_INET6: ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; break; } ipcm_info.src_port = (in_port_t)nfs_rdma_port; ibt_status = ibt_format_ip_private_data(&ipcm_info, IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); if (ibt_status != IBT_SUCCESS) { cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); return (-1); } qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; /* Alloc a RC channel */ qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; qp_attr.rc_pd = hca->pd_hdl; qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; qp_attr.rc_clone_chan = NULL; qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; qp_attr.rc_flags = IBT_WR_SIGNALED; rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); chan_args.oc_path = &rptp->path; chan_args.oc_cm_handler = rib_clnt_cm_handler; chan_args.oc_cm_clnt_private = (void *)hca; chan_args.oc_rdma_ra_out = 4; chan_args.oc_rdma_ra_in = 4; chan_args.oc_path_retry_cnt = 2; chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; chan_args.oc_priv_data = cmp_ip_pvt; chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; refresh: rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl, &chan_sizes); } else { rw_exit(&hca->state_lock); return (RDMA_FAILED); } rw_exit(&hca->state_lock); if (ibt_status != IBT_SUCCESS) { DTRACE_PROBE1(rpcib__i_conntosrv, int, ibt_status); return (RDMA_FAILED); } /* Connect to the Server */ (void) bzero(&ret_args, sizeof (ret_args)); mutex_enter(&qp->cb_lock); ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &chan_args, &ret_args); if (ibt_status != IBT_SUCCESS) { DTRACE_PROBE2(rpcib__i_openrctosrv, int, ibt_status, int, ret_args.rc_status); (void) ibt_free_channel(qp->qp_hdl); qp->qp_hdl = NULL; mutex_exit(&qp->cb_lock); if (refresh-- && ibt_status == IBT_CM_FAILURE && ret_args.rc_status == IBT_CM_CONN_STALE) { /* * Got IBT_CM_CONN_STALE probably because of stale * data on the passive end of a channel that existed * prior to reboot. Retry establishing a channel * REFRESH_ATTEMPTS times, during which time the * stale conditions on the server might clear up. */ goto refresh; } return (RDMA_FAILED); } mutex_exit(&qp->cb_lock); /* * Set the private data area to qp to be used in callbacks */ ibt_set_chan_private(qp->qp_hdl, (void *)qp); return (RDMA_SUCCESS); } rdma_stat rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) { uint_t i, addr_count; ibt_status_t ibt_status; uint8_t num_paths_p; ibt_ip_path_attr_t ipattr; ibt_path_ip_src_t srcip; rpcib_ipaddrs_t addrs4; rpcib_ipaddrs_t addrs6; struct sockaddr_in *sinp; struct sockaddr_in6 *sin6p; rdma_stat retval = RDMA_FAILED; rib_hca_t *hca; if ((addr_type != AF_INET) && (addr_type != AF_INET6)) return (RDMA_INVAL); ASSERT(raddr->buf != NULL); bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { retval = RDMA_FAILED; goto done2; } if (addr_type == AF_INET) { addr_count = addrs4.ri_count; sinp = (struct sockaddr_in *)raddr->buf; rptp->dstip.family = AF_INET; rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; sinp = addrs4.ri_list; } else { addr_count = addrs6.ri_count; sin6p = (struct sockaddr_in6 *)raddr->buf; rptp->dstip.family = AF_INET6; rptp->dstip.un.ip6addr = sin6p->sin6_addr; sin6p = addrs6.ri_list; } rw_enter(&rib_stat->hcas_list_lock, RW_READER); for (hca = rib_stat->hcas_list; hca; hca = hca->next) { rw_enter(&hca->state_lock, RW_READER); if (hca->state == HCA_DETACHED) { rw_exit(&hca->state_lock); continue; } ipattr.ipa_dst_ip = &rptp->dstip; ipattr.ipa_hca_guid = hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = rptp->dstip.family; for (i = 0; i < addr_count; i++) { num_paths_p = 0; if (addr_type == AF_INET) { ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; } else { ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; } bzero(&srcip, sizeof (ibt_path_ip_src_t)); ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && rptp->path.pi_hca_guid == hca->hca_guid) { rptp->hca = hca; rw_exit(&hca->state_lock); if (addr_type == AF_INET) { rptp->srcip.family = AF_INET; rptp->srcip.un.ip4addr = srcip.ip_primary.un.ip4addr; } else { rptp->srcip.family = AF_INET6; rptp->srcip.un.ip6addr = srcip.ip_primary.un.ip6addr; } retval = RDMA_SUCCESS; goto done1; } } rw_exit(&hca->state_lock); } done1: rw_exit(&rib_stat->hcas_list_lock); done2: if (addrs4.ri_size > 0) kmem_free(addrs4.ri_list, addrs4.ri_size); if (addrs6.ri_size > 0) kmem_free(addrs6.ri_list, addrs6.ri_size); return (retval); } /* * Close channel, remove from connection list and * free up resources allocated for that channel. */ rdma_stat rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) { rib_qp_t *qp = ctoqp(conn); rib_hca_t *hca; mutex_enter(&conn->c_lock); if (conn->c_timeout != NULL) { mutex_exit(&conn->c_lock); (void) untimeout(conn->c_timeout); mutex_enter(&conn->c_lock); } while (conn->c_flags & C_CLOSE_PENDING) { cv_wait(&conn->c_cv, &conn->c_lock); } mutex_exit(&conn->c_lock); /* * c_ref == 0 and connection is in C_DISCONN_PEND */ hca = qp->hca; if (conn_list != NULL) (void) rib_rm_conn(conn, conn_list); /* * There is only one case where we get here with * qp_hdl = NULL, which is during connection setup on * the client. In such a case there are no posted * send/recv buffers. */ if (qp->qp_hdl != NULL) { mutex_enter(&qp->posted_rbufs_lock); while (qp->n_posted_rbufs) cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); mutex_exit(&qp->posted_rbufs_lock); mutex_enter(&qp->send_rbufs_lock); while (qp->n_send_rbufs) cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); mutex_exit(&qp->send_rbufs_lock); (void) ibt_free_channel(qp->qp_hdl); qp->qp_hdl = NULL; } ASSERT(qp->rdlist == NULL); if (qp->replylist != NULL) { (void) rib_rem_replylist(qp); } cv_destroy(&qp->cb_conn_cv); cv_destroy(&qp->posted_rbufs_cv); cv_destroy(&qp->send_rbufs_cv); mutex_destroy(&qp->cb_lock); mutex_destroy(&qp->replylist_lock); mutex_destroy(&qp->posted_rbufs_lock); mutex_destroy(&qp->send_rbufs_lock); mutex_destroy(&qp->rdlist_lock); cv_destroy(&conn->c_cv); mutex_destroy(&conn->c_lock); if (conn->c_raddr.buf != NULL) { kmem_free(conn->c_raddr.buf, conn->c_raddr.len); } if (conn->c_laddr.buf != NULL) { kmem_free(conn->c_laddr.buf, conn->c_laddr.len); } if (conn->c_netid != NULL) { kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); } /* * Credit control cleanup. */ if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { rdma_clnt_cred_ctrl_t *cc_info; cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; cv_destroy(&cc_info->clnt_cc_cv); } kmem_free(qp, sizeof (rib_qp_t)); /* * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, * then the hca is no longer being used. */ if (conn_list != NULL) { rw_enter(&hca->state_lock, RW_READER); if (hca->state == HCA_DETACHED) { rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); if (hca->srv_conn_list.conn_hd == NULL) { rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); if (hca->cl_conn_list.conn_hd == NULL) { mutex_enter(&hca->inuse_lock); hca->inuse = FALSE; cv_signal(&hca->cb_cv); mutex_exit(&hca->inuse_lock); } rw_exit(&hca->cl_conn_list.conn_lock); } rw_exit(&hca->srv_conn_list.conn_lock); } rw_exit(&hca->state_lock); } return (RDMA_SUCCESS); } /* * All sends are done under the protection of * the wdesc->sendwait_lock. n_send_rbufs count * is protected using the send_rbufs_lock. * lock ordering is: * sendwait_lock -> send_rbufs_lock */ void rib_send_hold(rib_qp_t *qp) { mutex_enter(&qp->send_rbufs_lock); qp->n_send_rbufs++; mutex_exit(&qp->send_rbufs_lock); } void rib_send_rele(rib_qp_t *qp) { mutex_enter(&qp->send_rbufs_lock); qp->n_send_rbufs--; if (qp->n_send_rbufs == 0) cv_signal(&qp->send_rbufs_cv); mutex_exit(&qp->send_rbufs_lock); } void rib_recv_rele(rib_qp_t *qp) { mutex_enter(&qp->posted_rbufs_lock); qp->n_posted_rbufs--; if (qp->n_posted_rbufs == 0) cv_signal(&qp->posted_rbufs_cv); mutex_exit(&qp->posted_rbufs_lock); } /* * Wait for send completion notification. Only on receiving a * notification be it a successful or error completion, free the * send_wid. */ static rdma_stat rib_sendwait(rib_qp_t *qp, struct send_wid *wd) { clock_t timout, cv_wait_ret; rdma_stat error = RDMA_SUCCESS; int i; /* * Wait for send to complete */ ASSERT(wd != NULL); mutex_enter(&wd->sendwait_lock); if (wd->status == (uint_t)SEND_WAIT) { timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + ddi_get_lbolt(); if (qp->mode == RIB_SERVER) { while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, &wd->sendwait_lock, timout)) > 0 && wd->status == (uint_t)SEND_WAIT) ; switch (cv_wait_ret) { case -1: /* timeout */ DTRACE_PROBE(rpcib__i__srvsendwait__timeout); wd->cv_sig = 0; /* no signal needed */ error = RDMA_TIMEDOUT; break; default: /* got send completion */ break; } } else { while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, &wd->sendwait_lock, timout)) > 0 && wd->status == (uint_t)SEND_WAIT) ; switch (cv_wait_ret) { case -1: /* timeout */ DTRACE_PROBE(rpcib__i__clntsendwait__timeout); wd->cv_sig = 0; /* no signal needed */ error = RDMA_TIMEDOUT; break; case 0: /* interrupted */ DTRACE_PROBE(rpcib__i__clntsendwait__intr); wd->cv_sig = 0; /* no signal needed */ error = RDMA_INTR; break; default: /* got send completion */ break; } } } if (wd->status != (uint_t)SEND_WAIT) { /* got send completion */ if (wd->status != RDMA_SUCCESS) { switch (wd->status) { case RDMA_CONNLOST: error = RDMA_CONNLOST; break; default: error = RDMA_FAILED; break; } } for (i = 0; i < wd->nsbufs; i++) { rib_rbuf_free(qptoc(qp), SEND_BUFFER, (void *)(uintptr_t)wd->sbufaddr[i]); } rib_send_rele(qp); mutex_exit(&wd->sendwait_lock); (void) rib_free_sendwait(wd); } else { mutex_exit(&wd->sendwait_lock); } return (error); } static struct send_wid * rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) { struct send_wid *wd; wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); wd->xid = xid; wd->cv_sig = cv_sig; wd->qp = qp; cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); wd->status = (uint_t)SEND_WAIT; return (wd); } static int rib_free_sendwait(struct send_wid *wdesc) { cv_destroy(&wdesc->wait_cv); mutex_destroy(&wdesc->sendwait_lock); kmem_free(wdesc, sizeof (*wdesc)); return (0); } static rdma_stat rib_rem_rep(rib_qp_t *qp, struct reply *rep) { mutex_enter(&qp->replylist_lock); if (rep != NULL) { (void) rib_remreply(qp, rep); mutex_exit(&qp->replylist_lock); return (RDMA_SUCCESS); } mutex_exit(&qp->replylist_lock); return (RDMA_FAILED); } /* * Send buffers are freed here only in case of error in posting * on QP. If the post succeeded, the send buffers are freed upon * send completion in rib_sendwait() or in the scq_handler. */ rdma_stat rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, int send_sig, int cv_sig, caddr_t *swid) { struct send_wid *wdesc; struct clist *clp; ibt_status_t ibt_status = IBT_SUCCESS; rdma_stat ret = RDMA_SUCCESS; ibt_send_wr_t tx_wr; int i, nds; ibt_wr_ds_t sgl[DSEG_MAX]; uint_t total_msg_size; rib_qp_t *qp; qp = ctoqp(conn); ASSERT(cl != NULL); bzero(&tx_wr, sizeof (ibt_send_wr_t)); nds = 0; total_msg_size = 0; clp = cl; while (clp != NULL) { if (nds >= DSEG_MAX) { DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); return (RDMA_FAILED); } sgl[nds].ds_va = clp->w.c_saddr; sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ sgl[nds].ds_len = clp->c_len; total_msg_size += clp->c_len; clp = clp->c_next; nds++; } if (send_sig) { /* Set SEND_SIGNAL flag. */ tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; wdesc = rib_init_sendwait(msgid, cv_sig, qp); *swid = (caddr_t)wdesc; tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; mutex_enter(&wdesc->sendwait_lock); wdesc->nsbufs = nds; for (i = 0; i < nds; i++) { wdesc->sbufaddr[i] = sgl[i].ds_va; } } else { tx_wr.wr_flags = IBT_WR_NO_FLAGS; *swid = NULL; tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; } tx_wr.wr_opcode = IBT_WRC_SEND; tx_wr.wr_trans = IBT_RC_SRV; tx_wr.wr_nds = nds; tx_wr.wr_sgl = sgl; mutex_enter(&conn->c_lock); if (conn->c_state == C_CONNECTED) { ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); } if (conn->c_state != C_CONNECTED || ibt_status != IBT_SUCCESS) { if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); if (send_sig) { for (i = 0; i < nds; i++) { rib_rbuf_free(conn, SEND_BUFFER, (void *)(uintptr_t)wdesc->sbufaddr[i]); } mutex_exit(&wdesc->sendwait_lock); (void) rib_free_sendwait(wdesc); } return (RDMA_CONNLOST); } mutex_exit(&conn->c_lock); if (send_sig) { rib_send_hold(qp); mutex_exit(&wdesc->sendwait_lock); if (cv_sig) { /* * cv_wait for send to complete. * We can fail due to a timeout or signal or * unsuccessful send. */ ret = rib_sendwait(qp, wdesc); return (ret); } } return (RDMA_SUCCESS); } rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret; caddr_t wd; /* send-wait & cv_signal */ ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); return (ret); } /* * Deprecated/obsolete interface not used currently * but earlier used for READ-READ protocol. * Send RPC reply and wait for RDMA_DONE. */ rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) { rdma_stat ret = RDMA_SUCCESS; struct rdma_done_list *rd; clock_t cv_wait_ret; caddr_t *wid = NULL; rib_qp_t *qp = ctoqp(conn); mutex_enter(&qp->rdlist_lock); rd = rdma_done_add(qp, msgid); /* No cv_signal (whether send-wait or no-send-wait) */ ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); if (ret != RDMA_SUCCESS) { rdma_done_rm(qp, rd); } else { /* * Wait for RDMA_DONE from remote end */ cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), TR_CLOCK_TICK); rdma_done_rm(qp, rd); if (cv_wait_ret < 0) { ret = RDMA_TIMEDOUT; } } mutex_exit(&qp->rdlist_lock); return (ret); } static struct recv_wid * rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) { struct recv_wid *rwid; rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); rwid->xid = msgid; rwid->addr = sgl->ds_va; rwid->qp = qp; return (rwid); } static void rib_free_wid(struct recv_wid *rwid) { kmem_free(rwid, sizeof (struct recv_wid)); } rdma_stat rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) { rib_qp_t *qp = ctoqp(conn); struct clist *clp = cl; struct reply *rep; struct recv_wid *rwid; int nds; ibt_wr_ds_t sgl[DSEG_MAX]; ibt_recv_wr_t recv_wr; rdma_stat ret; ibt_status_t ibt_status; /* * rdma_clnt_postrecv uses RECV_BUFFER. */ nds = 0; while (cl != NULL) { if (nds >= DSEG_MAX) { ret = RDMA_FAILED; goto done; } sgl[nds].ds_va = cl->w.c_saddr; sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ sgl[nds].ds_len = cl->c_len; cl = cl->c_next; nds++; } if (nds != 1) { ret = RDMA_FAILED; goto done; } bzero(&recv_wr, sizeof (ibt_recv_wr_t)); recv_wr.wr_nds = nds; recv_wr.wr_sgl = sgl; rwid = rib_create_wid(qp, &sgl[0], msgid); if (rwid) { recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; } else { ret = RDMA_NORESOURCE; goto done; } rep = rib_addreplylist(qp, msgid); if (!rep) { rib_free_wid(rwid); ret = RDMA_NORESOURCE; goto done; } mutex_enter(&conn->c_lock); if (conn->c_state == C_CONNECTED) { ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); } if (conn->c_state != C_CONNECTED || ibt_status != IBT_SUCCESS) { if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); rib_free_wid(rwid); (void) rib_rem_rep(qp, rep); ret = RDMA_CONNLOST; goto done; } mutex_enter(&qp->posted_rbufs_lock); qp->n_posted_rbufs++; mutex_exit(&qp->posted_rbufs_lock); mutex_exit(&conn->c_lock); return (RDMA_SUCCESS); done: while (clp != NULL) { rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->w.c_saddr3); clp = clp->c_next; } return (ret); } rdma_stat rib_svc_post(CONN* conn, struct clist *cl) { rib_qp_t *qp = ctoqp(conn); struct svc_recv *s_recvp; int nds; ibt_wr_ds_t sgl[DSEG_MAX]; ibt_recv_wr_t recv_wr; ibt_status_t ibt_status; nds = 0; while (cl != NULL) { if (nds >= DSEG_MAX) { return (RDMA_FAILED); } sgl[nds].ds_va = cl->w.c_saddr; sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ sgl[nds].ds_len = cl->c_len; cl = cl->c_next; nds++; } if (nds != 1) { rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va); return (RDMA_FAILED); } bzero(&recv_wr, sizeof (ibt_recv_wr_t)); recv_wr.wr_nds = nds; recv_wr.wr_sgl = sgl; s_recvp = rib_init_svc_recv(qp, &sgl[0]); /* Use s_recvp's addr as wr id */ recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; mutex_enter(&conn->c_lock); if (conn->c_state == C_CONNECTED) { ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); } if (conn->c_state != C_CONNECTED || ibt_status != IBT_SUCCESS) { if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va); (void) rib_free_svc_recv(s_recvp); return (RDMA_CONNLOST); } mutex_exit(&conn->c_lock); return (RDMA_SUCCESS); } /* Client */ rdma_stat rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) { return (rib_clnt_post(conn, cl, msgid)); } /* Client */ rdma_stat rib_post_resp_remove(CONN* conn, uint32_t msgid) { rib_qp_t *qp = ctoqp(conn); struct reply *rep; mutex_enter(&qp->replylist_lock); for (rep = qp->replylist; rep != NULL; rep = rep->next) { if (rep->xid == msgid) { if (rep->vaddr_cq) { rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)rep->vaddr_cq); } (void) rib_remreply(qp, rep); break; } } mutex_exit(&qp->replylist_lock); return (RDMA_SUCCESS); } /* Server */ rdma_stat rib_post_recv(CONN *conn, struct clist *cl) { rib_qp_t *qp = ctoqp(conn); if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { mutex_enter(&qp->posted_rbufs_lock); qp->n_posted_rbufs++; mutex_exit(&qp->posted_rbufs_lock); return (RDMA_SUCCESS); } return (RDMA_FAILED); } /* * Client side only interface to "recv" the rpc reply buf * posted earlier by rib_post_resp(conn, cl, msgid). */ rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) { struct reply *rep = NULL; clock_t timout, cv_wait_ret; rdma_stat ret = RDMA_SUCCESS; rib_qp_t *qp = ctoqp(conn); /* * Find the reply structure for this msgid */ mutex_enter(&qp->replylist_lock); for (rep = qp->replylist; rep != NULL; rep = rep->next) { if (rep->xid == msgid) break; } if (rep != NULL) { /* * If message not yet received, wait. */ if (rep->status == (uint_t)REPLY_WAIT) { timout = ddi_get_lbolt() + drv_usectohz(REPLY_WAIT_TIME * 1000000); while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, &qp->replylist_lock, timout)) > 0 && rep->status == (uint_t)REPLY_WAIT) ; switch (cv_wait_ret) { case -1: /* timeout */ ret = RDMA_TIMEDOUT; break; case 0: ret = RDMA_INTR; break; default: break; } } if (rep->status == RDMA_SUCCESS) { struct clist *cl = NULL; /* * Got message successfully */ clist_add(&cl, 0, rep->bytes_xfer, NULL, (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); *clp = cl; } else { if (rep->status != (uint_t)REPLY_WAIT) { /* * Got error in reply message. Free * recv buffer here. */ ret = rep->status; rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)rep->vaddr_cq); } } (void) rib_remreply(qp, rep); } else { /* * No matching reply structure found for given msgid on the * reply wait list. */ ret = RDMA_INVAL; DTRACE_PROBE(rpcib__i__nomatchxid2); } /* * Done. */ mutex_exit(&qp->replylist_lock); return (ret); } /* * RDMA write a buffer to the remote address. */ rdma_stat rib_write(CONN *conn, struct clist *cl, int wait) { ibt_send_wr_t tx_wr; int cv_sig; ibt_wr_ds_t sgl[DSEG_MAX]; struct send_wid *wdesc; ibt_status_t ibt_status; rdma_stat ret = RDMA_SUCCESS; rib_qp_t *qp = ctoqp(conn); uint64_t n_writes = 0; if (cl == NULL) { return (RDMA_FAILED); } while ((cl != NULL)) { if (cl->c_len > 0) { bzero(&tx_wr, sizeof (ibt_send_wr_t)); tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */ sgl[0].ds_va = cl->w.c_saddr; sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ sgl[0].ds_len = cl->c_len; if (wait) { cv_sig = 1; } else { if (n_writes > max_unsignaled_rws) { n_writes = 0; cv_sig = 1; } else { cv_sig = 0; } } if (cv_sig) { tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; wdesc = rib_init_sendwait(0, cv_sig, qp); tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; mutex_enter(&wdesc->sendwait_lock); } else { tx_wr.wr_flags = IBT_WR_NO_FLAGS; tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; } tx_wr.wr_opcode = IBT_WRC_RDMAW; tx_wr.wr_trans = IBT_RC_SRV; tx_wr.wr_nds = 1; tx_wr.wr_sgl = sgl; mutex_enter(&conn->c_lock); if (conn->c_state == C_CONNECTED) { ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); } if (conn->c_state != C_CONNECTED || ibt_status != IBT_SUCCESS) { if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); if (cv_sig) { mutex_exit(&wdesc->sendwait_lock); (void) rib_free_sendwait(wdesc); } return (RDMA_CONNLOST); } mutex_exit(&conn->c_lock); /* * Wait for send to complete */ if (cv_sig) { rib_send_hold(qp); mutex_exit(&wdesc->sendwait_lock); ret = rib_sendwait(qp, wdesc); if (ret != 0) return (ret); } n_writes ++; } cl = cl->c_next; } return (RDMA_SUCCESS); } /* * RDMA Read a buffer from the remote address. */ rdma_stat rib_read(CONN *conn, struct clist *cl, int wait) { ibt_send_wr_t rx_wr; int cv_sig = 0; ibt_wr_ds_t sgl; struct send_wid *wdesc; ibt_status_t ibt_status = IBT_SUCCESS; rdma_stat ret = RDMA_SUCCESS; rib_qp_t *qp = ctoqp(conn); if (cl == NULL) { return (RDMA_FAILED); } while (cl != NULL) { bzero(&rx_wr, sizeof (ibt_send_wr_t)); /* * Remote address is at the head chunk item in list. */ rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; sgl.ds_va = cl->u.c_daddr; sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ sgl.ds_len = cl->c_len; /* * If there are multiple chunks to be read, and * wait is set, ask for signal only for the last chunk * and wait only on the last chunk. The completion of * RDMA_READ on last chunk ensures that reads on all * previous chunks are also completed. */ if (wait && (cl->c_next == NULL)) { cv_sig = 1; wdesc = rib_init_sendwait(0, cv_sig, qp); rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; mutex_enter(&wdesc->sendwait_lock); } else { rx_wr.wr_flags = IBT_WR_NO_FLAGS; rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; } rx_wr.wr_opcode = IBT_WRC_RDMAR; rx_wr.wr_trans = IBT_RC_SRV; rx_wr.wr_nds = 1; rx_wr.wr_sgl = &sgl; mutex_enter(&conn->c_lock); if (conn->c_state == C_CONNECTED) { ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); } if (conn->c_state != C_CONNECTED || ibt_status != IBT_SUCCESS) { if (conn->c_state != C_DISCONN_PEND) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); if (wait && (cl->c_next == NULL)) { mutex_exit(&wdesc->sendwait_lock); (void) rib_free_sendwait(wdesc); } return (RDMA_CONNLOST); } mutex_exit(&conn->c_lock); /* * Wait for send to complete if this is the * last item in the list. */ if (wait && cl->c_next == NULL) { rib_send_hold(qp); mutex_exit(&wdesc->sendwait_lock); ret = rib_sendwait(qp, wdesc); if (ret != 0) return (ret); } cl = cl->c_next; } return (RDMA_SUCCESS); } /* * rib_srv_cm_handler() * Connection Manager callback to handle RC connection requests. */ /* ARGSUSED */ static ibt_cm_status_t rib_srv_cm_handler(void *any, ibt_cm_event_t *event, ibt_cm_return_args_t *ret_args, void *priv_data, ibt_priv_data_len_t len) { queue_t *q; rib_qp_t *qp; rib_hca_t *hca; rdma_stat status = RDMA_SUCCESS; int i; struct clist cl; rdma_buf_t rdbuf = {0}; void *buf = NULL; CONN *conn; ibt_ip_cm_info_t ipinfo; struct sockaddr_in *s; struct sockaddr_in6 *s6; int sin_size = sizeof (struct sockaddr_in); int in_size = sizeof (struct in_addr); int sin6_size = sizeof (struct sockaddr_in6); ASSERT(any != NULL); ASSERT(event != NULL); hca = (rib_hca_t *)any; /* got a connection request */ switch (event->cm_type) { case IBT_CM_EVENT_REQ_RCV: /* * If the plugin is in the NO_ACCEPT state, bail out. */ mutex_enter(&plugin_state_lock); if (plugin_state == NO_ACCEPT) { mutex_exit(&plugin_state_lock); return (IBT_CM_REJECT); } mutex_exit(&plugin_state_lock); /* * Need to send a MRA MAD to CM so that it does not * timeout on us. */ (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, event->cm_event.req.req_timeout * 8, NULL, 0); mutex_enter(&rib_stat->open_hca_lock); q = rib_stat->q; mutex_exit(&rib_stat->open_hca_lock); status = rib_svc_create_chan(hca, (caddr_t)q, event->cm_event.req.req_prim_hca_port, &qp); if (status) { return (IBT_CM_REJECT); } ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; ret_args->cm_ret.rep.cm_rdma_ra_out = 4; ret_args->cm_ret.rep.cm_rdma_ra_in = 4; ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; /* * Pre-posts RECV buffers */ conn = qptoc(qp); for (i = 0; i < preposted_rbufs; i++) { bzero(&rdbuf, sizeof (rdbuf)); rdbuf.type = RECV_BUFFER; buf = rib_rbuf_alloc(conn, &rdbuf); if (buf == NULL) { /* * A connection is not established yet. * Just flush the channel. Buffers * posted till now will error out with * IBT_WC_WR_FLUSHED_ERR. */ (void) ibt_flush_channel(qp->qp_hdl); (void) rib_disconnect_channel(conn, NULL); return (IBT_CM_REJECT); } bzero(&cl, sizeof (cl)); cl.w.c_saddr3 = (caddr_t)rdbuf.addr; cl.c_len = rdbuf.len; cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */ cl.c_next = NULL; status = rib_post_recv(conn, &cl); if (status != RDMA_SUCCESS) { /* * A connection is not established yet. * Just flush the channel. Buffers * posted till now will error out with * IBT_WC_WR_FLUSHED_ERR. */ (void) ibt_flush_channel(qp->qp_hdl); (void) rib_disconnect_channel(conn, NULL); return (IBT_CM_REJECT); } } (void) rib_add_connlist(conn, &hca->srv_conn_list); /* * Get the address translation */ rw_enter(&hca->state_lock, RW_READER); if (hca->state == HCA_DETACHED) { rw_exit(&hca->state_lock); return (IBT_CM_REJECT); } rw_exit(&hca->state_lock); bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); if (ibt_get_ip_data(event->cm_priv_data_len, event->cm_priv_data, &ipinfo) != IBT_SUCCESS) { return (IBT_CM_REJECT); } switch (ipinfo.src_addr.family) { case AF_INET: conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); (void) strcpy(conn->c_netid, RIBNETID_TCP); conn->c_raddr.maxlen = conn->c_raddr.len = sin_size; conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); s = (struct sockaddr_in *)conn->c_raddr.buf; s->sin_family = AF_INET; bcopy((void *)&ipinfo.src_addr.un.ip4addr, &s->sin_addr, in_size); conn->c_laddr.maxlen = conn->c_laddr.len = sin_size; conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); s = (struct sockaddr_in *)conn->c_laddr.buf; s->sin_family = AF_INET; bcopy((void *)&ipinfo.dst_addr.un.ip4addr, &s->sin_addr, in_size); break; case AF_INET6: conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); (void) strcpy(conn->c_netid, RIBNETID_TCP6); conn->c_raddr.maxlen = conn->c_raddr.len = sin6_size; conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; s6->sin6_family = AF_INET6; bcopy((void *)&ipinfo.src_addr.un.ip6addr, &s6->sin6_addr, sizeof (struct in6_addr)); conn->c_laddr.maxlen = conn->c_laddr.len = sin6_size; conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; s6->sin6_family = AF_INET6; bcopy((void *)&ipinfo.dst_addr.un.ip6addr, &s6->sin6_addr, sizeof (struct in6_addr)); break; default: return (IBT_CM_REJECT); } break; case IBT_CM_EVENT_CONN_CLOSED: { CONN *conn; rib_qp_t *qp; switch (event->cm_event.closed) { case IBT_CM_CLOSED_DREP_RCVD: case IBT_CM_CLOSED_DREQ_TIMEOUT: case IBT_CM_CLOSED_DUP: case IBT_CM_CLOSED_ABORT: case IBT_CM_CLOSED_ALREADY: /* * These cases indicate the local end initiated * the closing of the channel. Nothing to do here. */ break; default: /* * Reason for CONN_CLOSED event must be one of * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD * or IBT_CM_CLOSED_STALE. These indicate cases were * the remote end is closing the channel. In these * cases free the channel and transition to error * state */ qp = ibt_get_chan_private(event->cm_channel); conn = qptoc(qp); mutex_enter(&conn->c_lock); if (conn->c_state == C_DISCONN_PEND) { mutex_exit(&conn->c_lock); break; } conn->c_state = C_ERROR_CONN; /* * Free the conn if c_ref goes down to 0 */ if (conn->c_ref == 0) { /* * Remove from list and free conn */ conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); (void) rib_disconnect_channel(conn, &hca->srv_conn_list); } else { /* * conn will be freed when c_ref goes to 0. * Indicate to cleaning thread not to close * the connection, but just free the channel. */ conn->c_flags |= C_CLOSE_NOTNEEDED; mutex_exit(&conn->c_lock); } DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); break; } break; } case IBT_CM_EVENT_CONN_EST: /* * RTU received, hence connection established. */ if (rib_debug > 1) cmn_err(CE_NOTE, "rib_srv_cm_handler: " "(CONN_EST) channel established"); break; default: if (rib_debug > 2) { /* Let CM handle the following events. */ if (event->cm_type == IBT_CM_EVENT_REP_RCV) { cmn_err(CE_NOTE, "rib_srv_cm_handler: " "server recv'ed IBT_CM_EVENT_REP_RCV\n"); } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { cmn_err(CE_NOTE, "rib_srv_cm_handler: " "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { cmn_err(CE_NOTE, "rib_srv_cm_handler: " "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { cmn_err(CE_NOTE, "rib_srv_cm_handler: " "server recv'ed IBT_CM_EVENT_APR_RCV\n"); } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { cmn_err(CE_NOTE, "rib_srv_cm_handler: " "server recv'ed IBT_CM_EVENT_FAILURE\n"); } } return (IBT_CM_DEFAULT); } /* accept all other CM messages (i.e. let the CM handle them) */ return (IBT_CM_ACCEPT); } static rdma_stat rib_register_service(rib_hca_t *hca, int service_type, uint8_t protocol_num, in_port_t dst_port) { ibt_srv_desc_t sdesc; ibt_hca_portinfo_t *port_infop; ib_svc_id_t srv_id; ibt_srv_hdl_t srv_hdl; uint_t port_size; uint_t pki, i, num_ports, nbinds; ibt_status_t ibt_status; rib_service_t *service; ib_pkey_t pkey; /* * Query all ports for the given HCA */ rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, &num_ports, &port_size); rw_exit(&hca->state_lock); } else { rw_exit(&hca->state_lock); return (RDMA_FAILED); } if (ibt_status != IBT_SUCCESS) { return (RDMA_FAILED); } DTRACE_PROBE1(rpcib__i__regservice_numports, int, num_ports); for (i = 0; i < num_ports; i++) { if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { DTRACE_PROBE1(rpcib__i__regservice__portinactive, int, i+1); } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { DTRACE_PROBE1(rpcib__i__regservice__portactive, int, i+1); } } /* * Get all the IP addresses on this system to register the * given "service type" on all DNS recognized IP addrs. * Each service type such as NFS will have all the systems * IP addresses as its different names. For now the only * type of service we support in RPCIB is NFS. */ rw_enter(&rib_stat->service_list_lock, RW_WRITER); /* * Start registering and binding service to active * on active ports on this HCA. */ nbinds = 0; for (service = rib_stat->service_list; service && (service->srv_type != service_type); service = service->next) ; if (service == NULL) { /* * We use IP addresses as the service names for * service registration. Register each of them * with CM to obtain a svc_id and svc_hdl. We do not * register the service with machine's loopback address. */ (void) bzero(&srv_id, sizeof (ib_svc_id_t)); (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); sdesc.sd_handler = rib_srv_cm_handler; sdesc.sd_flags = 0; ibt_status = ibt_register_service(hca->ibt_clnt_hdl, &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 1, &srv_hdl, &srv_id); if ((ibt_status != IBT_SUCCESS) && (ibt_status != IBT_CM_SERVICE_EXISTS)) { rw_exit(&rib_stat->service_list_lock); DTRACE_PROBE1(rpcib__i__regservice__ibtres, int, ibt_status); ibt_free_portinfo(port_infop, port_size); return (RDMA_FAILED); } /* * Allocate and prepare a service entry */ service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); service->srv_type = service_type; service->srv_hdl = srv_hdl; service->srv_id = srv_id; service->next = rib_stat->service_list; rib_stat->service_list = service; DTRACE_PROBE1(rpcib__i__regservice__new__service, int, service->srv_type); } else { srv_hdl = service->srv_hdl; srv_id = service->srv_id; DTRACE_PROBE1(rpcib__i__regservice__existing__service, int, service->srv_type); } for (i = 0; i < num_ports; i++) { ibt_sbind_hdl_t sbp; rib_hca_service_t *hca_srv; ib_gid_t gid; if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) continue; for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { pkey = port_infop[i].p_pkey_tbl[pki]; rw_enter(&hca->bound_services_lock, RW_READER); gid = port_infop[i].p_sgid_tbl[0]; for (hca_srv = hca->bound_services; hca_srv; hca_srv = hca_srv->next) { if ((hca_srv->srv_id == service->srv_id) && (hca_srv->gid.gid_prefix == gid.gid_prefix) && (hca_srv->gid.gid_guid == gid.gid_guid)) break; } rw_exit(&hca->bound_services_lock); if (hca_srv != NULL) { /* * port is alreay bound the the service */ DTRACE_PROBE1( rpcib__i__regservice__already__bound, int, i+1); nbinds++; continue; } if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) { sbp = NULL; ibt_status = ibt_bind_service(srv_hdl, gid, NULL, hca, &sbp); if (ibt_status == IBT_SUCCESS) { hca_srv = kmem_zalloc( sizeof (rib_hca_service_t), KM_SLEEP); hca_srv->srv_id = srv_id; hca_srv->gid = gid; hca_srv->sbind_hdl = sbp; rw_enter(&hca->bound_services_lock, RW_WRITER); hca_srv->next = hca->bound_services; hca->bound_services = hca_srv; rw_exit(&hca->bound_services_lock); nbinds++; } DTRACE_PROBE1(rpcib__i__regservice__bindres, int, ibt_status); } } } rw_exit(&rib_stat->service_list_lock); ibt_free_portinfo(port_infop, port_size); if (nbinds == 0) { return (RDMA_FAILED); } else { /* * Put this plugin into accept state, since atleast * one registration was successful. */ mutex_enter(&plugin_state_lock); plugin_state = ACCEPT; mutex_exit(&plugin_state_lock); return (RDMA_SUCCESS); } } void rib_listen(struct rdma_svc_data *rd) { rdma_stat status; int n_listening = 0; rib_hca_t *hca; mutex_enter(&rib_stat->listen_lock); /* * if rd parameter is NULL then it means that rib_stat->q is * already initialized by a call from RDMA and we just want to * add a newly attached HCA to the same listening state as other * HCAs. */ if (rd == NULL) { if (rib_stat->q == NULL) { mutex_exit(&rib_stat->listen_lock); return; } } else { rib_stat->q = &rd->q; } rw_enter(&rib_stat->hcas_list_lock, RW_READER); for (hca = rib_stat->hcas_list; hca; hca = hca->next) { /* * First check if a hca is still attached */ rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_INITED) { rw_exit(&hca->state_lock); continue; } rw_exit(&hca->state_lock); /* * Right now the only service type is NFS. Hence * force feed this value. Ideally to communicate * the service type it should be passed down in * rdma_svc_data. */ status = rib_register_service(hca, NFS, IPPROTO_TCP, nfs_rdma_port); if (status == RDMA_SUCCESS) n_listening++; } rw_exit(&rib_stat->hcas_list_lock); /* * Service active on an HCA, check rd->err_code for more * explainable errors. */ if (rd) { if (n_listening > 0) { rd->active = 1; rd->err_code = RDMA_SUCCESS; } else { rd->active = 0; rd->err_code = RDMA_FAILED; } } mutex_exit(&rib_stat->listen_lock); } /* XXXX */ /* ARGSUSED */ static void rib_listen_stop(struct rdma_svc_data *svcdata) { rib_hca_t *hca; mutex_enter(&rib_stat->listen_lock); /* * KRPC called the RDMATF to stop the listeners, this means * stop sending incomming or recieved requests to KRPC master * transport handle for RDMA-IB. This is also means that the * master transport handle, responsible for us, is going away. */ mutex_enter(&plugin_state_lock); plugin_state = NO_ACCEPT; if (svcdata != NULL) svcdata->active = 0; mutex_exit(&plugin_state_lock); rw_enter(&rib_stat->hcas_list_lock, RW_READER); for (hca = rib_stat->hcas_list; hca; hca = hca->next) { /* * First check if a hca is still attached */ rw_enter(&hca->state_lock, RW_READER); if (hca->state == HCA_DETACHED) { rw_exit(&hca->state_lock); continue; } rib_close_channels(&hca->srv_conn_list); rib_stop_services(hca); rw_exit(&hca->state_lock); } rw_exit(&rib_stat->hcas_list_lock); /* * Avoid rib_listen() using the stale q field. * This could happen if a port goes up after all services * are already unregistered. */ rib_stat->q = NULL; mutex_exit(&rib_stat->listen_lock); } /* * Traverse the HCA's service list to unbind and deregister services. * For each bound service of HCA to be removed, first find the corresponding * service handle (srv_hdl) and then unbind the service by calling * ibt_unbind_service(). */ static void rib_stop_services(rib_hca_t *hca) { rib_hca_service_t *srv_list, *to_remove; /* * unbind and deregister the services for this service type. * Right now there is only one service type. In future it will * be passed down to this function. */ rw_enter(&hca->bound_services_lock, RW_READER); srv_list = hca->bound_services; hca->bound_services = NULL; rw_exit(&hca->bound_services_lock); while (srv_list != NULL) { rib_service_t *sc; to_remove = srv_list; srv_list = to_remove->next; rw_enter(&rib_stat->service_list_lock, RW_READER); for (sc = rib_stat->service_list; sc && (sc->srv_id != to_remove->srv_id); sc = sc->next) ; /* * if sc is NULL then the service doesn't exist anymore, * probably just removed completely through rib_stat. */ if (sc != NULL) (void) ibt_unbind_service(sc->srv_hdl, to_remove->sbind_hdl); rw_exit(&rib_stat->service_list_lock); kmem_free(to_remove, sizeof (rib_hca_service_t)); } } static struct svc_recv * rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) { struct svc_recv *recvp; recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); recvp->vaddr = sgl->ds_va; recvp->qp = qp; recvp->bytes_xfer = 0; return (recvp); } static int rib_free_svc_recv(struct svc_recv *recvp) { kmem_free(recvp, sizeof (*recvp)); return (0); } static struct reply * rib_addreplylist(rib_qp_t *qp, uint32_t msgid) { struct reply *rep; rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); if (rep == NULL) { DTRACE_PROBE(rpcib__i__addrreply__nomem); return (NULL); } rep->xid = msgid; rep->vaddr_cq = NULL; rep->bytes_xfer = 0; rep->status = (uint_t)REPLY_WAIT; rep->prev = NULL; cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); mutex_enter(&qp->replylist_lock); if (qp->replylist) { rep->next = qp->replylist; qp->replylist->prev = rep; } qp->rep_list_size++; DTRACE_PROBE1(rpcib__i__addrreply__listsize, int, qp->rep_list_size); qp->replylist = rep; mutex_exit(&qp->replylist_lock); return (rep); } static rdma_stat rib_rem_replylist(rib_qp_t *qp) { struct reply *r, *n; mutex_enter(&qp->replylist_lock); for (r = qp->replylist; r != NULL; r = n) { n = r->next; (void) rib_remreply(qp, r); } mutex_exit(&qp->replylist_lock); return (RDMA_SUCCESS); } static int rib_remreply(rib_qp_t *qp, struct reply *rep) { ASSERT(MUTEX_HELD(&qp->replylist_lock)); if (rep->prev) { rep->prev->next = rep->next; } if (rep->next) { rep->next->prev = rep->prev; } if (qp->replylist == rep) qp->replylist = rep->next; cv_destroy(&rep->wait_cv); qp->rep_list_size--; DTRACE_PROBE1(rpcib__i__remreply__listsize, int, qp->rep_list_size); kmem_free(rep, sizeof (*rep)); return (0); } rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Note: ALL buffer pools use the same memory type RDMARW. */ status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } return (status); } static rdma_stat rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) { ibt_mr_attr_t mem_attr; ibt_status_t ibt_status; mem_attr.mr_vaddr = (uintptr_t)buf; mem_attr.mr_len = (ib_msglen_t)size; mem_attr.mr_as = (struct as *)(caddr_t)adsp; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_WINDOW_BIND | spec; rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, &mem_attr, mr_hdlp, mr_descp); rw_exit(&hca->state_lock); } else { rw_exit(&hca->state_lock); return (RDMA_FAILED); } if (ibt_status != IBT_SUCCESS) { return (RDMA_FAILED); } return (RDMA_SUCCESS); } rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ rib_lrc_entry_t *l; ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; rib_hca_t *hca = (ctoqp(conn))->hca; /* * Non-coherent memory registration. */ l = (rib_lrc_entry_t *)lrc; if (l) { if (l->registered) { buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo; buf_handle->mrc_lmr = (uint32_t)l->lrc_mhandle.mrc_lmr; buf_handle->mrc_rmr = (uint32_t)l->lrc_mhandle.mrc_rmr; *sync_handle = (RIB_SYNCMEM_HANDLE) (uintptr_t)l->lrc_mhandle.mrc_linfo; return (RDMA_SUCCESS); } else { /* Always register the whole buffer */ buf = (caddr_t)l->lrc_buf; buflen = l->lrc_len; } } status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { if (l) { l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; l->registered = TRUE; } buf_handle->mrc_linfo = (uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } return (status); } /* ARGSUSED */ rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) { rib_hca_t *hca = (ctoqp(conn))->hca; /* * Allow memory deregistration even if HCA is * getting detached. Need all outstanding * memory registrations to be deregistered * before HCA_DETACH_EVENT can be accepted. */ (void) ibt_deregister_mr(hca->hca_hdl, (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); return (RDMA_SUCCESS); } /* ARGSUSED */ rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *lrc) { rib_lrc_entry_t *l; l = (rib_lrc_entry_t *)lrc; if (l) if (l->registered) return (RDMA_SUCCESS); (void) rib_deregistermem(conn, buf, buf_handle); return (RDMA_SUCCESS); } /* ARGSUSED */ rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, int len, int cpu) { ibt_status_t status; rib_hca_t *hca = (ctoqp(conn))->hca; ibt_mr_sync_t mr_segment; mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; mr_segment.ms_len = (ib_memlen_t)len; if (cpu) { /* make incoming data visible to memory */ mr_segment.ms_flags = IBT_SYNC_WRITE; } else { /* make memory changes visible to IO */ mr_segment.ms_flags = IBT_SYNC_READ; } rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); rw_exit(&hca->state_lock); } else { rw_exit(&hca->state_lock); return (RDMA_FAILED); } if (status == IBT_SUCCESS) return (RDMA_SUCCESS); else { return (RDMA_FAILED); } } /* * XXXX ???? */ static rdma_stat rib_getinfo(rdma_info_t *info) { /* * XXXX Hack! */ info->addrlen = 16; info->mts = 1000000; info->mtu = 1000000; return (RDMA_SUCCESS); } rib_bufpool_t * rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) { rib_bufpool_t *rbp = NULL; bufpool_t *bp = NULL; caddr_t buf; ibt_mr_attr_t mem_attr; ibt_status_t ibt_status; int i, j; rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + num * sizeof (void *), KM_SLEEP); mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); bp->numelems = num; switch (ptype) { case SEND_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; bp->rsize = RPC_MSG_SZ; break; case RECV_BUFFER: mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; bp->rsize = RPC_BUF_SIZE; break; default: goto fail; } /* * Register the pool. */ bp->bufsize = num * bp->rsize; bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * sizeof (ibt_mr_hdl_t), KM_SLEEP); rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * sizeof (ibt_mr_desc_t), KM_SLEEP); rw_enter(&hca->state_lock, RW_READER); if (hca->state == HCA_DETACHED) { rw_exit(&hca->state_lock); goto fail; } for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); mem_attr.mr_vaddr = (uintptr_t)buf; mem_attr.mr_len = (ib_msglen_t)bp->rsize; mem_attr.mr_as = NULL; ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i], &rbp->mr_desc[i]); if (ibt_status != IBT_SUCCESS) { for (j = 0; j < i; j++) { (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]); } rw_exit(&hca->state_lock); goto fail; } } rw_exit(&hca->state_lock); buf = (caddr_t)bp->buf; for (i = 0; i < num; i++, buf += bp->rsize) { bp->buflist[i] = (void *)buf; } bp->buffree = num - 1; /* no. of free buffers */ rbp->bpool = bp; return (rbp); fail: if (bp) { if (bp->buf) kmem_free(bp->buf, bp->bufsize); kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); } if (rbp) { if (rbp->mr_hdl) kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); if (rbp->mr_desc) kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); kmem_free(rbp, sizeof (rib_bufpool_t)); } return (NULL); } static void rib_rbufpool_deregister(rib_hca_t *hca, int ptype) { int i; rib_bufpool_t *rbp = NULL; bufpool_t *bp; /* * Obtain pool address based on type of pool */ switch (ptype) { case SEND_BUFFER: rbp = hca->send_pool; break; case RECV_BUFFER: rbp = hca->recv_pool; break; default: return; } if (rbp == NULL) return; bp = rbp->bpool; /* * Deregister the pool memory and free it. */ for (i = 0; i < bp->numelems; i++) { (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); } } static void rib_rbufpool_free(rib_hca_t *hca, int ptype) { rib_bufpool_t *rbp = NULL; bufpool_t *bp; /* * Obtain pool address based on type of pool */ switch (ptype) { case SEND_BUFFER: rbp = hca->send_pool; break; case RECV_BUFFER: rbp = hca->recv_pool; break; default: return; } if (rbp == NULL) return; bp = rbp->bpool; /* * Free the pool memory. */ if (rbp->mr_hdl) kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); if (rbp->mr_desc) kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); if (bp->buf) kmem_free(bp->buf, bp->bufsize); mutex_destroy(&bp->buflock); kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); kmem_free(rbp, sizeof (rib_bufpool_t)); } void rib_rbufpool_destroy(rib_hca_t *hca, int ptype) { /* * Deregister the pool memory and free it. */ rib_rbufpool_deregister(hca, ptype); rib_rbufpool_free(hca, ptype); } /* * Fetch a buffer from the pool of type specified in rdbuf->type. */ static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) { rib_lrc_entry_t *rlep; if (rdbuf->type == RDMA_LONG_BUFFER) { rlep = rib_get_cache_buf(conn, rdbuf->len); rdbuf->rb_private = (caddr_t)rlep; rdbuf->addr = rlep->lrc_buf; rdbuf->handle = rlep->lrc_mhandle; return (RDMA_SUCCESS); } rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); if (rdbuf->addr) { switch (rdbuf->type) { case SEND_BUFFER: rdbuf->len = RPC_MSG_SZ; /* 1K */ break; case RECV_BUFFER: rdbuf->len = RPC_BUF_SIZE; /* 2K */ break; default: rdbuf->len = 0; } return (RDMA_SUCCESS); } else return (RDMA_FAILED); } /* * Fetch a buffer of specified type. * Note that rdbuf->handle is mw's rkey. */ static void * rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) { rib_qp_t *qp = ctoqp(conn); rib_hca_t *hca = qp->hca; rdma_btype ptype = rdbuf->type; void *buf; rib_bufpool_t *rbp = NULL; bufpool_t *bp; int i; /* * Obtain pool address based on type of pool */ switch (ptype) { case SEND_BUFFER: rbp = hca->send_pool; break; case RECV_BUFFER: rbp = hca->recv_pool; break; default: return (NULL); } if (rbp == NULL) return (NULL); bp = rbp->bpool; mutex_enter(&bp->buflock); if (bp->buffree < 0) { mutex_exit(&bp->buflock); return (NULL); } /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ buf = bp->buflist[bp->buffree]; rdbuf->addr = buf; rdbuf->len = bp->rsize; for (i = bp->numelems - 1; i >= 0; i--) { if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey; rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i]; rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey; bp->buffree--; mutex_exit(&bp->buflock); return (buf); } } mutex_exit(&bp->buflock); return (NULL); } static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) { if (rdbuf->type == RDMA_LONG_BUFFER) { rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); rdbuf->rb_private = NULL; return; } rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); } static void rib_rbuf_free(CONN *conn, int ptype, void *buf) { rib_qp_t *qp = ctoqp(conn); rib_hca_t *hca = qp->hca; rib_bufpool_t *rbp = NULL; bufpool_t *bp; /* * Obtain pool address based on type of pool */ switch (ptype) { case SEND_BUFFER: rbp = hca->send_pool; break; case RECV_BUFFER: rbp = hca->recv_pool; break; default: return; } if (rbp == NULL) return; bp = rbp->bpool; mutex_enter(&bp->buflock); if (++bp->buffree >= bp->numelems) { /* * Should never happen */ bp->buffree--; } else { bp->buflist[bp->buffree] = buf; } mutex_exit(&bp->buflock); } static rdma_stat rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) { rw_enter(&connlist->conn_lock, RW_WRITER); if (connlist->conn_hd) { cn->c_next = connlist->conn_hd; connlist->conn_hd->c_prev = cn; } connlist->conn_hd = cn; rw_exit(&connlist->conn_lock); return (RDMA_SUCCESS); } static rdma_stat rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) { rw_enter(&connlist->conn_lock, RW_WRITER); if (cn->c_prev) { cn->c_prev->c_next = cn->c_next; } if (cn->c_next) { cn->c_next->c_prev = cn->c_prev; } if (connlist->conn_hd == cn) connlist->conn_hd = cn->c_next; rw_exit(&connlist->conn_lock); return (RDMA_SUCCESS); } /* ARGSUSED */ static rdma_stat rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, int addr_type, void *handle, CONN **conn) { rdma_stat status; rpcib_ping_t rpt; status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); return (status); } /* * rib_find_hca_connection * * if there is an existing connection to the specified address then * it will be returned in conn, otherwise conn will be set to NULL. * Also cleans up any connection that is in error state. */ static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, CONN **conn) { CONN *cn; clock_t cv_stat, timout; *conn = NULL; again: rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); cn = hca->cl_conn_list.conn_hd; while (cn != NULL) { /* * First, clear up any connection in the ERROR state */ mutex_enter(&cn->c_lock); if (cn->c_state == C_ERROR_CONN) { if (cn->c_ref == 0) { /* * Remove connection from list and destroy it. */ cn->c_state = C_DISCONN_PEND; mutex_exit(&cn->c_lock); rw_exit(&hca->cl_conn_list.conn_lock); rib_conn_close((void *)cn); goto again; } mutex_exit(&cn->c_lock); cn = cn->c_next; continue; } if (cn->c_state == C_DISCONN_PEND) { mutex_exit(&cn->c_lock); cn = cn->c_next; continue; } /* * source address is only checked for if there is one, * this is the case for retries. */ if ((cn->c_raddr.len == d_svcaddr->len) && (bcmp(d_svcaddr->buf, cn->c_raddr.buf, d_svcaddr->len) == 0) && ((s_svcaddr->len == 0) || ((cn->c_laddr.len == s_svcaddr->len) && (bcmp(s_svcaddr->buf, cn->c_laddr.buf, s_svcaddr->len) == 0)))) { /* * Our connection. Give up conn list lock * as we are done traversing the list. */ rw_exit(&hca->cl_conn_list.conn_lock); if (cn->c_state == C_CONNECTED) { cn->c_ref++; /* sharing a conn */ mutex_exit(&cn->c_lock); *conn = cn; return (RDMA_SUCCESS); } if (cn->c_state == C_CONN_PEND) { /* * Hold a reference to this conn before * we give up the lock. */ cn->c_ref++; timout = ddi_get_lbolt() + drv_usectohz(CONN_WAIT_TIME * 1000000); while ((cv_stat = cv_timedwait_sig(&cn->c_cv, &cn->c_lock, timout)) > 0 && cn->c_state == C_CONN_PEND) ; if (cv_stat == 0) { (void) rib_conn_release_locked(cn); return (RDMA_INTR); } if (cv_stat < 0) { (void) rib_conn_release_locked(cn); return (RDMA_TIMEDOUT); } if (cn->c_state == C_CONNECTED) { *conn = cn; mutex_exit(&cn->c_lock); return (RDMA_SUCCESS); } else { (void) rib_conn_release_locked(cn); return (RDMA_TIMEDOUT); } } } mutex_exit(&cn->c_lock); cn = cn->c_next; } rw_exit(&hca->cl_conn_list.conn_lock); *conn = NULL; return (RDMA_FAILED); } /* * Connection management. * IBTF does not support recycling of channels. So connections are only * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or * C_DISCONN_PEND state. No C_IDLE state. * C_CONN_PEND state: Connection establishment in progress to the server. * C_CONNECTED state: A connection when created is in C_CONNECTED state. * It has an RC channel associated with it. ibt_post_send/recv are allowed * only in this state. * C_ERROR_CONN state: A connection transitions to this state when WRs on the * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when * c_ref drops to 0 (this indicates that RPC has no more references to this * connection), the connection should be destroyed. A connection transitions * into this state when it is being destroyed. */ /* ARGSUSED */ static rdma_stat rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, int addr_type, rpcib_ping_t *rpt, CONN **conn) { CONN *cn; int status; rib_hca_t *hca; rib_qp_t *qp; int s_addr_len; char *s_addr_buf; rw_enter(&rib_stat->hcas_list_lock, RW_READER); for (hca = rib_stat->hcas_list; hca; hca = hca->next) { rw_enter(&hca->state_lock, RW_READER); if (hca->state != HCA_DETACHED) { status = rib_find_hca_connection(hca, s_svcaddr, d_svcaddr, conn); rw_exit(&hca->state_lock); if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { rw_exit(&rib_stat->hcas_list_lock); return (status); } } else rw_exit(&hca->state_lock); } rw_exit(&rib_stat->hcas_list_lock); /* * No existing connection found, establish a new connection. */ bzero(rpt, sizeof (rpcib_ping_t)); status = rib_ping_srv(addr_type, d_svcaddr, rpt); if (status != RDMA_SUCCESS) { return (RDMA_FAILED); } hca = rpt->hca; if (rpt->srcip.family == AF_INET) { s_addr_len = sizeof (rpt->srcip.un.ip4addr); s_addr_buf = (char *)&rpt->srcip.un.ip4addr; } else if (rpt->srcip.family == AF_INET6) { s_addr_len = sizeof (rpt->srcip.un.ip6addr); s_addr_buf = (char *)&rpt->srcip.un.ip6addr; } else { return (RDMA_FAILED); } /* * Channel to server doesn't exist yet, create one. */ if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { return (RDMA_FAILED); } cn = qptoc(qp); cn->c_state = C_CONN_PEND; cn->c_ref = 1; cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; if (rpt->srcip.family == AF_INET) { cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); (void) strcpy(cn->c_netid, RIBNETID_TCP); } else { cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); (void) strcpy(cn->c_netid, RIBNETID_TCP6); } /* * Add to conn list. * We had given up the READER lock. In the time since then, * another thread might have created the connection we are * trying here. But for now, that is quiet alright - there * might be two connections between a pair of hosts instead * of one. If we really want to close that window, * then need to check the list after acquiring the * WRITER lock. */ (void) rib_add_connlist(cn, &hca->cl_conn_list); status = rib_conn_to_srv(hca, qp, rpt); mutex_enter(&cn->c_lock); if (cn->c_flags & C_CLOSE_PENDING) { /* * This handles a case where the module or * HCA detached in the time a connection is * established. In such a case close the * connection immediately if this is the * only reference. */ if (cn->c_ref == 1) { cn->c_ref--; cn->c_state = C_DISCONN_PEND; mutex_exit(&cn->c_lock); rib_conn_close((void *)cn); return (RDMA_FAILED); } /* * Connection to be closed later when c_ref = 0 */ status = RDMA_FAILED; } if (status == RDMA_SUCCESS) { cn->c_state = C_CONNECTED; *conn = cn; } else { cn->c_state = C_ERROR_CONN; cn->c_ref--; } cv_signal(&cn->c_cv); mutex_exit(&cn->c_lock); return (status); } static void rib_conn_close(void *rarg) { CONN *conn = (CONN *)rarg; rib_qp_t *qp = ctoqp(conn); mutex_enter(&conn->c_lock); if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); /* * Live connection in CONNECTED state. */ if (conn->c_state == C_CONNECTED) { conn->c_state = C_ERROR_CONN; } mutex_exit(&conn->c_lock); rib_close_a_channel(conn); mutex_enter(&conn->c_lock); conn->c_flags &= ~C_CLOSE_PENDING; } mutex_exit(&conn->c_lock); if (qp->mode == RIB_SERVER) (void) rib_disconnect_channel(conn, &qp->hca->srv_conn_list); else (void) rib_disconnect_channel(conn, &qp->hca->cl_conn_list); } static void rib_conn_timeout_call(void *carg) { time_t idle_time; CONN *conn = (CONN *)carg; rib_hca_t *hca = ctoqp(conn)->hca; int error; mutex_enter(&conn->c_lock); if ((conn->c_ref > 0) || (conn->c_state == C_DISCONN_PEND)) { conn->c_timeout = NULL; mutex_exit(&conn->c_lock); return; } idle_time = (gethrestime_sec() - conn->c_last_used); if ((idle_time <= rib_conn_timeout) && (conn->c_state != C_ERROR_CONN)) { /* * There was activity after the last timeout. * Extend the conn life. Unless the conn is * already in error state. */ conn->c_timeout = timeout(rib_conn_timeout_call, conn, SEC_TO_TICK(rib_conn_timeout - idle_time)); mutex_exit(&conn->c_lock); return; } error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, (void *)conn, DDI_NOSLEEP); /* * If taskq dispatch fails above, then reset the timeout * to try again after 10 secs. */ if (error != DDI_SUCCESS) { conn->c_timeout = timeout(rib_conn_timeout_call, conn, SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); mutex_exit(&conn->c_lock); return; } conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); } static rdma_stat rib_conn_release(CONN *conn) { mutex_enter(&conn->c_lock); return (rib_conn_release_locked(conn)); } /* * Expects conn->c_lock to be held on entry. * c_lock released on return */ static rdma_stat rib_conn_release_locked(CONN *conn) { conn->c_ref--; conn->c_last_used = gethrestime_sec(); if (conn->c_ref > 0) { mutex_exit(&conn->c_lock); return (RDMA_SUCCESS); } /* * If a conn is C_ERROR_CONN, close the channel. */ if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); rib_conn_close((void *)conn); return (RDMA_SUCCESS); } /* * c_ref == 0, set a timeout for conn release */ if (conn->c_timeout == NULL) { conn->c_timeout = timeout(rib_conn_timeout_call, conn, SEC_TO_TICK(rib_conn_timeout)); } mutex_exit(&conn->c_lock); return (RDMA_SUCCESS); } /* * Add at front of list */ static struct rdma_done_list * rdma_done_add(rib_qp_t *qp, uint32_t xid) { struct rdma_done_list *rd; ASSERT(MUTEX_HELD(&qp->rdlist_lock)); rd = kmem_alloc(sizeof (*rd), KM_SLEEP); rd->xid = xid; cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); rd->prev = NULL; rd->next = qp->rdlist; if (qp->rdlist != NULL) qp->rdlist->prev = rd; qp->rdlist = rd; return (rd); } static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) { struct rdma_done_list *r; ASSERT(MUTEX_HELD(&qp->rdlist_lock)); r = rd->next; if (r != NULL) { r->prev = rd->prev; } r = rd->prev; if (r != NULL) { r->next = rd->next; } else { qp->rdlist = rd->next; } cv_destroy(&rd->rdma_done_cv); kmem_free(rd, sizeof (*rd)); } static void rdma_done_rem_list(rib_qp_t *qp) { struct rdma_done_list *r, *n; mutex_enter(&qp->rdlist_lock); for (r = qp->rdlist; r != NULL; r = n) { n = r->next; rdma_done_rm(qp, r); } mutex_exit(&qp->rdlist_lock); } static void rdma_done_notify(rib_qp_t *qp, uint32_t xid) { struct rdma_done_list *r = qp->rdlist; ASSERT(MUTEX_HELD(&qp->rdlist_lock)); while (r) { if (r->xid == xid) { cv_signal(&r->rdma_done_cv); return; } else { r = r->next; } } DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, int, xid); } /* * Expects conn->c_lock to be held by the caller. */ static void rib_close_a_channel(CONN *conn) { rib_qp_t *qp; qp = ctoqp(conn); if (qp->qp_hdl == NULL) { /* channel already freed */ return; } /* * Call ibt_close_rc_channel in blocking mode * with no callbacks. */ (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); } /* * Goes through all connections and closes the channel * This will cause all the WRs on those channels to be * flushed. */ static void rib_close_channels(rib_conn_list_t *connlist) { CONN *conn, *tmp; rw_enter(&connlist->conn_lock, RW_READER); conn = connlist->conn_hd; while (conn != NULL) { mutex_enter(&conn->c_lock); tmp = conn->c_next; if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { if (conn->c_state == C_CONN_PEND) { conn->c_flags |= C_CLOSE_PENDING; goto next; } conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); /* * Live connection in CONNECTED state. */ if (conn->c_state == C_CONNECTED) conn->c_state = C_ERROR_CONN; mutex_exit(&conn->c_lock); rib_close_a_channel(conn); mutex_enter(&conn->c_lock); conn->c_flags &= ~C_CLOSE_PENDING; /* Signal a pending rib_disconnect_channel() */ cv_signal(&conn->c_cv); } next: mutex_exit(&conn->c_lock); conn = tmp; } rw_exit(&connlist->conn_lock); } /* * Frees up all connections that are no longer being referenced */ static void rib_purge_connlist(rib_conn_list_t *connlist) { CONN *conn; top: rw_enter(&connlist->conn_lock, RW_READER); conn = connlist->conn_hd; while (conn != NULL) { mutex_enter(&conn->c_lock); /* * At this point connection is either in ERROR * or DISCONN_PEND state. If in DISCONN_PEND state * then some other thread is culling that connection. * If not and if c_ref is 0, then destroy the connection. */ if (conn->c_ref == 0 && conn->c_state != C_DISCONN_PEND) { /* * Cull the connection */ conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); rw_exit(&connlist->conn_lock); (void) rib_disconnect_channel(conn, connlist); goto top; } else { /* * conn disconnect already scheduled or will * happen from conn_release when c_ref drops to 0. */ mutex_exit(&conn->c_lock); } conn = conn->c_next; } rw_exit(&connlist->conn_lock); /* * At this point, only connections with c_ref != 0 are on the list */ } /* * Free all the HCA resources and close * the hca. */ static void rib_free_hca(rib_hca_t *hca) { (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); kmem_free(hca->svc_scq, sizeof (rib_cq_t)); rib_rbufpool_destroy(hca, RECV_BUFFER); rib_rbufpool_destroy(hca, SEND_BUFFER); rib_destroy_cache(hca); if (rib_mod.rdma_count == 0) (void) rdma_unregister_mod(&rib_mod); (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); (void) ibt_close_hca(hca->hca_hdl); hca->hca_hdl = NULL; } static void rib_stop_hca_services(rib_hca_t *hca) { rib_stop_services(hca); rib_close_channels(&hca->cl_conn_list); rib_close_channels(&hca->srv_conn_list); rib_purge_connlist(&hca->cl_conn_list); rib_purge_connlist(&hca->srv_conn_list); if ((rib_stat->hcas_list == NULL) && stats_enabled) { kstat_delete_byname_zone("unix", 0, "rpcib_cache", GLOBAL_ZONEID); stats_enabled = FALSE; } rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); if (hca->srv_conn_list.conn_hd == NULL && hca->cl_conn_list.conn_hd == NULL) { /* * conn_lists are NULL, so destroy * buffers, close hca and be done. */ rib_free_hca(hca); } rw_exit(&hca->cl_conn_list.conn_lock); rw_exit(&hca->srv_conn_list.conn_lock); if (hca->hca_hdl != NULL) { mutex_enter(&hca->inuse_lock); while (hca->inuse) cv_wait(&hca->cb_cv, &hca->inuse_lock); mutex_exit(&hca->inuse_lock); rib_free_hca(hca); } rw_destroy(&hca->bound_services_lock); if (hca->cleanup_helper != NULL) { ddi_taskq_destroy(hca->cleanup_helper); hca->cleanup_helper = NULL; } } /* * Cleans and closes up all uses of the HCA */ static void rib_detach_hca(ibt_hca_hdl_t hca_hdl) { rib_hca_t *hca = NULL; rib_hca_t **hcap; rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { hca = *hcap; rw_enter(&hca->state_lock, RW_WRITER); if (hca->hca_hdl == hca_hdl) { /* * Mark as detached and remove from * hca list. */ hca->state = HCA_DETACHED; *hcap = hca->next; rib_stat->nhca_inited--; rib_mod.rdma_count--; rw_exit(&hca->state_lock); break; } rw_exit(&hca->state_lock); } rw_exit(&rib_stat->hcas_list_lock); if (hca == NULL) return; ASSERT(hca->hca_hdl == hca_hdl); /* * Stop all services on the HCA * Go through cl_conn_list and close all rc_channels * Go through svr_conn_list and close all rc_channels * Free connections whose c_ref has dropped to 0 * Destroy all CQs * Deregister and released all buffer pool memory after all * connections are destroyed * Free the protection domain * ibt_close_hca() */ rib_stop_hca_services(hca); kmem_free(hca, sizeof (*hca)); } static void rib_server_side_cache_reclaim(void *argp) { cache_avl_struct_t *rcas; rib_lrc_entry_t *rb; rib_hca_t *hca = (rib_hca_t *)argp; rw_enter(&hca->avl_rw_lock, RW_WRITER); rcas = avl_first(&hca->avl_tree); if (rcas != NULL) avl_remove(&hca->avl_tree, rcas); while (rcas != NULL) { while (rcas->r.forw != &rcas->r) { rcas->elements--; rb = rcas->r.forw; remque(rb); if (rb->registered) (void) rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); hca->cache_allocation -= rb->lrc_len; kmem_free(rb->lrc_buf, rb->lrc_len); kmem_free(rb, sizeof (rib_lrc_entry_t)); } mutex_destroy(&rcas->node_lock); kmem_cache_free(hca->server_side_cache, rcas); rcas = avl_first(&hca->avl_tree); if (rcas != NULL) avl_remove(&hca->avl_tree, rcas); } rw_exit(&hca->avl_rw_lock); } static void rib_server_side_cache_cleanup(void *argp) { cache_avl_struct_t *rcas; rib_lrc_entry_t *rb; rib_hca_t *hca = (rib_hca_t *)argp; mutex_enter(&hca->cache_allocation_lock); if (hca->cache_allocation < cache_limit) { mutex_exit(&hca->cache_allocation_lock); return; } mutex_exit(&hca->cache_allocation_lock); rw_enter(&hca->avl_rw_lock, RW_WRITER); rcas = avl_last(&hca->avl_tree); if (rcas != NULL) avl_remove(&hca->avl_tree, rcas); while (rcas != NULL) { while (rcas->r.forw != &rcas->r) { rcas->elements--; rb = rcas->r.forw; remque(rb); if (rb->registered) (void) rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); hca->cache_allocation -= rb->lrc_len; kmem_free(rb->lrc_buf, rb->lrc_len); kmem_free(rb, sizeof (rib_lrc_entry_t)); } mutex_destroy(&rcas->node_lock); if (hca->server_side_cache) { kmem_cache_free(hca->server_side_cache, rcas); } if (hca->cache_allocation < cache_limit) { rw_exit(&hca->avl_rw_lock); return; } rcas = avl_last(&hca->avl_tree); if (rcas != NULL) avl_remove(&hca->avl_tree, rcas); } rw_exit(&hca->avl_rw_lock); } static int avl_compare(const void *t1, const void *t2) { if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) return (0); if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) return (-1); return (1); } static void rib_destroy_cache(rib_hca_t *hca) { if (hca->avl_init) { rib_server_side_cache_reclaim((void *)hca); if (hca->server_side_cache) { kmem_cache_destroy(hca->server_side_cache); hca->server_side_cache = NULL; } avl_destroy(&hca->avl_tree); mutex_destroy(&hca->cache_allocation_lock); rw_destroy(&hca->avl_rw_lock); } hca->avl_init = FALSE; } static void rib_force_cleanup(void *hca) { if (((rib_hca_t *)hca)->cleanup_helper != NULL) (void) ddi_taskq_dispatch( ((rib_hca_t *)hca)->cleanup_helper, rib_server_side_cache_cleanup, (void *)hca, DDI_NOSLEEP); } static rib_lrc_entry_t * rib_get_cache_buf(CONN *conn, uint32_t len) { cache_avl_struct_t cas, *rcas; rib_hca_t *hca = (ctoqp(conn))->hca; rib_lrc_entry_t *reply_buf; avl_index_t where = NULL; uint64_t c_alloc = 0; if (!hca->avl_init) goto error_alloc; cas.len = len; rw_enter(&hca->avl_rw_lock, RW_READER); mutex_enter(&hca->cache_allocation_lock); c_alloc = hca->cache_allocation; mutex_exit(&hca->cache_allocation_lock); if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL) { /* Am I above the cache limit */ if ((c_alloc + len) >= cache_limit) { rib_force_cleanup((void *)hca); rw_exit(&hca->avl_rw_lock); mutex_enter(&hca->cache_allocation_lock); hca->cache_misses_above_the_limit ++; mutex_exit(&hca->cache_allocation_lock); /* Allocate and register the buffer directly */ goto error_alloc; } rw_exit(&hca->avl_rw_lock); rw_enter(&hca->avl_rw_lock, RW_WRITER); /* Recheck to make sure no other thread added the entry in */ if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL) { /* Allocate an avl tree entry */ rcas = (cache_avl_struct_t *) kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); bzero(rcas, sizeof (cache_avl_struct_t)); rcas->elements = 0; rcas->r.forw = &rcas->r; rcas->r.back = &rcas->r; rcas->len = len; mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); avl_insert(&hca->avl_tree, rcas, where); } } mutex_enter(&rcas->node_lock); if (rcas->r.forw != &rcas->r && rcas->elements > 0) { reply_buf = rcas->r.forw; remque(reply_buf); rcas->elements--; mutex_exit(&rcas->node_lock); rw_exit(&hca->avl_rw_lock); mutex_enter(&hca->cache_allocation_lock); hca->cache_hits++; hca->cache_allocation -= len; mutex_exit(&hca->cache_allocation_lock); } else { /* Am I above the cache limit */ mutex_exit(&rcas->node_lock); if ((c_alloc + len) >= cache_limit) { rib_force_cleanup((void *)hca); rw_exit(&hca->avl_rw_lock); mutex_enter(&hca->cache_allocation_lock); hca->cache_misses_above_the_limit++; mutex_exit(&hca->cache_allocation_lock); /* Allocate and register the buffer directly */ goto error_alloc; } rw_exit(&hca->avl_rw_lock); mutex_enter(&hca->cache_allocation_lock); hca->cache_misses++; mutex_exit(&hca->cache_allocation_lock); /* Allocate a reply_buf entry */ reply_buf = (rib_lrc_entry_t *) kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); bzero(reply_buf, sizeof (rib_lrc_entry_t)); reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); reply_buf->lrc_len = len; reply_buf->registered = FALSE; reply_buf->avl_node = (void *)rcas; } return (reply_buf); error_alloc: reply_buf = (rib_lrc_entry_t *) kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); bzero(reply_buf, sizeof (rib_lrc_entry_t)); reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); reply_buf->lrc_len = len; reply_buf->registered = FALSE; reply_buf->avl_node = NULL; return (reply_buf); } /* * Return a pre-registered back to the cache (without * unregistering the buffer).. */ static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) { cache_avl_struct_t cas, *rcas; avl_index_t where = NULL; rib_hca_t *hca = (ctoqp(conn))->hca; if (!hca->avl_init) goto error_free; cas.len = reg_buf->lrc_len; rw_enter(&hca->avl_rw_lock, RW_READER); if ((rcas = (cache_avl_struct_t *) avl_find(&hca->avl_tree, &cas, &where)) == NULL) { rw_exit(&hca->avl_rw_lock); goto error_free; } else { cas.len = reg_buf->lrc_len; mutex_enter(&rcas->node_lock); insque(reg_buf, &rcas->r); rcas->elements ++; mutex_exit(&rcas->node_lock); rw_exit(&hca->avl_rw_lock); mutex_enter(&hca->cache_allocation_lock); hca->cache_allocation += cas.len; mutex_exit(&hca->cache_allocation_lock); } return; error_free: if (reg_buf->registered) (void) rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle); kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); } static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t buflen, struct mrc *buf_handle) { ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ rdma_stat status; /* * Note: ALL buffer pools use the same memory type RDMARW. */ status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); if (status == RDMA_SUCCESS) { buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; } else { buf_handle->mrc_linfo = NULL; buf_handle->mrc_lmr = 0; buf_handle->mrc_rmr = 0; } return (status); } /* ARGSUSED */ static rdma_stat rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) { (void) rib_deregistermem_via_hca(hca, buf, buf_handle); return (RDMA_SUCCESS); } /* ARGSUSED */ static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) { (void) ibt_deregister_mr(hca->hca_hdl, (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); return (RDMA_SUCCESS); } /* * Check if the IP interface named by `lifrp' is RDMA-capable. */ static boolean_t rpcib_rdma_capable_interface(struct lifreq *lifrp) { char ifname[LIFNAMSIZ]; char *cp; if (lifrp->lifr_type == IFT_IB) return (B_TRUE); /* * Strip off the logical interface portion before getting * intimate with the name. */ (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); if ((cp = strchr(ifname, ':')) != NULL) *cp = '\0'; return (strcmp("lo0", ifname) == 0); } static int rpcib_do_ip_ioctl(int cmd, int len, void *arg) { vnode_t *kkvp, *vp; TIUSER *tiptr; struct strioctl iocb; k_sigset_t smask; int err = 0; if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, &tiptr, CRED()) == 0) { vp = tiptr->fp->f_vnode; } else { VN_RELE(kkvp); return (EPROTO); } } else { return (EPROTO); } iocb.ic_cmd = cmd; iocb.ic_timout = 0; iocb.ic_len = len; iocb.ic_dp = (caddr_t)arg; sigintr(&smask, 0); err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); sigunintr(&smask); (void) t_kclose(tiptr, 0); VN_RELE(kkvp); return (err); } /* * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. */ static int rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) { int err; struct lifnum lifn; bzero(&lifn, sizeof (struct lifnum)); lifn.lifn_family = AF_UNSPEC; err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); if (err != 0) return (err); /* * Pad the interface count to account for additional interfaces that * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. */ lifn.lifn_count += 4; bzero(lifcp, sizeof (struct lifconf)); lifcp->lifc_family = AF_UNSPEC; lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); if (err != 0) { kmem_free(lifcp->lifc_buf, *bufsizep); return (err); } return (0); } static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) { uint_t i, nifs; uint_t bufsize; struct lifconf lifc; struct lifreq *lifrp; struct sockaddr_in *sinp; struct sockaddr_in6 *sin6p; bzero(addrs4, sizeof (rpcib_ipaddrs_t)); bzero(addrs6, sizeof (rpcib_ipaddrs_t)); if (rpcib_do_lifconf(&lifc, &bufsize) != 0) return (B_FALSE); if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { kmem_free(lifc.lifc_buf, bufsize); return (B_FALSE); } /* * Worst case is that all of the addresses are IB-capable and have * the same address family, so size our buffers accordingly. */ addrs4->ri_size = nifs * sizeof (struct sockaddr_in); addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { if (!rpcib_rdma_capable_interface(lifrp)) continue; if (lifrp->lifr_addr.ss_family == AF_INET) { sinp = addrs4->ri_list; bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], sizeof (struct sockaddr_in)); } else if (lifrp->lifr_addr.ss_family == AF_INET6) { sin6p = addrs6->ri_list; bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], sizeof (struct sockaddr_in6)); } } kmem_free(lifc.lifc_buf, bufsize); return (B_TRUE); } /* ARGSUSED */ static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { rib_hca_t *hca; if (KSTAT_WRITE == rw) { return (EACCES); } rpcib_kstat.cache_limit.value.ui64 = (uint64_t)cache_limit; rw_enter(&rib_stat->hcas_list_lock, RW_READER); for (hca = rib_stat->hcas_list; hca; hca = hca->next) { rpcib_kstat.cache_allocation.value.ui64 += (uint64_t)hca->cache_allocation; rpcib_kstat.cache_hits.value.ui64 += (uint64_t)hca->cache_hits; rpcib_kstat.cache_misses.value.ui64 += (uint64_t)hca->cache_misses; rpcib_kstat.cache_misses_above_the_limit.value.ui64 += (uint64_t)hca->cache_misses_above_the_limit; } rw_exit(&rib_stat->hcas_list_lock); return (0); }