/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* * Sun elects to include this software in Sun product * under the OpenIB BSD license. * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include /* * This File contains the buffer management code */ #define DUMP_USER_PARAMS() \ RDS_DPRINTF3(LABEL, "MaxNodes = %d", MaxNodes); \ RDS_DPRINTF3(LABEL, "UserBufferSize = %d", UserBufferSize); \ RDS_DPRINTF3(LABEL, "RdsPktSize = %d", RdsPktSize); \ RDS_DPRINTF3(LABEL, "MaxDataSendBuffers = %d", MaxDataSendBuffers); \ RDS_DPRINTF3(LABEL, "MaxDataRecvBuffers = %d", MaxDataRecvBuffers); \ RDS_DPRINTF3(LABEL, "MaxCtrlSendBuffers = %d", MaxCtrlSendBuffers); \ RDS_DPRINTF3(LABEL, "MaxCtrlRecvBuffers = %d", MaxCtrlRecvBuffers); \ RDS_DPRINTF3(LABEL, "DataRecvBufferLWM = %d", DataRecvBufferLWM); \ RDS_DPRINTF3(LABEL, "PendingRxPktsHWM = %d", PendingRxPktsHWM); \ RDS_DPRINTF3(LABEL, "MinRnrRetry = %d", MinRnrRetry) static void rds_free_mblk(char *arg) { rds_buf_t *bp = (rds_buf_t *)(uintptr_t)arg; /* Free the recv buffer */ RDS_DPRINTF4("rds_free_mblk", "Enter: BP(%p)", bp); ASSERT(bp->buf_state == RDS_RCVBUF_ONSOCKQ); rds_free_recv_buf(bp, 1); RDS_DECR_RXPKTS_PEND(1); RDS_DPRINTF4("rds_free_mblk", "Return: BP(%p)", bp); } void rds_free_recv_caches(rds_state_t *statep) { rds_hca_t *hcap; int ret; RDS_DPRINTF4("rds_free_recv_caches", "Enter"); mutex_enter(&rds_dpool.pool_lock); if (rds_dpool.pool_memp == NULL) { RDS_DPRINTF2("rds_free_recv_caches", "Caches are empty"); mutex_exit(&rds_dpool.pool_lock); return; } /* * All buffers must have been freed as all sessions are closed * and destroyed */ ASSERT(rds_dpool.pool_nbusy == 0); RDS_DPRINTF2("rds_free_recv_caches", "Data Pool has " "pending buffers: %d", rds_dpool.pool_nbusy); while (rds_dpool.pool_nbusy != 0) { mutex_exit(&rds_dpool.pool_lock); delay(drv_usectohz(1000000)); mutex_enter(&rds_dpool.pool_lock); } hcap = statep->rds_hcalistp; while (hcap != NULL) { if (hcap->hca_mrhdl != NULL) { ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl); if (ret == IBT_SUCCESS) { hcap->hca_mrhdl = NULL; hcap->hca_lkey = 0; hcap->hca_rkey = 0; } else { RDS_DPRINTF2(LABEL, "ibt_deregister_mr " "failed: %d, mrhdl: 0x%p", ret, hcap->hca_mrhdl); } } hcap = hcap->hca_nextp; } kmem_free(rds_dpool.pool_bufmemp, (rds_dpool.pool_nbuffers + rds_cpool.pool_nbuffers) * sizeof (rds_buf_t)); rds_dpool.pool_bufmemp = NULL; kmem_free(rds_dpool.pool_memp, rds_dpool.pool_memsize); rds_dpool.pool_memp = NULL; mutex_exit(&rds_dpool.pool_lock); RDS_DPRINTF4("rds_free_recv_caches", "Return"); } int rds_init_recv_caches(rds_state_t *statep) { uint8_t *mp; rds_buf_t *bp; rds_hca_t *hcap; uint32_t nsessions; uint_t ix; uint_t nctrlrx; uint8_t *memp; uint_t memsize, nbuf; rds_buf_t *bufmemp; ibt_mr_attr_t mem_attr; ibt_mr_desc_t mem_desc; int ret; RDS_DPRINTF4("rds_init_recv_caches", "Enter"); DUMP_USER_PARAMS(); mutex_enter(&rds_dpool.pool_lock); if (rds_dpool.pool_memp != NULL) { RDS_DPRINTF2("rds_init_recv_caches", "Pools are already " "initialized"); mutex_exit(&rds_dpool.pool_lock); return (0); } /* * High water mark for the receive buffers in the system. If the * number of buffers used crosses this mark then all sockets in * would be stalled. The port quota for the sockets is set based * on this limit. */ rds_rx_pkts_pending_hwm = (PendingRxPktsHWM * NDataRX)/100; /* nsessions can never be less than 1 */ nsessions = MaxNodes - 1; nctrlrx = (nsessions + 1) * MaxCtrlRecvBuffers; RDS_DPRINTF3(LABEL, "Number of Possible Sessions: %d", nsessions); /* Add the hdr */ RdsPktSize = UserBufferSize + RDS_DATA_HDR_SZ; memsize = (NDataRX * RdsPktSize) + (nctrlrx * RDS_CTRLPKT_SIZE); nbuf = NDataRX + nctrlrx; RDS_DPRINTF3(LABEL, "RDS Buffer Pool Memory: %lld", memsize); RDS_DPRINTF3(LABEL, "Total Buffers: %d", nbuf); memp = (uint8_t *)kmem_zalloc(memsize, KM_NOSLEEP); if (memp == NULL) { RDS_DPRINTF1(LABEL, "RDS Memory allocation failed"); mutex_exit(&rds_dpool.pool_lock); return (-1); } RDS_DPRINTF3(LABEL, "RDS Buffer Entries Memory: %lld", nbuf * sizeof (rds_buf_t)); /* allocate memory for buffer entries */ bufmemp = (rds_buf_t *)kmem_zalloc(nbuf * sizeof (rds_buf_t), KM_SLEEP); /* register the memory with all HCAs */ mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)memp; mem_attr.mr_len = memsize; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; rw_enter(&statep->rds_hca_lock, RW_WRITER); hcap = statep->rds_hcalistp; while (hcap != NULL) { if (hcap->hca_state != RDS_HCA_STATE_OPEN) { hcap = hcap->hca_nextp; continue; } ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, &hcap->hca_mrhdl, &mem_desc); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "ibt_register_mr failed: %d", ret); hcap = statep->rds_hcalistp; while ((hcap) && (hcap->hca_mrhdl != NULL)) { ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl); if (ret == IBT_SUCCESS) { hcap->hca_mrhdl = NULL; hcap->hca_lkey = 0; hcap->hca_rkey = 0; } else { RDS_DPRINTF2(LABEL, "ibt_deregister_mr " "failed: %d, mrhdl: 0x%p", ret, hcap->hca_mrhdl); } hcap = hcap->hca_nextp; } kmem_free(bufmemp, nbuf * sizeof (rds_buf_t)); kmem_free(memp, memsize); rw_exit(&statep->rds_hca_lock); mutex_exit(&rds_dpool.pool_lock); return (-1); } hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED; hcap->hca_lkey = mem_desc.md_lkey; hcap->hca_rkey = mem_desc.md_rkey; hcap = hcap->hca_nextp; } rw_exit(&statep->rds_hca_lock); /* Initialize data pool */ rds_dpool.pool_memp = memp; rds_dpool.pool_memsize = memsize; rds_dpool.pool_bufmemp = bufmemp; rds_dpool.pool_nbuffers = NDataRX; rds_dpool.pool_nbusy = 0; rds_dpool.pool_nfree = NDataRX; /* chain the buffers */ mp = memp; bp = bufmemp; for (ix = 0; ix < NDataRX; ix++) { bp[ix].buf_nextp = &bp[ix + 1]; bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[ix].buf_state = RDS_RCVBUF_FREE; bp[ix].buf_frtn.free_func = rds_free_mblk; bp[ix].buf_frtn.free_arg = (char *)&bp[ix]; mp = mp + RdsPktSize; } bp[NDataRX - 1].buf_nextp = NULL; rds_dpool.pool_headp = &bp[0]; rds_dpool.pool_tailp = &bp[NDataRX - 1]; /* Initialize ctrl pool */ rds_cpool.pool_nbuffers = nctrlrx; rds_cpool.pool_nbusy = 0; rds_cpool.pool_nfree = nctrlrx; /* chain the buffers */ for (ix = NDataRX; ix < nbuf - 1; ix++) { bp[ix].buf_nextp = &bp[ix + 1]; bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; mp = mp + RDS_CTRLPKT_SIZE; } bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[nbuf - 1].buf_nextp = NULL; rds_cpool.pool_headp = &bp[NDataRX]; rds_cpool.pool_tailp = &bp[nbuf - 1]; mutex_exit(&rds_dpool.pool_lock); RDS_DPRINTF3(LABEL, "rdsmemp start: %p end: %p", memp, mp); RDS_DPRINTF4("rds_init_recv_caches", "Return"); return (0); } rds_hca_t *rds_lkup_hca(ib_guid_t hca_guid); void rds_free_send_pool(rds_ep_t *ep) { rds_bufpool_t *pool; rds_hca_t *hcap; int ret; pool = &ep->ep_sndpool; mutex_enter(&pool->pool_lock); if (pool->pool_memp == NULL) { mutex_exit(&pool->pool_lock); RDS_DPRINTF2("rds_free_send_pool", "EP(%p) DOUBLE Free on Send Pool", ep); return; } /* get the hcap for the HCA hosting this channel */ hcap = rds_lkup_hca(ep->ep_hca_guid); if (hcap == NULL) { RDS_DPRINTF2("rds_free_send_pool", "HCA (0x%llx) not found", ep->ep_hca_guid); } else { ret = ibt_deregister_mr(hcap->hca_hdl, ep->ep_snd_mrhdl); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "ibt_deregister_mr failed: %d, mrhdl: 0x%p", ret, ep->ep_snd_mrhdl); } if (ep->ep_ack_addr) { ret = ibt_deregister_mr(hcap->hca_hdl, ep->ep_ackhdl); if (ret != IBT_SUCCESS) { RDS_DPRINTF2(LABEL, "ibt_deregister_mr ackhdl failed: %d, " "mrhdl: 0x%p", ret, ep->ep_ackhdl); } kmem_free((void *)ep->ep_ack_addr, sizeof (uintptr_t)); ep->ep_ack_addr = NULL; } } kmem_free(pool->pool_memp, pool->pool_memsize); kmem_free(pool->pool_bufmemp, pool->pool_nbuffers * sizeof (rds_buf_t)); pool->pool_memp = NULL; pool->pool_bufmemp = NULL; mutex_exit(&pool->pool_lock); } int rds_init_send_pool(rds_ep_t *ep, ib_guid_t hca_guid) { uint8_t *mp; rds_buf_t *bp; rds_hca_t *hcap; uint_t ix, rcv_len; ibt_mr_attr_t mem_attr; ibt_mr_desc_t mem_desc; uint8_t *memp; rds_buf_t *bufmemp; uintptr_t ack_addr = NULL; uint_t memsize; uint_t nbuf; rds_bufpool_t *spool; rds_data_hdr_t *pktp; int ret; RDS_DPRINTF2("rds_init_send_pool", "Enter"); spool = &ep->ep_sndpool; ASSERT(spool->pool_memp == NULL); ASSERT(ep->ep_hca_guid == 0); /* get the hcap for the HCA hosting this channel */ hcap = rds_get_hcap(rdsib_statep, hca_guid); if (hcap == NULL) { RDS_DPRINTF2("rds_init_send_pool", "HCA (0x%llx) not found", hca_guid); return (-1); } if (ep->ep_type == RDS_EP_TYPE_DATA) { spool->pool_nbuffers = MaxDataSendBuffers; spool->pool_nbusy = 0; spool->pool_nfree = MaxDataSendBuffers; memsize = (MaxDataSendBuffers * RdsPktSize) + sizeof (uintptr_t); rcv_len = RdsPktSize; } else { spool->pool_nbuffers = MaxCtrlSendBuffers; spool->pool_nbusy = 0; spool->pool_nfree = MaxCtrlSendBuffers; memsize = MaxCtrlSendBuffers * RDS_CTRLPKT_SIZE; rcv_len = RDS_CTRLPKT_SIZE; } nbuf = spool->pool_nbuffers; RDS_DPRINTF3(LABEL, "RDS Send Pool Memory: %lld", memsize); memp = (uint8_t *)kmem_zalloc(memsize, KM_NOSLEEP); if (memp == NULL) { RDS_DPRINTF1(LABEL, "RDS Send Memory allocation failed"); return (-1); } RDS_DPRINTF3(LABEL, "RDS Buffer Entries Memory: %lld", nbuf * sizeof (rds_buf_t)); /* allocate memory for buffer entries */ bufmemp = (rds_buf_t *)kmem_zalloc(nbuf * sizeof (rds_buf_t), KM_SLEEP); if (ep->ep_type == RDS_EP_TYPE_DATA) { ack_addr = (uintptr_t)kmem_zalloc(sizeof (uintptr_t), KM_SLEEP); /* register the memory with the HCA for this channel */ mem_attr.mr_vaddr = (ib_vaddr_t)ack_addr; mem_attr.mr_len = sizeof (uintptr_t); mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_WRITE; ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, &ep->ep_ackhdl, &mem_desc); if (ret != IBT_SUCCESS) { RDS_DPRINTF2("rds_init_send_pool", "EP(%p): ibt_register_mr for ack failed: %d", ep, ret); kmem_free(memp, memsize); kmem_free(bufmemp, nbuf * sizeof (rds_buf_t)); kmem_free((void *)ack_addr, sizeof (uintptr_t)); return (-1); } ep->ep_ack_rkey = mem_desc.md_rkey; ep->ep_ack_addr = ack_addr; } /* register the memory with the HCA for this channel */ mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)memp; mem_attr.mr_len = memsize; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, &ep->ep_snd_mrhdl, &mem_desc); if (ret != IBT_SUCCESS) { RDS_DPRINTF2("rds_init_send_pool", "EP(%p): ibt_register_mr " "failed: %d", ep, ret); kmem_free(memp, memsize); kmem_free(bufmemp, nbuf * sizeof (rds_buf_t)); if (ack_addr != NULL) kmem_free((void *)ack_addr, sizeof (uintptr_t)); return (-1); } ep->ep_snd_lkey = mem_desc.md_lkey; /* Initialize the pool */ spool->pool_memp = memp; spool->pool_memsize = memsize; spool->pool_bufmemp = bufmemp; spool->pool_sqpoll_pending = B_FALSE; /* chain the buffers and initialize them */ mp = memp; bp = bufmemp; if (ep->ep_type == RDS_EP_TYPE_DATA) { for (ix = 0; ix < nbuf - 1; ix++) { bp[ix].buf_nextp = &bp[ix + 1]; bp[ix].buf_ep = ep; bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[ix].buf_ds.ds_key = ep->ep_snd_lkey; bp[ix].buf_state = RDS_SNDBUF_FREE; pktp = (rds_data_hdr_t *)(uintptr_t)mp; pktp->dh_bufid = (uintptr_t)&bp[ix]; mp = mp + rcv_len; } bp[nbuf - 1].buf_nextp = NULL; bp[nbuf - 1].buf_ep = ep; bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[nbuf - 1].buf_ds.ds_key = ep->ep_snd_lkey; bp[nbuf - 1].buf_state = RDS_SNDBUF_FREE; pktp = (rds_data_hdr_t *)(uintptr_t)mp; pktp->dh_bufid = (uintptr_t)&bp[nbuf - 1]; spool->pool_headp = &bp[0]; spool->pool_tailp = &bp[nbuf - 1]; mp = mp + rcv_len; ep->ep_ackds.ds_va = (ib_vaddr_t)(uintptr_t)mp; ep->ep_ackds.ds_key = ep->ep_snd_lkey; ep->ep_ackds.ds_len = sizeof (uintptr_t); *(uintptr_t *)ep->ep_ack_addr = (uintptr_t)spool->pool_tailp; } else { /* control send pool */ for (ix = 0; ix < nbuf - 1; ix++) { bp[ix].buf_nextp = &bp[ix + 1]; bp[ix].buf_ep = ep; bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[ix].buf_ds.ds_key = ep->ep_snd_lkey; bp[ix].buf_state = RDS_SNDBUF_FREE; mp = mp + rcv_len; } bp[nbuf - 1].buf_nextp = NULL; bp[nbuf - 1].buf_ep = ep; bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp; bp[nbuf - 1].buf_ds.ds_key = ep->ep_snd_lkey; bp[nbuf - 1].buf_state = RDS_SNDBUF_FREE; spool->pool_headp = &bp[0]; spool->pool_tailp = &bp[nbuf - 1]; } RDS_DPRINTF3(LABEL, "rdsmemp start: %p end: %p", memp, mp); RDS_DPRINTF2("rds_init_send_pool", "Return"); return (0); } int rds_reinit_send_pool(rds_ep_t *ep, ib_guid_t hca_guid) { rds_buf_t *bp; rds_hca_t *hcap; ibt_mr_attr_t mem_attr; ibt_mr_desc_t mem_desc; rds_bufpool_t *spool; int ret; RDS_DPRINTF2("rds_reinit_send_pool", "Enter: EP(%p)", ep); spool = &ep->ep_sndpool; ASSERT(spool->pool_memp != NULL); /* deregister the send pool memory from the previous HCA */ hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid); if (hcap == NULL) { RDS_DPRINTF2("rds_reinit_send_pool", "HCA (0x%llx) not found", ep->ep_hca_guid); } else { if (ep->ep_snd_mrhdl != NULL) { (void) ibt_deregister_mr(hcap->hca_hdl, ep->ep_snd_mrhdl); ep->ep_snd_mrhdl = NULL; ep->ep_snd_lkey = 0; } if ((ep->ep_type == RDS_EP_TYPE_DATA) && (ep->ep_ackhdl != NULL)) { (void) ibt_deregister_mr(hcap->hca_hdl, ep->ep_ackhdl); ep->ep_ackhdl = NULL; ep->ep_ack_rkey = 0; } ep->ep_hca_guid = NULL; } /* get the hcap for the new HCA */ hcap = rds_get_hcap(rdsib_statep, hca_guid); if (hcap == NULL) { RDS_DPRINTF2("rds_reinit_send_pool", "HCA (0x%llx) not found", hca_guid); return (-1); } /* register the send memory */ mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)spool->pool_memp; mem_attr.mr_len = spool->pool_memsize; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, &ep->ep_snd_mrhdl, &mem_desc); if (ret != IBT_SUCCESS) { RDS_DPRINTF2("rds_reinit_send_pool", "EP(%p): ibt_register_mr failed: %d", ep, ret); return (-1); } ep->ep_snd_lkey = mem_desc.md_lkey; /* register the acknowledgement space */ if (ep->ep_type == RDS_EP_TYPE_DATA) { mem_attr.mr_vaddr = (ib_vaddr_t)ep->ep_ack_addr; mem_attr.mr_len = sizeof (uintptr_t); mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_WRITE; ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr, &ep->ep_ackhdl, &mem_desc); if (ret != IBT_SUCCESS) { RDS_DPRINTF2("rds_reinit_send_pool", "EP(%p): ibt_register_mr for ack failed: %d", ep, ret); (void) ibt_deregister_mr(hcap->hca_hdl, ep->ep_snd_mrhdl); ep->ep_snd_mrhdl = NULL; ep->ep_snd_lkey = 0; return (-1); } ep->ep_ack_rkey = mem_desc.md_rkey; /* update the LKEY in the acknowledgement WR */ ep->ep_ackds.ds_key = ep->ep_snd_lkey; } /* update the LKEY in each buffer */ bp = spool->pool_headp; while (bp) { bp->buf_ds.ds_key = ep->ep_snd_lkey; bp = bp->buf_nextp; } ep->ep_hca_guid = hca_guid; RDS_DPRINTF2("rds_reinit_send_pool", "Return: EP(%p)", ep); return (0); } void rds_free_recv_pool(rds_ep_t *ep) { rds_bufpool_t *pool; if (ep->ep_type == RDS_EP_TYPE_DATA) { pool = &rds_dpool; } else { pool = &rds_cpool; } mutex_enter(&ep->ep_rcvpool.pool_lock); if (ep->ep_rcvpool.pool_nfree != 0) { rds_free_buf(pool, ep->ep_rcvpool.pool_headp, ep->ep_rcvpool.pool_nfree); ep->ep_rcvpool.pool_nfree = 0; ep->ep_rcvpool.pool_headp = NULL; ep->ep_rcvpool.pool_tailp = NULL; } mutex_exit(&ep->ep_rcvpool.pool_lock); } int rds_init_recv_pool(rds_ep_t *ep) { rds_bufpool_t *rpool; rds_qp_t *recvqp; recvqp = &ep->ep_recvqp; rpool = &ep->ep_rcvpool; if (ep->ep_type == RDS_EP_TYPE_DATA) { recvqp->qp_depth = MaxDataRecvBuffers; recvqp->qp_level = 0; recvqp->qp_lwm = (DataRecvBufferLWM * MaxDataRecvBuffers)/100; recvqp->qp_taskqpending = B_FALSE; rpool->pool_nbuffers = MaxDataRecvBuffers; rpool->pool_nbusy = 0; rpool->pool_nfree = 0; } else { recvqp->qp_depth = MaxCtrlRecvBuffers; recvqp->qp_level = 0; recvqp->qp_lwm = (CtrlRecvBufferLWM * MaxCtrlRecvBuffers)/100; recvqp->qp_taskqpending = B_FALSE; rpool->pool_nbuffers = MaxCtrlRecvBuffers; rpool->pool_nbusy = 0; rpool->pool_nfree = 0; } return (0); } /* Free buffers to the global pool, either cpool or dpool */ void rds_free_buf(rds_bufpool_t *pool, rds_buf_t *bp, uint_t nbuf) { uint_t ix; RDS_DPRINTF4("rds_free_buf", "Enter"); ASSERT(nbuf != 0); mutex_enter(&pool->pool_lock); if (pool->pool_nfree != 0) { pool->pool_tailp->buf_nextp = bp; } else { pool->pool_headp = bp; } if (nbuf == 1) { ASSERT(bp->buf_state == RDS_RCVBUF_FREE); bp->buf_ep = NULL; bp->buf_nextp = NULL; pool->pool_tailp = bp; } else { for (ix = 1; ix < nbuf; ix++) { ASSERT(bp->buf_state == RDS_RCVBUF_FREE); bp->buf_ep = NULL; bp = bp->buf_nextp; } ASSERT(bp->buf_state == RDS_RCVBUF_FREE); bp->buf_ep = NULL; bp->buf_nextp = NULL; pool->pool_tailp = bp; } /* tail is always the last buffer */ pool->pool_tailp->buf_nextp = NULL; pool->pool_nfree += nbuf; pool->pool_nbusy -= nbuf; mutex_exit(&pool->pool_lock); RDS_DPRINTF4("rds_free_buf", "Return"); } /* Get buffers from the global pools, either cpool or dpool */ rds_buf_t * rds_get_buf(rds_bufpool_t *pool, uint_t nbuf, uint_t *nret) { rds_buf_t *bp = NULL, *bp1; uint_t ix; RDS_DPRINTF4("rds_get_buf", "Enter"); mutex_enter(&pool->pool_lock); RDS_DPRINTF3("rds_get_buf", "Available: %d Needed: %d", pool->pool_nfree, nbuf); if (nbuf < pool->pool_nfree) { *nret = nbuf; bp1 = pool->pool_headp; for (ix = 1; ix < nbuf; ix++) { bp1 = bp1->buf_nextp; } bp = pool->pool_headp; pool->pool_headp = bp1->buf_nextp; bp1->buf_nextp = NULL; pool->pool_nfree -= nbuf; pool->pool_nbusy += nbuf; } else if (nbuf >= pool->pool_nfree) { *nret = pool->pool_nfree; bp = pool->pool_headp; pool->pool_headp = NULL; pool->pool_tailp = NULL; pool->pool_nbusy += pool->pool_nfree; pool->pool_nfree = 0; } mutex_exit(&pool->pool_lock); RDS_DPRINTF4("rds_get_buf", "Return"); return (bp); } boolean_t rds_is_recvq_empty(rds_ep_t *ep, boolean_t wait) { rds_qp_t *recvqp; rds_bufpool_t *rpool; boolean_t ret = B_TRUE; recvqp = &ep->ep_recvqp; mutex_enter(&recvqp->qp_lock); RDS_DPRINTF2("rds_is_recvq_empty", "EP(%p): QP has %d WRs", ep, recvqp->qp_level); if (wait) { /* wait until the RQ is empty */ while (recvqp->qp_level != 0) { /* wait one second and try again */ mutex_exit(&recvqp->qp_lock); delay(drv_usectohz(1000000)); mutex_enter(&recvqp->qp_lock); } } else if (recvqp->qp_level != 0) { ret = B_FALSE; } mutex_exit(&recvqp->qp_lock); rpool = &ep->ep_rcvpool; mutex_enter(&rpool->pool_lock); RDS_DPRINTF2("rds_is_recvq_empty", "EP(%p): " "There are %d pending buffers on sockqs", ep, rpool->pool_nbusy); if (wait) { /* Wait for all buffers to be freed by sockfs */ while (rpool->pool_nbusy != 0) { /* wait one second and try again */ mutex_exit(&rpool->pool_lock); delay(drv_usectohz(1000000)); mutex_enter(&rpool->pool_lock); } } else if (rpool->pool_nbusy != 0) { ret = B_FALSE; } mutex_exit(&rpool->pool_lock); return (ret); } boolean_t rds_is_sendq_empty(rds_ep_t *ep, uint_t wait) { rds_bufpool_t *spool; rds_buf_t *bp; boolean_t ret1 = B_TRUE; /* check if all the sends completed */ spool = &ep->ep_sndpool; mutex_enter(&spool->pool_lock); RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): " "Send Pool contains: %d", ep, spool->pool_nbusy); if (wait) { while (spool->pool_nbusy != 0) { if (rds_no_interrupts) { /* wait one second and try again */ delay(drv_usectohz(1000000)); rds_poll_send_completions(ep->ep_sendcq, ep, B_TRUE); } else { /* wait one second and try again */ mutex_exit(&spool->pool_lock); delay(drv_usectohz(1000000)); mutex_enter(&spool->pool_lock); } } if ((wait == 2) && (ep->ep_type == RDS_EP_TYPE_DATA)) { rds_buf_t *ackbp; rds_buf_t *prev_ackbp; /* * If the last one is acknowledged then everything * is acknowledged */ bp = spool->pool_tailp; ackbp = *(rds_buf_t **)ep->ep_ack_addr; prev_ackbp = ackbp; RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): " "Checking for acknowledgements", ep); while (bp != ackbp) { RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p) BP(0x%p/0x%p) last " "sent/acknowledged", ep, bp, ackbp); mutex_exit(&spool->pool_lock); delay(drv_usectohz(1000000)); mutex_enter(&spool->pool_lock); bp = spool->pool_tailp; ackbp = *(rds_buf_t **)ep->ep_ack_addr; if (ackbp == prev_ackbp) { RDS_DPRINTF2("rds_is_sendq_empty", "There has been no progress," "give up and proceed"); break; } prev_ackbp = ackbp; } } } else if (spool->pool_nbusy != 0) { ret1 = B_FALSE; } mutex_exit(&spool->pool_lock); /* check if all the rdma acks completed */ mutex_enter(&ep->ep_lock); RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): " "Outstanding RDMA Acks: %d", ep, ep->ep_rdmacnt); if (wait) { while (ep->ep_rdmacnt != 0) { if (rds_no_interrupts) { /* wait one second and try again */ delay(drv_usectohz(1000000)); rds_poll_send_completions(ep->ep_sendcq, ep, B_FALSE); } else { /* wait one second and try again */ mutex_exit(&ep->ep_lock); delay(drv_usectohz(1000000)); mutex_enter(&ep->ep_lock); } } } else if (ep->ep_rdmacnt != 0) { ret1 = B_FALSE; } mutex_exit(&ep->ep_lock); return (ret1); } /* Get buffers from the send pool */ rds_buf_t * rds_get_send_buf(rds_ep_t *ep, uint_t nbuf) { rds_buf_t *bp = NULL, *bp1; rds_bufpool_t *spool; uint_t waittime = rds_waittime_ms * 1000; uint_t ix; int ret; RDS_DPRINTF4("rds_get_send_buf", "Enter: EP(%p) Buffers requested: %d", ep, nbuf); spool = &ep->ep_sndpool; mutex_enter(&spool->pool_lock); if (rds_no_interrupts) { if ((spool->pool_sqpoll_pending == B_FALSE) && (spool->pool_nbusy > (spool->pool_nbuffers * rds_poll_percent_full)/100)) { spool->pool_sqpoll_pending = B_TRUE; mutex_exit(&spool->pool_lock); rds_poll_send_completions(ep->ep_sendcq, ep, B_FALSE); mutex_enter(&spool->pool_lock); spool->pool_sqpoll_pending = B_FALSE; } } if (spool->pool_nfree < nbuf) { /* wait for buffers to become available */ spool->pool_cv_count += nbuf; ret = cv_timedwait_sig(&spool->pool_cv, &spool->pool_lock, ddi_get_lbolt() + drv_usectohz(waittime)); /* ret = cv_wait_sig(&spool->pool_cv, &spool->pool_lock); */ if (ret == 0) { /* signal pending */ spool->pool_cv_count -= nbuf; mutex_exit(&spool->pool_lock); return (NULL); } spool->pool_cv_count -= nbuf; } /* Have the number of buffers needed */ if (spool->pool_nfree > nbuf) { bp = spool->pool_headp; if (ep->ep_type == RDS_EP_TYPE_DATA) { rds_buf_t *ackbp; ackbp = *(rds_buf_t **)ep->ep_ack_addr; /* check if all the needed buffers are acknowledged */ bp1 = bp; for (ix = 0; ix < nbuf; ix++) { if ((bp1 == ackbp) || (bp1->buf_state != RDS_SNDBUF_FREE)) { /* * The buffer is not yet signalled or * is not yet acknowledged */ RDS_DPRINTF5("rds_get_send_buf", "EP(%p) Buffer (%p) not yet " "acked/completed", ep, bp1); mutex_exit(&spool->pool_lock); return (NULL); } bp1 = bp1->buf_nextp; } } /* mark the buffers as pending */ bp1 = bp; for (ix = 1; ix < nbuf; ix++) { ASSERT(bp1->buf_state == RDS_SNDBUF_FREE); bp1->buf_state = RDS_SNDBUF_PENDING; bp1 = bp1->buf_nextp; } ASSERT(bp1->buf_state == RDS_SNDBUF_FREE); bp1->buf_state = RDS_SNDBUF_PENDING; spool->pool_headp = bp1->buf_nextp; bp1->buf_nextp = NULL; if (spool->pool_headp == NULL) spool->pool_tailp = NULL; spool->pool_nfree -= nbuf; spool->pool_nbusy += nbuf; } mutex_exit(&spool->pool_lock); RDS_DPRINTF4("rds_get_send_buf", "Return: EP(%p) Buffers requested: %d", ep, nbuf); return (bp); } #define RDS_MIN_BUF_TO_WAKE_THREADS 10 void rds_free_send_buf(rds_ep_t *ep, rds_buf_t *headp, rds_buf_t *tailp, uint_t nbuf, boolean_t lock) { rds_bufpool_t *spool; rds_buf_t *tmp; RDS_DPRINTF4("rds_free_send_buf", "Enter"); ASSERT(nbuf != 0); if (tailp == NULL) { if (nbuf > 1) { tmp = headp; while (tmp->buf_nextp) { tmp = tmp->buf_nextp; } tailp = tmp; } else { tailp = headp; } } spool = &ep->ep_sndpool; if (lock == B_FALSE) { /* lock is not held outside */ mutex_enter(&spool->pool_lock); } if (spool->pool_nfree) { spool->pool_tailp->buf_nextp = headp; } else { spool->pool_headp = headp; } spool->pool_tailp = tailp; spool->pool_nfree += nbuf; spool->pool_nbusy -= nbuf; if ((spool->pool_cv_count > 0) && (spool->pool_nfree > RDS_MIN_BUF_TO_WAKE_THREADS)) { if (spool->pool_nfree >= spool->pool_cv_count) cv_broadcast(&spool->pool_cv); else cv_signal(&spool->pool_cv); } if (lock == B_FALSE) { mutex_exit(&spool->pool_lock); } RDS_DPRINTF4("rds_free_send_buf", "Return"); } #define RDS_NBUFFERS_TO_PUTBACK 100 void rds_free_recv_buf(rds_buf_t *bp, uint_t nbuf) { rds_ep_t *ep; rds_bufpool_t *rpool; rds_buf_t *bp1; uint_t ix; RDS_DPRINTF4("rds_free_recv_buf", "Enter"); ASSERT(nbuf != 0); ep = bp->buf_ep; rpool = &ep->ep_rcvpool; mutex_enter(&rpool->pool_lock); /* Add the buffers to the local pool */ if (rpool->pool_tailp == NULL) { ASSERT(rpool->pool_headp == NULL); ASSERT(rpool->pool_nfree == 0); rpool->pool_headp = bp; bp1 = bp; for (ix = 1; ix < nbuf; ix++) { if (bp1->buf_state == RDS_RCVBUF_ONSOCKQ) { rpool->pool_nbusy--; } bp1->buf_state = RDS_RCVBUF_FREE; bp1 = bp1->buf_nextp; } bp1->buf_nextp = NULL; if (bp->buf_state == RDS_RCVBUF_ONSOCKQ) { rpool->pool_nbusy--; } bp->buf_state = RDS_RCVBUF_FREE; rpool->pool_tailp = bp1; rpool->pool_nfree += nbuf; } else { bp1 = bp; for (ix = 1; ix < nbuf; ix++) { if (bp1->buf_state == RDS_RCVBUF_ONSOCKQ) { rpool->pool_nbusy--; } bp1->buf_state = RDS_RCVBUF_FREE; bp1 = bp1->buf_nextp; } bp1->buf_nextp = NULL; if (bp->buf_state == RDS_RCVBUF_ONSOCKQ) { rpool->pool_nbusy--; } bp->buf_state = RDS_RCVBUF_FREE; rpool->pool_tailp->buf_nextp = bp; rpool->pool_tailp = bp1; rpool->pool_nfree += nbuf; } if (rpool->pool_nfree >= RDS_NBUFFERS_TO_PUTBACK) { bp = rpool->pool_headp; nbuf = rpool->pool_nfree; rpool->pool_headp = NULL; rpool->pool_tailp = NULL; rpool->pool_nfree = 0; mutex_exit(&rpool->pool_lock); /* Free the buffers to the global pool */ if (ep->ep_type == RDS_EP_TYPE_DATA) { rds_free_buf(&rds_dpool, bp, nbuf); } else { rds_free_buf(&rds_cpool, bp, nbuf); } return; } mutex_exit(&rpool->pool_lock); RDS_DPRINTF4("rds_free_recv_buf", "Return"); }