1c0dd49bdSEiji Ota /* 2c0dd49bdSEiji Ota * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3c0dd49bdSEiji Ota */ 4c0dd49bdSEiji Ota 5c0dd49bdSEiji Ota /* 6*16e76cddSagiri * This file contains code imported from the OFED rds source file ib_send.c 7*16e76cddSagiri * Oracle elects to have and use the contents of ib_send.c under and governed 8*16e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However, 9*16e76cddSagiri * the following notice accompanied the original version of this file: 10*16e76cddSagiri */ 11*16e76cddSagiri 12*16e76cddSagiri /* 13c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 14c0dd49bdSEiji Ota * 15c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 16c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 17c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 18c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 19c0dd49bdSEiji Ota * OpenIB.org BSD license below: 20c0dd49bdSEiji Ota * 21c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 22c0dd49bdSEiji Ota * without modification, are permitted provided that the following 23c0dd49bdSEiji Ota * conditions are met: 24c0dd49bdSEiji Ota * 25c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 26c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 27c0dd49bdSEiji Ota * disclaimer. 28c0dd49bdSEiji Ota * 29c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 30c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 31c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 32c0dd49bdSEiji Ota * provided with the distribution. 33c0dd49bdSEiji Ota * 34c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41c0dd49bdSEiji Ota * SOFTWARE. 42c0dd49bdSEiji Ota * 43c0dd49bdSEiji Ota */ 44c0dd49bdSEiji Ota #include <sys/rds.h> 45c0dd49bdSEiji Ota 46c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 47c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/ib.h> 49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 50c0dd49bdSEiji Ota 51c0dd49bdSEiji Ota static void 52c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(struct rdsv3_message *rm, 53c0dd49bdSEiji Ota int wc_status) 54c0dd49bdSEiji Ota { 55c0dd49bdSEiji Ota int notify_status; 56c0dd49bdSEiji Ota 57c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 58c0dd49bdSEiji Ota rm, wc_status); 59c0dd49bdSEiji Ota 60c0dd49bdSEiji Ota switch (wc_status) { 61c0dd49bdSEiji Ota case IBT_WC_WR_FLUSHED_ERR: 62c0dd49bdSEiji Ota return; 63c0dd49bdSEiji Ota 64c0dd49bdSEiji Ota case IBT_WC_SUCCESS: 65fe817b60SEiji Ota notify_status = RDS_RDMA_SUCCESS; 66c0dd49bdSEiji Ota break; 67c0dd49bdSEiji Ota 68c0dd49bdSEiji Ota case IBT_WC_REMOTE_ACCESS_ERR: 69fe817b60SEiji Ota notify_status = RDS_RDMA_REMOTE_ERROR; 70c0dd49bdSEiji Ota break; 71c0dd49bdSEiji Ota 72c0dd49bdSEiji Ota default: 73fe817b60SEiji Ota notify_status = RDS_RDMA_OTHER_ERROR; 74c0dd49bdSEiji Ota break; 75c0dd49bdSEiji Ota } 76c0dd49bdSEiji Ota rdsv3_rdma_send_complete(rm, notify_status); 77c0dd49bdSEiji Ota 78c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 79c0dd49bdSEiji Ota rm, wc_status); 80c0dd49bdSEiji Ota } 81c0dd49bdSEiji Ota 82c0dd49bdSEiji Ota static void rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, 83c0dd49bdSEiji Ota uint_t num, struct rdsv3_rdma_sg scat[]); 84c0dd49bdSEiji Ota 85c0dd49bdSEiji Ota void 86c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(struct rdsv3_ib_connection *ic, 87c0dd49bdSEiji Ota struct rdsv3_rdma_op *op) 88c0dd49bdSEiji Ota { 89c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rdma", "ic: %p, op: %p", ic, op); 90c0dd49bdSEiji Ota if (op->r_mapped) { 91c0dd49bdSEiji Ota op->r_mapped = 0; 92c0dd49bdSEiji Ota if (ic->i_cm_id) { 93c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(ic->i_cm_id->device, 94c0dd49bdSEiji Ota op->r_nents, op->r_rdma_sg); 95c0dd49bdSEiji Ota } else { 96c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma((struct ib_device *)NULL, 97c0dd49bdSEiji Ota op->r_nents, op->r_rdma_sg); 98c0dd49bdSEiji Ota } 99c0dd49bdSEiji Ota } 100c0dd49bdSEiji Ota } 101c0dd49bdSEiji Ota 102c0dd49bdSEiji Ota static void 103c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic, 104c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send, 105c0dd49bdSEiji Ota int wc_status) 106c0dd49bdSEiji Ota { 107c0dd49bdSEiji Ota struct rdsv3_message *rm = send->s_rm; 108c0dd49bdSEiji Ota 109c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n", 110c0dd49bdSEiji Ota ic, send, rm); 111c0dd49bdSEiji Ota 1125d5562f5SEiji Ota mutex_enter(&rm->m_rs_lock); 1135d5562f5SEiji Ota if (rm->m_count) { 114c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, 1155d5562f5SEiji Ota rm->m_sg, rm->m_count); 1165d5562f5SEiji Ota rm->m_count = 0; 1175d5562f5SEiji Ota } 1185d5562f5SEiji Ota mutex_exit(&rm->m_rs_lock); 119c0dd49bdSEiji Ota 120c0dd49bdSEiji Ota if (rm->m_rdma_op != NULL) { 121c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op); 122c0dd49bdSEiji Ota 123c0dd49bdSEiji Ota /* 124c0dd49bdSEiji Ota * If the user asked for a completion notification on this 125c0dd49bdSEiji Ota * message, we can implement three different semantics: 126c0dd49bdSEiji Ota * 1. Notify when we received the ACK on the RDS message 127c0dd49bdSEiji Ota * that was queued with the RDMA. This provides reliable 128c0dd49bdSEiji Ota * notification of RDMA status at the expense of a one-way 129c0dd49bdSEiji Ota * packet delay. 130c0dd49bdSEiji Ota * 2. Notify when the IB stack gives us the completion 131c0dd49bdSEiji Ota * event for the RDMA operation. 132c0dd49bdSEiji Ota * 3. Notify when the IB stack gives us the completion 133c0dd49bdSEiji Ota * event for the accompanying RDS messages. 134c0dd49bdSEiji Ota * Here, we implement approach #3. To implement approach #2, 135c0dd49bdSEiji Ota * call rdsv3_rdma_send_complete from the cq_handler. 136c0dd49bdSEiji Ota * To implement #1, 137c0dd49bdSEiji Ota * don't call rdsv3_rdma_send_complete at all, and fall back to 138c0dd49bdSEiji Ota * the notify 139c0dd49bdSEiji Ota * handling in the ACK processing code. 140c0dd49bdSEiji Ota * 141c0dd49bdSEiji Ota * Note: There's no need to explicitly sync any RDMA buffers 142c0dd49bdSEiji Ota * using 143c0dd49bdSEiji Ota * ib_dma_sync_sg_for_cpu - the completion for the RDMA 144c0dd49bdSEiji Ota * operation itself unmapped the RDMA buffers, which takes care 145c0dd49bdSEiji Ota * of synching. 146c0dd49bdSEiji Ota */ 147c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(rm, wc_status); 148c0dd49bdSEiji Ota 149c0dd49bdSEiji Ota if (rm->m_rdma_op->r_write) 150c0dd49bdSEiji Ota rdsv3_stats_add(s_send_rdma_bytes, 151c0dd49bdSEiji Ota rm->m_rdma_op->r_bytes); 152c0dd49bdSEiji Ota else 153c0dd49bdSEiji Ota rdsv3_stats_add(s_recv_rdma_bytes, 154c0dd49bdSEiji Ota rm->m_rdma_op->r_bytes); 155c0dd49bdSEiji Ota } 156c0dd49bdSEiji Ota 157c0dd49bdSEiji Ota /* 158c0dd49bdSEiji Ota * If anyone waited for this message to get flushed out, wake 159c0dd49bdSEiji Ota * them up now 160c0dd49bdSEiji Ota */ 161c0dd49bdSEiji Ota rdsv3_message_unmapped(rm); 162c0dd49bdSEiji Ota 163c0dd49bdSEiji Ota rdsv3_message_put(rm); 164c0dd49bdSEiji Ota send->s_rm = NULL; 165c0dd49bdSEiji Ota } 166c0dd49bdSEiji Ota 167c0dd49bdSEiji Ota void 168c0dd49bdSEiji Ota rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic) 169c0dd49bdSEiji Ota { 170c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 171c0dd49bdSEiji Ota uint32_t i; 172c0dd49bdSEiji Ota 173c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_init_ring", "ic: %p", ic); 174c0dd49bdSEiji Ota 175c0dd49bdSEiji Ota for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 176c0dd49bdSEiji Ota send->s_rm = NULL; 177c0dd49bdSEiji Ota send->s_op = NULL; 178c0dd49bdSEiji Ota } 179c0dd49bdSEiji Ota } 180c0dd49bdSEiji Ota 181c0dd49bdSEiji Ota void 182c0dd49bdSEiji Ota rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic) 183c0dd49bdSEiji Ota { 184c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 185c0dd49bdSEiji Ota uint32_t i; 186c0dd49bdSEiji Ota 187c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "ic: %p", ic); 188c0dd49bdSEiji Ota 189c0dd49bdSEiji Ota for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 190c0dd49bdSEiji Ota if (send->s_opcode == 0xdd) 191c0dd49bdSEiji Ota continue; 192c0dd49bdSEiji Ota if (send->s_rm) 193c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(ic, send, IBT_WC_WR_FLUSHED_ERR); 194c0dd49bdSEiji Ota if (send->s_op) 195c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, send->s_op); 196c0dd49bdSEiji Ota } 197c0dd49bdSEiji Ota 198c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "Return: ic: %p", ic); 199c0dd49bdSEiji Ota } 200c0dd49bdSEiji Ota 201c0dd49bdSEiji Ota /* 202c0dd49bdSEiji Ota * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 203c0dd49bdSEiji Ota * operations performed in the send path. As the sender allocs and potentially 204c0dd49bdSEiji Ota * unallocs the next free entry in the ring it doesn't alter which is 205c0dd49bdSEiji Ota * the next to be freed, which is what this is concerned with. 206c0dd49bdSEiji Ota */ 207c0dd49bdSEiji Ota void 2085d5562f5SEiji Ota rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc) 209c0dd49bdSEiji Ota { 2105d5562f5SEiji Ota struct rdsv3_connection *conn = ic->conn; 211c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 212c0dd49bdSEiji Ota uint32_t completed, polled; 213c0dd49bdSEiji Ota uint32_t oldest; 214c0dd49bdSEiji Ota uint32_t i = 0; 215c0dd49bdSEiji Ota int ret; 216c0dd49bdSEiji Ota 2175d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", 2185d5562f5SEiji Ota "wc wc_id 0x%llx status %u byte_len %u imm_data %u\n", 2195d5562f5SEiji Ota (unsigned long long)wc->wc_id, wc->wc_status, 2205d5562f5SEiji Ota wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); 221c0dd49bdSEiji Ota 222c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_cq_event); 223c0dd49bdSEiji Ota 2245d5562f5SEiji Ota if (wc->wc_id == RDSV3_IB_ACK_WR_ID) { 225c0dd49bdSEiji Ota if (ic->i_ack_queued + HZ/2 < jiffies) 226c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_stalled); 227c0dd49bdSEiji Ota rdsv3_ib_ack_send_complete(ic); 2285d5562f5SEiji Ota return; 229c0dd49bdSEiji Ota } 230c0dd49bdSEiji Ota 231c0dd49bdSEiji Ota oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); 232c0dd49bdSEiji Ota 233c0dd49bdSEiji Ota completed = rdsv3_ib_ring_completed(&ic->i_send_ring, 2345d5562f5SEiji Ota (wc->wc_id & ~RDSV3_IB_SEND_OP), oldest); 235c0dd49bdSEiji Ota 236c0dd49bdSEiji Ota for (i = 0; i < completed; i++) { 237c0dd49bdSEiji Ota send = &ic->i_sends[oldest]; 238c0dd49bdSEiji Ota 239c0dd49bdSEiji Ota /* 2405d5562f5SEiji Ota * In the error case, wc->opcode sometimes contains 241c0dd49bdSEiji Ota * garbage 242c0dd49bdSEiji Ota */ 243c0dd49bdSEiji Ota switch (send->s_opcode) { 244c0dd49bdSEiji Ota case IBT_WRC_SEND: 245c0dd49bdSEiji Ota if (send->s_rm) 246c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(ic, send, 2475d5562f5SEiji Ota wc->wc_status); 248c0dd49bdSEiji Ota break; 249c0dd49bdSEiji Ota case IBT_WRC_RDMAW: 250c0dd49bdSEiji Ota case IBT_WRC_RDMAR: 251c0dd49bdSEiji Ota /* 252c0dd49bdSEiji Ota * Nothing to be done - the SG list will 253c0dd49bdSEiji Ota * be unmapped 254c0dd49bdSEiji Ota * when the SEND completes. 255c0dd49bdSEiji Ota */ 256c0dd49bdSEiji Ota break; 257c0dd49bdSEiji Ota default: 258c0dd49bdSEiji Ota #ifndef __lock_lint 2596e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", 260c0dd49bdSEiji Ota "RDS/IB: %s: unexpected opcode " 261c0dd49bdSEiji Ota "0x%x in WR!", 262c0dd49bdSEiji Ota __func__, send->s_opcode); 263c0dd49bdSEiji Ota #endif 264c0dd49bdSEiji Ota break; 265c0dd49bdSEiji Ota } 266c0dd49bdSEiji Ota 267c0dd49bdSEiji Ota send->s_opcode = 0xdd; 268c0dd49bdSEiji Ota if (send->s_queued + HZ/2 < jiffies) 269c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_stalled); 270c0dd49bdSEiji Ota 271c0dd49bdSEiji Ota /* 272c0dd49bdSEiji Ota * If a RDMA operation produced an error, signal 273c0dd49bdSEiji Ota * this right 274c0dd49bdSEiji Ota * away. If we don't, the subsequent SEND that goes 275c0dd49bdSEiji Ota * with this 276c0dd49bdSEiji Ota * RDMA will be canceled with ERR_WFLUSH, and the 277c0dd49bdSEiji Ota * application 278c0dd49bdSEiji Ota * never learn that the RDMA failed. 279c0dd49bdSEiji Ota */ 2805d5562f5SEiji Ota if (wc->wc_status == 281c0dd49bdSEiji Ota IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { 282c0dd49bdSEiji Ota struct rdsv3_message *rm; 283c0dd49bdSEiji Ota 284c0dd49bdSEiji Ota rm = rdsv3_send_get_message(conn, send->s_op); 285c0dd49bdSEiji Ota if (rm) { 286c0dd49bdSEiji Ota if (rm->m_rdma_op != NULL) 287c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, 288c0dd49bdSEiji Ota rm->m_rdma_op); 289c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(rm, 2905d5562f5SEiji Ota wc->wc_status); 291c0dd49bdSEiji Ota rdsv3_message_put(rm); 292c0dd49bdSEiji Ota } 293c0dd49bdSEiji Ota } 294c0dd49bdSEiji Ota 295c0dd49bdSEiji Ota oldest = (oldest + 1) % ic->i_send_ring.w_nr; 296c0dd49bdSEiji Ota } 297c0dd49bdSEiji Ota 298c0dd49bdSEiji Ota rdsv3_ib_ring_free(&ic->i_send_ring, completed); 299c0dd49bdSEiji Ota 3005d5562f5SEiji Ota clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 301c0dd49bdSEiji Ota 302c0dd49bdSEiji Ota /* We expect errors as the qp is drained during shutdown */ 3035d5562f5SEiji Ota if (wc->wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { 3045d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_ib_send_cqe_handler", 305c0dd49bdSEiji Ota "send completion on %u.%u.%u.%u " 306c0dd49bdSEiji Ota "had status %u, disconnecting and reconnecting\n", 3075d5562f5SEiji Ota NIPQUAD(conn->c_faddr), wc->wc_status); 308c0dd49bdSEiji Ota rdsv3_conn_drop(conn); 309c0dd49bdSEiji Ota } 310c0dd49bdSEiji Ota 3115d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", "Return: conn: %p", ic); 312c0dd49bdSEiji Ota } 313c0dd49bdSEiji Ota 314c0dd49bdSEiji Ota /* 315c0dd49bdSEiji Ota * This is the main function for allocating credits when sending 316c0dd49bdSEiji Ota * messages. 317c0dd49bdSEiji Ota * 318c0dd49bdSEiji Ota * Conceptually, we have two counters: 319c0dd49bdSEiji Ota * - send credits: this tells us how many WRs we're allowed 320c0dd49bdSEiji Ota * to submit without overruning the reciever's queue. For 321c0dd49bdSEiji Ota * each SEND WR we post, we decrement this by one. 322c0dd49bdSEiji Ota * 323c0dd49bdSEiji Ota * - posted credits: this tells us how many WRs we recently 324c0dd49bdSEiji Ota * posted to the receive queue. This value is transferred 325c0dd49bdSEiji Ota * to the peer as a "credit update" in a RDS header field. 326c0dd49bdSEiji Ota * Every time we transmit credits to the peer, we subtract 327c0dd49bdSEiji Ota * the amount of transferred credits from this counter. 328c0dd49bdSEiji Ota * 329c0dd49bdSEiji Ota * It is essential that we avoid situations where both sides have 330c0dd49bdSEiji Ota * exhausted their send credits, and are unable to send new credits 331c0dd49bdSEiji Ota * to the peer. We achieve this by requiring that we send at least 332c0dd49bdSEiji Ota * one credit update to the peer before exhausting our credits. 333c0dd49bdSEiji Ota * When new credits arrive, we subtract one credit that is withheld 334c0dd49bdSEiji Ota * until we've posted new buffers and are ready to transmit these 335c0dd49bdSEiji Ota * credits (see rdsv3_ib_send_add_credits below). 336c0dd49bdSEiji Ota * 337c0dd49bdSEiji Ota * The RDS send code is essentially single-threaded; rdsv3_send_xmit 338c0dd49bdSEiji Ota * grabs c_send_lock to ensure exclusive access to the send ring. 339c0dd49bdSEiji Ota * However, the ACK sending code is independent and can race with 340c0dd49bdSEiji Ota * message SENDs. 341c0dd49bdSEiji Ota * 342c0dd49bdSEiji Ota * In the send path, we need to update the counters for send credits 343c0dd49bdSEiji Ota * and the counter of posted buffers atomically - when we use the 344c0dd49bdSEiji Ota * last available credit, we cannot allow another thread to race us 345c0dd49bdSEiji Ota * and grab the posted credits counter. Hence, we have to use a 346c0dd49bdSEiji Ota * spinlock to protect the credit counter, or use atomics. 347c0dd49bdSEiji Ota * 348c0dd49bdSEiji Ota * Spinlocks shared between the send and the receive path are bad, 349c0dd49bdSEiji Ota * because they create unnecessary delays. An early implementation 350c0dd49bdSEiji Ota * using a spinlock showed a 5% degradation in throughput at some 351c0dd49bdSEiji Ota * loads. 352c0dd49bdSEiji Ota * 353c0dd49bdSEiji Ota * This implementation avoids spinlocks completely, putting both 354c0dd49bdSEiji Ota * counters into a single atomic, and updating that atomic using 355c0dd49bdSEiji Ota * atomic_add (in the receive path, when receiving fresh credits), 356c0dd49bdSEiji Ota * and using atomic_cmpxchg when updating the two counters. 357c0dd49bdSEiji Ota */ 358c0dd49bdSEiji Ota int 359c0dd49bdSEiji Ota rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, 360cadbfdc3SEiji Ota uint32_t wanted, uint32_t *adv_credits, int need_posted) 361c0dd49bdSEiji Ota { 362c0dd49bdSEiji Ota unsigned int avail, posted, got = 0, advertise; 363c0dd49bdSEiji Ota long oldval, newval; 364c0dd49bdSEiji Ota 365cadbfdc3SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 366cadbfdc3SEiji Ota ic, wanted, *adv_credits, need_posted); 367c0dd49bdSEiji Ota 368c0dd49bdSEiji Ota *adv_credits = 0; 369c0dd49bdSEiji Ota if (!ic->i_flowctl) 370c0dd49bdSEiji Ota return (wanted); 371c0dd49bdSEiji Ota 372c0dd49bdSEiji Ota try_again: 373c0dd49bdSEiji Ota advertise = 0; 374c0dd49bdSEiji Ota oldval = newval = atomic_get(&ic->i_credits); 375c0dd49bdSEiji Ota posted = IB_GET_POST_CREDITS(oldval); 376c0dd49bdSEiji Ota avail = IB_GET_SEND_CREDITS(oldval); 377c0dd49bdSEiji Ota 378c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_send_grab_credits", 379c0dd49bdSEiji Ota "wanted (%u): credits=%u posted=%u\n", wanted, avail, posted); 380c0dd49bdSEiji Ota 381c0dd49bdSEiji Ota /* The last credit must be used to send a credit update. */ 382c0dd49bdSEiji Ota if (avail && !posted) 383c0dd49bdSEiji Ota avail--; 384c0dd49bdSEiji Ota 385c0dd49bdSEiji Ota if (avail < wanted) { 386c0dd49bdSEiji Ota struct rdsv3_connection *conn = ic->i_cm_id->context; 387c0dd49bdSEiji Ota 388c0dd49bdSEiji Ota /* Oops, there aren't that many credits left! */ 389c0dd49bdSEiji Ota set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 390c0dd49bdSEiji Ota got = avail; 391c0dd49bdSEiji Ota } else { 392c0dd49bdSEiji Ota /* Sometimes you get what you want, lalala. */ 393c0dd49bdSEiji Ota got = wanted; 394c0dd49bdSEiji Ota } 395c0dd49bdSEiji Ota newval -= IB_SET_SEND_CREDITS(got); 396c0dd49bdSEiji Ota 397c0dd49bdSEiji Ota /* 398c0dd49bdSEiji Ota * If need_posted is non-zero, then the caller wants 399c0dd49bdSEiji Ota * the posted regardless of whether any send credits are 400c0dd49bdSEiji Ota * available. 401c0dd49bdSEiji Ota */ 402c0dd49bdSEiji Ota if (posted && (got || need_posted)) { 403cadbfdc3SEiji Ota advertise = min(posted, RDSV3_MAX_ADV_CREDIT); 404c0dd49bdSEiji Ota newval -= IB_SET_POST_CREDITS(advertise); 405c0dd49bdSEiji Ota } 406c0dd49bdSEiji Ota 407c0dd49bdSEiji Ota /* Finally bill everything */ 408c0dd49bdSEiji Ota if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) 409c0dd49bdSEiji Ota goto try_again; 410c0dd49bdSEiji Ota 411c0dd49bdSEiji Ota *adv_credits = advertise; 412c0dd49bdSEiji Ota 413cadbfdc3SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 414cadbfdc3SEiji Ota ic, got, *adv_credits, need_posted); 415cadbfdc3SEiji Ota 416c0dd49bdSEiji Ota return (got); 417c0dd49bdSEiji Ota } 418c0dd49bdSEiji Ota 419c0dd49bdSEiji Ota void 420c0dd49bdSEiji Ota rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, unsigned int credits) 421c0dd49bdSEiji Ota { 422c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 423c0dd49bdSEiji Ota 424c0dd49bdSEiji Ota if (credits == 0) 425c0dd49bdSEiji Ota return; 426c0dd49bdSEiji Ota 427c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_send_add_credits", 428c0dd49bdSEiji Ota "credits (%u): current=%u%s\n", 429c0dd49bdSEiji Ota credits, 430c0dd49bdSEiji Ota IB_GET_SEND_CREDITS(atomic_get(&ic->i_credits)), 431c0dd49bdSEiji Ota test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ? 432c0dd49bdSEiji Ota ", ll_send_full" : ""); 433c0dd49bdSEiji Ota 434c0dd49bdSEiji Ota atomic_add_32(&ic->i_credits, IB_SET_SEND_CREDITS(credits)); 435c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 436c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 437c0dd49bdSEiji Ota 438c0dd49bdSEiji Ota ASSERT(!(IB_GET_SEND_CREDITS(credits) >= 16384)); 439c0dd49bdSEiji Ota 440c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_rx_credit_updates); 441c0dd49bdSEiji Ota 442c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_add_credits", 443c0dd49bdSEiji Ota "Return: conn: %p, credits: %d", 444c0dd49bdSEiji Ota conn, credits); 445c0dd49bdSEiji Ota } 446c0dd49bdSEiji Ota 447c0dd49bdSEiji Ota void 448c0dd49bdSEiji Ota rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, unsigned int posted) 449c0dd49bdSEiji Ota { 450c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 451c0dd49bdSEiji Ota 452c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_advertise_credits", "conn: %p, posted: %d", 453c0dd49bdSEiji Ota conn, posted); 454c0dd49bdSEiji Ota 455c0dd49bdSEiji Ota if (posted == 0) 456c0dd49bdSEiji Ota return; 457c0dd49bdSEiji Ota 458c0dd49bdSEiji Ota atomic_add_32(&ic->i_credits, IB_SET_POST_CREDITS(posted)); 459c0dd49bdSEiji Ota 460c0dd49bdSEiji Ota /* 461c0dd49bdSEiji Ota * Decide whether to send an update to the peer now. 462c0dd49bdSEiji Ota * If we would send a credit update for every single buffer we 463c0dd49bdSEiji Ota * post, we would end up with an ACK storm (ACK arrives, 464c0dd49bdSEiji Ota * consumes buffer, we refill the ring, send ACK to remote 465c0dd49bdSEiji Ota * advertising the newly posted buffer... ad inf) 466c0dd49bdSEiji Ota * 467c0dd49bdSEiji Ota * Performance pretty much depends on how often we send 468c0dd49bdSEiji Ota * credit updates - too frequent updates mean lots of ACKs. 469c0dd49bdSEiji Ota * Too infrequent updates, and the peer will run out of 470c0dd49bdSEiji Ota * credits and has to throttle. 471c0dd49bdSEiji Ota * For the time being, 16 seems to be a good compromise. 472c0dd49bdSEiji Ota */ 473c0dd49bdSEiji Ota if (IB_GET_POST_CREDITS(atomic_get(&ic->i_credits)) >= 16) 474c0dd49bdSEiji Ota set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 475c0dd49bdSEiji Ota } 476c0dd49bdSEiji Ota 477c0dd49bdSEiji Ota static inline void 478c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic, 479c0dd49bdSEiji Ota ibt_send_wr_t *wr, unsigned int pos, 480c0dd49bdSEiji Ota struct rdsv3_scatterlist *scat, unsigned int off, unsigned int length, 481c0dd49bdSEiji Ota int send_flags) 482c0dd49bdSEiji Ota { 483c0dd49bdSEiji Ota ibt_wr_ds_t *sge; 484c0dd49bdSEiji Ota 485c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 486c0dd49bdSEiji Ota "ic: %p, wr: %p scat: %p %d %d %d %d", 487c0dd49bdSEiji Ota ic, wr, scat, pos, off, length, send_flags); 488c0dd49bdSEiji Ota 4895d5562f5SEiji Ota wr->wr_id = pos | RDSV3_IB_SEND_OP; 490c0dd49bdSEiji Ota wr->wr_trans = IBT_RC_SRV; 491c0dd49bdSEiji Ota wr->wr_flags = send_flags; 492c0dd49bdSEiji Ota wr->wr_opcode = IBT_WRC_SEND; 493c0dd49bdSEiji Ota 494c0dd49bdSEiji Ota if (length != 0) { 495c0dd49bdSEiji Ota int ix, len, assigned; 496c0dd49bdSEiji Ota ibt_wr_ds_t *sgl; 497c0dd49bdSEiji Ota 498c0dd49bdSEiji Ota ASSERT(length <= scat->length - off); 499c0dd49bdSEiji Ota 500c0dd49bdSEiji Ota sgl = scat->sgl; 501c0dd49bdSEiji Ota if (off != 0) { 502c0dd49bdSEiji Ota /* find the right sgl to begin with */ 503c0dd49bdSEiji Ota while (sgl->ds_len <= off) { 504c0dd49bdSEiji Ota off -= sgl->ds_len; 505c0dd49bdSEiji Ota sgl++; 506c0dd49bdSEiji Ota } 507c0dd49bdSEiji Ota } 508c0dd49bdSEiji Ota 509c0dd49bdSEiji Ota ix = 1; /* first data sgl is at 1 */ 510c0dd49bdSEiji Ota assigned = 0; 511c0dd49bdSEiji Ota len = length; 512c0dd49bdSEiji Ota do { 513c0dd49bdSEiji Ota sge = &wr->wr_sgl[ix++]; 514c0dd49bdSEiji Ota sge->ds_va = sgl->ds_va + off; 515c0dd49bdSEiji Ota assigned = min(len, sgl->ds_len - off); 516c0dd49bdSEiji Ota sge->ds_len = assigned; 517c0dd49bdSEiji Ota sge->ds_key = sgl->ds_key; 518c0dd49bdSEiji Ota len -= assigned; 519c0dd49bdSEiji Ota if (len != 0) { 520c0dd49bdSEiji Ota sgl++; 521c0dd49bdSEiji Ota off = 0; 522c0dd49bdSEiji Ota } 523c0dd49bdSEiji Ota } while (len > 0); 524c0dd49bdSEiji Ota 525c0dd49bdSEiji Ota wr->wr_nds = ix; 526c0dd49bdSEiji Ota } else { 527c0dd49bdSEiji Ota /* 528c0dd49bdSEiji Ota * We're sending a packet with no payload. There is only 529c0dd49bdSEiji Ota * one SGE 530c0dd49bdSEiji Ota */ 531c0dd49bdSEiji Ota wr->wr_nds = 1; 532c0dd49bdSEiji Ota } 533c0dd49bdSEiji Ota 534c0dd49bdSEiji Ota sge = &wr->wr_sgl[0]; 535c0dd49bdSEiji Ota sge->ds_va = ic->i_send_hdrs_dma + (pos * sizeof (struct rdsv3_header)); 536c0dd49bdSEiji Ota sge->ds_len = sizeof (struct rdsv3_header); 537c0dd49bdSEiji Ota sge->ds_key = ic->i_mr->lkey; 538c0dd49bdSEiji Ota 539c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 540c0dd49bdSEiji Ota "Return: ic: %p, wr: %p scat: %p", ic, wr, scat); 541c0dd49bdSEiji Ota } 542c0dd49bdSEiji Ota 543c0dd49bdSEiji Ota /* 544c0dd49bdSEiji Ota * This can be called multiple times for a given message. The first time 545c0dd49bdSEiji Ota * we see a message we map its scatterlist into the IB device so that 546c0dd49bdSEiji Ota * we can provide that mapped address to the IB scatter gather entries 547c0dd49bdSEiji Ota * in the IB work requests. We translate the scatterlist into a series 548c0dd49bdSEiji Ota * of work requests that fragment the message. These work requests complete 549c0dd49bdSEiji Ota * in order so we pass ownership of the message to the completion handler 550c0dd49bdSEiji Ota * once we send the final fragment. 551c0dd49bdSEiji Ota * 552c0dd49bdSEiji Ota * The RDS core uses the c_send_lock to only enter this function once 553c0dd49bdSEiji Ota * per connection. This makes sure that the tx ring alloc/unalloc pairs 554c0dd49bdSEiji Ota * don't get out of sync and confuse the ring. 555c0dd49bdSEiji Ota */ 556c0dd49bdSEiji Ota int 557c0dd49bdSEiji Ota rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 558c0dd49bdSEiji Ota unsigned int hdr_off, unsigned int sg, unsigned int off) 559c0dd49bdSEiji Ota { 560c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 561c0dd49bdSEiji Ota struct ib_device *dev = ic->i_cm_id->device; 562c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send = NULL; 563c0dd49bdSEiji Ota struct rdsv3_ib_send_work *first; 564c0dd49bdSEiji Ota struct rdsv3_ib_send_work *prev; 565c0dd49bdSEiji Ota ibt_send_wr_t *wr; 566c0dd49bdSEiji Ota struct rdsv3_scatterlist *scat; 567c0dd49bdSEiji Ota uint32_t pos; 568c0dd49bdSEiji Ota uint32_t i; 569c0dd49bdSEiji Ota uint32_t work_alloc; 570c0dd49bdSEiji Ota uint32_t credit_alloc; 571c0dd49bdSEiji Ota uint32_t posted; 572c0dd49bdSEiji Ota uint32_t adv_credits = 0; 573c0dd49bdSEiji Ota int send_flags = 0; 574c0dd49bdSEiji Ota int sent; 575c0dd49bdSEiji Ota int ret; 576c0dd49bdSEiji Ota int flow_controlled = 0; 577c0dd49bdSEiji Ota 578c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit", "conn: %p, rm: %p", conn, rm); 579c0dd49bdSEiji Ota 580c0dd49bdSEiji Ota ASSERT(!(off % RDSV3_FRAG_SIZE)); 581c0dd49bdSEiji Ota ASSERT(!(hdr_off != 0 && hdr_off != sizeof (struct rdsv3_header))); 582c0dd49bdSEiji Ota 583c0dd49bdSEiji Ota /* Do not send cong updates to IB loopback */ 584c0dd49bdSEiji Ota if (conn->c_loopback && 585c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) { 586c0dd49bdSEiji Ota rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); 587c0dd49bdSEiji Ota return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); 588c0dd49bdSEiji Ota } 589c0dd49bdSEiji Ota 590c0dd49bdSEiji Ota #ifndef __lock_lint 591c0dd49bdSEiji Ota /* FIXME we may overallocate here */ 592c0dd49bdSEiji Ota if (ntohl(rm->m_inc.i_hdr.h_len) == 0) 593c0dd49bdSEiji Ota i = 1; 594c0dd49bdSEiji Ota else 595c0dd49bdSEiji Ota i = ceil(ntohl(rm->m_inc.i_hdr.h_len), RDSV3_FRAG_SIZE); 596c0dd49bdSEiji Ota #endif 597c0dd49bdSEiji Ota 598c0dd49bdSEiji Ota work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos); 5995d5562f5SEiji Ota if (work_alloc != i) { 6005d5562f5SEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 601c0dd49bdSEiji Ota set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 602c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_ring_full); 603c0dd49bdSEiji Ota ret = -ENOMEM; 604c0dd49bdSEiji Ota goto out; 605c0dd49bdSEiji Ota } 606c0dd49bdSEiji Ota 607c0dd49bdSEiji Ota credit_alloc = work_alloc; 608c0dd49bdSEiji Ota if (ic->i_flowctl) { 609c0dd49bdSEiji Ota credit_alloc = rdsv3_ib_send_grab_credits(ic, work_alloc, 610cadbfdc3SEiji Ota &posted, 0); 611c0dd49bdSEiji Ota adv_credits += posted; 612c0dd49bdSEiji Ota if (credit_alloc < work_alloc) { 613c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, 614c0dd49bdSEiji Ota work_alloc - credit_alloc); 615c0dd49bdSEiji Ota work_alloc = credit_alloc; 616c0dd49bdSEiji Ota flow_controlled++; 617c0dd49bdSEiji Ota } 618c0dd49bdSEiji Ota if (work_alloc == 0) { 619cadbfdc3SEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 620c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_throttle); 621c0dd49bdSEiji Ota ret = -ENOMEM; 622c0dd49bdSEiji Ota goto out; 623c0dd49bdSEiji Ota } 624c0dd49bdSEiji Ota } 625c0dd49bdSEiji Ota 626c0dd49bdSEiji Ota /* map the message the first time we see it */ 627c0dd49bdSEiji Ota if (ic->i_rm == NULL) { 628c0dd49bdSEiji Ota /* 629c0dd49bdSEiji Ota * printk(KERN_NOTICE 630c0dd49bdSEiji Ota * "rdsv3_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", 631c0dd49bdSEiji Ota * be16_to_cpu(rm->m_inc.i_hdr.h_dport), 632c0dd49bdSEiji Ota * rm->m_inc.i_hdr.h_flags, 633c0dd49bdSEiji Ota * be32_to_cpu(rm->m_inc.i_hdr.h_len)); 634c0dd49bdSEiji Ota */ 635c0dd49bdSEiji Ota if (rm->m_nents) { 636c0dd49bdSEiji Ota rm->m_count = rdsv3_ib_dma_map_sg(dev, 637c0dd49bdSEiji Ota rm->m_sg, rm->m_nents); 638c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit", 639c0dd49bdSEiji Ota "ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); 640c0dd49bdSEiji Ota if (rm->m_count == 0) { 641c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 642c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, 643c0dd49bdSEiji Ota work_alloc); 644c0dd49bdSEiji Ota ret = -ENOMEM; /* XXX ? */ 645c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit", 646c0dd49bdSEiji Ota "fail: ic %p mapping rm %p: %d\n", 647c0dd49bdSEiji Ota ic, rm, rm->m_count); 648c0dd49bdSEiji Ota goto out; 649c0dd49bdSEiji Ota } 650c0dd49bdSEiji Ota } else { 651c0dd49bdSEiji Ota rm->m_count = 0; 652c0dd49bdSEiji Ota } 653c0dd49bdSEiji Ota 654c0dd49bdSEiji Ota ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 655c0dd49bdSEiji Ota ic->i_unsignaled_bytes = rdsv3_ib_sysctl_max_unsig_bytes; 656c0dd49bdSEiji Ota rdsv3_message_addref(rm); 657c0dd49bdSEiji Ota ic->i_rm = rm; 658c0dd49bdSEiji Ota 659c0dd49bdSEiji Ota /* Finalize the header */ 660c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags)) 661c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_ACK_REQUIRED; 662c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) 663c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_RETRANSMITTED; 664c0dd49bdSEiji Ota 665c0dd49bdSEiji Ota /* 666c0dd49bdSEiji Ota * If it has a RDMA op, tell the peer we did it. This is 667c0dd49bdSEiji Ota * used by the peer to release use-once RDMA MRs. 668c0dd49bdSEiji Ota */ 669c0dd49bdSEiji Ota if (rm->m_rdma_op) { 670c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma ext_hdr; 671c0dd49bdSEiji Ota 672c0dd49bdSEiji Ota ext_hdr.h_rdma_rkey = htonl(rm->m_rdma_op->r_key); 673c0dd49bdSEiji Ota (void) rdsv3_message_add_extension(&rm->m_inc.i_hdr, 674c0dd49bdSEiji Ota RDSV3_EXTHDR_RDMA, &ext_hdr, 675c0dd49bdSEiji Ota sizeof (ext_hdr)); 676c0dd49bdSEiji Ota } 677c0dd49bdSEiji Ota if (rm->m_rdma_cookie) { 678c0dd49bdSEiji Ota (void) rdsv3_message_add_rdma_dest_extension( 679c0dd49bdSEiji Ota &rm->m_inc.i_hdr, 680c0dd49bdSEiji Ota rdsv3_rdma_cookie_key(rm->m_rdma_cookie), 681c0dd49bdSEiji Ota rdsv3_rdma_cookie_offset(rm->m_rdma_cookie)); 682c0dd49bdSEiji Ota } 683c0dd49bdSEiji Ota 684c0dd49bdSEiji Ota /* 685c0dd49bdSEiji Ota * Note - rdsv3_ib_piggyb_ack clears the ACK_REQUIRED bit, so 686c0dd49bdSEiji Ota * we should not do this unless we have a chance of at least 687c0dd49bdSEiji Ota * sticking the header into the send ring. Which is why we 688c0dd49bdSEiji Ota * should call rdsv3_ib_ring_alloc first. 689c0dd49bdSEiji Ota */ 690c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_ack = htonll(rdsv3_ib_piggyb_ack(ic)); 691c0dd49bdSEiji Ota rdsv3_message_make_checksum(&rm->m_inc.i_hdr); 692c0dd49bdSEiji Ota 693c0dd49bdSEiji Ota /* 694c0dd49bdSEiji Ota * Update adv_credits since we reset the ACK_REQUIRED bit. 695c0dd49bdSEiji Ota */ 696cadbfdc3SEiji Ota (void) rdsv3_ib_send_grab_credits(ic, 0, &posted, 1); 697c0dd49bdSEiji Ota adv_credits += posted; 698c0dd49bdSEiji Ota ASSERT(adv_credits <= 255); 699cadbfdc3SEiji Ota } 700c0dd49bdSEiji Ota 701c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 702c0dd49bdSEiji Ota first = send; 703c0dd49bdSEiji Ota prev = NULL; 704c0dd49bdSEiji Ota scat = &rm->m_sg[sg]; 705c0dd49bdSEiji Ota sent = 0; 706c0dd49bdSEiji Ota i = 0; 707c0dd49bdSEiji Ota 708c0dd49bdSEiji Ota /* 709c0dd49bdSEiji Ota * Sometimes you want to put a fence between an RDMA 710c0dd49bdSEiji Ota * READ and the following SEND. 711c0dd49bdSEiji Ota * We could either do this all the time 712c0dd49bdSEiji Ota * or when requested by the user. Right now, we let 713c0dd49bdSEiji Ota * the application choose. 714c0dd49bdSEiji Ota */ 715c0dd49bdSEiji Ota if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 716c0dd49bdSEiji Ota send_flags = IBT_WR_SEND_FENCE; 717c0dd49bdSEiji Ota 718c0dd49bdSEiji Ota /* 719c0dd49bdSEiji Ota * We could be copying the header into the unused tail of the page. 720c0dd49bdSEiji Ota * That would need to be changed in the future when those pages might 721c0dd49bdSEiji Ota * be mapped userspace pages or page cache pages. So instead we always 722c0dd49bdSEiji Ota * use a second sge and our long-lived ring of mapped headers. We send 723c0dd49bdSEiji Ota * the header after the data so that the data payload can be aligned on 724c0dd49bdSEiji Ota * the receiver. 725c0dd49bdSEiji Ota */ 726c0dd49bdSEiji Ota 727c0dd49bdSEiji Ota /* handle a 0-len message */ 728c0dd49bdSEiji Ota if (ntohl(rm->m_inc.i_hdr.h_len) == 0) { 729c0dd49bdSEiji Ota wr = &ic->i_send_wrs[0]; 730c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(ic, wr, pos, NULL, 0, 0, send_flags); 731c0dd49bdSEiji Ota send->s_queued = jiffies; 732c0dd49bdSEiji Ota send->s_op = NULL; 733c0dd49bdSEiji Ota send->s_opcode = wr->wr_opcode; 734c0dd49bdSEiji Ota goto add_header; 735c0dd49bdSEiji Ota } 736c0dd49bdSEiji Ota 737c0dd49bdSEiji Ota /* if there's data reference it with a chain of work reqs */ 738c0dd49bdSEiji Ota for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 739c0dd49bdSEiji Ota unsigned int len; 740c0dd49bdSEiji Ota 741c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 742c0dd49bdSEiji Ota 743c0dd49bdSEiji Ota wr = &ic->i_send_wrs[i]; 744c0dd49bdSEiji Ota len = min(RDSV3_FRAG_SIZE, 745c0dd49bdSEiji Ota rdsv3_ib_sg_dma_len(dev, scat) - off); 746c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(ic, wr, pos, scat, off, len, 747c0dd49bdSEiji Ota send_flags); 748c0dd49bdSEiji Ota send->s_queued = jiffies; 749c0dd49bdSEiji Ota send->s_op = NULL; 750c0dd49bdSEiji Ota send->s_opcode = wr->wr_opcode; 751c0dd49bdSEiji Ota 752c0dd49bdSEiji Ota /* 753c0dd49bdSEiji Ota * We want to delay signaling completions just enough to get 754c0dd49bdSEiji Ota * the batching benefits but not so much that we create dead 755c0dd49bdSEiji Ota * time 756c0dd49bdSEiji Ota * on the wire. 757c0dd49bdSEiji Ota */ 758c0dd49bdSEiji Ota if (ic->i_unsignaled_wrs-- == 0) { 759c0dd49bdSEiji Ota ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 760c0dd49bdSEiji Ota wr->wr_flags |= 761c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 762c0dd49bdSEiji Ota } 763c0dd49bdSEiji Ota 764c0dd49bdSEiji Ota ic->i_unsignaled_bytes -= len; 765c0dd49bdSEiji Ota if (ic->i_unsignaled_bytes <= 0) { 766c0dd49bdSEiji Ota ic->i_unsignaled_bytes = 767c0dd49bdSEiji Ota rdsv3_ib_sysctl_max_unsig_bytes; 768c0dd49bdSEiji Ota wr->wr_flags |= 769c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 770c0dd49bdSEiji Ota } 771c0dd49bdSEiji Ota 772c0dd49bdSEiji Ota /* 773c0dd49bdSEiji Ota * Always signal the last one if we're stopping due to flow 774c0dd49bdSEiji Ota * control. 775c0dd49bdSEiji Ota */ 776c0dd49bdSEiji Ota if (flow_controlled && i == (work_alloc-1)) { 777c0dd49bdSEiji Ota wr->wr_flags |= 778c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 779c0dd49bdSEiji Ota } 780c0dd49bdSEiji Ota 781c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit", "send %p wr %p num_sge %u \n", 782c0dd49bdSEiji Ota send, wr, wr->wr_nds); 783c0dd49bdSEiji Ota 784c0dd49bdSEiji Ota sent += len; 785c0dd49bdSEiji Ota off += len; 786c0dd49bdSEiji Ota if (off == rdsv3_ib_sg_dma_len(dev, scat)) { 787c0dd49bdSEiji Ota scat++; 788c0dd49bdSEiji Ota off = 0; 789c0dd49bdSEiji Ota } 790c0dd49bdSEiji Ota 791c0dd49bdSEiji Ota add_header: 792c0dd49bdSEiji Ota /* 793c0dd49bdSEiji Ota * Tack on the header after the data. The header SGE 794c0dd49bdSEiji Ota * should already 795c0dd49bdSEiji Ota * have been set up to point to the right header buffer. 796c0dd49bdSEiji Ota */ 797c0dd49bdSEiji Ota (void) memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, 798c0dd49bdSEiji Ota sizeof (struct rdsv3_header)); 799c0dd49bdSEiji Ota 800c0dd49bdSEiji Ota if (0) { 801c0dd49bdSEiji Ota struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 802c0dd49bdSEiji Ota 8036e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 804c0dd49bdSEiji Ota "send WR dport=%u flags=0x%x len=%d", 805c0dd49bdSEiji Ota ntohs(hdr->h_dport), 806c0dd49bdSEiji Ota hdr->h_flags, 807c0dd49bdSEiji Ota ntohl(hdr->h_len)); 808c0dd49bdSEiji Ota } 809c0dd49bdSEiji Ota if (adv_credits) { 810c0dd49bdSEiji Ota struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 811c0dd49bdSEiji Ota 812c0dd49bdSEiji Ota /* add credit and redo the header checksum */ 813c0dd49bdSEiji Ota hdr->h_credit = adv_credits; 814c0dd49bdSEiji Ota rdsv3_message_make_checksum(hdr); 815c0dd49bdSEiji Ota adv_credits = 0; 816c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_credit_updates); 817c0dd49bdSEiji Ota } 818c0dd49bdSEiji Ota 819c0dd49bdSEiji Ota prev = send; 820c0dd49bdSEiji Ota 821c0dd49bdSEiji Ota pos = (pos + 1) % ic->i_send_ring.w_nr; 822c0dd49bdSEiji Ota } 823c0dd49bdSEiji Ota 824c0dd49bdSEiji Ota /* 825c0dd49bdSEiji Ota * Account the RDS header in the number of bytes we sent, but just once. 826c0dd49bdSEiji Ota * The caller has no concept of fragmentation. 827c0dd49bdSEiji Ota */ 828c0dd49bdSEiji Ota if (hdr_off == 0) 829c0dd49bdSEiji Ota sent += sizeof (struct rdsv3_header); 830c0dd49bdSEiji Ota 831c0dd49bdSEiji Ota /* if we finished the message then send completion owns it */ 832c0dd49bdSEiji Ota if (scat == &rm->m_sg[rm->m_count]) { 833c0dd49bdSEiji Ota prev->s_rm = ic->i_rm; 834c0dd49bdSEiji Ota wr->wr_flags |= IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 835c0dd49bdSEiji Ota ic->i_rm = NULL; 836c0dd49bdSEiji Ota } 837c0dd49bdSEiji Ota 838c0dd49bdSEiji Ota if (i < work_alloc) { 839c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 840c0dd49bdSEiji Ota work_alloc = i; 841c0dd49bdSEiji Ota } 842c0dd49bdSEiji Ota if (ic->i_flowctl && i < credit_alloc) 843c0dd49bdSEiji Ota rdsv3_ib_send_add_credits(conn, credit_alloc - i); 844c0dd49bdSEiji Ota 845c0dd49bdSEiji Ota /* XXX need to worry about failed_wr and partial sends. */ 846c0dd49bdSEiji Ota ret = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 847c0dd49bdSEiji Ota ic->i_send_wrs, i, &posted); 848c0dd49bdSEiji Ota if (posted != i) { 8496e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 850c0dd49bdSEiji Ota "ic %p first %p nwr: %d ret %d:%d", 851c0dd49bdSEiji Ota ic, first, i, ret, posted); 852c0dd49bdSEiji Ota } 853c0dd49bdSEiji Ota if (ret) { 8546e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 855c0dd49bdSEiji Ota "RDS/IB: ib_post_send to %u.%u.%u.%u " 856c0dd49bdSEiji Ota "returned %d\n", NIPQUAD(conn->c_faddr), ret); 857c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 858c0dd49bdSEiji Ota if (prev->s_rm) { 859c0dd49bdSEiji Ota ic->i_rm = prev->s_rm; 860c0dd49bdSEiji Ota prev->s_rm = NULL; 861c0dd49bdSEiji Ota } 862cadbfdc3SEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send failed\n"); 863cadbfdc3SEiji Ota rdsv3_conn_drop(ic->conn); 8645d5562f5SEiji Ota ret = -EAGAIN; 865c0dd49bdSEiji Ota goto out; 866c0dd49bdSEiji Ota } 867c0dd49bdSEiji Ota 868c0dd49bdSEiji Ota ret = sent; 869c0dd49bdSEiji Ota 870c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit", "Return: conn: %p, rm: %p", conn, rm); 871c0dd49bdSEiji Ota out: 872c0dd49bdSEiji Ota ASSERT(!adv_credits); 873c0dd49bdSEiji Ota return (ret); 874c0dd49bdSEiji Ota } 875c0dd49bdSEiji Ota 876c0dd49bdSEiji Ota static void 877c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, uint_t num, 878c0dd49bdSEiji Ota struct rdsv3_rdma_sg scat[]) 879c0dd49bdSEiji Ota { 880c0dd49bdSEiji Ota ibt_hca_hdl_t hca_hdl; 881c0dd49bdSEiji Ota int i; 882c0dd49bdSEiji Ota int num_sgl; 883c0dd49bdSEiji Ota 884c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_unmap_sg", "rdma_sg: %p", scat); 885c0dd49bdSEiji Ota 886c0dd49bdSEiji Ota if (dev) { 887c0dd49bdSEiji Ota hca_hdl = ib_get_ibt_hca_hdl(dev); 888c0dd49bdSEiji Ota } else { 889c0dd49bdSEiji Ota hca_hdl = scat[0].hca_hdl; 890c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_dma_unmap_sg_rdma", 891c0dd49bdSEiji Ota "NULL dev use cached hca_hdl %p", hca_hdl); 892c0dd49bdSEiji Ota } 893c0dd49bdSEiji Ota 894c0dd49bdSEiji Ota if (hca_hdl == NULL) 895c0dd49bdSEiji Ota return; 896c0dd49bdSEiji Ota scat[0].hca_hdl = NULL; 897c0dd49bdSEiji Ota 898c0dd49bdSEiji Ota for (i = 0; i < num; i++) { 899c0dd49bdSEiji Ota if (scat[i].mihdl != NULL) { 900c0dd49bdSEiji Ota num_sgl = (scat[i].iovec.bytes / PAGESIZE) + 2; 901c0dd49bdSEiji Ota kmem_free(scat[i].swr.wr_sgl, 902c0dd49bdSEiji Ota (num_sgl * sizeof (ibt_wr_ds_t))); 903c0dd49bdSEiji Ota scat[i].swr.wr_sgl = NULL; 904c0dd49bdSEiji Ota (void) ibt_unmap_mem_iov(hca_hdl, scat[i].mihdl); 905c0dd49bdSEiji Ota scat[i].mihdl = NULL; 906c0dd49bdSEiji Ota } else 907c0dd49bdSEiji Ota break; 908c0dd49bdSEiji Ota } 909c0dd49bdSEiji Ota } 910c0dd49bdSEiji Ota 911c0dd49bdSEiji Ota /* ARGSUSED */ 912c0dd49bdSEiji Ota uint_t 913c0dd49bdSEiji Ota rdsv3_ib_dma_map_sg_rdma(struct ib_device *dev, struct rdsv3_rdma_sg scat[], 914c0dd49bdSEiji Ota uint_t num, struct rdsv3_scatterlist **scatl) 915c0dd49bdSEiji Ota { 916c0dd49bdSEiji Ota ibt_hca_hdl_t hca_hdl; 917c0dd49bdSEiji Ota ibt_iov_attr_t iov_attr; 918c0dd49bdSEiji Ota struct buf *bp; 919c0dd49bdSEiji Ota uint_t i, j, k; 920c0dd49bdSEiji Ota uint_t count; 921c0dd49bdSEiji Ota struct rdsv3_scatterlist *sg; 922c0dd49bdSEiji Ota int ret; 923c0dd49bdSEiji Ota 924c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "scat: %p, num: %d", 925c0dd49bdSEiji Ota scat, num); 926c0dd49bdSEiji Ota 927c0dd49bdSEiji Ota hca_hdl = ib_get_ibt_hca_hdl(dev); 928c0dd49bdSEiji Ota scat[0].hca_hdl = hca_hdl; 929c0dd49bdSEiji Ota bzero(&iov_attr, sizeof (ibt_iov_attr_t)); 930c0dd49bdSEiji Ota iov_attr.iov_flags = IBT_IOV_BUF; 931c0dd49bdSEiji Ota iov_attr.iov_lso_hdr_sz = 0; 932c0dd49bdSEiji Ota 933c0dd49bdSEiji Ota for (i = 0, count = 0; i < num; i++) { 934c0dd49bdSEiji Ota /* transpose umem_cookie to buf structure */ 935c0dd49bdSEiji Ota bp = ddi_umem_iosetup(scat[i].umem_cookie, 936c0dd49bdSEiji Ota scat[i].iovec.addr & PAGEOFFSET, scat[i].iovec.bytes, 937c0dd49bdSEiji Ota B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); 938c0dd49bdSEiji Ota if (bp == NULL) { 939c0dd49bdSEiji Ota /* free resources and return error */ 940c0dd49bdSEiji Ota goto out; 941c0dd49bdSEiji Ota } 942c0dd49bdSEiji Ota /* setup ibt_map_mem_iov() attributes */ 943c0dd49bdSEiji Ota iov_attr.iov_buf = bp; 944c0dd49bdSEiji Ota iov_attr.iov_wr_nds = (scat[i].iovec.bytes / PAGESIZE) + 2; 945c0dd49bdSEiji Ota scat[i].swr.wr_sgl = 946c0dd49bdSEiji Ota kmem_zalloc(iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t), 947c0dd49bdSEiji Ota KM_SLEEP); 948c0dd49bdSEiji Ota 949c0dd49bdSEiji Ota ret = ibt_map_mem_iov(hca_hdl, &iov_attr, 950c0dd49bdSEiji Ota (ibt_all_wr_t *)&scat[i].swr, &scat[i].mihdl); 951c0dd49bdSEiji Ota freerbuf(bp); 952c0dd49bdSEiji Ota if (ret != IBT_SUCCESS) { 953c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg_rdma", 954c0dd49bdSEiji Ota "ibt_map_mem_iov returned: %d", ret); 955c0dd49bdSEiji Ota /* free resources and return error */ 956c0dd49bdSEiji Ota kmem_free(scat[i].swr.wr_sgl, 957c0dd49bdSEiji Ota iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t)); 958c0dd49bdSEiji Ota goto out; 959c0dd49bdSEiji Ota } 960c0dd49bdSEiji Ota count += scat[i].swr.wr_nds; 961c0dd49bdSEiji Ota 962c0dd49bdSEiji Ota #ifdef DEBUG 963c0dd49bdSEiji Ota for (j = 0; j < scat[i].swr.wr_nds; j++) { 964c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_dma_map_sg_rdma", 965c0dd49bdSEiji Ota "sgl[%d] va %llx len %x", j, 966c0dd49bdSEiji Ota scat[i].swr.wr_sgl[j].ds_va, 967c0dd49bdSEiji Ota scat[i].swr.wr_sgl[j].ds_len); 968c0dd49bdSEiji Ota } 969c0dd49bdSEiji Ota #endif 970c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", 971c0dd49bdSEiji Ota "iovec.bytes: 0x%x scat[%d]swr.wr_nds: %d", 972c0dd49bdSEiji Ota scat[i].iovec.bytes, i, scat[i].swr.wr_nds); 973c0dd49bdSEiji Ota } 974c0dd49bdSEiji Ota 975c0dd49bdSEiji Ota count = ((count - 1) / RDSV3_IB_MAX_SGE) + 1; 976c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "Ret: num: %d", count); 977c0dd49bdSEiji Ota return (count); 978c0dd49bdSEiji Ota 979c0dd49bdSEiji Ota out: 980c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(dev, num, scat); 981c0dd49bdSEiji Ota return (0); 982c0dd49bdSEiji Ota } 983c0dd49bdSEiji Ota 984c0dd49bdSEiji Ota int 985c0dd49bdSEiji Ota rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) 986c0dd49bdSEiji Ota { 987c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 988c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send = NULL; 989c0dd49bdSEiji Ota struct rdsv3_rdma_sg *scat; 990c0dd49bdSEiji Ota uint64_t remote_addr; 991c0dd49bdSEiji Ota uint32_t pos; 992c0dd49bdSEiji Ota uint32_t work_alloc; 993c0dd49bdSEiji Ota uint32_t i, j, k, idx; 994c0dd49bdSEiji Ota uint32_t left, count; 995c0dd49bdSEiji Ota uint32_t posted; 996c0dd49bdSEiji Ota int sent; 997c0dd49bdSEiji Ota ibt_status_t status; 998c0dd49bdSEiji Ota ibt_send_wr_t *wr; 999c0dd49bdSEiji Ota ibt_wr_ds_t *sge; 1000c0dd49bdSEiji Ota 1001c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "rdsv3_ib_conn: %p", ic); 1002c0dd49bdSEiji Ota 1003c0dd49bdSEiji Ota /* map the message the first time we see it */ 1004c0dd49bdSEiji Ota if (!op->r_mapped) { 1005c0dd49bdSEiji Ota op->r_count = rdsv3_ib_dma_map_sg_rdma(ic->i_cm_id->device, 1006c0dd49bdSEiji Ota op->r_rdma_sg, op->r_nents, &op->r_sg); 1007c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "ic %p mapping op %p: %d", 1008c0dd49bdSEiji Ota ic, op, op->r_count); 1009c0dd49bdSEiji Ota if (op->r_count == 0) { 1010c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 1011c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1012c0dd49bdSEiji Ota "fail: ic %p mapping op %p: %d", 1013c0dd49bdSEiji Ota ic, op, op->r_count); 1014c0dd49bdSEiji Ota return (-ENOMEM); /* XXX ? */ 1015c0dd49bdSEiji Ota } 1016c0dd49bdSEiji Ota op->r_mapped = 1; 1017c0dd49bdSEiji Ota } 1018c0dd49bdSEiji Ota 1019c0dd49bdSEiji Ota /* 1020c0dd49bdSEiji Ota * Instead of knowing how to return a partial rdma read/write 1021c0dd49bdSEiji Ota * we insist that there 1022c0dd49bdSEiji Ota * be enough work requests to send the entire message. 1023c0dd49bdSEiji Ota */ 1024c0dd49bdSEiji Ota work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, op->r_count, &pos); 1025c0dd49bdSEiji Ota if (work_alloc != op->r_count) { 1026c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1027c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_ring_full); 1028c0dd49bdSEiji Ota return (-ENOMEM); 1029c0dd49bdSEiji Ota } 1030c0dd49bdSEiji Ota 10315d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "pos %u cnt %u", pos, op->r_count); 1032c0dd49bdSEiji Ota /* 1033c0dd49bdSEiji Ota * take the scatter list and transpose into a list of 1034c0dd49bdSEiji Ota * send wr's each with a scatter list of RDSV3_IB_MAX_SGE 1035c0dd49bdSEiji Ota */ 1036c0dd49bdSEiji Ota scat = &op->r_rdma_sg[0]; 1037c0dd49bdSEiji Ota sent = 0; 1038c0dd49bdSEiji Ota remote_addr = op->r_remote_addr; 1039c0dd49bdSEiji Ota 1040c0dd49bdSEiji Ota for (i = 0, k = 0; i < op->r_nents; i++) { 1041c0dd49bdSEiji Ota left = scat[i].swr.wr_nds; 1042c0dd49bdSEiji Ota for (idx = 0; left > 0; k++) { 1043c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 1044c0dd49bdSEiji Ota send->s_queued = jiffies; 1045c0dd49bdSEiji Ota send->s_opcode = op->r_write ? IBT_WRC_RDMAW : 1046c0dd49bdSEiji Ota IBT_WRC_RDMAR; 1047c0dd49bdSEiji Ota send->s_op = op; 1048c0dd49bdSEiji Ota 1049c0dd49bdSEiji Ota wr = &ic->i_send_wrs[k]; 1050c0dd49bdSEiji Ota wr->wr_flags = 0; 10515d5562f5SEiji Ota wr->wr_id = pos | RDSV3_IB_SEND_OP; 1052c0dd49bdSEiji Ota wr->wr_trans = IBT_RC_SRV; 1053c0dd49bdSEiji Ota wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW : 1054c0dd49bdSEiji Ota IBT_WRC_RDMAR; 1055c0dd49bdSEiji Ota wr->wr.rc.rcwr.rdma.rdma_raddr = remote_addr; 1056c0dd49bdSEiji Ota wr->wr.rc.rcwr.rdma.rdma_rkey = op->r_key; 1057c0dd49bdSEiji Ota 1058c0dd49bdSEiji Ota if (left > RDSV3_IB_MAX_SGE) { 1059c0dd49bdSEiji Ota count = RDSV3_IB_MAX_SGE; 1060c0dd49bdSEiji Ota left -= RDSV3_IB_MAX_SGE; 1061c0dd49bdSEiji Ota } else { 1062c0dd49bdSEiji Ota count = left; 1063c0dd49bdSEiji Ota left = 0; 1064c0dd49bdSEiji Ota } 1065c0dd49bdSEiji Ota wr->wr_nds = count; 1066c0dd49bdSEiji Ota 1067c0dd49bdSEiji Ota for (j = 0; j < count; j++) { 1068c0dd49bdSEiji Ota sge = &wr->wr_sgl[j]; 1069c0dd49bdSEiji Ota *sge = scat[i].swr.wr_sgl[idx]; 1070c0dd49bdSEiji Ota remote_addr += scat[i].swr.wr_sgl[idx].ds_len; 1071c0dd49bdSEiji Ota sent += scat[i].swr.wr_sgl[idx].ds_len; 1072c0dd49bdSEiji Ota idx++; 10735d5562f5SEiji Ota RDSV3_DPRINTF5("xmit_rdma", 1074c0dd49bdSEiji Ota "send_wrs[%d]sgl[%d] va %llx len %x", 1075c0dd49bdSEiji Ota k, j, sge->ds_va, sge->ds_len); 1076c0dd49bdSEiji Ota } 10775d5562f5SEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", 1078c0dd49bdSEiji Ota "wr[%d] %p key: %x code: %d tlen: %d", 1079c0dd49bdSEiji Ota k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey, 1080c0dd49bdSEiji Ota wr->wr_opcode, sent); 1081c0dd49bdSEiji Ota 1082c0dd49bdSEiji Ota /* 1083c0dd49bdSEiji Ota * We want to delay signaling completions just enough 1084c0dd49bdSEiji Ota * to get the batching benefits but not so much that 1085c0dd49bdSEiji Ota * we create dead time on the wire. 1086c0dd49bdSEiji Ota */ 1087c0dd49bdSEiji Ota if (ic->i_unsignaled_wrs-- == 0) { 1088c0dd49bdSEiji Ota ic->i_unsignaled_wrs = 1089c0dd49bdSEiji Ota rdsv3_ib_sysctl_max_unsig_wrs; 1090c0dd49bdSEiji Ota wr->wr_flags = IBT_WR_SEND_SIGNAL; 1091c0dd49bdSEiji Ota } 1092c0dd49bdSEiji Ota 1093c0dd49bdSEiji Ota pos = (pos + 1) % ic->i_send_ring.w_nr; 1094c0dd49bdSEiji Ota } 1095c0dd49bdSEiji Ota } 1096c0dd49bdSEiji Ota 1097c0dd49bdSEiji Ota status = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 1098c0dd49bdSEiji Ota ic->i_send_wrs, k, &posted); 1099c0dd49bdSEiji Ota if (status != IBT_SUCCESS) { 11006e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1101cadbfdc3SEiji Ota "RDS/IB: rdma ib_post_send to %u.%u.%u.%u " 1102cadbfdc3SEiji Ota "returned %d", NIPQUAD(conn->c_faddr), status); 1103c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1104c0dd49bdSEiji Ota } 11055d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "Ret: %p", ic); 1106c0dd49bdSEiji Ota return (status); 1107c0dd49bdSEiji Ota } 1108c0dd49bdSEiji Ota 1109c0dd49bdSEiji Ota void 1110c0dd49bdSEiji Ota rdsv3_ib_xmit_complete(struct rdsv3_connection *conn) 1111c0dd49bdSEiji Ota { 1112c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 1113c0dd49bdSEiji Ota 1114c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_complete", "conn: %p", conn); 1115c0dd49bdSEiji Ota 1116c0dd49bdSEiji Ota /* 1117c0dd49bdSEiji Ota * We may have a pending ACK or window update we were unable 1118c0dd49bdSEiji Ota * to send previously (due to flow control). Try again. 1119c0dd49bdSEiji Ota */ 1120c0dd49bdSEiji Ota rdsv3_ib_attempt_ack(ic); 1121c0dd49bdSEiji Ota } 1122