1c0dd49bdSEiji Ota /* 2c0dd49bdSEiji Ota * CDDL HEADER START 3c0dd49bdSEiji Ota * 4c0dd49bdSEiji Ota * The contents of this file are subject to the terms of the 5c0dd49bdSEiji Ota * Common Development and Distribution License (the "License"). 6c0dd49bdSEiji Ota * You may not use this file except in compliance with the License. 7c0dd49bdSEiji Ota * 8c0dd49bdSEiji Ota * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c0dd49bdSEiji Ota * or http://www.opensolaris.org/os/licensing. 10c0dd49bdSEiji Ota * See the License for the specific language governing permissions 11c0dd49bdSEiji Ota * and limitations under the License. 12c0dd49bdSEiji Ota * 13c0dd49bdSEiji Ota * When distributing Covered Code, include this CDDL HEADER in each 14c0dd49bdSEiji Ota * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c0dd49bdSEiji Ota * If applicable, add the following below this CDDL HEADER, with the 16c0dd49bdSEiji Ota * fields enclosed by brackets "[]" replaced with your own identifying 17c0dd49bdSEiji Ota * information: Portions Copyright [yyyy] [name of copyright owner] 18c0dd49bdSEiji Ota * 19c0dd49bdSEiji Ota * CDDL HEADER END 20c0dd49bdSEiji Ota */ 21c0dd49bdSEiji Ota /* 22c0dd49bdSEiji Ota * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23c0dd49bdSEiji Ota */ 24c0dd49bdSEiji Ota 25c0dd49bdSEiji Ota /* 26c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 27c0dd49bdSEiji Ota * 28c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 29c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 30c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 31c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 32c0dd49bdSEiji Ota * OpenIB.org BSD license below: 33c0dd49bdSEiji Ota * 34c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 35c0dd49bdSEiji Ota * without modification, are permitted provided that the following 36c0dd49bdSEiji Ota * conditions are met: 37c0dd49bdSEiji Ota * 38c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 39c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 40c0dd49bdSEiji Ota * disclaimer. 41c0dd49bdSEiji Ota * 42c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 43c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 44c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 45c0dd49bdSEiji Ota * provided with the distribution. 46c0dd49bdSEiji Ota * 47c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54c0dd49bdSEiji Ota * SOFTWARE. 55c0dd49bdSEiji Ota * 56c0dd49bdSEiji Ota */ 57c0dd49bdSEiji Ota #include <sys/rds.h> 58c0dd49bdSEiji Ota 59c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 60c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 61c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/ib.h> 62c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 63c0dd49bdSEiji Ota 64c0dd49bdSEiji Ota static void 65c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(struct rdsv3_message *rm, 66c0dd49bdSEiji Ota int wc_status) 67c0dd49bdSEiji Ota { 68c0dd49bdSEiji Ota int notify_status; 69c0dd49bdSEiji Ota 70c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 71c0dd49bdSEiji Ota rm, wc_status); 72c0dd49bdSEiji Ota 73c0dd49bdSEiji Ota switch (wc_status) { 74c0dd49bdSEiji Ota case IBT_WC_WR_FLUSHED_ERR: 75c0dd49bdSEiji Ota return; 76c0dd49bdSEiji Ota 77c0dd49bdSEiji Ota case IBT_WC_SUCCESS: 78c0dd49bdSEiji Ota notify_status = RDSV3_RDMA_SUCCESS; 79c0dd49bdSEiji Ota break; 80c0dd49bdSEiji Ota 81c0dd49bdSEiji Ota case IBT_WC_REMOTE_ACCESS_ERR: 82c0dd49bdSEiji Ota notify_status = RDSV3_RDMA_REMOTE_ERROR; 83c0dd49bdSEiji Ota break; 84c0dd49bdSEiji Ota 85c0dd49bdSEiji Ota default: 86c0dd49bdSEiji Ota notify_status = RDSV3_RDMA_OTHER_ERROR; 87c0dd49bdSEiji Ota break; 88c0dd49bdSEiji Ota } 89c0dd49bdSEiji Ota rdsv3_rdma_send_complete(rm, notify_status); 90c0dd49bdSEiji Ota 91c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 92c0dd49bdSEiji Ota rm, wc_status); 93c0dd49bdSEiji Ota } 94c0dd49bdSEiji Ota 95c0dd49bdSEiji Ota static void rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, 96c0dd49bdSEiji Ota uint_t num, struct rdsv3_rdma_sg scat[]); 97c0dd49bdSEiji Ota 98c0dd49bdSEiji Ota void 99c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(struct rdsv3_ib_connection *ic, 100c0dd49bdSEiji Ota struct rdsv3_rdma_op *op) 101c0dd49bdSEiji Ota { 102c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rdma", "ic: %p, op: %p", ic, op); 103c0dd49bdSEiji Ota if (op->r_mapped) { 104c0dd49bdSEiji Ota op->r_mapped = 0; 105c0dd49bdSEiji Ota if (ic->i_cm_id) { 106c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(ic->i_cm_id->device, 107c0dd49bdSEiji Ota op->r_nents, op->r_rdma_sg); 108c0dd49bdSEiji Ota } else { 109c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma((struct ib_device *)NULL, 110c0dd49bdSEiji Ota op->r_nents, op->r_rdma_sg); 111c0dd49bdSEiji Ota } 112c0dd49bdSEiji Ota } 113c0dd49bdSEiji Ota } 114c0dd49bdSEiji Ota 115c0dd49bdSEiji Ota static void 116c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic, 117c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send, 118c0dd49bdSEiji Ota int wc_status) 119c0dd49bdSEiji Ota { 120c0dd49bdSEiji Ota struct rdsv3_message *rm = send->s_rm; 121c0dd49bdSEiji Ota 122c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n", 123c0dd49bdSEiji Ota ic, send, rm); 124c0dd49bdSEiji Ota 125*5d5562f5SEiji Ota mutex_enter(&rm->m_rs_lock); 126*5d5562f5SEiji Ota if (rm->m_count) { 127c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, 128*5d5562f5SEiji Ota rm->m_sg, rm->m_count); 129*5d5562f5SEiji Ota rm->m_count = 0; 130*5d5562f5SEiji Ota } 131*5d5562f5SEiji Ota mutex_exit(&rm->m_rs_lock); 132c0dd49bdSEiji Ota 133c0dd49bdSEiji Ota if (rm->m_rdma_op != NULL) { 134c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op); 135c0dd49bdSEiji Ota 136c0dd49bdSEiji Ota /* 137c0dd49bdSEiji Ota * If the user asked for a completion notification on this 138c0dd49bdSEiji Ota * message, we can implement three different semantics: 139c0dd49bdSEiji Ota * 1. Notify when we received the ACK on the RDS message 140c0dd49bdSEiji Ota * that was queued with the RDMA. This provides reliable 141c0dd49bdSEiji Ota * notification of RDMA status at the expense of a one-way 142c0dd49bdSEiji Ota * packet delay. 143c0dd49bdSEiji Ota * 2. Notify when the IB stack gives us the completion 144c0dd49bdSEiji Ota * event for the RDMA operation. 145c0dd49bdSEiji Ota * 3. Notify when the IB stack gives us the completion 146c0dd49bdSEiji Ota * event for the accompanying RDS messages. 147c0dd49bdSEiji Ota * Here, we implement approach #3. To implement approach #2, 148c0dd49bdSEiji Ota * call rdsv3_rdma_send_complete from the cq_handler. 149c0dd49bdSEiji Ota * To implement #1, 150c0dd49bdSEiji Ota * don't call rdsv3_rdma_send_complete at all, and fall back to 151c0dd49bdSEiji Ota * the notify 152c0dd49bdSEiji Ota * handling in the ACK processing code. 153c0dd49bdSEiji Ota * 154c0dd49bdSEiji Ota * Note: There's no need to explicitly sync any RDMA buffers 155c0dd49bdSEiji Ota * using 156c0dd49bdSEiji Ota * ib_dma_sync_sg_for_cpu - the completion for the RDMA 157c0dd49bdSEiji Ota * operation itself unmapped the RDMA buffers, which takes care 158c0dd49bdSEiji Ota * of synching. 159c0dd49bdSEiji Ota */ 160c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(rm, wc_status); 161c0dd49bdSEiji Ota 162c0dd49bdSEiji Ota if (rm->m_rdma_op->r_write) 163c0dd49bdSEiji Ota rdsv3_stats_add(s_send_rdma_bytes, 164c0dd49bdSEiji Ota rm->m_rdma_op->r_bytes); 165c0dd49bdSEiji Ota else 166c0dd49bdSEiji Ota rdsv3_stats_add(s_recv_rdma_bytes, 167c0dd49bdSEiji Ota rm->m_rdma_op->r_bytes); 168c0dd49bdSEiji Ota } 169c0dd49bdSEiji Ota 170c0dd49bdSEiji Ota /* 171c0dd49bdSEiji Ota * If anyone waited for this message to get flushed out, wake 172c0dd49bdSEiji Ota * them up now 173c0dd49bdSEiji Ota */ 174c0dd49bdSEiji Ota rdsv3_message_unmapped(rm); 175c0dd49bdSEiji Ota 176c0dd49bdSEiji Ota rdsv3_message_put(rm); 177c0dd49bdSEiji Ota send->s_rm = NULL; 178c0dd49bdSEiji Ota } 179c0dd49bdSEiji Ota 180c0dd49bdSEiji Ota void 181c0dd49bdSEiji Ota rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic) 182c0dd49bdSEiji Ota { 183c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 184c0dd49bdSEiji Ota uint32_t i; 185c0dd49bdSEiji Ota 186c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_init_ring", "ic: %p", ic); 187c0dd49bdSEiji Ota 188c0dd49bdSEiji Ota for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 189c0dd49bdSEiji Ota send->s_rm = NULL; 190c0dd49bdSEiji Ota send->s_op = NULL; 191c0dd49bdSEiji Ota } 192c0dd49bdSEiji Ota } 193c0dd49bdSEiji Ota 194c0dd49bdSEiji Ota void 195c0dd49bdSEiji Ota rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic) 196c0dd49bdSEiji Ota { 197c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 198c0dd49bdSEiji Ota uint32_t i; 199c0dd49bdSEiji Ota 200c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "ic: %p", ic); 201c0dd49bdSEiji Ota 202c0dd49bdSEiji Ota for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 203c0dd49bdSEiji Ota if (send->s_opcode == 0xdd) 204c0dd49bdSEiji Ota continue; 205c0dd49bdSEiji Ota if (send->s_rm) 206c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(ic, send, IBT_WC_WR_FLUSHED_ERR); 207c0dd49bdSEiji Ota if (send->s_op) 208c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, send->s_op); 209c0dd49bdSEiji Ota } 210c0dd49bdSEiji Ota 211c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "Return: ic: %p", ic); 212c0dd49bdSEiji Ota } 213c0dd49bdSEiji Ota 214c0dd49bdSEiji Ota /* 215c0dd49bdSEiji Ota * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 216c0dd49bdSEiji Ota * operations performed in the send path. As the sender allocs and potentially 217c0dd49bdSEiji Ota * unallocs the next free entry in the ring it doesn't alter which is 218c0dd49bdSEiji Ota * the next to be freed, which is what this is concerned with. 219c0dd49bdSEiji Ota */ 220c0dd49bdSEiji Ota void 221*5d5562f5SEiji Ota rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc) 222c0dd49bdSEiji Ota { 223*5d5562f5SEiji Ota struct rdsv3_connection *conn = ic->conn; 224c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send; 225c0dd49bdSEiji Ota uint32_t completed, polled; 226c0dd49bdSEiji Ota uint32_t oldest; 227c0dd49bdSEiji Ota uint32_t i = 0; 228c0dd49bdSEiji Ota int ret; 229c0dd49bdSEiji Ota 230*5d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", 231*5d5562f5SEiji Ota "wc wc_id 0x%llx status %u byte_len %u imm_data %u\n", 232*5d5562f5SEiji Ota (unsigned long long)wc->wc_id, wc->wc_status, 233*5d5562f5SEiji Ota wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); 234c0dd49bdSEiji Ota 235c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_cq_event); 236c0dd49bdSEiji Ota 237*5d5562f5SEiji Ota if (wc->wc_id == RDSV3_IB_ACK_WR_ID) { 238c0dd49bdSEiji Ota if (ic->i_ack_queued + HZ/2 < jiffies) 239c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_stalled); 240c0dd49bdSEiji Ota rdsv3_ib_ack_send_complete(ic); 241*5d5562f5SEiji Ota return; 242c0dd49bdSEiji Ota } 243c0dd49bdSEiji Ota 244c0dd49bdSEiji Ota oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); 245c0dd49bdSEiji Ota 246c0dd49bdSEiji Ota completed = rdsv3_ib_ring_completed(&ic->i_send_ring, 247*5d5562f5SEiji Ota (wc->wc_id & ~RDSV3_IB_SEND_OP), oldest); 248c0dd49bdSEiji Ota 249c0dd49bdSEiji Ota for (i = 0; i < completed; i++) { 250c0dd49bdSEiji Ota send = &ic->i_sends[oldest]; 251c0dd49bdSEiji Ota 252c0dd49bdSEiji Ota /* 253*5d5562f5SEiji Ota * In the error case, wc->opcode sometimes contains 254c0dd49bdSEiji Ota * garbage 255c0dd49bdSEiji Ota */ 256c0dd49bdSEiji Ota switch (send->s_opcode) { 257c0dd49bdSEiji Ota case IBT_WRC_SEND: 258c0dd49bdSEiji Ota if (send->s_rm) 259c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rm(ic, send, 260*5d5562f5SEiji Ota wc->wc_status); 261c0dd49bdSEiji Ota break; 262c0dd49bdSEiji Ota case IBT_WRC_RDMAW: 263c0dd49bdSEiji Ota case IBT_WRC_RDMAR: 264c0dd49bdSEiji Ota /* 265c0dd49bdSEiji Ota * Nothing to be done - the SG list will 266c0dd49bdSEiji Ota * be unmapped 267c0dd49bdSEiji Ota * when the SEND completes. 268c0dd49bdSEiji Ota */ 269c0dd49bdSEiji Ota break; 270c0dd49bdSEiji Ota default: 271c0dd49bdSEiji Ota #ifndef __lock_lint 2726e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", 273c0dd49bdSEiji Ota "RDS/IB: %s: unexpected opcode " 274c0dd49bdSEiji Ota "0x%x in WR!", 275c0dd49bdSEiji Ota __func__, send->s_opcode); 276c0dd49bdSEiji Ota #endif 277c0dd49bdSEiji Ota break; 278c0dd49bdSEiji Ota } 279c0dd49bdSEiji Ota 280c0dd49bdSEiji Ota send->s_opcode = 0xdd; 281c0dd49bdSEiji Ota if (send->s_queued + HZ/2 < jiffies) 282c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_stalled); 283c0dd49bdSEiji Ota 284c0dd49bdSEiji Ota /* 285c0dd49bdSEiji Ota * If a RDMA operation produced an error, signal 286c0dd49bdSEiji Ota * this right 287c0dd49bdSEiji Ota * away. If we don't, the subsequent SEND that goes 288c0dd49bdSEiji Ota * with this 289c0dd49bdSEiji Ota * RDMA will be canceled with ERR_WFLUSH, and the 290c0dd49bdSEiji Ota * application 291c0dd49bdSEiji Ota * never learn that the RDMA failed. 292c0dd49bdSEiji Ota */ 293*5d5562f5SEiji Ota if (wc->wc_status == 294c0dd49bdSEiji Ota IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { 295c0dd49bdSEiji Ota struct rdsv3_message *rm; 296c0dd49bdSEiji Ota 297c0dd49bdSEiji Ota rm = rdsv3_send_get_message(conn, send->s_op); 298c0dd49bdSEiji Ota if (rm) { 299c0dd49bdSEiji Ota if (rm->m_rdma_op != NULL) 300c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(ic, 301c0dd49bdSEiji Ota rm->m_rdma_op); 302c0dd49bdSEiji Ota rdsv3_ib_send_rdma_complete(rm, 303*5d5562f5SEiji Ota wc->wc_status); 304c0dd49bdSEiji Ota rdsv3_message_put(rm); 305c0dd49bdSEiji Ota } 306c0dd49bdSEiji Ota } 307c0dd49bdSEiji Ota 308c0dd49bdSEiji Ota oldest = (oldest + 1) % ic->i_send_ring.w_nr; 309c0dd49bdSEiji Ota } 310c0dd49bdSEiji Ota 311c0dd49bdSEiji Ota rdsv3_ib_ring_free(&ic->i_send_ring, completed); 312c0dd49bdSEiji Ota 313*5d5562f5SEiji Ota clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 314c0dd49bdSEiji Ota 315c0dd49bdSEiji Ota /* We expect errors as the qp is drained during shutdown */ 316*5d5562f5SEiji Ota if (wc->wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { 317*5d5562f5SEiji Ota RDSV3_DPRINTF2("rdsv3_ib_send_cqe_handler", 318c0dd49bdSEiji Ota "send completion on %u.%u.%u.%u " 319c0dd49bdSEiji Ota "had status %u, disconnecting and reconnecting\n", 320*5d5562f5SEiji Ota NIPQUAD(conn->c_faddr), wc->wc_status); 321c0dd49bdSEiji Ota rdsv3_conn_drop(conn); 322c0dd49bdSEiji Ota } 323c0dd49bdSEiji Ota 324*5d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", "Return: conn: %p", ic); 325c0dd49bdSEiji Ota } 326c0dd49bdSEiji Ota 327c0dd49bdSEiji Ota /* 328c0dd49bdSEiji Ota * This is the main function for allocating credits when sending 329c0dd49bdSEiji Ota * messages. 330c0dd49bdSEiji Ota * 331c0dd49bdSEiji Ota * Conceptually, we have two counters: 332c0dd49bdSEiji Ota * - send credits: this tells us how many WRs we're allowed 333c0dd49bdSEiji Ota * to submit without overruning the reciever's queue. For 334c0dd49bdSEiji Ota * each SEND WR we post, we decrement this by one. 335c0dd49bdSEiji Ota * 336c0dd49bdSEiji Ota * - posted credits: this tells us how many WRs we recently 337c0dd49bdSEiji Ota * posted to the receive queue. This value is transferred 338c0dd49bdSEiji Ota * to the peer as a "credit update" in a RDS header field. 339c0dd49bdSEiji Ota * Every time we transmit credits to the peer, we subtract 340c0dd49bdSEiji Ota * the amount of transferred credits from this counter. 341c0dd49bdSEiji Ota * 342c0dd49bdSEiji Ota * It is essential that we avoid situations where both sides have 343c0dd49bdSEiji Ota * exhausted their send credits, and are unable to send new credits 344c0dd49bdSEiji Ota * to the peer. We achieve this by requiring that we send at least 345c0dd49bdSEiji Ota * one credit update to the peer before exhausting our credits. 346c0dd49bdSEiji Ota * When new credits arrive, we subtract one credit that is withheld 347c0dd49bdSEiji Ota * until we've posted new buffers and are ready to transmit these 348c0dd49bdSEiji Ota * credits (see rdsv3_ib_send_add_credits below). 349c0dd49bdSEiji Ota * 350c0dd49bdSEiji Ota * The RDS send code is essentially single-threaded; rdsv3_send_xmit 351c0dd49bdSEiji Ota * grabs c_send_lock to ensure exclusive access to the send ring. 352c0dd49bdSEiji Ota * However, the ACK sending code is independent and can race with 353c0dd49bdSEiji Ota * message SENDs. 354c0dd49bdSEiji Ota * 355c0dd49bdSEiji Ota * In the send path, we need to update the counters for send credits 356c0dd49bdSEiji Ota * and the counter of posted buffers atomically - when we use the 357c0dd49bdSEiji Ota * last available credit, we cannot allow another thread to race us 358c0dd49bdSEiji Ota * and grab the posted credits counter. Hence, we have to use a 359c0dd49bdSEiji Ota * spinlock to protect the credit counter, or use atomics. 360c0dd49bdSEiji Ota * 361c0dd49bdSEiji Ota * Spinlocks shared between the send and the receive path are bad, 362c0dd49bdSEiji Ota * because they create unnecessary delays. An early implementation 363c0dd49bdSEiji Ota * using a spinlock showed a 5% degradation in throughput at some 364c0dd49bdSEiji Ota * loads. 365c0dd49bdSEiji Ota * 366c0dd49bdSEiji Ota * This implementation avoids spinlocks completely, putting both 367c0dd49bdSEiji Ota * counters into a single atomic, and updating that atomic using 368c0dd49bdSEiji Ota * atomic_add (in the receive path, when receiving fresh credits), 369c0dd49bdSEiji Ota * and using atomic_cmpxchg when updating the two counters. 370c0dd49bdSEiji Ota */ 371c0dd49bdSEiji Ota int 372c0dd49bdSEiji Ota rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, 373cadbfdc3SEiji Ota uint32_t wanted, uint32_t *adv_credits, int need_posted) 374c0dd49bdSEiji Ota { 375c0dd49bdSEiji Ota unsigned int avail, posted, got = 0, advertise; 376c0dd49bdSEiji Ota long oldval, newval; 377c0dd49bdSEiji Ota 378cadbfdc3SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 379cadbfdc3SEiji Ota ic, wanted, *adv_credits, need_posted); 380c0dd49bdSEiji Ota 381c0dd49bdSEiji Ota *adv_credits = 0; 382c0dd49bdSEiji Ota if (!ic->i_flowctl) 383c0dd49bdSEiji Ota return (wanted); 384c0dd49bdSEiji Ota 385c0dd49bdSEiji Ota try_again: 386c0dd49bdSEiji Ota advertise = 0; 387c0dd49bdSEiji Ota oldval = newval = atomic_get(&ic->i_credits); 388c0dd49bdSEiji Ota posted = IB_GET_POST_CREDITS(oldval); 389c0dd49bdSEiji Ota avail = IB_GET_SEND_CREDITS(oldval); 390c0dd49bdSEiji Ota 391c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_send_grab_credits", 392c0dd49bdSEiji Ota "wanted (%u): credits=%u posted=%u\n", wanted, avail, posted); 393c0dd49bdSEiji Ota 394c0dd49bdSEiji Ota /* The last credit must be used to send a credit update. */ 395c0dd49bdSEiji Ota if (avail && !posted) 396c0dd49bdSEiji Ota avail--; 397c0dd49bdSEiji Ota 398c0dd49bdSEiji Ota if (avail < wanted) { 399c0dd49bdSEiji Ota struct rdsv3_connection *conn = ic->i_cm_id->context; 400c0dd49bdSEiji Ota 401c0dd49bdSEiji Ota /* Oops, there aren't that many credits left! */ 402c0dd49bdSEiji Ota set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 403c0dd49bdSEiji Ota got = avail; 404c0dd49bdSEiji Ota } else { 405c0dd49bdSEiji Ota /* Sometimes you get what you want, lalala. */ 406c0dd49bdSEiji Ota got = wanted; 407c0dd49bdSEiji Ota } 408c0dd49bdSEiji Ota newval -= IB_SET_SEND_CREDITS(got); 409c0dd49bdSEiji Ota 410c0dd49bdSEiji Ota /* 411c0dd49bdSEiji Ota * If need_posted is non-zero, then the caller wants 412c0dd49bdSEiji Ota * the posted regardless of whether any send credits are 413c0dd49bdSEiji Ota * available. 414c0dd49bdSEiji Ota */ 415c0dd49bdSEiji Ota if (posted && (got || need_posted)) { 416cadbfdc3SEiji Ota advertise = min(posted, RDSV3_MAX_ADV_CREDIT); 417c0dd49bdSEiji Ota newval -= IB_SET_POST_CREDITS(advertise); 418c0dd49bdSEiji Ota } 419c0dd49bdSEiji Ota 420c0dd49bdSEiji Ota /* Finally bill everything */ 421c0dd49bdSEiji Ota if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) 422c0dd49bdSEiji Ota goto try_again; 423c0dd49bdSEiji Ota 424c0dd49bdSEiji Ota *adv_credits = advertise; 425c0dd49bdSEiji Ota 426cadbfdc3SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 427cadbfdc3SEiji Ota ic, got, *adv_credits, need_posted); 428cadbfdc3SEiji Ota 429c0dd49bdSEiji Ota return (got); 430c0dd49bdSEiji Ota } 431c0dd49bdSEiji Ota 432c0dd49bdSEiji Ota void 433c0dd49bdSEiji Ota rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, unsigned int credits) 434c0dd49bdSEiji Ota { 435c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 436c0dd49bdSEiji Ota 437c0dd49bdSEiji Ota if (credits == 0) 438c0dd49bdSEiji Ota return; 439c0dd49bdSEiji Ota 440c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_send_add_credits", 441c0dd49bdSEiji Ota "credits (%u): current=%u%s\n", 442c0dd49bdSEiji Ota credits, 443c0dd49bdSEiji Ota IB_GET_SEND_CREDITS(atomic_get(&ic->i_credits)), 444c0dd49bdSEiji Ota test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ? 445c0dd49bdSEiji Ota ", ll_send_full" : ""); 446c0dd49bdSEiji Ota 447c0dd49bdSEiji Ota atomic_add_32(&ic->i_credits, IB_SET_SEND_CREDITS(credits)); 448c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 449c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 450c0dd49bdSEiji Ota 451c0dd49bdSEiji Ota ASSERT(!(IB_GET_SEND_CREDITS(credits) >= 16384)); 452c0dd49bdSEiji Ota 453c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_rx_credit_updates); 454c0dd49bdSEiji Ota 455c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_send_add_credits", 456c0dd49bdSEiji Ota "Return: conn: %p, credits: %d", 457c0dd49bdSEiji Ota conn, credits); 458c0dd49bdSEiji Ota } 459c0dd49bdSEiji Ota 460c0dd49bdSEiji Ota void 461c0dd49bdSEiji Ota rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, unsigned int posted) 462c0dd49bdSEiji Ota { 463c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 464c0dd49bdSEiji Ota 465c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_advertise_credits", "conn: %p, posted: %d", 466c0dd49bdSEiji Ota conn, posted); 467c0dd49bdSEiji Ota 468c0dd49bdSEiji Ota if (posted == 0) 469c0dd49bdSEiji Ota return; 470c0dd49bdSEiji Ota 471c0dd49bdSEiji Ota atomic_add_32(&ic->i_credits, IB_SET_POST_CREDITS(posted)); 472c0dd49bdSEiji Ota 473c0dd49bdSEiji Ota /* 474c0dd49bdSEiji Ota * Decide whether to send an update to the peer now. 475c0dd49bdSEiji Ota * If we would send a credit update for every single buffer we 476c0dd49bdSEiji Ota * post, we would end up with an ACK storm (ACK arrives, 477c0dd49bdSEiji Ota * consumes buffer, we refill the ring, send ACK to remote 478c0dd49bdSEiji Ota * advertising the newly posted buffer... ad inf) 479c0dd49bdSEiji Ota * 480c0dd49bdSEiji Ota * Performance pretty much depends on how often we send 481c0dd49bdSEiji Ota * credit updates - too frequent updates mean lots of ACKs. 482c0dd49bdSEiji Ota * Too infrequent updates, and the peer will run out of 483c0dd49bdSEiji Ota * credits and has to throttle. 484c0dd49bdSEiji Ota * For the time being, 16 seems to be a good compromise. 485c0dd49bdSEiji Ota */ 486c0dd49bdSEiji Ota if (IB_GET_POST_CREDITS(atomic_get(&ic->i_credits)) >= 16) 487c0dd49bdSEiji Ota set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 488c0dd49bdSEiji Ota } 489c0dd49bdSEiji Ota 490c0dd49bdSEiji Ota static inline void 491c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic, 492c0dd49bdSEiji Ota ibt_send_wr_t *wr, unsigned int pos, 493c0dd49bdSEiji Ota struct rdsv3_scatterlist *scat, unsigned int off, unsigned int length, 494c0dd49bdSEiji Ota int send_flags) 495c0dd49bdSEiji Ota { 496c0dd49bdSEiji Ota ibt_wr_ds_t *sge; 497c0dd49bdSEiji Ota 498c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 499c0dd49bdSEiji Ota "ic: %p, wr: %p scat: %p %d %d %d %d", 500c0dd49bdSEiji Ota ic, wr, scat, pos, off, length, send_flags); 501c0dd49bdSEiji Ota 502*5d5562f5SEiji Ota wr->wr_id = pos | RDSV3_IB_SEND_OP; 503c0dd49bdSEiji Ota wr->wr_trans = IBT_RC_SRV; 504c0dd49bdSEiji Ota wr->wr_flags = send_flags; 505c0dd49bdSEiji Ota wr->wr_opcode = IBT_WRC_SEND; 506c0dd49bdSEiji Ota 507c0dd49bdSEiji Ota if (length != 0) { 508c0dd49bdSEiji Ota int ix, len, assigned; 509c0dd49bdSEiji Ota ibt_wr_ds_t *sgl; 510c0dd49bdSEiji Ota 511c0dd49bdSEiji Ota ASSERT(length <= scat->length - off); 512c0dd49bdSEiji Ota 513c0dd49bdSEiji Ota sgl = scat->sgl; 514c0dd49bdSEiji Ota if (off != 0) { 515c0dd49bdSEiji Ota /* find the right sgl to begin with */ 516c0dd49bdSEiji Ota while (sgl->ds_len <= off) { 517c0dd49bdSEiji Ota off -= sgl->ds_len; 518c0dd49bdSEiji Ota sgl++; 519c0dd49bdSEiji Ota } 520c0dd49bdSEiji Ota } 521c0dd49bdSEiji Ota 522c0dd49bdSEiji Ota ix = 1; /* first data sgl is at 1 */ 523c0dd49bdSEiji Ota assigned = 0; 524c0dd49bdSEiji Ota len = length; 525c0dd49bdSEiji Ota do { 526c0dd49bdSEiji Ota sge = &wr->wr_sgl[ix++]; 527c0dd49bdSEiji Ota sge->ds_va = sgl->ds_va + off; 528c0dd49bdSEiji Ota assigned = min(len, sgl->ds_len - off); 529c0dd49bdSEiji Ota sge->ds_len = assigned; 530c0dd49bdSEiji Ota sge->ds_key = sgl->ds_key; 531c0dd49bdSEiji Ota len -= assigned; 532c0dd49bdSEiji Ota if (len != 0) { 533c0dd49bdSEiji Ota sgl++; 534c0dd49bdSEiji Ota off = 0; 535c0dd49bdSEiji Ota } 536c0dd49bdSEiji Ota } while (len > 0); 537c0dd49bdSEiji Ota 538c0dd49bdSEiji Ota wr->wr_nds = ix; 539c0dd49bdSEiji Ota } else { 540c0dd49bdSEiji Ota /* 541c0dd49bdSEiji Ota * We're sending a packet with no payload. There is only 542c0dd49bdSEiji Ota * one SGE 543c0dd49bdSEiji Ota */ 544c0dd49bdSEiji Ota wr->wr_nds = 1; 545c0dd49bdSEiji Ota } 546c0dd49bdSEiji Ota 547c0dd49bdSEiji Ota sge = &wr->wr_sgl[0]; 548c0dd49bdSEiji Ota sge->ds_va = ic->i_send_hdrs_dma + (pos * sizeof (struct rdsv3_header)); 549c0dd49bdSEiji Ota sge->ds_len = sizeof (struct rdsv3_header); 550c0dd49bdSEiji Ota sge->ds_key = ic->i_mr->lkey; 551c0dd49bdSEiji Ota 552c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 553c0dd49bdSEiji Ota "Return: ic: %p, wr: %p scat: %p", ic, wr, scat); 554c0dd49bdSEiji Ota } 555c0dd49bdSEiji Ota 556c0dd49bdSEiji Ota /* 557c0dd49bdSEiji Ota * This can be called multiple times for a given message. The first time 558c0dd49bdSEiji Ota * we see a message we map its scatterlist into the IB device so that 559c0dd49bdSEiji Ota * we can provide that mapped address to the IB scatter gather entries 560c0dd49bdSEiji Ota * in the IB work requests. We translate the scatterlist into a series 561c0dd49bdSEiji Ota * of work requests that fragment the message. These work requests complete 562c0dd49bdSEiji Ota * in order so we pass ownership of the message to the completion handler 563c0dd49bdSEiji Ota * once we send the final fragment. 564c0dd49bdSEiji Ota * 565c0dd49bdSEiji Ota * The RDS core uses the c_send_lock to only enter this function once 566c0dd49bdSEiji Ota * per connection. This makes sure that the tx ring alloc/unalloc pairs 567c0dd49bdSEiji Ota * don't get out of sync and confuse the ring. 568c0dd49bdSEiji Ota */ 569c0dd49bdSEiji Ota int 570c0dd49bdSEiji Ota rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 571c0dd49bdSEiji Ota unsigned int hdr_off, unsigned int sg, unsigned int off) 572c0dd49bdSEiji Ota { 573c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 574c0dd49bdSEiji Ota struct ib_device *dev = ic->i_cm_id->device; 575c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send = NULL; 576c0dd49bdSEiji Ota struct rdsv3_ib_send_work *first; 577c0dd49bdSEiji Ota struct rdsv3_ib_send_work *prev; 578c0dd49bdSEiji Ota ibt_send_wr_t *wr; 579c0dd49bdSEiji Ota struct rdsv3_scatterlist *scat; 580c0dd49bdSEiji Ota uint32_t pos; 581c0dd49bdSEiji Ota uint32_t i; 582c0dd49bdSEiji Ota uint32_t work_alloc; 583c0dd49bdSEiji Ota uint32_t credit_alloc; 584c0dd49bdSEiji Ota uint32_t posted; 585c0dd49bdSEiji Ota uint32_t adv_credits = 0; 586c0dd49bdSEiji Ota int send_flags = 0; 587c0dd49bdSEiji Ota int sent; 588c0dd49bdSEiji Ota int ret; 589c0dd49bdSEiji Ota int flow_controlled = 0; 590c0dd49bdSEiji Ota 591c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit", "conn: %p, rm: %p", conn, rm); 592c0dd49bdSEiji Ota 593c0dd49bdSEiji Ota ASSERT(!(off % RDSV3_FRAG_SIZE)); 594c0dd49bdSEiji Ota ASSERT(!(hdr_off != 0 && hdr_off != sizeof (struct rdsv3_header))); 595c0dd49bdSEiji Ota 596c0dd49bdSEiji Ota /* Do not send cong updates to IB loopback */ 597c0dd49bdSEiji Ota if (conn->c_loopback && 598c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) { 599c0dd49bdSEiji Ota rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); 600c0dd49bdSEiji Ota return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); 601c0dd49bdSEiji Ota } 602c0dd49bdSEiji Ota 603c0dd49bdSEiji Ota #ifndef __lock_lint 604c0dd49bdSEiji Ota /* FIXME we may overallocate here */ 605c0dd49bdSEiji Ota if (ntohl(rm->m_inc.i_hdr.h_len) == 0) 606c0dd49bdSEiji Ota i = 1; 607c0dd49bdSEiji Ota else 608c0dd49bdSEiji Ota i = ceil(ntohl(rm->m_inc.i_hdr.h_len), RDSV3_FRAG_SIZE); 609c0dd49bdSEiji Ota #endif 610c0dd49bdSEiji Ota 611c0dd49bdSEiji Ota work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos); 612*5d5562f5SEiji Ota if (work_alloc != i) { 613*5d5562f5SEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 614c0dd49bdSEiji Ota set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 615c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_ring_full); 616c0dd49bdSEiji Ota ret = -ENOMEM; 617c0dd49bdSEiji Ota goto out; 618c0dd49bdSEiji Ota } 619c0dd49bdSEiji Ota 620c0dd49bdSEiji Ota credit_alloc = work_alloc; 621c0dd49bdSEiji Ota if (ic->i_flowctl) { 622c0dd49bdSEiji Ota credit_alloc = rdsv3_ib_send_grab_credits(ic, work_alloc, 623cadbfdc3SEiji Ota &posted, 0); 624c0dd49bdSEiji Ota adv_credits += posted; 625c0dd49bdSEiji Ota if (credit_alloc < work_alloc) { 626c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, 627c0dd49bdSEiji Ota work_alloc - credit_alloc); 628c0dd49bdSEiji Ota work_alloc = credit_alloc; 629c0dd49bdSEiji Ota flow_controlled++; 630c0dd49bdSEiji Ota } 631c0dd49bdSEiji Ota if (work_alloc == 0) { 632cadbfdc3SEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 633c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_throttle); 634c0dd49bdSEiji Ota ret = -ENOMEM; 635c0dd49bdSEiji Ota goto out; 636c0dd49bdSEiji Ota } 637c0dd49bdSEiji Ota } 638c0dd49bdSEiji Ota 639c0dd49bdSEiji Ota /* map the message the first time we see it */ 640c0dd49bdSEiji Ota if (ic->i_rm == NULL) { 641c0dd49bdSEiji Ota /* 642c0dd49bdSEiji Ota * printk(KERN_NOTICE 643c0dd49bdSEiji Ota * "rdsv3_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", 644c0dd49bdSEiji Ota * be16_to_cpu(rm->m_inc.i_hdr.h_dport), 645c0dd49bdSEiji Ota * rm->m_inc.i_hdr.h_flags, 646c0dd49bdSEiji Ota * be32_to_cpu(rm->m_inc.i_hdr.h_len)); 647c0dd49bdSEiji Ota */ 648c0dd49bdSEiji Ota if (rm->m_nents) { 649c0dd49bdSEiji Ota rm->m_count = rdsv3_ib_dma_map_sg(dev, 650c0dd49bdSEiji Ota rm->m_sg, rm->m_nents); 651c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit", 652c0dd49bdSEiji Ota "ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); 653c0dd49bdSEiji Ota if (rm->m_count == 0) { 654c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 655c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, 656c0dd49bdSEiji Ota work_alloc); 657c0dd49bdSEiji Ota ret = -ENOMEM; /* XXX ? */ 658c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit", 659c0dd49bdSEiji Ota "fail: ic %p mapping rm %p: %d\n", 660c0dd49bdSEiji Ota ic, rm, rm->m_count); 661c0dd49bdSEiji Ota goto out; 662c0dd49bdSEiji Ota } 663c0dd49bdSEiji Ota } else { 664c0dd49bdSEiji Ota rm->m_count = 0; 665c0dd49bdSEiji Ota } 666c0dd49bdSEiji Ota 667c0dd49bdSEiji Ota ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 668c0dd49bdSEiji Ota ic->i_unsignaled_bytes = rdsv3_ib_sysctl_max_unsig_bytes; 669c0dd49bdSEiji Ota rdsv3_message_addref(rm); 670c0dd49bdSEiji Ota ic->i_rm = rm; 671c0dd49bdSEiji Ota 672c0dd49bdSEiji Ota /* Finalize the header */ 673c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags)) 674c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_ACK_REQUIRED; 675c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) 676c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_RETRANSMITTED; 677c0dd49bdSEiji Ota 678c0dd49bdSEiji Ota /* 679c0dd49bdSEiji Ota * If it has a RDMA op, tell the peer we did it. This is 680c0dd49bdSEiji Ota * used by the peer to release use-once RDMA MRs. 681c0dd49bdSEiji Ota */ 682c0dd49bdSEiji Ota if (rm->m_rdma_op) { 683c0dd49bdSEiji Ota struct rdsv3_ext_header_rdma ext_hdr; 684c0dd49bdSEiji Ota 685c0dd49bdSEiji Ota ext_hdr.h_rdma_rkey = htonl(rm->m_rdma_op->r_key); 686c0dd49bdSEiji Ota (void) rdsv3_message_add_extension(&rm->m_inc.i_hdr, 687c0dd49bdSEiji Ota RDSV3_EXTHDR_RDMA, &ext_hdr, 688c0dd49bdSEiji Ota sizeof (ext_hdr)); 689c0dd49bdSEiji Ota } 690c0dd49bdSEiji Ota if (rm->m_rdma_cookie) { 691c0dd49bdSEiji Ota (void) rdsv3_message_add_rdma_dest_extension( 692c0dd49bdSEiji Ota &rm->m_inc.i_hdr, 693c0dd49bdSEiji Ota rdsv3_rdma_cookie_key(rm->m_rdma_cookie), 694c0dd49bdSEiji Ota rdsv3_rdma_cookie_offset(rm->m_rdma_cookie)); 695c0dd49bdSEiji Ota } 696c0dd49bdSEiji Ota 697c0dd49bdSEiji Ota /* 698c0dd49bdSEiji Ota * Note - rdsv3_ib_piggyb_ack clears the ACK_REQUIRED bit, so 699c0dd49bdSEiji Ota * we should not do this unless we have a chance of at least 700c0dd49bdSEiji Ota * sticking the header into the send ring. Which is why we 701c0dd49bdSEiji Ota * should call rdsv3_ib_ring_alloc first. 702c0dd49bdSEiji Ota */ 703c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_ack = htonll(rdsv3_ib_piggyb_ack(ic)); 704c0dd49bdSEiji Ota rdsv3_message_make_checksum(&rm->m_inc.i_hdr); 705c0dd49bdSEiji Ota 706c0dd49bdSEiji Ota /* 707c0dd49bdSEiji Ota * Update adv_credits since we reset the ACK_REQUIRED bit. 708c0dd49bdSEiji Ota */ 709cadbfdc3SEiji Ota (void) rdsv3_ib_send_grab_credits(ic, 0, &posted, 1); 710c0dd49bdSEiji Ota adv_credits += posted; 711c0dd49bdSEiji Ota ASSERT(adv_credits <= 255); 712cadbfdc3SEiji Ota } 713c0dd49bdSEiji Ota 714c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 715c0dd49bdSEiji Ota first = send; 716c0dd49bdSEiji Ota prev = NULL; 717c0dd49bdSEiji Ota scat = &rm->m_sg[sg]; 718c0dd49bdSEiji Ota sent = 0; 719c0dd49bdSEiji Ota i = 0; 720c0dd49bdSEiji Ota 721c0dd49bdSEiji Ota /* 722c0dd49bdSEiji Ota * Sometimes you want to put a fence between an RDMA 723c0dd49bdSEiji Ota * READ and the following SEND. 724c0dd49bdSEiji Ota * We could either do this all the time 725c0dd49bdSEiji Ota * or when requested by the user. Right now, we let 726c0dd49bdSEiji Ota * the application choose. 727c0dd49bdSEiji Ota */ 728c0dd49bdSEiji Ota if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 729c0dd49bdSEiji Ota send_flags = IBT_WR_SEND_FENCE; 730c0dd49bdSEiji Ota 731c0dd49bdSEiji Ota /* 732c0dd49bdSEiji Ota * We could be copying the header into the unused tail of the page. 733c0dd49bdSEiji Ota * That would need to be changed in the future when those pages might 734c0dd49bdSEiji Ota * be mapped userspace pages or page cache pages. So instead we always 735c0dd49bdSEiji Ota * use a second sge and our long-lived ring of mapped headers. We send 736c0dd49bdSEiji Ota * the header after the data so that the data payload can be aligned on 737c0dd49bdSEiji Ota * the receiver. 738c0dd49bdSEiji Ota */ 739c0dd49bdSEiji Ota 740c0dd49bdSEiji Ota /* handle a 0-len message */ 741c0dd49bdSEiji Ota if (ntohl(rm->m_inc.i_hdr.h_len) == 0) { 742c0dd49bdSEiji Ota wr = &ic->i_send_wrs[0]; 743c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(ic, wr, pos, NULL, 0, 0, send_flags); 744c0dd49bdSEiji Ota send->s_queued = jiffies; 745c0dd49bdSEiji Ota send->s_op = NULL; 746c0dd49bdSEiji Ota send->s_opcode = wr->wr_opcode; 747c0dd49bdSEiji Ota goto add_header; 748c0dd49bdSEiji Ota } 749c0dd49bdSEiji Ota 750c0dd49bdSEiji Ota /* if there's data reference it with a chain of work reqs */ 751c0dd49bdSEiji Ota for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 752c0dd49bdSEiji Ota unsigned int len; 753c0dd49bdSEiji Ota 754c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 755c0dd49bdSEiji Ota 756c0dd49bdSEiji Ota wr = &ic->i_send_wrs[i]; 757c0dd49bdSEiji Ota len = min(RDSV3_FRAG_SIZE, 758c0dd49bdSEiji Ota rdsv3_ib_sg_dma_len(dev, scat) - off); 759c0dd49bdSEiji Ota rdsv3_ib_xmit_populate_wr(ic, wr, pos, scat, off, len, 760c0dd49bdSEiji Ota send_flags); 761c0dd49bdSEiji Ota send->s_queued = jiffies; 762c0dd49bdSEiji Ota send->s_op = NULL; 763c0dd49bdSEiji Ota send->s_opcode = wr->wr_opcode; 764c0dd49bdSEiji Ota 765c0dd49bdSEiji Ota /* 766c0dd49bdSEiji Ota * We want to delay signaling completions just enough to get 767c0dd49bdSEiji Ota * the batching benefits but not so much that we create dead 768c0dd49bdSEiji Ota * time 769c0dd49bdSEiji Ota * on the wire. 770c0dd49bdSEiji Ota */ 771c0dd49bdSEiji Ota if (ic->i_unsignaled_wrs-- == 0) { 772c0dd49bdSEiji Ota ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 773c0dd49bdSEiji Ota wr->wr_flags |= 774c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 775c0dd49bdSEiji Ota } 776c0dd49bdSEiji Ota 777c0dd49bdSEiji Ota ic->i_unsignaled_bytes -= len; 778c0dd49bdSEiji Ota if (ic->i_unsignaled_bytes <= 0) { 779c0dd49bdSEiji Ota ic->i_unsignaled_bytes = 780c0dd49bdSEiji Ota rdsv3_ib_sysctl_max_unsig_bytes; 781c0dd49bdSEiji Ota wr->wr_flags |= 782c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 783c0dd49bdSEiji Ota } 784c0dd49bdSEiji Ota 785c0dd49bdSEiji Ota /* 786c0dd49bdSEiji Ota * Always signal the last one if we're stopping due to flow 787c0dd49bdSEiji Ota * control. 788c0dd49bdSEiji Ota */ 789c0dd49bdSEiji Ota if (flow_controlled && i == (work_alloc-1)) { 790c0dd49bdSEiji Ota wr->wr_flags |= 791c0dd49bdSEiji Ota IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 792c0dd49bdSEiji Ota } 793c0dd49bdSEiji Ota 794c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit", "send %p wr %p num_sge %u \n", 795c0dd49bdSEiji Ota send, wr, wr->wr_nds); 796c0dd49bdSEiji Ota 797c0dd49bdSEiji Ota sent += len; 798c0dd49bdSEiji Ota off += len; 799c0dd49bdSEiji Ota if (off == rdsv3_ib_sg_dma_len(dev, scat)) { 800c0dd49bdSEiji Ota scat++; 801c0dd49bdSEiji Ota off = 0; 802c0dd49bdSEiji Ota } 803c0dd49bdSEiji Ota 804c0dd49bdSEiji Ota add_header: 805c0dd49bdSEiji Ota /* 806c0dd49bdSEiji Ota * Tack on the header after the data. The header SGE 807c0dd49bdSEiji Ota * should already 808c0dd49bdSEiji Ota * have been set up to point to the right header buffer. 809c0dd49bdSEiji Ota */ 810c0dd49bdSEiji Ota (void) memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, 811c0dd49bdSEiji Ota sizeof (struct rdsv3_header)); 812c0dd49bdSEiji Ota 813c0dd49bdSEiji Ota if (0) { 814c0dd49bdSEiji Ota struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 815c0dd49bdSEiji Ota 8166e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 817c0dd49bdSEiji Ota "send WR dport=%u flags=0x%x len=%d", 818c0dd49bdSEiji Ota ntohs(hdr->h_dport), 819c0dd49bdSEiji Ota hdr->h_flags, 820c0dd49bdSEiji Ota ntohl(hdr->h_len)); 821c0dd49bdSEiji Ota } 822c0dd49bdSEiji Ota if (adv_credits) { 823c0dd49bdSEiji Ota struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 824c0dd49bdSEiji Ota 825c0dd49bdSEiji Ota /* add credit and redo the header checksum */ 826c0dd49bdSEiji Ota hdr->h_credit = adv_credits; 827c0dd49bdSEiji Ota rdsv3_message_make_checksum(hdr); 828c0dd49bdSEiji Ota adv_credits = 0; 829c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_credit_updates); 830c0dd49bdSEiji Ota } 831c0dd49bdSEiji Ota 832c0dd49bdSEiji Ota prev = send; 833c0dd49bdSEiji Ota 834c0dd49bdSEiji Ota pos = (pos + 1) % ic->i_send_ring.w_nr; 835c0dd49bdSEiji Ota } 836c0dd49bdSEiji Ota 837c0dd49bdSEiji Ota /* 838c0dd49bdSEiji Ota * Account the RDS header in the number of bytes we sent, but just once. 839c0dd49bdSEiji Ota * The caller has no concept of fragmentation. 840c0dd49bdSEiji Ota */ 841c0dd49bdSEiji Ota if (hdr_off == 0) 842c0dd49bdSEiji Ota sent += sizeof (struct rdsv3_header); 843c0dd49bdSEiji Ota 844c0dd49bdSEiji Ota /* if we finished the message then send completion owns it */ 845c0dd49bdSEiji Ota if (scat == &rm->m_sg[rm->m_count]) { 846c0dd49bdSEiji Ota prev->s_rm = ic->i_rm; 847c0dd49bdSEiji Ota wr->wr_flags |= IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 848c0dd49bdSEiji Ota ic->i_rm = NULL; 849c0dd49bdSEiji Ota } 850c0dd49bdSEiji Ota 851c0dd49bdSEiji Ota if (i < work_alloc) { 852c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 853c0dd49bdSEiji Ota work_alloc = i; 854c0dd49bdSEiji Ota } 855c0dd49bdSEiji Ota if (ic->i_flowctl && i < credit_alloc) 856c0dd49bdSEiji Ota rdsv3_ib_send_add_credits(conn, credit_alloc - i); 857c0dd49bdSEiji Ota 858c0dd49bdSEiji Ota /* XXX need to worry about failed_wr and partial sends. */ 859c0dd49bdSEiji Ota ret = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 860c0dd49bdSEiji Ota ic->i_send_wrs, i, &posted); 861c0dd49bdSEiji Ota if (posted != i) { 8626e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 863c0dd49bdSEiji Ota "ic %p first %p nwr: %d ret %d:%d", 864c0dd49bdSEiji Ota ic, first, i, ret, posted); 865c0dd49bdSEiji Ota } 866c0dd49bdSEiji Ota if (ret) { 8676e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit", 868c0dd49bdSEiji Ota "RDS/IB: ib_post_send to %u.%u.%u.%u " 869c0dd49bdSEiji Ota "returned %d\n", NIPQUAD(conn->c_faddr), ret); 870c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 871c0dd49bdSEiji Ota if (prev->s_rm) { 872c0dd49bdSEiji Ota ic->i_rm = prev->s_rm; 873c0dd49bdSEiji Ota prev->s_rm = NULL; 874c0dd49bdSEiji Ota } 875cadbfdc3SEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send failed\n"); 876cadbfdc3SEiji Ota rdsv3_conn_drop(ic->conn); 877*5d5562f5SEiji Ota ret = -EAGAIN; 878c0dd49bdSEiji Ota goto out; 879c0dd49bdSEiji Ota } 880c0dd49bdSEiji Ota 881c0dd49bdSEiji Ota ret = sent; 882c0dd49bdSEiji Ota 883c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit", "Return: conn: %p, rm: %p", conn, rm); 884c0dd49bdSEiji Ota out: 885c0dd49bdSEiji Ota ASSERT(!adv_credits); 886c0dd49bdSEiji Ota return (ret); 887c0dd49bdSEiji Ota } 888c0dd49bdSEiji Ota 889c0dd49bdSEiji Ota static void 890c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, uint_t num, 891c0dd49bdSEiji Ota struct rdsv3_rdma_sg scat[]) 892c0dd49bdSEiji Ota { 893c0dd49bdSEiji Ota ibt_hca_hdl_t hca_hdl; 894c0dd49bdSEiji Ota int i; 895c0dd49bdSEiji Ota int num_sgl; 896c0dd49bdSEiji Ota 897c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_unmap_sg", "rdma_sg: %p", scat); 898c0dd49bdSEiji Ota 899c0dd49bdSEiji Ota if (dev) { 900c0dd49bdSEiji Ota hca_hdl = ib_get_ibt_hca_hdl(dev); 901c0dd49bdSEiji Ota } else { 902c0dd49bdSEiji Ota hca_hdl = scat[0].hca_hdl; 903c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_dma_unmap_sg_rdma", 904c0dd49bdSEiji Ota "NULL dev use cached hca_hdl %p", hca_hdl); 905c0dd49bdSEiji Ota } 906c0dd49bdSEiji Ota 907c0dd49bdSEiji Ota if (hca_hdl == NULL) 908c0dd49bdSEiji Ota return; 909c0dd49bdSEiji Ota scat[0].hca_hdl = NULL; 910c0dd49bdSEiji Ota 911c0dd49bdSEiji Ota for (i = 0; i < num; i++) { 912c0dd49bdSEiji Ota if (scat[i].mihdl != NULL) { 913c0dd49bdSEiji Ota num_sgl = (scat[i].iovec.bytes / PAGESIZE) + 2; 914c0dd49bdSEiji Ota kmem_free(scat[i].swr.wr_sgl, 915c0dd49bdSEiji Ota (num_sgl * sizeof (ibt_wr_ds_t))); 916c0dd49bdSEiji Ota scat[i].swr.wr_sgl = NULL; 917c0dd49bdSEiji Ota (void) ibt_unmap_mem_iov(hca_hdl, scat[i].mihdl); 918c0dd49bdSEiji Ota scat[i].mihdl = NULL; 919c0dd49bdSEiji Ota } else 920c0dd49bdSEiji Ota break; 921c0dd49bdSEiji Ota } 922c0dd49bdSEiji Ota } 923c0dd49bdSEiji Ota 924c0dd49bdSEiji Ota /* ARGSUSED */ 925c0dd49bdSEiji Ota uint_t 926c0dd49bdSEiji Ota rdsv3_ib_dma_map_sg_rdma(struct ib_device *dev, struct rdsv3_rdma_sg scat[], 927c0dd49bdSEiji Ota uint_t num, struct rdsv3_scatterlist **scatl) 928c0dd49bdSEiji Ota { 929c0dd49bdSEiji Ota ibt_hca_hdl_t hca_hdl; 930c0dd49bdSEiji Ota ibt_iov_attr_t iov_attr; 931c0dd49bdSEiji Ota struct buf *bp; 932c0dd49bdSEiji Ota uint_t i, j, k; 933c0dd49bdSEiji Ota uint_t count; 934c0dd49bdSEiji Ota struct rdsv3_scatterlist *sg; 935c0dd49bdSEiji Ota int ret; 936c0dd49bdSEiji Ota 937c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "scat: %p, num: %d", 938c0dd49bdSEiji Ota scat, num); 939c0dd49bdSEiji Ota 940c0dd49bdSEiji Ota hca_hdl = ib_get_ibt_hca_hdl(dev); 941c0dd49bdSEiji Ota scat[0].hca_hdl = hca_hdl; 942c0dd49bdSEiji Ota bzero(&iov_attr, sizeof (ibt_iov_attr_t)); 943c0dd49bdSEiji Ota iov_attr.iov_flags = IBT_IOV_BUF; 944c0dd49bdSEiji Ota iov_attr.iov_lso_hdr_sz = 0; 945c0dd49bdSEiji Ota 946c0dd49bdSEiji Ota for (i = 0, count = 0; i < num; i++) { 947c0dd49bdSEiji Ota /* transpose umem_cookie to buf structure */ 948c0dd49bdSEiji Ota bp = ddi_umem_iosetup(scat[i].umem_cookie, 949c0dd49bdSEiji Ota scat[i].iovec.addr & PAGEOFFSET, scat[i].iovec.bytes, 950c0dd49bdSEiji Ota B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); 951c0dd49bdSEiji Ota if (bp == NULL) { 952c0dd49bdSEiji Ota /* free resources and return error */ 953c0dd49bdSEiji Ota goto out; 954c0dd49bdSEiji Ota } 955c0dd49bdSEiji Ota /* setup ibt_map_mem_iov() attributes */ 956c0dd49bdSEiji Ota iov_attr.iov_buf = bp; 957c0dd49bdSEiji Ota iov_attr.iov_wr_nds = (scat[i].iovec.bytes / PAGESIZE) + 2; 958c0dd49bdSEiji Ota scat[i].swr.wr_sgl = 959c0dd49bdSEiji Ota kmem_zalloc(iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t), 960c0dd49bdSEiji Ota KM_SLEEP); 961c0dd49bdSEiji Ota 962c0dd49bdSEiji Ota ret = ibt_map_mem_iov(hca_hdl, &iov_attr, 963c0dd49bdSEiji Ota (ibt_all_wr_t *)&scat[i].swr, &scat[i].mihdl); 964c0dd49bdSEiji Ota freerbuf(bp); 965c0dd49bdSEiji Ota if (ret != IBT_SUCCESS) { 966c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg_rdma", 967c0dd49bdSEiji Ota "ibt_map_mem_iov returned: %d", ret); 968c0dd49bdSEiji Ota /* free resources and return error */ 969c0dd49bdSEiji Ota kmem_free(scat[i].swr.wr_sgl, 970c0dd49bdSEiji Ota iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t)); 971c0dd49bdSEiji Ota goto out; 972c0dd49bdSEiji Ota } 973c0dd49bdSEiji Ota count += scat[i].swr.wr_nds; 974c0dd49bdSEiji Ota 975c0dd49bdSEiji Ota #ifdef DEBUG 976c0dd49bdSEiji Ota for (j = 0; j < scat[i].swr.wr_nds; j++) { 977c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_dma_map_sg_rdma", 978c0dd49bdSEiji Ota "sgl[%d] va %llx len %x", j, 979c0dd49bdSEiji Ota scat[i].swr.wr_sgl[j].ds_va, 980c0dd49bdSEiji Ota scat[i].swr.wr_sgl[j].ds_len); 981c0dd49bdSEiji Ota } 982c0dd49bdSEiji Ota #endif 983c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", 984c0dd49bdSEiji Ota "iovec.bytes: 0x%x scat[%d]swr.wr_nds: %d", 985c0dd49bdSEiji Ota scat[i].iovec.bytes, i, scat[i].swr.wr_nds); 986c0dd49bdSEiji Ota } 987c0dd49bdSEiji Ota 988c0dd49bdSEiji Ota count = ((count - 1) / RDSV3_IB_MAX_SGE) + 1; 989c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "Ret: num: %d", count); 990c0dd49bdSEiji Ota return (count); 991c0dd49bdSEiji Ota 992c0dd49bdSEiji Ota out: 993c0dd49bdSEiji Ota rdsv3_ib_dma_unmap_sg_rdma(dev, num, scat); 994c0dd49bdSEiji Ota return (0); 995c0dd49bdSEiji Ota } 996c0dd49bdSEiji Ota 997c0dd49bdSEiji Ota int 998c0dd49bdSEiji Ota rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) 999c0dd49bdSEiji Ota { 1000c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 1001c0dd49bdSEiji Ota struct rdsv3_ib_send_work *send = NULL; 1002c0dd49bdSEiji Ota struct rdsv3_rdma_sg *scat; 1003c0dd49bdSEiji Ota uint64_t remote_addr; 1004c0dd49bdSEiji Ota uint32_t pos; 1005c0dd49bdSEiji Ota uint32_t work_alloc; 1006c0dd49bdSEiji Ota uint32_t i, j, k, idx; 1007c0dd49bdSEiji Ota uint32_t left, count; 1008c0dd49bdSEiji Ota uint32_t posted; 1009c0dd49bdSEiji Ota int sent; 1010c0dd49bdSEiji Ota ibt_status_t status; 1011c0dd49bdSEiji Ota ibt_send_wr_t *wr; 1012c0dd49bdSEiji Ota ibt_wr_ds_t *sge; 1013c0dd49bdSEiji Ota 1014c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "rdsv3_ib_conn: %p", ic); 1015c0dd49bdSEiji Ota 1016c0dd49bdSEiji Ota /* map the message the first time we see it */ 1017c0dd49bdSEiji Ota if (!op->r_mapped) { 1018c0dd49bdSEiji Ota op->r_count = rdsv3_ib_dma_map_sg_rdma(ic->i_cm_id->device, 1019c0dd49bdSEiji Ota op->r_rdma_sg, op->r_nents, &op->r_sg); 1020c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "ic %p mapping op %p: %d", 1021c0dd49bdSEiji Ota ic, op, op->r_count); 1022c0dd49bdSEiji Ota if (op->r_count == 0) { 1023c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 1024c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1025c0dd49bdSEiji Ota "fail: ic %p mapping op %p: %d", 1026c0dd49bdSEiji Ota ic, op, op->r_count); 1027c0dd49bdSEiji Ota return (-ENOMEM); /* XXX ? */ 1028c0dd49bdSEiji Ota } 1029c0dd49bdSEiji Ota op->r_mapped = 1; 1030c0dd49bdSEiji Ota } 1031c0dd49bdSEiji Ota 1032c0dd49bdSEiji Ota /* 1033c0dd49bdSEiji Ota * Instead of knowing how to return a partial rdma read/write 1034c0dd49bdSEiji Ota * we insist that there 1035c0dd49bdSEiji Ota * be enough work requests to send the entire message. 1036c0dd49bdSEiji Ota */ 1037c0dd49bdSEiji Ota work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, op->r_count, &pos); 1038c0dd49bdSEiji Ota if (work_alloc != op->r_count) { 1039c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1040c0dd49bdSEiji Ota rdsv3_ib_stats_inc(s_ib_tx_ring_full); 1041c0dd49bdSEiji Ota return (-ENOMEM); 1042c0dd49bdSEiji Ota } 1043c0dd49bdSEiji Ota 1044*5d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "pos %u cnt %u", pos, op->r_count); 1045c0dd49bdSEiji Ota /* 1046c0dd49bdSEiji Ota * take the scatter list and transpose into a list of 1047c0dd49bdSEiji Ota * send wr's each with a scatter list of RDSV3_IB_MAX_SGE 1048c0dd49bdSEiji Ota */ 1049c0dd49bdSEiji Ota scat = &op->r_rdma_sg[0]; 1050c0dd49bdSEiji Ota sent = 0; 1051c0dd49bdSEiji Ota remote_addr = op->r_remote_addr; 1052c0dd49bdSEiji Ota 1053c0dd49bdSEiji Ota for (i = 0, k = 0; i < op->r_nents; i++) { 1054c0dd49bdSEiji Ota left = scat[i].swr.wr_nds; 1055c0dd49bdSEiji Ota for (idx = 0; left > 0; k++) { 1056c0dd49bdSEiji Ota send = &ic->i_sends[pos]; 1057c0dd49bdSEiji Ota send->s_queued = jiffies; 1058c0dd49bdSEiji Ota send->s_opcode = op->r_write ? IBT_WRC_RDMAW : 1059c0dd49bdSEiji Ota IBT_WRC_RDMAR; 1060c0dd49bdSEiji Ota send->s_op = op; 1061c0dd49bdSEiji Ota 1062c0dd49bdSEiji Ota wr = &ic->i_send_wrs[k]; 1063c0dd49bdSEiji Ota wr->wr_flags = 0; 1064*5d5562f5SEiji Ota wr->wr_id = pos | RDSV3_IB_SEND_OP; 1065c0dd49bdSEiji Ota wr->wr_trans = IBT_RC_SRV; 1066c0dd49bdSEiji Ota wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW : 1067c0dd49bdSEiji Ota IBT_WRC_RDMAR; 1068c0dd49bdSEiji Ota wr->wr.rc.rcwr.rdma.rdma_raddr = remote_addr; 1069c0dd49bdSEiji Ota wr->wr.rc.rcwr.rdma.rdma_rkey = op->r_key; 1070c0dd49bdSEiji Ota 1071c0dd49bdSEiji Ota if (left > RDSV3_IB_MAX_SGE) { 1072c0dd49bdSEiji Ota count = RDSV3_IB_MAX_SGE; 1073c0dd49bdSEiji Ota left -= RDSV3_IB_MAX_SGE; 1074c0dd49bdSEiji Ota } else { 1075c0dd49bdSEiji Ota count = left; 1076c0dd49bdSEiji Ota left = 0; 1077c0dd49bdSEiji Ota } 1078c0dd49bdSEiji Ota wr->wr_nds = count; 1079c0dd49bdSEiji Ota 1080c0dd49bdSEiji Ota for (j = 0; j < count; j++) { 1081c0dd49bdSEiji Ota sge = &wr->wr_sgl[j]; 1082c0dd49bdSEiji Ota *sge = scat[i].swr.wr_sgl[idx]; 1083c0dd49bdSEiji Ota remote_addr += scat[i].swr.wr_sgl[idx].ds_len; 1084c0dd49bdSEiji Ota sent += scat[i].swr.wr_sgl[idx].ds_len; 1085c0dd49bdSEiji Ota idx++; 1086*5d5562f5SEiji Ota RDSV3_DPRINTF5("xmit_rdma", 1087c0dd49bdSEiji Ota "send_wrs[%d]sgl[%d] va %llx len %x", 1088c0dd49bdSEiji Ota k, j, sge->ds_va, sge->ds_len); 1089c0dd49bdSEiji Ota } 1090*5d5562f5SEiji Ota RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", 1091c0dd49bdSEiji Ota "wr[%d] %p key: %x code: %d tlen: %d", 1092c0dd49bdSEiji Ota k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey, 1093c0dd49bdSEiji Ota wr->wr_opcode, sent); 1094c0dd49bdSEiji Ota 1095c0dd49bdSEiji Ota /* 1096c0dd49bdSEiji Ota * We want to delay signaling completions just enough 1097c0dd49bdSEiji Ota * to get the batching benefits but not so much that 1098c0dd49bdSEiji Ota * we create dead time on the wire. 1099c0dd49bdSEiji Ota */ 1100c0dd49bdSEiji Ota if (ic->i_unsignaled_wrs-- == 0) { 1101c0dd49bdSEiji Ota ic->i_unsignaled_wrs = 1102c0dd49bdSEiji Ota rdsv3_ib_sysctl_max_unsig_wrs; 1103c0dd49bdSEiji Ota wr->wr_flags = IBT_WR_SEND_SIGNAL; 1104c0dd49bdSEiji Ota } 1105c0dd49bdSEiji Ota 1106c0dd49bdSEiji Ota pos = (pos + 1) % ic->i_send_ring.w_nr; 1107c0dd49bdSEiji Ota } 1108c0dd49bdSEiji Ota } 1109c0dd49bdSEiji Ota 1110c0dd49bdSEiji Ota status = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 1111c0dd49bdSEiji Ota ic->i_send_wrs, k, &posted); 1112c0dd49bdSEiji Ota if (status != IBT_SUCCESS) { 11136e18d381Sagiri RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1114cadbfdc3SEiji Ota "RDS/IB: rdma ib_post_send to %u.%u.%u.%u " 1115cadbfdc3SEiji Ota "returned %d", NIPQUAD(conn->c_faddr), status); 1116c0dd49bdSEiji Ota rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1117c0dd49bdSEiji Ota } 1118*5d5562f5SEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "Ret: %p", ic); 1119c0dd49bdSEiji Ota return (status); 1120c0dd49bdSEiji Ota } 1121c0dd49bdSEiji Ota 1122c0dd49bdSEiji Ota void 1123c0dd49bdSEiji Ota rdsv3_ib_xmit_complete(struct rdsv3_connection *conn) 1124c0dd49bdSEiji Ota { 1125c0dd49bdSEiji Ota struct rdsv3_ib_connection *ic = conn->c_transport_data; 1126c0dd49bdSEiji Ota 1127c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_ib_xmit_complete", "conn: %p", conn); 1128c0dd49bdSEiji Ota 1129c0dd49bdSEiji Ota /* 1130c0dd49bdSEiji Ota * We may have a pending ACK or window update we were unable 1131c0dd49bdSEiji Ota * to send previously (due to flow control). Try again. 1132c0dd49bdSEiji Ota */ 1133c0dd49bdSEiji Ota rdsv3_ib_attempt_ack(ic); 1134c0dd49bdSEiji Ota } 1135