1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 3 * 4 * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 #include "sdp.h" 35 36 #define sdp_cnt(var) do { (var)++; } while (0) 37 38 SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, 39 "Total number of keepalive probes sent."); 40 41 static int sdp_process_tx_cq(struct sdp_sock *ssk); 42 static void sdp_poll_tx_timeout(void *data); 43 44 int 45 sdp_xmit_poll(struct sdp_sock *ssk, int force) 46 { 47 int wc_processed = 0; 48 49 SDP_WLOCK_ASSERT(ssk); 50 sdp_prf(ssk->socket, NULL, "%s", __func__); 51 52 /* If we don't have a pending timer, set one up to catch our recent 53 post in case the interface becomes idle */ 54 if (!callout_pending(&ssk->tx_ring.timer)) 55 callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, 56 sdp_poll_tx_timeout, ssk); 57 58 /* Poll the CQ every SDP_TX_POLL_MODER packets */ 59 if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) 60 wc_processed = sdp_process_tx_cq(ssk); 61 62 return wc_processed; 63 } 64 65 void 66 sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb) 67 { 68 struct sdp_buf *tx_req; 69 struct sdp_bsdh *h; 70 unsigned long mseq; 71 struct ib_device *dev; 72 const struct ib_send_wr *bad_wr; 73 struct ib_sge ibsge[SDP_MAX_SEND_SGES]; 74 struct ib_sge *sge; 75 struct ib_send_wr tx_wr = { NULL }; 76 int i, rc; 77 u64 addr; 78 79 SDPSTATS_COUNTER_MID_INC(post_send, h->mid); 80 SDPSTATS_HIST(send_size, mb->len); 81 82 if (!ssk->qp_active) { 83 m_freem(mb); 84 return; 85 } 86 87 mseq = ring_head(ssk->tx_ring); 88 h = mtod(mb, struct sdp_bsdh *); 89 ssk->tx_packets++; 90 ssk->tx_bytes += mb->m_pkthdr.len; 91 92 #ifdef SDP_ZCOPY 93 if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { 94 struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb); 95 if (ssk->tx_sa != tx_sa) { 96 sdp_dbg_data(ssk->socket, "SrcAvail cancelled " 97 "before being sent!\n"); 98 WARN_ON(1); 99 m_freem(mb); 100 return; 101 } 102 TX_SRCAVAIL_STATE(mb)->mseq = mseq; 103 } 104 #endif 105 106 if (unlikely(mb->m_flags & M_URG)) 107 h->flags = SDP_OOB_PRES | SDP_OOB_PEND; 108 else 109 h->flags = 0; 110 111 mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */ 112 h->bufs = htons(rx_ring_posted(ssk)); 113 h->len = htonl(mb->m_pkthdr.len); 114 h->mseq = htonl(mseq); 115 h->mseq_ack = htonl(mseq_ack(ssk)); 116 117 sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d", 118 mid2str(h->mid), rx_ring_posted(ssk), mseq, 119 ntohl(h->mseq_ack)); 120 121 SDP_DUMP_PACKET(ssk->socket, "TX", mb, h); 122 123 tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)]; 124 tx_req->mb = mb; 125 dev = ssk->ib_device; 126 sge = &ibsge[0]; 127 for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { 128 addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, 129 DMA_TO_DEVICE); 130 /* TODO: proper error handling */ 131 BUG_ON(ib_dma_mapping_error(dev, addr)); 132 BUG_ON(i >= SDP_MAX_SEND_SGES); 133 tx_req->mapping[i] = addr; 134 sge->addr = addr; 135 sge->length = mb->m_len; 136 sge->lkey = ssk->sdp_dev->pd->local_dma_lkey; 137 } 138 tx_wr.next = NULL; 139 tx_wr.wr_id = mseq | SDP_OP_SEND; 140 tx_wr.sg_list = ibsge; 141 tx_wr.num_sge = i; 142 tx_wr.opcode = IB_WR_SEND; 143 tx_wr.send_flags = IB_SEND_SIGNALED; 144 if (unlikely(tx_req->mb->m_flags & M_URG)) 145 tx_wr.send_flags |= IB_SEND_SOLICITED; 146 147 rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr); 148 if (unlikely(rc)) { 149 sdp_dbg(ssk->socket, 150 "ib_post_send failed with status %d.\n", rc); 151 152 sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); 153 154 sdp_notify(ssk, ECONNRESET); 155 m_freem(tx_req->mb); 156 return; 157 } 158 159 atomic_inc(&ssk->tx_ring.head); 160 atomic_dec(&ssk->tx_ring.credits); 161 atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); 162 163 return; 164 } 165 166 static struct mbuf * 167 sdp_send_completion(struct sdp_sock *ssk, int mseq) 168 { 169 struct ib_device *dev; 170 struct sdp_buf *tx_req; 171 struct mbuf *mb = NULL; 172 struct sdp_tx_ring *tx_ring = &ssk->tx_ring; 173 174 if (unlikely(mseq != ring_tail(*tx_ring))) { 175 printk(KERN_WARNING "Bogus send completion id %d tail %d\n", 176 mseq, ring_tail(*tx_ring)); 177 goto out; 178 } 179 180 dev = ssk->ib_device; 181 tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)]; 182 mb = tx_req->mb; 183 sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); 184 185 #ifdef SDP_ZCOPY 186 /* TODO: AIO and real zcopy code; add their context support here */ 187 if (BZCOPY_STATE(mb)) 188 BZCOPY_STATE(mb)->busy--; 189 #endif 190 191 atomic_inc(&tx_ring->tail); 192 193 out: 194 return mb; 195 } 196 197 static int 198 sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc) 199 { 200 struct mbuf *mb = NULL; 201 struct sdp_bsdh *h; 202 203 if (unlikely(wc->status)) { 204 if (wc->status != IB_WC_WR_FLUSH_ERR) { 205 sdp_prf(ssk->socket, mb, "Send completion with error. " 206 "Status %d", wc->status); 207 sdp_dbg_data(ssk->socket, "Send completion with error. " 208 "Status %d\n", wc->status); 209 sdp_notify(ssk, ECONNRESET); 210 } 211 } 212 213 mb = sdp_send_completion(ssk, wc->wr_id); 214 if (unlikely(!mb)) 215 return -1; 216 217 h = mtod(mb, struct sdp_bsdh *); 218 sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq)); 219 sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d", 220 mb, mb->m_pkthdr.len, ntohl(h->mseq)); 221 m_freem(mb); 222 223 return 0; 224 } 225 226 static inline void 227 sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc) 228 { 229 230 if (likely(wc->wr_id & SDP_OP_SEND)) { 231 sdp_handle_send_comp(ssk, wc); 232 return; 233 } 234 235 #ifdef SDP_ZCOPY 236 if (wc->wr_id & SDP_OP_RDMA) { 237 /* TODO: handle failed RDMA read cqe */ 238 239 sdp_dbg_data(ssk->socket, 240 "TX comp: RDMA read. status: %d\n", wc->status); 241 sdp_prf1(sk, NULL, "TX comp: RDMA read"); 242 243 if (!ssk->tx_ring.rdma_inflight) { 244 sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n"); 245 return; 246 } 247 248 if (!ssk->tx_ring.rdma_inflight->busy) { 249 sdp_warn(ssk->socket, 250 "ERROR: too many RDMA read completions\n"); 251 return; 252 } 253 254 /* Only last RDMA read WR is signalled. Order is guaranteed - 255 * therefore if Last RDMA read WR is completed - all other 256 * have, too */ 257 ssk->tx_ring.rdma_inflight->busy = 0; 258 sowwakeup(ssk->socket); 259 sdp_dbg_data(ssk->socket, "woke up sleepers\n"); 260 return; 261 } 262 #endif 263 264 /* Keepalive probe sent cleanup */ 265 sdp_cnt(sdp_keepalive_probes_sent); 266 267 if (likely(!wc->status)) 268 return; 269 270 sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n", 271 __func__, wc->status); 272 273 if (wc->status == IB_WC_WR_FLUSH_ERR) 274 return; 275 276 sdp_notify(ssk, ECONNRESET); 277 } 278 279 static int 280 sdp_process_tx_cq(struct sdp_sock *ssk) 281 { 282 struct ib_wc ibwc[SDP_NUM_WC]; 283 int n, i; 284 int wc_processed = 0; 285 286 SDP_WLOCK_ASSERT(ssk); 287 288 if (!ssk->tx_ring.cq) { 289 sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n"); 290 return 0; 291 } 292 293 do { 294 n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc); 295 for (i = 0; i < n; ++i) { 296 sdp_process_tx_wc(ssk, ibwc + i); 297 wc_processed++; 298 } 299 } while (n == SDP_NUM_WC); 300 301 if (wc_processed) { 302 sdp_post_sends(ssk, M_NOWAIT); 303 sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", 304 (u32) tx_ring_posted(ssk)); 305 sowwakeup(ssk->socket); 306 } 307 308 return wc_processed; 309 } 310 311 static void 312 sdp_poll_tx(struct sdp_sock *ssk) 313 { 314 struct socket *sk = ssk->socket; 315 u32 inflight, wc_processed; 316 317 sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", 318 (u32) tx_ring_posted(ssk), 319 ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring)); 320 321 if (unlikely(ssk->state == TCPS_CLOSED)) { 322 sdp_warn(sk, "Socket is closed\n"); 323 goto out; 324 } 325 326 wc_processed = sdp_process_tx_cq(ssk); 327 if (!wc_processed) 328 SDPSTATS_COUNTER_INC(tx_poll_miss); 329 else 330 SDPSTATS_COUNTER_INC(tx_poll_hit); 331 332 inflight = (u32) tx_ring_posted(ssk); 333 sdp_prf1(ssk->socket, NULL, "finished tx processing. inflight = %d", 334 inflight); 335 336 /* If there are still packets in flight and the timer has not already 337 * been scheduled by the Tx routine then schedule it here to guarantee 338 * completion processing of these packets */ 339 if (inflight) 340 callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, 341 sdp_poll_tx_timeout, ssk); 342 out: 343 #ifdef SDP_ZCOPY 344 if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) { 345 sdp_prf1(sk, NULL, "RDMA is inflight - arming irq"); 346 sdp_arm_tx_cq(ssk); 347 } 348 #endif 349 return; 350 } 351 352 static void 353 sdp_poll_tx_timeout(void *data) 354 { 355 struct sdp_sock *ssk = (struct sdp_sock *)data; 356 357 if (!callout_active(&ssk->tx_ring.timer)) 358 return; 359 callout_deactivate(&ssk->tx_ring.timer); 360 sdp_poll_tx(ssk); 361 } 362 363 static void 364 sdp_tx_irq(struct ib_cq *cq, void *cq_context) 365 { 366 struct sdp_sock *ssk; 367 368 ssk = cq_context; 369 sdp_prf1(ssk->socket, NULL, "tx irq"); 370 sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n"); 371 SDPSTATS_COUNTER_INC(tx_int_count); 372 SDP_WLOCK(ssk); 373 sdp_poll_tx(ssk); 374 SDP_WUNLOCK(ssk); 375 } 376 377 static 378 void sdp_tx_ring_purge(struct sdp_sock *ssk) 379 { 380 while (tx_ring_posted(ssk)) { 381 struct mbuf *mb; 382 mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring)); 383 if (!mb) 384 break; 385 m_freem(mb); 386 } 387 } 388 389 void 390 sdp_post_keepalive(struct sdp_sock *ssk) 391 { 392 int rc; 393 struct ib_send_wr wr; 394 const struct ib_send_wr *bad_wr; 395 396 sdp_dbg(ssk->socket, "%s\n", __func__); 397 398 memset(&wr, 0, sizeof(wr)); 399 400 wr.next = NULL; 401 wr.wr_id = 0; 402 wr.sg_list = NULL; 403 wr.num_sge = 0; 404 wr.opcode = IB_WR_RDMA_WRITE; 405 406 rc = ib_post_send(ssk->qp, &wr, &bad_wr); 407 if (rc) { 408 sdp_dbg(ssk->socket, 409 "ib_post_keepalive failed with status %d.\n", rc); 410 sdp_notify(ssk, ECONNRESET); 411 } 412 413 sdp_cnt(sdp_keepalive_probes_sent); 414 } 415 416 static void 417 sdp_tx_cq_event_handler(struct ib_event *event, void *data) 418 { 419 } 420 421 int 422 sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device) 423 { 424 struct ib_cq_init_attr tx_cq_attr = { 425 .cqe = SDP_TX_SIZE, 426 .comp_vector = 0, 427 .flags = 0, 428 }; 429 struct ib_cq *tx_cq; 430 int rc = 0; 431 432 sdp_dbg(ssk->socket, "tx ring create\n"); 433 callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0); 434 callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0); 435 atomic_set(&ssk->tx_ring.head, 1); 436 atomic_set(&ssk->tx_ring.tail, 1); 437 438 ssk->tx_ring.buffer = malloc(sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE, 439 M_SDP, M_WAITOK); 440 441 tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler, 442 ssk, &tx_cq_attr); 443 if (IS_ERR(tx_cq)) { 444 rc = PTR_ERR(tx_cq); 445 sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc); 446 goto err_cq; 447 } 448 ssk->tx_ring.cq = tx_cq; 449 ssk->tx_ring.poll_cnt = 0; 450 sdp_arm_tx_cq(ssk); 451 452 return 0; 453 454 err_cq: 455 free(ssk->tx_ring.buffer, M_SDP); 456 ssk->tx_ring.buffer = NULL; 457 return rc; 458 } 459 460 void 461 sdp_tx_ring_destroy(struct sdp_sock *ssk) 462 { 463 464 sdp_dbg(ssk->socket, "tx ring destroy\n"); 465 SDP_WLOCK(ssk); 466 callout_stop(&ssk->tx_ring.timer); 467 callout_stop(&ssk->nagle_timer); 468 SDP_WUNLOCK(ssk); 469 callout_drain(&ssk->tx_ring.timer); 470 callout_drain(&ssk->nagle_timer); 471 472 if (ssk->tx_ring.buffer) { 473 sdp_tx_ring_purge(ssk); 474 free(ssk->tx_ring.buffer, M_SDP); 475 ssk->tx_ring.buffer = NULL; 476 } 477 478 if (ssk->tx_ring.cq) { 479 ib_destroy_cq(ssk->tx_ring.cq); 480 ssk->tx_ring.cq = NULL; 481 } 482 483 WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring)); 484 } 485