xref: /freebsd/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c (revision 2a58b312b62f908ec92311d1bd8536dbaeb8e55b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
3  *
4  * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 #include <linux/tcp.h>
35 #include <asm/ioctls.h>
36 #include <linux/workqueue.h>
37 #include <linux/net.h>
38 #include <linux/socket.h>
39 #include <net/protocol.h>
40 #include <net/inet_common.h>
41 #include <rdma/rdma_cm.h>
42 #include <rdma/ib_verbs.h>
43 #include <rdma/ib_fmr_pool.h>
44 #include <rdma/ib_umem.h>
45 #include <net/tcp.h> /* for memcpy_toiovec */
46 #include <asm/io.h>
47 #include <asm/uaccess.h>
48 #include <linux/delay.h>
49 #include "sdp.h"
50 
51 static int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa)
52 {
53 	struct sdp_sock *ssk = sdp_sk(sk);
54 	struct mbuf *mb;
55 	int payload_len;
56 	struct page *payload_pg;
57 	int off, len;
58 	struct ib_umem_chunk *chunk;
59 
60 	WARN_ON(ssk->tx_sa);
61 
62 	BUG_ON(!tx_sa);
63 	BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey);
64 	BUG_ON(!tx_sa->umem);
65 	BUG_ON(!tx_sa->umem->chunk_list.next);
66 
67 	chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list);
68 	BUG_ON(!chunk->nmap);
69 
70 	off = tx_sa->umem->offset;
71 	len = tx_sa->umem->length;
72 
73 	tx_sa->bytes_sent = tx_sa->bytes_acked = 0;
74 
75 	mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0);
76 	if (!mb) {
77 		return -ENOMEM;
78 	}
79 	sdp_dbg_data(sk, "sending SrcAvail\n");
80 
81 	TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb
82 					 * but continue to live after mb is freed */
83 	ssk->tx_sa = tx_sa;
84 
85 	/* must have payload inlined in SrcAvail packet in combined mode */
86 	payload_len = MIN(tx_sa->umem->page_size - off, len);
87 	payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah));
88 	payload_pg  = sg_page(&chunk->page_list[0]);
89 	get_page(payload_pg);
90 
91 	sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n",
92 		off, payload_pg, payload_len);
93 
94 	mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags,
95 			payload_pg, off, payload_len);
96 
97 	mb->len             += payload_len;
98 	mb->data_len         = payload_len;
99 	mb->truesize        += payload_len;
100 //	sk->sk_wmem_queued   += payload_len;
101 //	sk->sk_forward_alloc -= payload_len;
102 
103 	mb_entail(sk, ssk, mb);
104 
105 	ssk->write_seq += payload_len;
106 	SDP_SKB_CB(mb)->end_seq += payload_len;
107 
108 	tx_sa->bytes_sent = tx_sa->umem->length;
109 	tx_sa->bytes_acked = payload_len;
110 
111 	/* TODO: pushing the mb into the tx_queue should be enough */
112 
113 	return 0;
114 }
115 
116 static int sdp_post_srcavail_cancel(struct socket *sk)
117 {
118 	struct sdp_sock *ssk = sdp_sk(sk);
119 	struct mbuf *mb;
120 
121 	sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n");
122 
123 	mb = sdp_alloc_mb_srcavail_cancel(sk, 0);
124 	mb_entail(sk, ssk, mb);
125 
126 	sdp_post_sends(ssk, 0);
127 
128 	schedule_delayed_work(&ssk->srcavail_cancel_work,
129 			SDP_SRCAVAIL_CANCEL_TIMEOUT);
130 
131 	return 0;
132 }
133 
134 void srcavail_cancel_timeout(struct work_struct *work)
135 {
136 	struct sdp_sock *ssk =
137 		container_of(work, struct sdp_sock, srcavail_cancel_work.work);
138 	struct socket *sk = ssk->socket;
139 
140 	lock_sock(sk);
141 
142 	sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout."
143 			" closing connection\n");
144 	sdp_set_error(sk, -ECONNRESET);
145 	wake_up(&ssk->wq);
146 
147 	release_sock(sk);
148 }
149 
150 static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p,
151 		int ignore_signals)
152 {
153 	struct socket *sk = ssk->socket;
154 	int err = 0;
155 	long vm_wait = 0;
156 	long current_timeo = *timeo_p;
157 	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
158 	DEFINE_WAIT(wait);
159 
160 	sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p);
161 	sdp_prf1(sk, NULL, "Going to sleep");
162 	while (ssk->qp_active) {
163 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
164 
165 		if (unlikely(!*timeo_p)) {
166 			err = -ETIME;
167 			tx_sa->abort_flags |= TX_SA_TIMEDOUT;
168 			sdp_prf1(sk, NULL, "timeout");
169 			SDPSTATS_COUNTER_INC(zcopy_tx_timeout);
170 			break;
171 		}
172 
173 		else if (tx_sa->bytes_acked > tx_sa->bytes_sent) {
174 			err = -EINVAL;
175 			sdp_dbg_data(sk, "acked bytes > sent bytes\n");
176 			tx_sa->abort_flags |= TX_SA_ERROR;
177 			break;
178 		}
179 
180 		if (tx_sa->abort_flags & TX_SA_SENDSM) {
181 			sdp_prf1(sk, NULL, "Aborting SrcAvail sending");
182 			SDPSTATS_COUNTER_INC(zcopy_tx_aborted);
183 			err = -EAGAIN;
184 			break ;
185 		}
186 
187 		if (!ignore_signals) {
188 			if (signal_pending(current)) {
189 				err = -EINTR;
190 				sdp_prf1(sk, NULL, "signalled");
191 				tx_sa->abort_flags |= TX_SA_INTRRUPTED;
192 				break;
193 			}
194 
195 			if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) {
196 				sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n");
197 				tx_sa->abort_flags |= TX_SA_CROSS_SEND;
198 				SDPSTATS_COUNTER_INC(zcopy_cross_send);
199 				err = -ETIME;
200 				break ;
201 			}
202 		}
203 
204 		posts_handler_put(ssk);
205 
206 		sk_wait_event(sk, &current_timeo,
207 				tx_sa->abort_flags &&
208 				ssk->rx_sa &&
209 				(tx_sa->bytes_acked < tx_sa->bytes_sent) &&
210 				vm_wait);
211 		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
212 
213 		posts_handler_get(ssk);
214 
215 		if (tx_sa->bytes_acked == tx_sa->bytes_sent)
216 			break;
217 
218 		if (vm_wait) {
219 			vm_wait -= current_timeo;
220 			current_timeo = *timeo_p;
221 			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
222 			    (current_timeo -= vm_wait) < 0)
223 				current_timeo = 0;
224 			vm_wait = 0;
225 		}
226 		*timeo_p = current_timeo;
227 	}
228 
229 	finish_wait(sk->sk_sleep, &wait);
230 
231 	sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n",
232 			tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags);
233 
234 	if (!ssk->qp_active) {
235 		sdp_dbg(sk, "QP destroyed while waiting\n");
236 		return -EINVAL;
237 	}
238 	return err;
239 }
240 
241 static void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk)
242 {
243 	struct socket *sk = ssk->socket;
244 	long timeo = HZ * 5; /* Timeout for RDMA read */
245 	DEFINE_WAIT(wait);
246 
247 	sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n");
248 	while (1) {
249 		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
250 
251 		if (!ssk->tx_ring.rdma_inflight->busy) {
252 			sdp_dbg_data(sk, "got rdma cqe\n");
253 			break;
254 		}
255 
256 		if (!ssk->qp_active) {
257 			sdp_dbg_data(sk, "QP destroyed\n");
258 			break;
259 		}
260 
261 		if (!timeo) {
262 			sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n");
263 			WARN_ON(1);
264 			break;
265 		}
266 
267 		posts_handler_put(ssk);
268 
269 		sdp_prf1(sk, NULL, "Going to sleep");
270 		sk_wait_event(sk, &timeo,
271 			!ssk->tx_ring.rdma_inflight->busy);
272 		sdp_prf1(sk, NULL, "Woke up");
273 		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
274 
275 		posts_handler_get(ssk);
276 	}
277 
278 	finish_wait(sk->sk_sleep, &wait);
279 
280 	sdp_dbg_data(sk, "Finished waiting\n");
281 }
282 
283 int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
284 		struct rx_srcavail_state *rx_sa)
285 {
286 	struct mbuf *mb;
287 	int copied = rx_sa->used - rx_sa->reported;
288 
289 	if (rx_sa->used <= rx_sa->reported)
290 		return 0;
291 
292 	mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0);
293 
294 	rx_sa->reported += copied;
295 
296 	/* TODO: What if no tx_credits available? */
297 	sdp_post_send(ssk, mb);
298 
299 	return 0;
300 }
301 
302 int sdp_post_sendsm(struct socket *sk)
303 {
304 	struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0);
305 
306 	sdp_post_send(sdp_sk(sk), mb);
307 
308 	return 0;
309 }
310 
311 static int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len)
312 {
313 	sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len);
314 	while (len > 0) {
315 		if (iov->iov_len) {
316 			int copy = min_t(unsigned int, iov->iov_len, len);
317 			len -= copy;
318 			iov->iov_len -= copy;
319 			iov->iov_base += copy;
320 		}
321 		iov++;
322 	}
323 
324 	return 0;
325 }
326 
327 static inline int sge_bytes(struct ib_sge *sge, int sge_cnt)
328 {
329 	int bytes = 0;
330 
331 	while (sge_cnt > 0) {
332 		bytes += sge->length;
333 		sge++;
334 		sge_cnt--;
335 	}
336 
337 	return bytes;
338 }
339 void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack)
340 {
341 	struct socket *sk = ssk->socket;
342 	unsigned long flags;
343 
344 	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
345 
346 	if (!ssk->tx_sa) {
347 		sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail");
348 		goto out;
349 	}
350 
351 	if (ssk->tx_sa->mseq > mseq_ack) {
352 		sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. "
353 			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
354 			mseq_ack, ssk->tx_sa->mseq);
355 		goto out;
356 	}
357 
358 	sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n");
359 
360 	ssk->tx_sa->abort_flags |= TX_SA_SENDSM;
361 	cancel_delayed_work(&ssk->srcavail_cancel_work);
362 
363 	wake_up(sk->sk_sleep);
364 	sdp_dbg_data(sk, "woke up sleepers\n");
365 
366 out:
367 	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
368 }
369 
370 void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
371 		u32 bytes_completed)
372 {
373 	struct socket *sk = ssk->socket;
374 	unsigned long flags;
375 
376 	sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa);
377 	sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa);
378 
379 	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
380 
381 	BUG_ON(!ssk);
382 
383 	if (!ssk->tx_sa) {
384 		sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n");
385 		goto out;
386 	}
387 
388 	if (ssk->tx_sa->mseq > mseq_ack) {
389 		sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. "
390 			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
391 			mseq_ack, ssk->tx_sa->mseq);
392 		goto out;
393 	}
394 
395 	ssk->tx_sa->bytes_acked += bytes_completed;
396 
397 	wake_up(sk->sk_sleep);
398 	sdp_dbg_data(sk, "woke up sleepers\n");
399 
400 out:
401 	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
402 	return;
403 }
404 
405 static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset)
406 {
407 	unsigned long avail;
408 	unsigned long lock_limit;
409 
410 	if (capable(CAP_IPC_LOCK))
411 		return ULONG_MAX;
412 
413 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
414 	avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT);
415 
416 	return avail - offset;
417 }
418 
419 static int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len,
420 	struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
421 {
422 	struct ib_pool_fmr *fmr;
423 	struct ib_umem *umem;
424 	struct ib_device *dev;
425 	u64 *pages;
426 	struct ib_umem_chunk *chunk;
427 	int n, j, k;
428 	int rc = 0;
429 	unsigned long max_lockable_bytes;
430 
431 	if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) {
432 		sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n",
433 			len, SDP_MAX_RDMA_READ_LEN);
434 		len = SDP_MAX_RDMA_READ_LEN;
435 	}
436 
437 	max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK);
438 	if (unlikely(len > max_lockable_bytes)) {
439 		sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n",
440 			len, max_lockable_bytes);
441 		len = max_lockable_bytes;
442 	}
443 
444 	sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n",
445 			uaddr, len, max_lockable_bytes);
446 
447 	umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len,
448 		IB_ACCESS_REMOTE_WRITE, 0);
449 
450 	if (IS_ERR(umem)) {
451 		rc = PTR_ERR(umem);
452 		sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc);
453 		sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n",
454 				current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur,
455 				current->signal->rlim[RLIMIT_MEMLOCK].rlim_max,
456 				capable(CAP_IPC_LOCK));
457 		goto err_umem_get;
458 	}
459 
460 	sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n",
461 		umem->offset, umem->length);
462 
463 	pages = (u64 *) __get_free_page(GFP_KERNEL);
464 	if (!pages)
465 		goto err_pages_alloc;
466 
467 	n = 0;
468 
469 	dev = sdp_sk(sk)->ib_device;
470 	list_for_each_entry(chunk, &umem->chunk_list, list) {
471 		for (j = 0; j < chunk->nmap; ++j) {
472 			len = ib_sg_dma_len(dev,
473 					&chunk->page_list[j]) >> PAGE_SHIFT;
474 
475 			for (k = 0; k < len; ++k) {
476 				pages[n++] = ib_sg_dma_address(dev,
477 						&chunk->page_list[j]) +
478 					umem->page_size * k;
479 
480 			}
481 		}
482 	}
483 
484 	fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0);
485 	if (IS_ERR(fmr)) {
486 		sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr));
487 		goto err_fmr_alloc;
488 	}
489 
490 	free_page((unsigned long) pages);
491 
492 	*_umem = umem;
493 	*_fmr = fmr;
494 
495 	return 0;
496 
497 err_fmr_alloc:
498 	free_page((unsigned long) pages);
499 
500 err_pages_alloc:
501 	ib_umem_release(umem);
502 
503 err_umem_get:
504 
505 	return rc;
506 }
507 
508 void sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
509 {
510 	if (!sdp_sk(sk)->qp_active)
511 		return;
512 
513 	ib_fmr_pool_unmap(*_fmr);
514 	*_fmr = NULL;
515 
516 	ib_umem_release(*_umem);
517 	*_umem = NULL;
518 }
519 
520 static int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa)
521 {
522 	struct sdp_sock *ssk = sdp_sk(sk);
523 	struct ib_send_wr *bad_wr;
524 	struct ib_send_wr wr = { NULL };
525 	struct ib_sge sge;
526 
527 	wr.opcode = IB_WR_RDMA_READ;
528 	wr.next = NULL;
529 	wr.wr_id = SDP_OP_RDMA;
530 	wr.wr.rdma.rkey = rx_sa->rkey;
531 	wr.send_flags = 0;
532 
533 	ssk->tx_ring.rdma_inflight = rx_sa;
534 
535 	sge.addr = rx_sa->umem->offset;
536 	sge.length = rx_sa->umem->length;
537 	sge.lkey = rx_sa->fmr->fmr->lkey;
538 
539 	wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used;
540 	wr.num_sge = 1;
541 	wr.sg_list = &sge;
542 	rx_sa->busy++;
543 
544 	wr.send_flags = IB_SEND_SIGNALED;
545 
546 	return ib_post_send(ssk->qp, &wr, &bad_wr);
547 }
548 
549 int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
550 		unsigned long *used)
551 {
552 	struct sdp_sock *ssk = sdp_sk(sk);
553 	struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb);
554 	int got_srcavail_cancel;
555 	int rc = 0;
556 	int len = *used;
557 	int copied;
558 
559 	sdp_dbg_data(ssk->socket, "preparing RDMA read."
560 		" len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len);
561 
562 	sock_hold(sk, SOCK_REF_RDMA_RD);
563 
564 	if (len > rx_sa->len) {
565 		sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len);
566 		WARN_ON(1);
567 		len = rx_sa->len;
568 	}
569 
570 	rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem);
571 	if (rc) {
572 		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
573 		goto err_alloc_fmr;
574 	}
575 
576 	rc = sdp_post_rdma_read(sk, rx_sa);
577 	if (unlikely(rc)) {
578 		sdp_warn(sk, "ib_post_send failed with status %d.\n", rc);
579 		sdp_set_error(ssk->socket, -ECONNRESET);
580 		wake_up(&ssk->wq);
581 		goto err_post_send;
582 	}
583 
584 	sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc);
585 
586 	got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq;
587 
588 	sdp_arm_tx_cq(sk);
589 
590 	sdp_wait_rdma_wr_finished(ssk);
591 
592 	sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc);
593 	if (!ssk->qp_active) {
594 		sdp_dbg_data(sk, "QP destroyed during RDMA read\n");
595 		rc = -EPIPE;
596 		goto err_post_send;
597 	}
598 
599 	copied = rx_sa->umem->length;
600 
601 	sdp_update_iov_used(sk, iov, copied);
602 	rx_sa->used += copied;
603 	atomic_add(copied, &ssk->rcv_nxt);
604 	*used = copied;
605 
606 	ssk->tx_ring.rdma_inflight = NULL;
607 
608 err_post_send:
609 	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
610 
611 err_alloc_fmr:
612 	if (rc && ssk->qp_active) {
613 		sdp_warn(sk, "Couldn't do RDMA - post sendsm\n");
614 		rx_sa->flags |= RX_SA_ABORTED;
615 	}
616 
617 	sock_put(sk, SOCK_REF_RDMA_RD);
618 
619 	return rc;
620 }
621 
622 static inline int wait_for_sndbuf(struct socket *sk, long *timeo_p)
623 {
624 	struct sdp_sock *ssk = sdp_sk(sk);
625 	int ret = 0;
626 	int credits_needed = 1;
627 
628 	sdp_dbg_data(sk, "Wait for mem\n");
629 
630 	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
631 
632 	SDPSTATS_COUNTER_INC(send_wait_for_mem);
633 
634 	sdp_do_posts(ssk);
635 
636 	sdp_xmit_poll(ssk, 1);
637 
638 	ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed);
639 
640 	return ret;
641 }
642 
643 static int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa,
644 		struct iovec *iov, long *timeo)
645 {
646 	struct sdp_sock *ssk = sdp_sk(sk);
647 	int rc = 0;
648 	unsigned long lock_flags;
649 
650 	rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len,
651 			&tx_sa->fmr, &tx_sa->umem);
652 	if (rc) {
653 		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
654 		goto err_alloc_fmr;
655 	}
656 
657 	if (tx_slots_free(ssk) == 0) {
658 		rc = wait_for_sndbuf(sk, timeo);
659 		if (rc) {
660 			sdp_warn(sk, "Couldn't get send buffer\n");
661 			goto err_no_tx_slots;
662 		}
663 	}
664 
665 	rc = sdp_post_srcavail(sk, tx_sa);
666 	if (rc) {
667 		sdp_dbg(sk, "Error posting SrcAvail\n");
668 		goto err_abort_send;
669 	}
670 
671 	rc = sdp_wait_rdmardcompl(ssk, timeo, 0);
672 	if (unlikely(rc)) {
673 		enum tx_sa_flag f = tx_sa->abort_flags;
674 
675 		if (f & TX_SA_SENDSM) {
676 			sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n");
677 		} else if (f & TX_SA_ERROR) {
678 			sdp_dbg_data(sk, "SrcAvail error completion\n");
679 			sdp_reset(sk);
680 			SDPSTATS_COUNTER_INC(zcopy_tx_error);
681 		} else if (ssk->qp_active) {
682 			sdp_post_srcavail_cancel(sk);
683 
684 			/* Wait for RdmaRdCompl/SendSM to
685 			 * finish the transaction */
686 			*timeo = 2 * HZ;
687 			sdp_dbg_data(sk, "Waiting for SendSM\n");
688 			sdp_wait_rdmardcompl(ssk, timeo, 1);
689 			sdp_dbg_data(sk, "finished waiting\n");
690 
691 			cancel_delayed_work(&ssk->srcavail_cancel_work);
692 		} else {
693 			sdp_dbg_data(sk, "QP was destroyed while waiting\n");
694 		}
695 	} else {
696 		sdp_dbg_data(sk, "got RdmaRdCompl\n");
697 	}
698 
699 	spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags);
700 	ssk->tx_sa = NULL;
701 	spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags);
702 
703 err_abort_send:
704 	sdp_update_iov_used(sk, iov, tx_sa->bytes_acked);
705 
706 err_no_tx_slots:
707 	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
708 
709 err_alloc_fmr:
710 	return rc;
711 }
712 
713 int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov)
714 {
715 	struct sdp_sock *ssk = sdp_sk(sk);
716 	int rc = 0;
717 	long timeo;
718 	struct tx_srcavail_state *tx_sa;
719 	int offset;
720 	size_t bytes_to_copy = 0;
721 	int copied = 0;
722 
723 	sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n",
724 			iov->iov_base, iov->iov_len);
725 	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start");
726 	if (ssk->rx_sa) {
727 		sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n");
728 		return 0;
729 	}
730 
731 	sock_hold(ssk->socket, SOCK_REF_ZCOPY);
732 
733 	SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment);
734 
735 	timeo = SDP_SRCAVAIL_ADV_TIMEOUT ;
736 
737 	/* Ok commence sending. */
738 	offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1);
739 
740 	tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL);
741 	if (!tx_sa) {
742 		sdp_warn(sk, "Error allocating zcopy context\n");
743 		rc = -EAGAIN; /* Buffer too big - fallback to bcopy */
744 		goto err_alloc_tx_sa;
745 	}
746 
747 	bytes_to_copy = iov->iov_len;
748 	do {
749 		tx_sa_reset(tx_sa);
750 
751 		rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo);
752 
753 		if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) {
754 			sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n",
755 				iov->iov_len);
756 			break;
757 		}
758 	} while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags);
759 
760 	kfree(tx_sa);
761 err_alloc_tx_sa:
762 	copied = bytes_to_copy - iov->iov_len;
763 
764 	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied);
765 
766 	sock_put(ssk->socket, SOCK_REF_ZCOPY);
767 
768 	if (rc < 0 && rc != -EAGAIN && rc != -ETIME)
769 		return rc;
770 
771 	return copied;
772 }
773 
774 void sdp_abort_srcavail(struct socket *sk)
775 {
776 	struct sdp_sock *ssk = sdp_sk(sk);
777 	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
778 	unsigned long flags;
779 
780 	if (!tx_sa)
781 		return;
782 
783 	cancel_delayed_work(&ssk->srcavail_cancel_work);
784 	flush_scheduled_work();
785 
786 	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
787 
788 	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
789 
790 	ssk->tx_sa = NULL;
791 
792 	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
793 }
794 
795 void sdp_abort_rdma_read(struct socket *sk)
796 {
797 	struct sdp_sock *ssk = sdp_sk(sk);
798 	struct rx_srcavail_state *rx_sa = ssk->rx_sa;
799 
800 	if (!rx_sa)
801 		return;
802 
803 	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
804 
805 	ssk->rx_sa = NULL;
806 }
807