xref: /linux/net/sunrpc/xprtrdma/frwr_ops.c (revision 6fdcba32711044c35c0e1b094cbd8f3f0b4472c9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
4  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5  */
6 
7 /* Lightweight memory registration using Fast Registration Work
8  * Requests (FRWR).
9  *
10  * FRWR features ordered asynchronous registration and invalidation
11  * of arbitrarily-sized memory regions. This is the fastest and safest
12  * but most complex memory registration mode.
13  */
14 
15 /* Normal operation
16  *
17  * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
18  * Work Request (frwr_map). When the RDMA operation is finished, this
19  * Memory Region is invalidated using a LOCAL_INV Work Request
20  * (frwr_unmap_async and frwr_unmap_sync).
21  *
22  * Typically FAST_REG Work Requests are not signaled, and neither are
23  * RDMA Send Work Requests (with the exception of signaling occasionally
24  * to prevent provider work queue overflows). This greatly reduces HCA
25  * interrupt workload.
26  */
27 
28 /* Transport recovery
29  *
30  * frwr_map and frwr_unmap_* cannot run at the same time the transport
31  * connect worker is running. The connect worker holds the transport
32  * send lock, just as ->send_request does. This prevents frwr_map and
33  * the connect worker from running concurrently. When a connection is
34  * closed, the Receive completion queue is drained before the allowing
35  * the connect worker to get control. This prevents frwr_unmap and the
36  * connect worker from running concurrently.
37  *
38  * When the underlying transport disconnects, MRs that are in flight
39  * are flushed and are likely unusable. Thus all MRs are destroyed.
40  * New MRs are created on demand.
41  */
42 
43 #include <linux/sunrpc/rpc_rdma.h>
44 #include <linux/sunrpc/svc_rdma.h>
45 
46 #include "xprt_rdma.h"
47 #include <trace/events/rpcrdma.h>
48 
49 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
50 # define RPCDBG_FACILITY	RPCDBG_TRANS
51 #endif
52 
53 /**
54  * frwr_is_supported - Check if device supports FRWR
55  * @device: interface adapter to check
56  *
57  * Returns true if device supports FRWR, otherwise false
58  */
59 bool frwr_is_supported(struct ib_device *device)
60 {
61 	struct ib_device_attr *attrs = &device->attrs;
62 
63 	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
64 		goto out_not_supported;
65 	if (attrs->max_fast_reg_page_list_len == 0)
66 		goto out_not_supported;
67 	return true;
68 
69 out_not_supported:
70 	pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
71 		device->name);
72 	return false;
73 }
74 
75 /**
76  * frwr_release_mr - Destroy one MR
77  * @mr: MR allocated by frwr_init_mr
78  *
79  */
80 void frwr_release_mr(struct rpcrdma_mr *mr)
81 {
82 	int rc;
83 
84 	rc = ib_dereg_mr(mr->frwr.fr_mr);
85 	if (rc)
86 		trace_xprtrdma_frwr_dereg(mr, rc);
87 	kfree(mr->mr_sg);
88 	kfree(mr);
89 }
90 
91 static void frwr_mr_recycle(struct rpcrdma_mr *mr)
92 {
93 	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
94 
95 	trace_xprtrdma_mr_recycle(mr);
96 
97 	if (mr->mr_dir != DMA_NONE) {
98 		trace_xprtrdma_mr_unmap(mr);
99 		ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
100 				mr->mr_sg, mr->mr_nents, mr->mr_dir);
101 		mr->mr_dir = DMA_NONE;
102 	}
103 
104 	spin_lock(&r_xprt->rx_buf.rb_lock);
105 	list_del(&mr->mr_all);
106 	r_xprt->rx_stats.mrs_recycled++;
107 	spin_unlock(&r_xprt->rx_buf.rb_lock);
108 
109 	frwr_release_mr(mr);
110 }
111 
112 /* frwr_reset - Place MRs back on the free list
113  * @req: request to reset
114  *
115  * Used after a failed marshal. For FRWR, this means the MRs
116  * don't have to be fully released and recreated.
117  *
118  * NB: This is safe only as long as none of @req's MRs are
119  * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
120  * Work Request.
121  */
122 void frwr_reset(struct rpcrdma_req *req)
123 {
124 	struct rpcrdma_mr *mr;
125 
126 	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
127 		rpcrdma_mr_put(mr);
128 }
129 
130 /**
131  * frwr_init_mr - Initialize one MR
132  * @ia: interface adapter
133  * @mr: generic MR to prepare for FRWR
134  *
135  * Returns zero if successful. Otherwise a negative errno
136  * is returned.
137  */
138 int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
139 {
140 	unsigned int depth = ia->ri_max_frwr_depth;
141 	struct scatterlist *sg;
142 	struct ib_mr *frmr;
143 	int rc;
144 
145 	frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
146 	if (IS_ERR(frmr))
147 		goto out_mr_err;
148 
149 	sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
150 	if (!sg)
151 		goto out_list_err;
152 
153 	mr->frwr.fr_mr = frmr;
154 	mr->mr_dir = DMA_NONE;
155 	INIT_LIST_HEAD(&mr->mr_list);
156 	init_completion(&mr->frwr.fr_linv_done);
157 
158 	sg_init_table(sg, depth);
159 	mr->mr_sg = sg;
160 	return 0;
161 
162 out_mr_err:
163 	rc = PTR_ERR(frmr);
164 	trace_xprtrdma_frwr_alloc(mr, rc);
165 	return rc;
166 
167 out_list_err:
168 	ib_dereg_mr(frmr);
169 	return -ENOMEM;
170 }
171 
172 /**
173  * frwr_open - Prepare an endpoint for use with FRWR
174  * @ia: interface adapter this endpoint will use
175  * @ep: endpoint to prepare
176  *
177  * On success, sets:
178  *	ep->rep_attr.cap.max_send_wr
179  *	ep->rep_attr.cap.max_recv_wr
180  *	ep->rep_max_requests
181  *	ia->ri_max_segs
182  *
183  * And these FRWR-related fields:
184  *	ia->ri_max_frwr_depth
185  *	ia->ri_mrtype
186  *
187  * On failure, a negative errno is returned.
188  */
189 int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
190 {
191 	struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
192 	int max_qp_wr, depth, delta;
193 
194 	ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
195 	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
196 		ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
197 
198 	/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
199 	 * capability, but perform optimally when the MRs are not larger
200 	 * than a page.
201 	 */
202 	if (attrs->max_sge_rd > 1)
203 		ia->ri_max_frwr_depth = attrs->max_sge_rd;
204 	else
205 		ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
206 	if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
207 		ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
208 	dprintk("RPC:       %s: max FR page list depth = %u\n",
209 		__func__, ia->ri_max_frwr_depth);
210 
211 	/* Add room for frwr register and invalidate WRs.
212 	 * 1. FRWR reg WR for head
213 	 * 2. FRWR invalidate WR for head
214 	 * 3. N FRWR reg WRs for pagelist
215 	 * 4. N FRWR invalidate WRs for pagelist
216 	 * 5. FRWR reg WR for tail
217 	 * 6. FRWR invalidate WR for tail
218 	 * 7. The RDMA_SEND WR
219 	 */
220 	depth = 7;
221 
222 	/* Calculate N if the device max FRWR depth is smaller than
223 	 * RPCRDMA_MAX_DATA_SEGS.
224 	 */
225 	if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
226 		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
227 		do {
228 			depth += 2; /* FRWR reg + invalidate */
229 			delta -= ia->ri_max_frwr_depth;
230 		} while (delta > 0);
231 	}
232 
233 	max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
234 	max_qp_wr -= RPCRDMA_BACKWARD_WRS;
235 	max_qp_wr -= 1;
236 	if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
237 		return -ENOMEM;
238 	if (ep->rep_max_requests > max_qp_wr)
239 		ep->rep_max_requests = max_qp_wr;
240 	ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
241 	if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
242 		ep->rep_max_requests = max_qp_wr / depth;
243 		if (!ep->rep_max_requests)
244 			return -EINVAL;
245 		ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
246 	}
247 	ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
248 	ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
249 	ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
250 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
251 	ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
252 
253 	ia->ri_max_segs =
254 		DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
255 	/* Reply chunks require segments for head and tail buffers */
256 	ia->ri_max_segs += 2;
257 	if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
258 		ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
259 	return 0;
260 }
261 
262 /**
263  * frwr_maxpages - Compute size of largest payload
264  * @r_xprt: transport
265  *
266  * Returns maximum size of an RPC message, in pages.
267  *
268  * FRWR mode conveys a list of pages per chunk segment. The
269  * maximum length of that list is the FRWR page list depth.
270  */
271 size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
272 {
273 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
274 
275 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
276 		     (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
277 }
278 
279 /**
280  * frwr_map - Register a memory region
281  * @r_xprt: controlling transport
282  * @seg: memory region co-ordinates
283  * @nsegs: number of segments remaining
284  * @writing: true when RDMA Write will be used
285  * @xid: XID of RPC using the registered memory
286  * @mr: MR to fill in
287  *
288  * Prepare a REG_MR Work Request to register a memory region
289  * for remote access via RDMA READ or RDMA WRITE.
290  *
291  * Returns the next segment or a negative errno pointer.
292  * On success, @mr is filled in.
293  */
294 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
295 				struct rpcrdma_mr_seg *seg,
296 				int nsegs, bool writing, __be32 xid,
297 				struct rpcrdma_mr *mr)
298 {
299 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
300 	struct ib_reg_wr *reg_wr;
301 	struct ib_mr *ibmr;
302 	int i, n;
303 	u8 key;
304 
305 	if (nsegs > ia->ri_max_frwr_depth)
306 		nsegs = ia->ri_max_frwr_depth;
307 	for (i = 0; i < nsegs;) {
308 		if (seg->mr_page)
309 			sg_set_page(&mr->mr_sg[i],
310 				    seg->mr_page,
311 				    seg->mr_len,
312 				    offset_in_page(seg->mr_offset));
313 		else
314 			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
315 				   seg->mr_len);
316 
317 		++seg;
318 		++i;
319 		if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
320 			continue;
321 		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
322 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
323 			break;
324 	}
325 	mr->mr_dir = rpcrdma_data_dir(writing);
326 
327 	mr->mr_nents =
328 		ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
329 	if (!mr->mr_nents)
330 		goto out_dmamap_err;
331 
332 	ibmr = mr->frwr.fr_mr;
333 	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
334 	if (unlikely(n != mr->mr_nents))
335 		goto out_mapmr_err;
336 
337 	ibmr->iova &= 0x00000000ffffffff;
338 	ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32;
339 	key = (u8)(ibmr->rkey & 0x000000FF);
340 	ib_update_fast_reg_key(ibmr, ++key);
341 
342 	reg_wr = &mr->frwr.fr_regwr;
343 	reg_wr->mr = ibmr;
344 	reg_wr->key = ibmr->rkey;
345 	reg_wr->access = writing ?
346 			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
347 			 IB_ACCESS_REMOTE_READ;
348 
349 	mr->mr_handle = ibmr->rkey;
350 	mr->mr_length = ibmr->length;
351 	mr->mr_offset = ibmr->iova;
352 	trace_xprtrdma_mr_map(mr);
353 
354 	return seg;
355 
356 out_dmamap_err:
357 	mr->mr_dir = DMA_NONE;
358 	trace_xprtrdma_frwr_sgerr(mr, i);
359 	return ERR_PTR(-EIO);
360 
361 out_mapmr_err:
362 	trace_xprtrdma_frwr_maperr(mr, n);
363 	return ERR_PTR(-EIO);
364 }
365 
366 /**
367  * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
368  * @cq:	completion queue (ignored)
369  * @wc:	completed WR
370  *
371  */
372 static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
373 {
374 	struct ib_cqe *cqe = wc->wr_cqe;
375 	struct rpcrdma_frwr *frwr =
376 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
377 
378 	/* WARNING: Only wr_cqe and status are reliable at this point */
379 	trace_xprtrdma_wc_fastreg(wc, frwr);
380 	/* The MR will get recycled when the associated req is retransmitted */
381 }
382 
383 /**
384  * frwr_send - post Send WR containing the RPC Call message
385  * @ia: interface adapter
386  * @req: Prepared RPC Call
387  *
388  * For FRWR, chain any FastReg WRs to the Send WR. Only a
389  * single ib_post_send call is needed to register memory
390  * and then post the Send WR.
391  *
392  * Returns the result of ib_post_send.
393  */
394 int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
395 {
396 	struct ib_send_wr *post_wr;
397 	struct rpcrdma_mr *mr;
398 
399 	post_wr = &req->rl_wr;
400 	list_for_each_entry(mr, &req->rl_registered, mr_list) {
401 		struct rpcrdma_frwr *frwr;
402 
403 		frwr = &mr->frwr;
404 
405 		frwr->fr_cqe.done = frwr_wc_fastreg;
406 		frwr->fr_regwr.wr.next = post_wr;
407 		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
408 		frwr->fr_regwr.wr.num_sge = 0;
409 		frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
410 		frwr->fr_regwr.wr.send_flags = 0;
411 
412 		post_wr = &frwr->fr_regwr.wr;
413 	}
414 
415 	return ib_post_send(ia->ri_id->qp, post_wr, NULL);
416 }
417 
418 /**
419  * frwr_reminv - handle a remotely invalidated mr on the @mrs list
420  * @rep: Received reply
421  * @mrs: list of MRs to check
422  *
423  */
424 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
425 {
426 	struct rpcrdma_mr *mr;
427 
428 	list_for_each_entry(mr, mrs, mr_list)
429 		if (mr->mr_handle == rep->rr_inv_rkey) {
430 			list_del_init(&mr->mr_list);
431 			trace_xprtrdma_mr_remoteinv(mr);
432 			rpcrdma_mr_put(mr);
433 			break;	/* only one invalidated MR per RPC */
434 		}
435 }
436 
437 static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
438 {
439 	if (wc->status != IB_WC_SUCCESS)
440 		frwr_mr_recycle(mr);
441 	else
442 		rpcrdma_mr_put(mr);
443 }
444 
445 /**
446  * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
447  * @cq:	completion queue (ignored)
448  * @wc:	completed WR
449  *
450  */
451 static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
452 {
453 	struct ib_cqe *cqe = wc->wr_cqe;
454 	struct rpcrdma_frwr *frwr =
455 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
456 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
457 
458 	/* WARNING: Only wr_cqe and status are reliable at this point */
459 	trace_xprtrdma_wc_li(wc, frwr);
460 	__frwr_release_mr(wc, mr);
461 }
462 
463 /**
464  * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
465  * @cq:	completion queue (ignored)
466  * @wc:	completed WR
467  *
468  * Awaken anyone waiting for an MR to finish being fenced.
469  */
470 static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
471 {
472 	struct ib_cqe *cqe = wc->wr_cqe;
473 	struct rpcrdma_frwr *frwr =
474 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
475 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
476 
477 	/* WARNING: Only wr_cqe and status are reliable at this point */
478 	trace_xprtrdma_wc_li_wake(wc, frwr);
479 	__frwr_release_mr(wc, mr);
480 	complete(&frwr->fr_linv_done);
481 }
482 
483 /**
484  * frwr_unmap_sync - invalidate memory regions that were registered for @req
485  * @r_xprt: controlling transport instance
486  * @req: rpcrdma_req with a non-empty list of MRs to process
487  *
488  * Sleeps until it is safe for the host CPU to access the previously mapped
489  * memory regions. This guarantees that registered MRs are properly fenced
490  * from the server before the RPC consumer accesses the data in them. It
491  * also ensures proper Send flow control: waking the next RPC waits until
492  * this RPC has relinquished all its Send Queue entries.
493  */
494 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
495 {
496 	struct ib_send_wr *first, **prev, *last;
497 	const struct ib_send_wr *bad_wr;
498 	struct rpcrdma_frwr *frwr;
499 	struct rpcrdma_mr *mr;
500 	int rc;
501 
502 	/* ORDER: Invalidate all of the MRs first
503 	 *
504 	 * Chain the LOCAL_INV Work Requests and post them with
505 	 * a single ib_post_send() call.
506 	 */
507 	frwr = NULL;
508 	prev = &first;
509 	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
510 
511 		trace_xprtrdma_mr_localinv(mr);
512 		r_xprt->rx_stats.local_inv_needed++;
513 
514 		frwr = &mr->frwr;
515 		frwr->fr_cqe.done = frwr_wc_localinv;
516 		last = &frwr->fr_invwr;
517 		last->next = NULL;
518 		last->wr_cqe = &frwr->fr_cqe;
519 		last->sg_list = NULL;
520 		last->num_sge = 0;
521 		last->opcode = IB_WR_LOCAL_INV;
522 		last->send_flags = IB_SEND_SIGNALED;
523 		last->ex.invalidate_rkey = mr->mr_handle;
524 
525 		*prev = last;
526 		prev = &last->next;
527 	}
528 
529 	/* Strong send queue ordering guarantees that when the
530 	 * last WR in the chain completes, all WRs in the chain
531 	 * are complete.
532 	 */
533 	frwr->fr_cqe.done = frwr_wc_localinv_wake;
534 	reinit_completion(&frwr->fr_linv_done);
535 
536 	/* Transport disconnect drains the receive CQ before it
537 	 * replaces the QP. The RPC reply handler won't call us
538 	 * unless ri_id->qp is a valid pointer.
539 	 */
540 	bad_wr = NULL;
541 	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
542 
543 	/* The final LOCAL_INV WR in the chain is supposed to
544 	 * do the wake. If it was never posted, the wake will
545 	 * not happen, so don't wait in that case.
546 	 */
547 	if (bad_wr != first)
548 		wait_for_completion(&frwr->fr_linv_done);
549 	if (!rc)
550 		return;
551 
552 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
553 	 */
554 	trace_xprtrdma_post_linv(req, rc);
555 	while (bad_wr) {
556 		frwr = container_of(bad_wr, struct rpcrdma_frwr,
557 				    fr_invwr);
558 		mr = container_of(frwr, struct rpcrdma_mr, frwr);
559 		bad_wr = bad_wr->next;
560 
561 		list_del_init(&mr->mr_list);
562 		frwr_mr_recycle(mr);
563 	}
564 }
565 
566 /**
567  * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
568  * @cq:	completion queue (ignored)
569  * @wc:	completed WR
570  *
571  */
572 static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
573 {
574 	struct ib_cqe *cqe = wc->wr_cqe;
575 	struct rpcrdma_frwr *frwr =
576 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
577 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
578 	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
579 
580 	/* WARNING: Only wr_cqe and status are reliable at this point */
581 	trace_xprtrdma_wc_li_done(wc, frwr);
582 	__frwr_release_mr(wc, mr);
583 
584 	/* Ensure @rep is generated before __frwr_release_mr */
585 	smp_rmb();
586 	rpcrdma_complete_rqst(rep);
587 }
588 
589 /**
590  * frwr_unmap_async - invalidate memory regions that were registered for @req
591  * @r_xprt: controlling transport instance
592  * @req: rpcrdma_req with a non-empty list of MRs to process
593  *
594  * This guarantees that registered MRs are properly fenced from the
595  * server before the RPC consumer accesses the data in them. It also
596  * ensures proper Send flow control: waking the next RPC waits until
597  * this RPC has relinquished all its Send Queue entries.
598  */
599 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
600 {
601 	struct ib_send_wr *first, *last, **prev;
602 	const struct ib_send_wr *bad_wr;
603 	struct rpcrdma_frwr *frwr;
604 	struct rpcrdma_mr *mr;
605 	int rc;
606 
607 	/* Chain the LOCAL_INV Work Requests and post them with
608 	 * a single ib_post_send() call.
609 	 */
610 	frwr = NULL;
611 	prev = &first;
612 	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
613 
614 		trace_xprtrdma_mr_localinv(mr);
615 		r_xprt->rx_stats.local_inv_needed++;
616 
617 		frwr = &mr->frwr;
618 		frwr->fr_cqe.done = frwr_wc_localinv;
619 		last = &frwr->fr_invwr;
620 		last->next = NULL;
621 		last->wr_cqe = &frwr->fr_cqe;
622 		last->sg_list = NULL;
623 		last->num_sge = 0;
624 		last->opcode = IB_WR_LOCAL_INV;
625 		last->send_flags = IB_SEND_SIGNALED;
626 		last->ex.invalidate_rkey = mr->mr_handle;
627 
628 		*prev = last;
629 		prev = &last->next;
630 	}
631 
632 	/* Strong send queue ordering guarantees that when the
633 	 * last WR in the chain completes, all WRs in the chain
634 	 * are complete. The last completion will wake up the
635 	 * RPC waiter.
636 	 */
637 	frwr->fr_cqe.done = frwr_wc_localinv_done;
638 
639 	/* Transport disconnect drains the receive CQ before it
640 	 * replaces the QP. The RPC reply handler won't call us
641 	 * unless ri_id->qp is a valid pointer.
642 	 */
643 	bad_wr = NULL;
644 	rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
645 	if (!rc)
646 		return;
647 
648 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
649 	 */
650 	trace_xprtrdma_post_linv(req, rc);
651 	while (bad_wr) {
652 		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
653 		mr = container_of(frwr, struct rpcrdma_mr, frwr);
654 		bad_wr = bad_wr->next;
655 
656 		frwr_mr_recycle(mr);
657 	}
658 
659 	/* The final LOCAL_INV WR in the chain is supposed to
660 	 * do the wake. If it was never posted, the wake will
661 	 * not happen, so wake here in that case.
662 	 */
663 	rpcrdma_complete_rqst(req->rl_reply);
664 }
665