1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2016-2018 Oracle. All rights reserved.
4 *
5 * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
6 */
7
8 #include <rdma/rw.h>
9
10 #include <linux/sunrpc/xdr.h>
11 #include <linux/sunrpc/rpc_rdma.h>
12 #include <linux/sunrpc/svc_rdma.h>
13
14 #include "xprt_rdma.h"
15 #include <trace/events/rpcrdma.h>
16
17 static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
18 static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
19
20 /* Each R/W context contains state for one chain of RDMA Read or
21 * Write Work Requests.
22 *
23 * Each WR chain handles a single contiguous server-side buffer,
24 * because scatterlist entries after the first have to start on
25 * page alignment. xdr_buf iovecs cannot guarantee alignment.
26 *
27 * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
28 * from a client may contain a unique R_key, so each WR chain moves
29 * up to one segment at a time.
30 *
31 * The scatterlist makes this data structure over 4KB in size. To
32 * make it less likely to fail, and to handle the allocation for
33 * smaller I/O requests without disabling bottom-halves, these
34 * contexts are created on demand, but cached and reused until the
35 * controlling svcxprt_rdma is destroyed.
36 */
37 struct svc_rdma_rw_ctxt {
38 struct llist_node rw_node;
39 struct list_head rw_list;
40 struct rdma_rw_ctx rw_ctx;
41 unsigned int rw_nents;
42 unsigned int rw_first_sgl_nents;
43 struct sg_table rw_sg_table;
44 struct scatterlist rw_first_sgl[];
45 };
46
47 static inline struct svc_rdma_rw_ctxt *
svc_rdma_next_ctxt(struct list_head * list)48 svc_rdma_next_ctxt(struct list_head *list)
49 {
50 return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
51 rw_list);
52 }
53
54 static struct svc_rdma_rw_ctxt *
svc_rdma_get_rw_ctxt(struct svcxprt_rdma * rdma,unsigned int sges)55 svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
56 {
57 struct ib_device *dev = rdma->sc_cm_id->device;
58 unsigned int first_sgl_nents = dev->attrs.max_send_sge;
59 struct svc_rdma_rw_ctxt *ctxt;
60 struct llist_node *node;
61
62 spin_lock(&rdma->sc_rw_ctxt_lock);
63 node = llist_del_first(&rdma->sc_rw_ctxts);
64 spin_unlock(&rdma->sc_rw_ctxt_lock);
65 if (node) {
66 ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
67 } else {
68 ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents),
69 GFP_KERNEL, ibdev_to_node(dev));
70 if (!ctxt)
71 goto out_noctx;
72
73 INIT_LIST_HEAD(&ctxt->rw_list);
74 ctxt->rw_first_sgl_nents = first_sgl_nents;
75 }
76
77 ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
78 if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
79 ctxt->rw_sg_table.sgl,
80 first_sgl_nents))
81 goto out_free;
82 return ctxt;
83
84 out_free:
85 kfree(ctxt);
86 out_noctx:
87 trace_svcrdma_rwctx_empty(rdma, sges);
88 return NULL;
89 }
90
__svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt * ctxt,struct llist_head * list)91 static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
92 struct llist_head *list)
93 {
94 sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents);
95 llist_add(&ctxt->rw_node, list);
96 }
97
svc_rdma_put_rw_ctxt(struct svcxprt_rdma * rdma,struct svc_rdma_rw_ctxt * ctxt)98 static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
99 struct svc_rdma_rw_ctxt *ctxt)
100 {
101 __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts);
102 }
103
104 /**
105 * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
106 * @rdma: transport about to be destroyed
107 *
108 */
svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma * rdma)109 void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
110 {
111 struct svc_rdma_rw_ctxt *ctxt;
112 struct llist_node *node;
113
114 while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) {
115 ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
116 kfree(ctxt);
117 }
118 }
119
120 /**
121 * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O
122 * @rdma: controlling transport instance
123 * @ctxt: R/W context to prepare
124 * @offset: RDMA offset
125 * @handle: RDMA tag/handle
126 * @direction: I/O direction
127 *
128 * Returns on success, the number of WQEs that will be needed
129 * on the workqueue, or a negative errno.
130 */
svc_rdma_rw_ctx_init(struct svcxprt_rdma * rdma,struct svc_rdma_rw_ctxt * ctxt,u64 offset,u32 handle,enum dma_data_direction direction)131 static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
132 struct svc_rdma_rw_ctxt *ctxt,
133 u64 offset, u32 handle,
134 enum dma_data_direction direction)
135 {
136 int ret;
137
138 ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
139 ctxt->rw_sg_table.sgl, ctxt->rw_nents,
140 0, offset, handle, direction);
141 if (unlikely(ret < 0)) {
142 trace_svcrdma_dma_map_rw_err(rdma, offset, handle,
143 ctxt->rw_nents, ret);
144 svc_rdma_put_rw_ctxt(rdma, ctxt);
145 }
146 return ret;
147 }
148
149 /**
150 * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt
151 * @rdma: controlling transport instance
152 * @cc: svc_rdma_chunk_ctxt to be initialized
153 */
svc_rdma_cc_init(struct svcxprt_rdma * rdma,struct svc_rdma_chunk_ctxt * cc)154 void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
155 struct svc_rdma_chunk_ctxt *cc)
156 {
157 struct rpc_rdma_cid *cid = &cc->cc_cid;
158
159 if (unlikely(!cid->ci_completion_id))
160 svc_rdma_send_cid_init(rdma, cid);
161
162 INIT_LIST_HEAD(&cc->cc_rwctxts);
163 cc->cc_sqecount = 0;
164 }
165
166 /**
167 * svc_rdma_cc_release - Release resources held by a svc_rdma_chunk_ctxt
168 * @rdma: controlling transport instance
169 * @cc: svc_rdma_chunk_ctxt to be released
170 * @dir: DMA direction
171 */
svc_rdma_cc_release(struct svcxprt_rdma * rdma,struct svc_rdma_chunk_ctxt * cc,enum dma_data_direction dir)172 void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
173 struct svc_rdma_chunk_ctxt *cc,
174 enum dma_data_direction dir)
175 {
176 struct llist_node *first, *last;
177 struct svc_rdma_rw_ctxt *ctxt;
178 LLIST_HEAD(free);
179
180 trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
181
182 first = last = NULL;
183 while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
184 list_del(&ctxt->rw_list);
185
186 rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
187 rdma->sc_port_num, ctxt->rw_sg_table.sgl,
188 ctxt->rw_nents, dir);
189 __svc_rdma_put_rw_ctxt(ctxt, &free);
190
191 ctxt->rw_node.next = first;
192 first = &ctxt->rw_node;
193 if (!last)
194 last = first;
195 }
196 if (first)
197 llist_add_batch(first, last, &rdma->sc_rw_ctxts);
198 }
199
200 static struct svc_rdma_write_info *
svc_rdma_write_info_alloc(struct svcxprt_rdma * rdma,const struct svc_rdma_chunk * chunk)201 svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
202 const struct svc_rdma_chunk *chunk)
203 {
204 struct svc_rdma_write_info *info;
205
206 info = kzalloc_node(sizeof(*info), GFP_KERNEL,
207 ibdev_to_node(rdma->sc_cm_id->device));
208 if (!info)
209 return info;
210
211 info->wi_rdma = rdma;
212 info->wi_chunk = chunk;
213 svc_rdma_cc_init(rdma, &info->wi_cc);
214 info->wi_cc.cc_cqe.done = svc_rdma_write_done;
215 return info;
216 }
217
svc_rdma_write_info_free_async(struct work_struct * work)218 static void svc_rdma_write_info_free_async(struct work_struct *work)
219 {
220 struct svc_rdma_write_info *info;
221
222 info = container_of(work, struct svc_rdma_write_info, wi_work);
223 svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE);
224 kfree(info);
225 }
226
svc_rdma_write_info_free(struct svc_rdma_write_info * info)227 static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
228 {
229 INIT_WORK(&info->wi_work, svc_rdma_write_info_free_async);
230 queue_work(svcrdma_wq, &info->wi_work);
231 }
232
233 /**
234 * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
235 * @rdma: controlling transport
236 * @ctxt: Send context that is being released
237 */
svc_rdma_reply_chunk_release(struct svcxprt_rdma * rdma,struct svc_rdma_send_ctxt * ctxt)238 void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
239 struct svc_rdma_send_ctxt *ctxt)
240 {
241 struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc;
242
243 if (!cc->cc_sqecount)
244 return;
245 svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE);
246 }
247
248 /**
249 * svc_rdma_reply_done - Reply chunk Write completion handler
250 * @cq: controlling Completion Queue
251 * @wc: Work Completion report
252 *
253 * Pages under I/O are released by a subsequent Send completion.
254 */
svc_rdma_reply_done(struct ib_cq * cq,struct ib_wc * wc)255 static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
256 {
257 struct ib_cqe *cqe = wc->wr_cqe;
258 struct svc_rdma_chunk_ctxt *cc =
259 container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
260 struct svcxprt_rdma *rdma = cq->cq_context;
261
262 switch (wc->status) {
263 case IB_WC_SUCCESS:
264 trace_svcrdma_wc_reply(&cc->cc_cid);
265 return;
266 case IB_WC_WR_FLUSH_ERR:
267 trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid);
268 break;
269 default:
270 trace_svcrdma_wc_reply_err(wc, &cc->cc_cid);
271 }
272
273 svc_xprt_deferred_close(&rdma->sc_xprt);
274 }
275
276 /**
277 * svc_rdma_write_done - Write chunk completion
278 * @cq: controlling Completion Queue
279 * @wc: Work Completion
280 *
281 * Pages under I/O are freed by a subsequent Send completion.
282 */
svc_rdma_write_done(struct ib_cq * cq,struct ib_wc * wc)283 static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
284 {
285 struct svcxprt_rdma *rdma = cq->cq_context;
286 struct ib_cqe *cqe = wc->wr_cqe;
287 struct svc_rdma_chunk_ctxt *cc =
288 container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
289 struct svc_rdma_write_info *info =
290 container_of(cc, struct svc_rdma_write_info, wi_cc);
291
292 switch (wc->status) {
293 case IB_WC_SUCCESS:
294 trace_svcrdma_wc_write(&cc->cc_cid);
295 break;
296 case IB_WC_WR_FLUSH_ERR:
297 trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
298 break;
299 default:
300 trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
301 }
302
303 svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
304
305 if (unlikely(wc->status != IB_WC_SUCCESS))
306 svc_xprt_deferred_close(&rdma->sc_xprt);
307
308 svc_rdma_write_info_free(info);
309 }
310
311 /**
312 * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx
313 * @cq: controlling Completion Queue
314 * @wc: Work Completion
315 *
316 */
svc_rdma_wc_read_done(struct ib_cq * cq,struct ib_wc * wc)317 static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
318 {
319 struct svcxprt_rdma *rdma = cq->cq_context;
320 struct ib_cqe *cqe = wc->wr_cqe;
321 struct svc_rdma_chunk_ctxt *cc =
322 container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
323 struct svc_rdma_recv_ctxt *ctxt;
324
325 svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
326
327 ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc);
328 switch (wc->status) {
329 case IB_WC_SUCCESS:
330 trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes,
331 cc->cc_posttime);
332
333 spin_lock(&rdma->sc_rq_dto_lock);
334 list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q);
335 /* the unlock pairs with the smp_rmb in svc_xprt_ready */
336 set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
337 spin_unlock(&rdma->sc_rq_dto_lock);
338 svc_xprt_enqueue(&rdma->sc_xprt);
339 return;
340 case IB_WC_WR_FLUSH_ERR:
341 trace_svcrdma_wc_read_flush(wc, &cc->cc_cid);
342 break;
343 default:
344 trace_svcrdma_wc_read_err(wc, &cc->cc_cid);
345 }
346
347 /* The RDMA Read has flushed, so the incoming RPC message
348 * cannot be constructed and must be dropped. Signal the
349 * loss to the client by closing the connection.
350 */
351 svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE);
352 svc_rdma_recv_ctxt_put(rdma, ctxt);
353 svc_xprt_deferred_close(&rdma->sc_xprt);
354 }
355
356 /*
357 * Assumptions:
358 * - If ib_post_send() succeeds, only one completion is expected,
359 * even if one or more WRs are flushed. This is true when posting
360 * an rdma_rw_ctx or when posting a single signaled WR.
361 */
svc_rdma_post_chunk_ctxt(struct svcxprt_rdma * rdma,struct svc_rdma_chunk_ctxt * cc)362 static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
363 struct svc_rdma_chunk_ctxt *cc)
364 {
365 struct ib_send_wr *first_wr;
366 const struct ib_send_wr *bad_wr;
367 struct list_head *tmp;
368 struct ib_cqe *cqe;
369 int ret;
370
371 might_sleep();
372
373 if (cc->cc_sqecount > rdma->sc_sq_depth)
374 return -EINVAL;
375
376 first_wr = NULL;
377 cqe = &cc->cc_cqe;
378 list_for_each(tmp, &cc->cc_rwctxts) {
379 struct svc_rdma_rw_ctxt *ctxt;
380
381 ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
382 first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
383 rdma->sc_port_num, cqe, first_wr);
384 cqe = NULL;
385 }
386
387 do {
388 if (atomic_sub_return(cc->cc_sqecount,
389 &rdma->sc_sq_avail) > 0) {
390 cc->cc_posttime = ktime_get();
391 ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
392 if (ret)
393 break;
394 return 0;
395 }
396
397 percpu_counter_inc(&svcrdma_stat_sq_starve);
398 trace_svcrdma_sq_full(rdma, &cc->cc_cid);
399 atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
400 wait_event(rdma->sc_send_wait,
401 atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
402 trace_svcrdma_sq_retry(rdma, &cc->cc_cid);
403 } while (1);
404
405 trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret);
406 svc_xprt_deferred_close(&rdma->sc_xprt);
407
408 /* If even one was posted, there will be a completion. */
409 if (bad_wr != first_wr)
410 return 0;
411
412 atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
413 wake_up(&rdma->sc_send_wait);
414 return -ENOTCONN;
415 }
416
417 /* Build and DMA-map an SGL that covers one kvec in an xdr_buf
418 */
svc_rdma_vec_to_sg(struct svc_rdma_write_info * info,unsigned int len,struct svc_rdma_rw_ctxt * ctxt)419 static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
420 unsigned int len,
421 struct svc_rdma_rw_ctxt *ctxt)
422 {
423 struct scatterlist *sg = ctxt->rw_sg_table.sgl;
424
425 sg_set_buf(&sg[0], info->wi_base, len);
426 info->wi_base += len;
427
428 ctxt->rw_nents = 1;
429 }
430
431 /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
432 */
svc_rdma_pagelist_to_sg(struct svc_rdma_write_info * info,unsigned int remaining,struct svc_rdma_rw_ctxt * ctxt)433 static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
434 unsigned int remaining,
435 struct svc_rdma_rw_ctxt *ctxt)
436 {
437 unsigned int sge_no, sge_bytes, page_off, page_no;
438 const struct xdr_buf *xdr = info->wi_xdr;
439 struct scatterlist *sg;
440 struct page **page;
441
442 page_off = info->wi_next_off + xdr->page_base;
443 page_no = page_off >> PAGE_SHIFT;
444 page_off = offset_in_page(page_off);
445 page = xdr->pages + page_no;
446 info->wi_next_off += remaining;
447 sg = ctxt->rw_sg_table.sgl;
448 sge_no = 0;
449 do {
450 sge_bytes = min_t(unsigned int, remaining,
451 PAGE_SIZE - page_off);
452 sg_set_page(sg, *page, sge_bytes, page_off);
453
454 remaining -= sge_bytes;
455 sg = sg_next(sg);
456 page_off = 0;
457 sge_no++;
458 page++;
459 } while (remaining);
460
461 ctxt->rw_nents = sge_no;
462 }
463
464 /* Construct RDMA Write WRs to send a portion of an xdr_buf containing
465 * an RPC Reply.
466 */
467 static int
svc_rdma_build_writes(struct svc_rdma_write_info * info,void (* constructor)(struct svc_rdma_write_info * info,unsigned int len,struct svc_rdma_rw_ctxt * ctxt),unsigned int remaining)468 svc_rdma_build_writes(struct svc_rdma_write_info *info,
469 void (*constructor)(struct svc_rdma_write_info *info,
470 unsigned int len,
471 struct svc_rdma_rw_ctxt *ctxt),
472 unsigned int remaining)
473 {
474 struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
475 struct svcxprt_rdma *rdma = info->wi_rdma;
476 const struct svc_rdma_segment *seg;
477 struct svc_rdma_rw_ctxt *ctxt;
478 int ret;
479
480 do {
481 unsigned int write_len;
482 u64 offset;
483
484 if (info->wi_seg_no >= info->wi_chunk->ch_segcount)
485 goto out_overflow;
486
487 seg = &info->wi_chunk->ch_segments[info->wi_seg_no];
488 write_len = min(remaining, seg->rs_length - info->wi_seg_off);
489 if (!write_len)
490 goto out_overflow;
491 ctxt = svc_rdma_get_rw_ctxt(rdma,
492 (write_len >> PAGE_SHIFT) + 2);
493 if (!ctxt)
494 return -ENOMEM;
495
496 constructor(info, write_len, ctxt);
497 offset = seg->rs_offset + info->wi_seg_off;
498 ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
499 DMA_TO_DEVICE);
500 if (ret < 0)
501 return -EIO;
502 percpu_counter_inc(&svcrdma_stat_write);
503
504 list_add(&ctxt->rw_list, &cc->cc_rwctxts);
505 cc->cc_sqecount += ret;
506 if (write_len == seg->rs_length - info->wi_seg_off) {
507 info->wi_seg_no++;
508 info->wi_seg_off = 0;
509 } else {
510 info->wi_seg_off += write_len;
511 }
512 remaining -= write_len;
513 } while (remaining);
514
515 return 0;
516
517 out_overflow:
518 trace_svcrdma_small_wrch_err(&cc->cc_cid, remaining, info->wi_seg_no,
519 info->wi_chunk->ch_segcount);
520 return -E2BIG;
521 }
522
523 /**
524 * svc_rdma_iov_write - Construct RDMA Writes from an iov
525 * @info: pointer to write arguments
526 * @iov: kvec to write
527 *
528 * Returns:
529 * On success, returns zero
530 * %-E2BIG if the client-provided Write chunk is too small
531 * %-ENOMEM if a resource has been exhausted
532 * %-EIO if an rdma-rw error occurred
533 */
svc_rdma_iov_write(struct svc_rdma_write_info * info,const struct kvec * iov)534 static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
535 const struct kvec *iov)
536 {
537 info->wi_base = iov->iov_base;
538 return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
539 iov->iov_len);
540 }
541
542 /**
543 * svc_rdma_pages_write - Construct RDMA Writes from pages
544 * @info: pointer to write arguments
545 * @xdr: xdr_buf with pages to write
546 * @offset: offset into the content of @xdr
547 * @length: number of bytes to write
548 *
549 * Returns:
550 * On success, returns zero
551 * %-E2BIG if the client-provided Write chunk is too small
552 * %-ENOMEM if a resource has been exhausted
553 * %-EIO if an rdma-rw error occurred
554 */
svc_rdma_pages_write(struct svc_rdma_write_info * info,const struct xdr_buf * xdr,unsigned int offset,unsigned long length)555 static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
556 const struct xdr_buf *xdr,
557 unsigned int offset,
558 unsigned long length)
559 {
560 info->wi_xdr = xdr;
561 info->wi_next_off = offset - xdr->head[0].iov_len;
562 return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
563 length);
564 }
565
566 /**
567 * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
568 * @xdr: xdr_buf to write
569 * @data: pointer to write arguments
570 *
571 * Returns:
572 * On success, returns zero
573 * %-E2BIG if the client-provided Write chunk is too small
574 * %-ENOMEM if a resource has been exhausted
575 * %-EIO if an rdma-rw error occurred
576 */
svc_rdma_xb_write(const struct xdr_buf * xdr,void * data)577 static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
578 {
579 struct svc_rdma_write_info *info = data;
580 int ret;
581
582 if (xdr->head[0].iov_len) {
583 ret = svc_rdma_iov_write(info, &xdr->head[0]);
584 if (ret < 0)
585 return ret;
586 }
587
588 if (xdr->page_len) {
589 ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
590 xdr->page_len);
591 if (ret < 0)
592 return ret;
593 }
594
595 if (xdr->tail[0].iov_len) {
596 ret = svc_rdma_iov_write(info, &xdr->tail[0]);
597 if (ret < 0)
598 return ret;
599 }
600
601 return xdr->len;
602 }
603
svc_rdma_send_write_chunk(struct svcxprt_rdma * rdma,const struct svc_rdma_chunk * chunk,const struct xdr_buf * xdr)604 static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
605 const struct svc_rdma_chunk *chunk,
606 const struct xdr_buf *xdr)
607 {
608 struct svc_rdma_write_info *info;
609 struct svc_rdma_chunk_ctxt *cc;
610 struct xdr_buf payload;
611 int ret;
612
613 if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
614 chunk->ch_payload_length))
615 return -EMSGSIZE;
616
617 info = svc_rdma_write_info_alloc(rdma, chunk);
618 if (!info)
619 return -ENOMEM;
620 cc = &info->wi_cc;
621
622 ret = svc_rdma_xb_write(&payload, info);
623 if (ret != payload.len)
624 goto out_err;
625
626 trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
627 ret = svc_rdma_post_chunk_ctxt(rdma, cc);
628 if (ret < 0)
629 goto out_err;
630 return 0;
631
632 out_err:
633 svc_rdma_write_info_free(info);
634 return ret;
635 }
636
637 /**
638 * svc_rdma_send_write_list - Send all chunks on the Write list
639 * @rdma: controlling RDMA transport
640 * @rctxt: Write list provisioned by the client
641 * @xdr: xdr_buf containing an RPC Reply message
642 *
643 * Returns zero on success, or a negative errno if one or more
644 * Write chunks could not be sent.
645 */
svc_rdma_send_write_list(struct svcxprt_rdma * rdma,const struct svc_rdma_recv_ctxt * rctxt,const struct xdr_buf * xdr)646 int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
647 const struct svc_rdma_recv_ctxt *rctxt,
648 const struct xdr_buf *xdr)
649 {
650 struct svc_rdma_chunk *chunk;
651 int ret;
652
653 pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
654 if (!chunk->ch_payload_length)
655 break;
656 ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
657 if (ret < 0)
658 return ret;
659 }
660 return 0;
661 }
662
663 /**
664 * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk
665 * @rdma: controlling RDMA transport
666 * @write_pcl: Write chunk list provided by client
667 * @reply_pcl: Reply chunk provided by client
668 * @sctxt: Send WR resources
669 * @xdr: xdr_buf containing an RPC Reply
670 *
671 * Returns a non-negative number of bytes the chunk consumed, or
672 * %-E2BIG if the payload was larger than the Reply chunk,
673 * %-EINVAL if client provided too many segments,
674 * %-ENOMEM if rdma_rw context pool was exhausted,
675 * %-ENOTCONN if posting failed (connection is lost),
676 * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
677 */
svc_rdma_prepare_reply_chunk(struct svcxprt_rdma * rdma,const struct svc_rdma_pcl * write_pcl,const struct svc_rdma_pcl * reply_pcl,struct svc_rdma_send_ctxt * sctxt,const struct xdr_buf * xdr)678 int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
679 const struct svc_rdma_pcl *write_pcl,
680 const struct svc_rdma_pcl *reply_pcl,
681 struct svc_rdma_send_ctxt *sctxt,
682 const struct xdr_buf *xdr)
683 {
684 struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
685 struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
686 struct ib_send_wr *first_wr;
687 struct list_head *pos;
688 struct ib_cqe *cqe;
689 int ret;
690
691 info->wi_rdma = rdma;
692 info->wi_chunk = pcl_first_chunk(reply_pcl);
693 info->wi_seg_off = 0;
694 info->wi_seg_no = 0;
695 info->wi_cc.cc_cqe.done = svc_rdma_reply_done;
696
697 ret = pcl_process_nonpayloads(write_pcl, xdr,
698 svc_rdma_xb_write, info);
699 if (ret < 0)
700 return ret;
701
702 first_wr = sctxt->sc_wr_chain;
703 cqe = &cc->cc_cqe;
704 list_for_each(pos, &cc->cc_rwctxts) {
705 struct svc_rdma_rw_ctxt *rwc;
706
707 rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
708 first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
709 rdma->sc_port_num, cqe, first_wr);
710 cqe = NULL;
711 }
712 sctxt->sc_wr_chain = first_wr;
713 sctxt->sc_sqecount += cc->cc_sqecount;
714
715 trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
716 return xdr->len;
717 }
718
719 /**
720 * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
721 * @rqstp: RPC transaction context
722 * @head: context for ongoing I/O
723 * @segment: co-ordinates of remote memory to be read
724 *
725 * Returns:
726 * %0: the Read WR chain was constructed successfully
727 * %-EINVAL: there were not enough rq_pages to finish
728 * %-ENOMEM: allocating a local resources failed
729 * %-EIO: a DMA mapping error occurred
730 */
svc_rdma_build_read_segment(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head,const struct svc_rdma_segment * segment)731 static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
732 struct svc_rdma_recv_ctxt *head,
733 const struct svc_rdma_segment *segment)
734 {
735 struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
736 struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
737 unsigned int sge_no, seg_len, len;
738 struct svc_rdma_rw_ctxt *ctxt;
739 struct scatterlist *sg;
740 int ret;
741
742 len = segment->rs_length;
743 sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
744 ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
745 if (!ctxt)
746 return -ENOMEM;
747 ctxt->rw_nents = sge_no;
748
749 sg = ctxt->rw_sg_table.sgl;
750 for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
751 seg_len = min_t(unsigned int, len,
752 PAGE_SIZE - head->rc_pageoff);
753
754 if (!head->rc_pageoff)
755 head->rc_page_count++;
756
757 sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
758 seg_len, head->rc_pageoff);
759 sg = sg_next(sg);
760
761 head->rc_pageoff += seg_len;
762 if (head->rc_pageoff == PAGE_SIZE) {
763 head->rc_curpage++;
764 head->rc_pageoff = 0;
765 }
766 len -= seg_len;
767
768 if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages)))
769 goto out_overrun;
770 }
771
772 ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
773 segment->rs_handle, DMA_FROM_DEVICE);
774 if (ret < 0)
775 return -EIO;
776 percpu_counter_inc(&svcrdma_stat_read);
777
778 list_add(&ctxt->rw_list, &cc->cc_rwctxts);
779 cc->cc_sqecount += ret;
780 return 0;
781
782 out_overrun:
783 trace_svcrdma_page_overrun_err(&cc->cc_cid, head->rc_curpage);
784 return -EINVAL;
785 }
786
787 /**
788 * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk
789 * @rqstp: RPC transaction context
790 * @head: context for ongoing I/O
791 * @chunk: Read chunk to pull
792 *
793 * Return values:
794 * %0: the Read WR chain was constructed successfully
795 * %-EINVAL: there were not enough resources to finish
796 * %-ENOMEM: allocating a local resources failed
797 * %-EIO: a DMA mapping error occurred
798 */
svc_rdma_build_read_chunk(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head,const struct svc_rdma_chunk * chunk)799 static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
800 struct svc_rdma_recv_ctxt *head,
801 const struct svc_rdma_chunk *chunk)
802 {
803 const struct svc_rdma_segment *segment;
804 int ret;
805
806 ret = -EINVAL;
807 pcl_for_each_segment(segment, chunk) {
808 ret = svc_rdma_build_read_segment(rqstp, head, segment);
809 if (ret < 0)
810 break;
811 head->rc_readbytes += segment->rs_length;
812 }
813 return ret;
814 }
815
816 /**
817 * svc_rdma_copy_inline_range - Copy part of the inline content into pages
818 * @rqstp: RPC transaction context
819 * @head: context for ongoing I/O
820 * @offset: offset into the Receive buffer of region to copy
821 * @remaining: length of region to copy
822 *
823 * Take a page at a time from rqstp->rq_pages and copy the inline
824 * content from the Receive buffer into that page. Update
825 * head->rc_curpage and head->rc_pageoff so that the next RDMA Read
826 * result will land contiguously with the copied content.
827 *
828 * Return values:
829 * %0: Inline content was successfully copied
830 * %-EINVAL: offset or length was incorrect
831 */
svc_rdma_copy_inline_range(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head,unsigned int offset,unsigned int remaining)832 static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp,
833 struct svc_rdma_recv_ctxt *head,
834 unsigned int offset,
835 unsigned int remaining)
836 {
837 unsigned char *dst, *src = head->rc_recv_buf;
838 unsigned int page_no, numpages;
839
840 numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT;
841 for (page_no = 0; page_no < numpages; page_no++) {
842 unsigned int page_len;
843
844 page_len = min_t(unsigned int, remaining,
845 PAGE_SIZE - head->rc_pageoff);
846
847 if (!head->rc_pageoff)
848 head->rc_page_count++;
849
850 dst = page_address(rqstp->rq_pages[head->rc_curpage]);
851 memcpy(dst + head->rc_curpage, src + offset, page_len);
852
853 head->rc_readbytes += page_len;
854 head->rc_pageoff += page_len;
855 if (head->rc_pageoff == PAGE_SIZE) {
856 head->rc_curpage++;
857 head->rc_pageoff = 0;
858 }
859 remaining -= page_len;
860 offset += page_len;
861 }
862
863 return -EINVAL;
864 }
865
866 /**
867 * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
868 * @rqstp: RPC transaction context
869 * @head: context for ongoing I/O
870 *
871 * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
872 * like an incoming TCP call.
873 *
874 * Return values:
875 * %0: RDMA Read WQEs were successfully built
876 * %-EINVAL: client provided too many chunks or segments,
877 * %-ENOMEM: rdma_rw context pool was exhausted,
878 * %-ENOTCONN: posting failed (connection is lost),
879 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
880 */
881 static noinline int
svc_rdma_read_multiple_chunks(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)882 svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp,
883 struct svc_rdma_recv_ctxt *head)
884 {
885 const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
886 struct svc_rdma_chunk *chunk, *next;
887 unsigned int start, length;
888 int ret;
889
890 start = 0;
891 chunk = pcl_first_chunk(pcl);
892 length = chunk->ch_position;
893 ret = svc_rdma_copy_inline_range(rqstp, head, start, length);
894 if (ret < 0)
895 return ret;
896
897 pcl_for_each_chunk(chunk, pcl) {
898 ret = svc_rdma_build_read_chunk(rqstp, head, chunk);
899 if (ret < 0)
900 return ret;
901
902 next = pcl_next_chunk(pcl, chunk);
903 if (!next)
904 break;
905
906 start += length;
907 length = next->ch_position - head->rc_readbytes;
908 ret = svc_rdma_copy_inline_range(rqstp, head, start, length);
909 if (ret < 0)
910 return ret;
911 }
912
913 start += length;
914 length = head->rc_byte_len - start;
915 return svc_rdma_copy_inline_range(rqstp, head, start, length);
916 }
917
918 /**
919 * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
920 * @rqstp: RPC transaction context
921 * @head: context for ongoing I/O
922 *
923 * The chunk data lands in the page list of rqstp->rq_arg.pages.
924 *
925 * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
926 * Therefore, XDR round-up of the Read chunk and trailing
927 * inline content must both be added at the end of the pagelist.
928 *
929 * Return values:
930 * %0: RDMA Read WQEs were successfully built
931 * %-EINVAL: client provided too many chunks or segments,
932 * %-ENOMEM: rdma_rw context pool was exhausted,
933 * %-ENOTCONN: posting failed (connection is lost),
934 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
935 */
svc_rdma_read_data_item(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)936 static int svc_rdma_read_data_item(struct svc_rqst *rqstp,
937 struct svc_rdma_recv_ctxt *head)
938 {
939 return svc_rdma_build_read_chunk(rqstp, head,
940 pcl_first_chunk(&head->rc_read_pcl));
941 }
942
943 /**
944 * svc_rdma_read_chunk_range - Build RDMA Read WRs for portion of a chunk
945 * @rqstp: RPC transaction context
946 * @head: context for ongoing I/O
947 * @chunk: parsed Call chunk to pull
948 * @offset: offset of region to pull
949 * @length: length of region to pull
950 *
951 * Return values:
952 * %0: RDMA Read WQEs were successfully built
953 * %-EINVAL: there were not enough resources to finish
954 * %-ENOMEM: rdma_rw context pool was exhausted,
955 * %-ENOTCONN: posting failed (connection is lost),
956 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
957 */
svc_rdma_read_chunk_range(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head,const struct svc_rdma_chunk * chunk,unsigned int offset,unsigned int length)958 static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp,
959 struct svc_rdma_recv_ctxt *head,
960 const struct svc_rdma_chunk *chunk,
961 unsigned int offset, unsigned int length)
962 {
963 const struct svc_rdma_segment *segment;
964 int ret;
965
966 ret = -EINVAL;
967 pcl_for_each_segment(segment, chunk) {
968 struct svc_rdma_segment dummy;
969
970 if (offset > segment->rs_length) {
971 offset -= segment->rs_length;
972 continue;
973 }
974
975 dummy.rs_handle = segment->rs_handle;
976 dummy.rs_length = min_t(u32, length, segment->rs_length) - offset;
977 dummy.rs_offset = segment->rs_offset + offset;
978
979 ret = svc_rdma_build_read_segment(rqstp, head, &dummy);
980 if (ret < 0)
981 break;
982
983 head->rc_readbytes += dummy.rs_length;
984 length -= dummy.rs_length;
985 offset = 0;
986 }
987 return ret;
988 }
989
990 /**
991 * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message
992 * @rqstp: RPC transaction context
993 * @head: context for ongoing I/O
994 *
995 * Return values:
996 * %0: RDMA Read WQEs were successfully built
997 * %-EINVAL: there were not enough resources to finish
998 * %-ENOMEM: rdma_rw context pool was exhausted,
999 * %-ENOTCONN: posting failed (connection is lost),
1000 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
1001 */
svc_rdma_read_call_chunk(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)1002 static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp,
1003 struct svc_rdma_recv_ctxt *head)
1004 {
1005 const struct svc_rdma_chunk *call_chunk =
1006 pcl_first_chunk(&head->rc_call_pcl);
1007 const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
1008 struct svc_rdma_chunk *chunk, *next;
1009 unsigned int start, length;
1010 int ret;
1011
1012 if (pcl_is_empty(pcl))
1013 return svc_rdma_build_read_chunk(rqstp, head, call_chunk);
1014
1015 start = 0;
1016 chunk = pcl_first_chunk(pcl);
1017 length = chunk->ch_position;
1018 ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk,
1019 start, length);
1020 if (ret < 0)
1021 return ret;
1022
1023 pcl_for_each_chunk(chunk, pcl) {
1024 ret = svc_rdma_build_read_chunk(rqstp, head, chunk);
1025 if (ret < 0)
1026 return ret;
1027
1028 next = pcl_next_chunk(pcl, chunk);
1029 if (!next)
1030 break;
1031
1032 start += length;
1033 length = next->ch_position - head->rc_readbytes;
1034 ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk,
1035 start, length);
1036 if (ret < 0)
1037 return ret;
1038 }
1039
1040 start += length;
1041 length = call_chunk->ch_length - start;
1042 return svc_rdma_read_chunk_range(rqstp, head, call_chunk,
1043 start, length);
1044 }
1045
1046 /**
1047 * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message
1048 * @rqstp: RPC transaction context
1049 * @head: context for ongoing I/O
1050 *
1051 * The start of the data lands in the first page just after the
1052 * Transport header, and the rest lands in rqstp->rq_arg.pages.
1053 *
1054 * Assumptions:
1055 * - A PZRC is never sent in an RDMA_MSG message, though it's
1056 * allowed by spec.
1057 *
1058 * Return values:
1059 * %0: RDMA Read WQEs were successfully built
1060 * %-EINVAL: client provided too many chunks or segments,
1061 * %-ENOMEM: rdma_rw context pool was exhausted,
1062 * %-ENOTCONN: posting failed (connection is lost),
1063 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
1064 */
svc_rdma_read_special(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)1065 static noinline int svc_rdma_read_special(struct svc_rqst *rqstp,
1066 struct svc_rdma_recv_ctxt *head)
1067 {
1068 return svc_rdma_read_call_chunk(rqstp, head);
1069 }
1070
1071 /* Pages under I/O have been copied to head->rc_pages. Ensure that
1072 * svc_xprt_release() does not put them when svc_rdma_recvfrom()
1073 * returns. This has to be done after all Read WRs are constructed
1074 * to properly handle a page that happens to be part of I/O on behalf
1075 * of two different RDMA segments.
1076 *
1077 * Note: if the subsequent post_send fails, these pages have already
1078 * been moved to head->rc_pages and thus will be cleaned up by
1079 * svc_rdma_recv_ctxt_put().
1080 */
svc_rdma_clear_rqst_pages(struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)1081 static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
1082 struct svc_rdma_recv_ctxt *head)
1083 {
1084 unsigned int i;
1085
1086 for (i = 0; i < head->rc_page_count; i++) {
1087 head->rc_pages[i] = rqstp->rq_pages[i];
1088 rqstp->rq_pages[i] = NULL;
1089 }
1090 }
1091
1092 /**
1093 * svc_rdma_process_read_list - Pull list of Read chunks from the client
1094 * @rdma: controlling RDMA transport
1095 * @rqstp: set of pages to use as Read sink buffers
1096 * @head: pages under I/O collect here
1097 *
1098 * The RPC/RDMA protocol assumes that the upper layer's XDR decoders
1099 * pull each Read chunk as they decode an incoming RPC message.
1100 *
1101 * On Linux, however, the server needs to have a fully-constructed RPC
1102 * message in rqstp->rq_arg when there is a positive return code from
1103 * ->xpo_recvfrom. So the Read list is safety-checked immediately when
1104 * it is received, then here the whole Read list is pulled all at once.
1105 * The ingress RPC message is fully reconstructed once all associated
1106 * RDMA Reads have completed.
1107 *
1108 * Return values:
1109 * %1: all needed RDMA Reads were posted successfully,
1110 * %-EINVAL: client provided too many chunks or segments,
1111 * %-ENOMEM: rdma_rw context pool was exhausted,
1112 * %-ENOTCONN: posting failed (connection is lost),
1113 * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
1114 */
svc_rdma_process_read_list(struct svcxprt_rdma * rdma,struct svc_rqst * rqstp,struct svc_rdma_recv_ctxt * head)1115 int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
1116 struct svc_rqst *rqstp,
1117 struct svc_rdma_recv_ctxt *head)
1118 {
1119 struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
1120 int ret;
1121
1122 cc->cc_cqe.done = svc_rdma_wc_read_done;
1123 cc->cc_sqecount = 0;
1124 head->rc_pageoff = 0;
1125 head->rc_curpage = 0;
1126 head->rc_readbytes = 0;
1127
1128 if (pcl_is_empty(&head->rc_call_pcl)) {
1129 if (head->rc_read_pcl.cl_count == 1)
1130 ret = svc_rdma_read_data_item(rqstp, head);
1131 else
1132 ret = svc_rdma_read_multiple_chunks(rqstp, head);
1133 } else
1134 ret = svc_rdma_read_special(rqstp, head);
1135 svc_rdma_clear_rqst_pages(rqstp, head);
1136 if (ret < 0)
1137 return ret;
1138
1139 trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
1140 ret = svc_rdma_post_chunk_ctxt(rdma, cc);
1141 return ret < 0 ? ret : 1;
1142 }
1143