xref: /linux/drivers/infiniband/core/rw.c (revision 7df48e36313029e4c0907b2023905dd7213fd678)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2016 HGST, a Western Digital Company.
4  */
5 #include <linux/memremap.h>
6 #include <linux/moduleparam.h>
7 #include <linux/slab.h>
8 #include <linux/pci-p2pdma.h>
9 #include <rdma/mr_pool.h>
10 #include <rdma/rw.h>
11 
12 enum {
13 	RDMA_RW_SINGLE_WR,
14 	RDMA_RW_MULTI_WR,
15 	RDMA_RW_MR,
16 	RDMA_RW_SIG_MR,
17 	RDMA_RW_IOVA,
18 };
19 
20 static bool rdma_rw_force_mr;
21 module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
22 MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
23 
24 /*
25  * Report whether memory registration should be used. Memory registration must
26  * be used for iWarp devices because of iWARP-specific limitations. Memory
27  * registration is also enabled if registering memory might yield better
28  * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
29  */
rdma_rw_can_use_mr(struct ib_device * dev,u32 port_num)30 static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num)
31 {
32 	if (rdma_protocol_iwarp(dev, port_num))
33 		return true;
34 	if (dev->attrs.max_sgl_rd)
35 		return true;
36 	if (unlikely(rdma_rw_force_mr))
37 		return true;
38 	return false;
39 }
40 
41 /*
42  * Check if the device will use memory registration for this RW operation.
43  * For RDMA READs we must use MRs on iWarp and can optionally use them as an
44  * optimization otherwise.  Additionally we have a debug option to force usage
45  * of MRs to help testing this code path.
46  */
rdma_rw_io_needs_mr(struct ib_device * dev,u32 port_num,enum dma_data_direction dir,int dma_nents)47 static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num,
48 		enum dma_data_direction dir, int dma_nents)
49 {
50 	if (dir == DMA_FROM_DEVICE) {
51 		if (rdma_protocol_iwarp(dev, port_num))
52 			return true;
53 		if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd)
54 			return true;
55 	}
56 	if (unlikely(rdma_rw_force_mr))
57 		return true;
58 	return false;
59 }
60 
rdma_rw_fr_page_list_len(struct ib_device * dev,bool pi_support)61 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
62 					   bool pi_support)
63 {
64 	u32 max_pages;
65 
66 	if (pi_support)
67 		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
68 	else
69 		max_pages = dev->attrs.max_fast_reg_page_list_len;
70 
71 	/* arbitrary limit to avoid allocating gigantic resources */
72 	return min_t(u32, max_pages, 256);
73 }
74 
rdma_rw_inv_key(struct rdma_rw_reg_ctx * reg)75 static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
76 {
77 	int count = 0;
78 
79 	if (reg->mr->need_inval) {
80 		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
81 		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
82 		reg->inv_wr.next = &reg->reg_wr.wr;
83 		count++;
84 	} else {
85 		reg->inv_wr.next = NULL;
86 	}
87 
88 	return count;
89 }
90 
91 /* Caller must have zero-initialized *reg. */
rdma_rw_init_one_mr(struct ib_qp * qp,u32 port_num,struct rdma_rw_reg_ctx * reg,struct scatterlist * sg,u32 sg_cnt,u32 offset)92 static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
93 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
94 		u32 sg_cnt, u32 offset)
95 {
96 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
97 						    qp->integrity_en);
98 	u32 nents = min(sg_cnt, pages_per_mr);
99 	int count = 0, ret;
100 
101 	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
102 	if (!reg->mr)
103 		return -EAGAIN;
104 
105 	count += rdma_rw_inv_key(reg);
106 
107 	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
108 	if (ret < 0 || ret < nents) {
109 		ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
110 		return -EINVAL;
111 	}
112 
113 	reg->reg_wr.wr.opcode = IB_WR_REG_MR;
114 	reg->reg_wr.mr = reg->mr;
115 	reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
116 	if (rdma_protocol_iwarp(qp->device, port_num))
117 		reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
118 	count++;
119 
120 	reg->sge.addr = reg->mr->iova;
121 	reg->sge.length = reg->mr->length;
122 	return count;
123 }
124 
rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx * reg,struct rdma_rw_reg_ctx * prev,struct ib_qp * qp,u32 port_num,u64 remote_addr,u32 rkey,enum dma_data_direction dir)125 static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg,
126 		struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num,
127 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
128 {
129 	if (prev) {
130 		if (reg->mr->need_inval)
131 			prev->wr.wr.next = &reg->inv_wr;
132 		else
133 			prev->wr.wr.next = &reg->reg_wr.wr;
134 	}
135 
136 	reg->reg_wr.wr.next = &reg->wr.wr;
137 
138 	reg->wr.wr.sg_list = &reg->sge;
139 	reg->wr.wr.num_sge = 1;
140 	reg->wr.remote_addr = remote_addr;
141 	reg->wr.rkey = rkey;
142 
143 	if (dir == DMA_TO_DEVICE) {
144 		reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
145 	} else if (!rdma_cap_read_inv(qp->device, port_num)) {
146 		reg->wr.wr.opcode = IB_WR_RDMA_READ;
147 	} else {
148 		reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
149 		reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
150 	}
151 
152 	return 1;
153 }
154 
rdma_rw_init_mr_wrs(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct scatterlist * sg,u32 sg_cnt,u32 offset,u64 remote_addr,u32 rkey,enum dma_data_direction dir)155 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
156 		u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
157 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
158 {
159 	struct rdma_rw_reg_ctx *prev = NULL;
160 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
161 						    qp->integrity_en);
162 	int i, j, ret = 0, count = 0;
163 
164 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr);
165 	ctx->reg = kzalloc_objs(*ctx->reg, ctx->nr_ops);
166 	if (!ctx->reg) {
167 		ret = -ENOMEM;
168 		goto out;
169 	}
170 
171 	for (i = 0; i < ctx->nr_ops; i++) {
172 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
173 		u32 nents = min(sg_cnt, pages_per_mr);
174 
175 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
176 				offset);
177 		if (ret < 0)
178 			goto out_free;
179 		count += ret;
180 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
181 				remote_addr, rkey, dir);
182 		remote_addr += reg->sge.length;
183 		sg_cnt -= nents;
184 		for (j = 0; j < nents; j++)
185 			sg = sg_next(sg);
186 		prev = reg;
187 		offset = 0;
188 	}
189 
190 	if (prev)
191 		prev->wr.wr.next = NULL;
192 
193 	ctx->type = RDMA_RW_MR;
194 	return count;
195 
196 out_free:
197 	while (--i >= 0)
198 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
199 	kfree(ctx->reg);
200 out:
201 	return ret;
202 }
203 
rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,const struct bio_vec * bvecs,u32 nr_bvec,struct bvec_iter * iter,u64 remote_addr,u32 rkey,enum dma_data_direction dir)204 static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
205 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
206 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
207 		enum dma_data_direction dir)
208 {
209 	struct ib_device *dev = qp->pd->device;
210 	struct rdma_rw_reg_ctx *prev = NULL;
211 	u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
212 	struct scatterlist *sg;
213 	int i, ret, count = 0;
214 	u32 nents = 0;
215 
216 	ctx->reg = kzalloc_objs(*ctx->reg, DIV_ROUND_UP(nr_bvec, pages_per_mr));
217 	if (!ctx->reg)
218 		return -ENOMEM;
219 
220 	/*
221 	 * Build scatterlist from bvecs using the iterator. This follows
222 	 * the pattern from __blk_rq_map_sg.
223 	 */
224 	ctx->reg[0].sgt.sgl = kmalloc_objs(*ctx->reg[0].sgt.sgl, nr_bvec);
225 	if (!ctx->reg[0].sgt.sgl) {
226 		ret = -ENOMEM;
227 		goto out_free_reg;
228 	}
229 	sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec);
230 
231 	for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) {
232 		struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
233 
234 		if (nents >= nr_bvec) {
235 			ret = -EINVAL;
236 			goto out_free_sgl;
237 		}
238 		sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset);
239 		bvec_iter_advance(bvecs, iter, bv.bv_len);
240 		nents++;
241 	}
242 	sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents));
243 	ctx->reg[0].sgt.orig_nents = nents;
244 
245 	/* DMA map the scatterlist */
246 	ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
247 	if (ret)
248 		goto out_free_sgl;
249 
250 	ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr);
251 
252 	sg = ctx->reg[0].sgt.sgl;
253 	nents = ctx->reg[0].sgt.nents;
254 	for (i = 0; i < ctx->nr_ops; i++) {
255 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
256 		u32 sge_cnt = min(nents, pages_per_mr);
257 
258 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0);
259 		if (ret < 0)
260 			goto out_free_mrs;
261 		count += ret;
262 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
263 				remote_addr, rkey, dir);
264 		remote_addr += reg->sge.length;
265 		nents -= sge_cnt;
266 		sg += sge_cnt;
267 		prev = reg;
268 	}
269 
270 	if (prev)
271 		prev->wr.wr.next = NULL;
272 
273 	ctx->type = RDMA_RW_MR;
274 	return count;
275 
276 out_free_mrs:
277 	while (--i >= 0)
278 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
279 	ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
280 out_free_sgl:
281 	kfree(ctx->reg[0].sgt.sgl);
282 out_free_reg:
283 	kfree(ctx->reg);
284 	return ret;
285 }
286 
rdma_rw_init_map_wrs(struct rdma_rw_ctx * ctx,struct ib_qp * qp,struct scatterlist * sg,u32 sg_cnt,u32 offset,u64 remote_addr,u32 rkey,enum dma_data_direction dir)287 static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
288 		struct scatterlist *sg, u32 sg_cnt, u32 offset,
289 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
290 {
291 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
292 		      qp->max_read_sge;
293 	struct ib_sge *sge;
294 	u32 total_len = 0, i, j;
295 
296 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
297 
298 	ctx->map.sges = sge = kzalloc_objs(*sge, sg_cnt);
299 	if (!ctx->map.sges)
300 		goto out;
301 
302 	ctx->map.wrs = kzalloc_objs(*ctx->map.wrs, ctx->nr_ops);
303 	if (!ctx->map.wrs)
304 		goto out_free_sges;
305 
306 	for (i = 0; i < ctx->nr_ops; i++) {
307 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
308 		u32 nr_sge = min(sg_cnt, max_sge);
309 
310 		if (dir == DMA_TO_DEVICE)
311 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
312 		else
313 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
314 		rdma_wr->remote_addr = remote_addr + total_len;
315 		rdma_wr->rkey = rkey;
316 		rdma_wr->wr.num_sge = nr_sge;
317 		rdma_wr->wr.sg_list = sge;
318 
319 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
320 			sge->addr = sg_dma_address(sg) + offset;
321 			sge->length = sg_dma_len(sg) - offset;
322 			sge->lkey = qp->pd->local_dma_lkey;
323 
324 			total_len += sge->length;
325 			sge++;
326 			sg_cnt--;
327 			offset = 0;
328 		}
329 
330 		rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
331 			&ctx->map.wrs[i + 1].wr : NULL;
332 	}
333 
334 	ctx->type = RDMA_RW_MULTI_WR;
335 	return ctx->nr_ops;
336 
337 out_free_sges:
338 	kfree(ctx->map.sges);
339 out:
340 	return -ENOMEM;
341 }
342 
rdma_rw_init_single_wr(struct rdma_rw_ctx * ctx,struct ib_qp * qp,struct scatterlist * sg,u32 offset,u64 remote_addr,u32 rkey,enum dma_data_direction dir)343 static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
344 		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
345 		enum dma_data_direction dir)
346 {
347 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
348 
349 	ctx->nr_ops = 1;
350 
351 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
352 	ctx->single.sge.addr = sg_dma_address(sg) + offset;
353 	ctx->single.sge.length = sg_dma_len(sg) - offset;
354 
355 	memset(rdma_wr, 0, sizeof(*rdma_wr));
356 	if (dir == DMA_TO_DEVICE)
357 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
358 	else
359 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
360 	rdma_wr->wr.sg_list = &ctx->single.sge;
361 	rdma_wr->wr.num_sge = 1;
362 	rdma_wr->remote_addr = remote_addr;
363 	rdma_wr->rkey = rkey;
364 
365 	ctx->type = RDMA_RW_SINGLE_WR;
366 	return 1;
367 }
368 
rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,const struct bio_vec * bvecs,struct bvec_iter * iter,u64 remote_addr,u32 rkey,enum dma_data_direction dir)369 static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
370 		struct ib_qp *qp, const struct bio_vec *bvecs,
371 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
372 		enum dma_data_direction dir)
373 {
374 	struct ib_device *dev = qp->pd->device;
375 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
376 	struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
377 	u64 dma_addr;
378 
379 	ctx->nr_ops = 1;
380 
381 	dma_addr = ib_dma_map_bvec(dev, &bv, dir);
382 	if (ib_dma_mapping_error(dev, dma_addr))
383 		return -ENOMEM;
384 
385 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
386 	ctx->single.sge.addr = dma_addr;
387 	ctx->single.sge.length = bv.bv_len;
388 
389 	memset(rdma_wr, 0, sizeof(*rdma_wr));
390 	if (dir == DMA_TO_DEVICE)
391 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
392 	else
393 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
394 	rdma_wr->wr.sg_list = &ctx->single.sge;
395 	rdma_wr->wr.num_sge = 1;
396 	rdma_wr->remote_addr = remote_addr;
397 	rdma_wr->rkey = rkey;
398 
399 	ctx->type = RDMA_RW_SINGLE_WR;
400 	return 1;
401 }
402 
rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,const struct bio_vec * bvecs,u32 nr_bvec,struct bvec_iter * iter,u64 remote_addr,u32 rkey,enum dma_data_direction dir)403 static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
404 		const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter,
405 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
406 {
407 	struct ib_device *dev = qp->pd->device;
408 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
409 		      qp->max_read_sge;
410 	struct ib_sge *sge;
411 	u32 total_len = 0, i, j;
412 	u32 mapped_bvecs = 0;
413 	u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
414 	size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges));
415 	size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs));
416 	size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs));
417 	void *mem;
418 
419 	if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX ||
420 	    check_add_overflow(wrs_offset, wrs_size, &wrs_size))
421 		return -ENOMEM;
422 
423 	mem = kzalloc(wrs_size, GFP_KERNEL);
424 	if (!mem)
425 		return -ENOMEM;
426 
427 	ctx->map.sges = sge = mem;
428 	ctx->map.wrs = mem + wrs_offset;
429 
430 	for (i = 0; i < nr_ops; i++) {
431 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
432 		u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge);
433 
434 		if (dir == DMA_TO_DEVICE)
435 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
436 		else
437 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
438 		rdma_wr->remote_addr = remote_addr + total_len;
439 		rdma_wr->rkey = rkey;
440 		rdma_wr->wr.num_sge = nr_sge;
441 		rdma_wr->wr.sg_list = sge;
442 
443 		for (j = 0; j < nr_sge; j++) {
444 			struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
445 			u64 dma_addr;
446 
447 			dma_addr = ib_dma_map_bvec(dev, &bv, dir);
448 			if (ib_dma_mapping_error(dev, dma_addr))
449 				goto out_unmap;
450 
451 			mapped_bvecs++;
452 			sge->addr = dma_addr;
453 			sge->length = bv.bv_len;
454 			sge->lkey = qp->pd->local_dma_lkey;
455 
456 			total_len += bv.bv_len;
457 			sge++;
458 
459 			bvec_iter_advance_single(bvecs, iter, bv.bv_len);
460 		}
461 
462 		rdma_wr->wr.next = i + 1 < nr_ops ?
463 			&ctx->map.wrs[i + 1].wr : NULL;
464 	}
465 
466 	ctx->nr_ops = nr_ops;
467 	ctx->type = RDMA_RW_MULTI_WR;
468 	return nr_ops;
469 
470 out_unmap:
471 	for (i = 0; i < mapped_bvecs; i++)
472 		ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
473 				  ctx->map.sges[i].length, dir);
474 	kfree(ctx->map.sges);
475 	return -ENOMEM;
476 }
477 
478 /*
479  * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
480  * This reduces IOTLB sync overhead by doing one sync at the end instead of
481  * one per bvec, and produces a contiguous DMA address range that can be
482  * described by a single SGE.
483  *
484  * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
485  * mapping is not available, or another negative error code on failure.
486  */
rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,const struct bio_vec * bvec,struct bvec_iter * iter,u64 remote_addr,u32 rkey,enum dma_data_direction dir)487 static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
488 		struct ib_qp *qp, const struct bio_vec *bvec,
489 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
490 		enum dma_data_direction dir)
491 {
492 	struct ib_device *dev = qp->pd->device;
493 	struct device *dma_dev = dev->dma_device;
494 	size_t total_len = iter->bi_size;
495 	struct bio_vec first_bv;
496 	size_t mapped_len = 0;
497 	int ret;
498 
499 	/* Virtual DMA devices cannot support IOVA allocators */
500 	if (ib_uses_virt_dma(dev))
501 		return -EOPNOTSUPP;
502 
503 	/* Try to allocate contiguous IOVA space */
504 	first_bv = mp_bvec_iter_bvec(bvec, *iter);
505 	if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
506 				bvec_phys(&first_bv), total_len))
507 		return -EOPNOTSUPP;
508 
509 	/* Link all bvecs into the IOVA space */
510 	while (iter->bi_size) {
511 		struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
512 
513 		ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
514 				    mapped_len, bv.bv_len, dir, 0);
515 		if (ret)
516 			goto out_destroy;
517 
518 		mapped_len += bv.bv_len;
519 		bvec_iter_advance(bvec, iter, bv.bv_len);
520 	}
521 
522 	/* Sync the IOTLB once for all linked pages */
523 	ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
524 	if (ret)
525 		goto out_destroy;
526 
527 	ctx->iova.mapped_len = mapped_len;
528 
529 	/* Single SGE covers the entire contiguous IOVA range */
530 	ctx->iova.sge.addr = ctx->iova.state.addr;
531 	ctx->iova.sge.length = mapped_len;
532 	ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
533 
534 	/* Single WR for the whole transfer */
535 	memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
536 	if (dir == DMA_TO_DEVICE)
537 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
538 	else
539 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
540 	ctx->iova.wr.wr.num_sge = 1;
541 	ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
542 	ctx->iova.wr.remote_addr = remote_addr;
543 	ctx->iova.wr.rkey = rkey;
544 
545 	ctx->type = RDMA_RW_IOVA;
546 	ctx->nr_ops = 1;
547 	return 1;
548 
549 out_destroy:
550 	/*
551 	 * dma_iova_destroy() expects the actual mapped length, not the
552 	 * total allocation size. It unlinks only the successfully linked
553 	 * range and frees the entire IOVA allocation.
554 	 */
555 	dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
556 	return ret;
557 }
558 
559 /**
560  * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
561  * @ctx:	context to initialize
562  * @qp:		queue pair to operate on
563  * @port_num:	port num to which the connection is bound
564  * @sg:		scatterlist to READ/WRITE from/to
565  * @sg_cnt:	number of entries in @sg
566  * @sg_offset:	current byte offset into @sg
567  * @remote_addr:remote address to read/write (relative to @rkey)
568  * @rkey:	remote key to operate on
569  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
570  *
571  * Returns the number of WQEs that will be needed on the workqueue if
572  * successful, or a negative error code.
573  */
rdma_rw_ctx_init(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct scatterlist * sg,u32 sg_cnt,u32 sg_offset,u64 remote_addr,u32 rkey,enum dma_data_direction dir)574 int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
575 		struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
576 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
577 {
578 	struct ib_device *dev = qp->pd->device;
579 	struct sg_table sgt = {
580 		.sgl = sg,
581 		.orig_nents = sg_cnt,
582 	};
583 	int ret;
584 
585 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
586 	if (ret)
587 		return ret;
588 	sg_cnt = sgt.nents;
589 
590 	/*
591 	 * Skip to the S/G entry that sg_offset falls into:
592 	 */
593 	for (;;) {
594 		u32 len = sg_dma_len(sg);
595 
596 		if (sg_offset < len)
597 			break;
598 
599 		sg = sg_next(sg);
600 		sg_offset -= len;
601 		sg_cnt--;
602 	}
603 
604 	ret = -EIO;
605 	if (WARN_ON_ONCE(sg_cnt == 0))
606 		goto out_unmap_sg;
607 
608 	if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
609 		ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
610 				sg_offset, remote_addr, rkey, dir);
611 		/*
612 		 * If MR init succeeded or failed for a reason other
613 		 * than pool exhaustion, that result is final.
614 		 *
615 		 * Pool exhaustion (-EAGAIN) from the max_sgl_rd
616 		 * optimization is recoverable: fall back to
617 		 * direct SGE posting. iWARP and force_mr require
618 		 * MRs unconditionally, so -EAGAIN is terminal.
619 		 */
620 		if (ret != -EAGAIN ||
621 		    rdma_protocol_iwarp(qp->device, port_num) ||
622 		    unlikely(rdma_rw_force_mr))
623 			goto out;
624 	}
625 
626 	if (sg_cnt > 1)
627 		ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
628 				remote_addr, rkey, dir);
629 	else
630 		ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
631 				remote_addr, rkey, dir);
632 
633 out:
634 	if (ret < 0)
635 		goto out_unmap_sg;
636 	return ret;
637 
638 out_unmap_sg:
639 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
640 	return ret;
641 }
642 EXPORT_SYMBOL(rdma_rw_ctx_init);
643 
644 /**
645  * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec
646  * @ctx:	context to initialize
647  * @qp:		queue pair to operate on
648  * @port_num:	port num to which the connection is bound
649  * @bvecs:	bio_vec array to READ/WRITE from/to
650  * @nr_bvec:	number of entries in @bvecs
651  * @iter:	bvec iterator describing offset and length
652  * @remote_addr: remote address to read/write (relative to @rkey)
653  * @rkey:	remote key to operate on
654  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
655  *
656  * Maps the bio_vec array directly, avoiding intermediate scatterlist
657  * conversion. Supports MR registration for iWARP devices and force_mr mode.
658  *
659  * Returns the number of WQEs that will be needed on the workqueue if
660  * successful, or a negative error code:
661  *
662  *   * -EINVAL  - @nr_bvec is zero or @iter.bi_size is zero
663  *   * -ENOMEM - DMA mapping or memory allocation failed
664  */
rdma_rw_ctx_init_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,const struct bio_vec * bvecs,u32 nr_bvec,struct bvec_iter iter,u64 remote_addr,u32 rkey,enum dma_data_direction dir)665 int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
666 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
667 		struct bvec_iter iter, u64 remote_addr, u32 rkey,
668 		enum dma_data_direction dir)
669 {
670 	struct ib_device *dev = qp->pd->device;
671 	int ret;
672 
673 	if (nr_bvec == 0 || iter.bi_size == 0)
674 		return -EINVAL;
675 
676 	/*
677 	 * iWARP requires MR registration for all RDMA READs. The force_mr
678 	 * debug option also mandates MR usage.
679 	 */
680 	if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num))
681 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
682 						nr_bvec, &iter, remote_addr,
683 						rkey, dir);
684 	if (unlikely(rdma_rw_force_mr))
685 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
686 						nr_bvec, &iter, remote_addr,
687 						rkey, dir);
688 
689 	if (nr_bvec == 1)
690 		return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
691 				remote_addr, rkey, dir);
692 
693 	/*
694 	 * Try IOVA-based mapping first for multi-bvec transfers.
695 	 * IOVA coalesces bvecs into a single DMA-contiguous region,
696 	 * reducing the number of WRs needed and avoiding MR overhead.
697 	 */
698 	ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
699 			rkey, dir);
700 	if (ret != -EOPNOTSUPP)
701 		return ret;
702 
703 	/*
704 	 * IOVA not available; fall back to the map_wrs path, which maps
705 	 * each bvec as a direct SGE. This is always correct: the MR path
706 	 * is a throughput optimization, not a correctness requirement.
707 	 * (iWARP, which does require MRs, is handled by the check above.)
708 	 *
709 	 * The rdma_rw_io_needs_mr() gate is not used here because nr_bvec
710 	 * is a raw page count that overstates DMA entry demand -- the bvec
711 	 * caller has no post-DMA-coalescing segment count, and feeding the
712 	 * inflated count into the MR path exhausts the pool on RDMA READs.
713 	 */
714 	return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
715 			remote_addr, rkey, dir);
716 }
717 EXPORT_SYMBOL(rdma_rw_ctx_init_bvec);
718 
719 /**
720  * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
721  * @ctx:	context to initialize
722  * @qp:		queue pair to operate on
723  * @port_num:	port num to which the connection is bound
724  * @sg:		scatterlist to READ/WRITE from/to
725  * @sg_cnt:	number of entries in @sg
726  * @prot_sg:	scatterlist to READ/WRITE protection information from/to
727  * @prot_sg_cnt: number of entries in @prot_sg
728  * @sig_attrs:	signature offloading algorithms
729  * @remote_addr:remote address to read/write (relative to @rkey)
730  * @rkey:	remote key to operate on
731  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
732  *
733  * Returns the number of WQEs that will be needed on the workqueue if
734  * successful, or a negative error code.
735  */
rdma_rw_ctx_signature_init(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct scatterlist * sg,u32 sg_cnt,struct scatterlist * prot_sg,u32 prot_sg_cnt,struct ib_sig_attrs * sig_attrs,u64 remote_addr,u32 rkey,enum dma_data_direction dir)736 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
737 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
738 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
739 		struct ib_sig_attrs *sig_attrs,
740 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
741 {
742 	struct ib_device *dev = qp->pd->device;
743 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
744 						    qp->integrity_en);
745 	struct sg_table sgt = {
746 		.sgl = sg,
747 		.orig_nents = sg_cnt,
748 	};
749 	struct sg_table prot_sgt = {
750 		.sgl = prot_sg,
751 		.orig_nents = prot_sg_cnt,
752 	};
753 	struct ib_rdma_wr *rdma_wr;
754 	int count = 0, ret;
755 
756 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
757 		pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n",
758 		       sg_cnt, prot_sg_cnt, pages_per_mr);
759 		return -EINVAL;
760 	}
761 
762 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
763 	if (ret)
764 		return ret;
765 
766 	if (prot_sg_cnt) {
767 		ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0);
768 		if (ret)
769 			goto out_unmap_sg;
770 	}
771 
772 	ctx->type = RDMA_RW_SIG_MR;
773 	ctx->nr_ops = 1;
774 	ctx->reg = kzalloc_obj(*ctx->reg);
775 	if (!ctx->reg) {
776 		ret = -ENOMEM;
777 		goto out_unmap_prot_sg;
778 	}
779 
780 	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
781 	if (!ctx->reg->mr) {
782 		ret = -EAGAIN;
783 		goto out_free_ctx;
784 	}
785 
786 	count += rdma_rw_inv_key(ctx->reg);
787 
788 	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
789 
790 	ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg,
791 			      prot_sgt.nents, NULL, SZ_4K);
792 	if (unlikely(ret)) {
793 		pr_err("failed to map PI sg (%u)\n",
794 		       sgt.nents + prot_sgt.nents);
795 		goto out_destroy_sig_mr;
796 	}
797 
798 	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
799 	ctx->reg->reg_wr.wr.wr_cqe = NULL;
800 	ctx->reg->reg_wr.wr.num_sge = 0;
801 	ctx->reg->reg_wr.wr.send_flags = 0;
802 	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
803 	if (rdma_protocol_iwarp(qp->device, port_num))
804 		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
805 	ctx->reg->reg_wr.mr = ctx->reg->mr;
806 	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
807 	count++;
808 
809 	ctx->reg->sge.addr = ctx->reg->mr->iova;
810 	ctx->reg->sge.length = ctx->reg->mr->length;
811 	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
812 		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
813 
814 	rdma_wr = &ctx->reg->wr;
815 	rdma_wr->wr.sg_list = &ctx->reg->sge;
816 	rdma_wr->wr.num_sge = 1;
817 	rdma_wr->remote_addr = remote_addr;
818 	rdma_wr->rkey = rkey;
819 	if (dir == DMA_TO_DEVICE)
820 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
821 	else
822 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
823 	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
824 	count++;
825 
826 	return count;
827 
828 out_destroy_sig_mr:
829 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
830 out_free_ctx:
831 	kfree(ctx->reg);
832 out_unmap_prot_sg:
833 	if (prot_sgt.nents)
834 		ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0);
835 out_unmap_sg:
836 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
837 	return ret;
838 }
839 EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
840 
841 /*
842  * Now that we are going to post the WRs we can update the lkey and need_inval
843  * state on the MRs.  If we were doing this at init time, we would get double
844  * or missing invalidations if a context was initialized but not actually
845  * posted.
846  */
rdma_rw_update_lkey(struct rdma_rw_reg_ctx * reg,bool need_inval)847 static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
848 {
849 	reg->mr->need_inval = need_inval;
850 	ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
851 	reg->reg_wr.key = reg->mr->lkey;
852 	reg->sge.lkey = reg->mr->lkey;
853 }
854 
855 /**
856  * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
857  * @ctx:	context to operate on
858  * @qp:		queue pair to operate on
859  * @port_num:	port num to which the connection is bound
860  * @cqe:	completion queue entry for the last WR
861  * @chain_wr:	WR to append to the posted chain
862  *
863  * Return the WR chain for the set of RDMA READ/WRITE operations described by
864  * @ctx, as well as any memory registration operations needed.  If @chain_wr
865  * is non-NULL the WR it points to will be appended to the chain of WRs posted.
866  * If @chain_wr is not set @cqe must be set so that the caller gets a
867  * completion notification.
868  */
rdma_rw_ctx_wrs(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct ib_cqe * cqe,struct ib_send_wr * chain_wr)869 struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
870 		u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
871 {
872 	struct ib_send_wr *first_wr, *last_wr;
873 	int i;
874 
875 	switch (ctx->type) {
876 	case RDMA_RW_SIG_MR:
877 	case RDMA_RW_MR:
878 		for (i = 0; i < ctx->nr_ops; i++) {
879 			rdma_rw_update_lkey(&ctx->reg[i],
880 				ctx->reg[i].wr.wr.opcode !=
881 					IB_WR_RDMA_READ_WITH_INV);
882 		}
883 
884 		if (ctx->reg[0].inv_wr.next)
885 			first_wr = &ctx->reg[0].inv_wr;
886 		else
887 			first_wr = &ctx->reg[0].reg_wr.wr;
888 		last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
889 		break;
890 	case RDMA_RW_IOVA:
891 		first_wr = &ctx->iova.wr.wr;
892 		last_wr = &ctx->iova.wr.wr;
893 		break;
894 	case RDMA_RW_MULTI_WR:
895 		first_wr = &ctx->map.wrs[0].wr;
896 		last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
897 		break;
898 	case RDMA_RW_SINGLE_WR:
899 		first_wr = &ctx->single.wr.wr;
900 		last_wr = &ctx->single.wr.wr;
901 		break;
902 	default:
903 		BUG();
904 	}
905 
906 	if (chain_wr) {
907 		last_wr->next = chain_wr;
908 	} else {
909 		last_wr->wr_cqe = cqe;
910 		last_wr->send_flags |= IB_SEND_SIGNALED;
911 	}
912 
913 	return first_wr;
914 }
915 EXPORT_SYMBOL(rdma_rw_ctx_wrs);
916 
917 /**
918  * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
919  * @ctx:	context to operate on
920  * @qp:		queue pair to operate on
921  * @port_num:	port num to which the connection is bound
922  * @cqe:	completion queue entry for the last WR
923  * @chain_wr:	WR to append to the posted chain
924  *
925  * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
926  * any memory registration operations needed.  If @chain_wr is non-NULL the
927  * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
928  * is not set @cqe must be set so that the caller gets a completion
929  * notification.
930  */
rdma_rw_ctx_post(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct ib_cqe * cqe,struct ib_send_wr * chain_wr)931 int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
932 		struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
933 {
934 	struct ib_send_wr *first_wr;
935 
936 	first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
937 	return ib_post_send(qp, first_wr, NULL);
938 }
939 EXPORT_SYMBOL(rdma_rw_ctx_post);
940 
941 /**
942  * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
943  * @ctx:	context to release
944  * @qp:		queue pair to operate on
945  * @port_num:	port num to which the connection is bound
946  * @sg:		scatterlist that was used for the READ/WRITE
947  * @sg_cnt:	number of entries in @sg
948  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
949  */
rdma_rw_ctx_destroy(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct scatterlist * sg,u32 sg_cnt,enum dma_data_direction dir)950 void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
951 			 u32 port_num, struct scatterlist *sg, u32 sg_cnt,
952 			 enum dma_data_direction dir)
953 {
954 	int i;
955 
956 	switch (ctx->type) {
957 	case RDMA_RW_MR:
958 		/* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
959 		WARN_ON_ONCE(ctx->reg[0].sgt.sgl);
960 		for (i = 0; i < ctx->nr_ops; i++)
961 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
962 		kfree(ctx->reg);
963 		break;
964 	case RDMA_RW_MULTI_WR:
965 		kfree(ctx->map.wrs);
966 		kfree(ctx->map.sges);
967 		break;
968 	case RDMA_RW_SINGLE_WR:
969 		break;
970 	case RDMA_RW_IOVA:
971 		/* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
972 		WARN_ON_ONCE(1);
973 		return;
974 	default:
975 		BUG();
976 		break;
977 	}
978 
979 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
980 }
981 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
982 
983 /**
984  * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec
985  * @ctx:	context to release
986  * @qp:		queue pair to operate on
987  * @port_num:	port num to which the connection is bound (unused)
988  * @bvecs:	bio_vec array that was used for the READ/WRITE (unused)
989  * @nr_bvec:	number of entries in @bvecs
990  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
991  *
992  * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec()
993  * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error.
994  *
995  * The @port_num and @bvecs parameters are unused but present for API
996  * symmetry with rdma_rw_ctx_destroy().
997  */
rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 __maybe_unused port_num,const struct bio_vec __maybe_unused * bvecs,u32 nr_bvec,enum dma_data_direction dir)998 void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
999 		u32 __maybe_unused port_num,
1000 		const struct bio_vec __maybe_unused *bvecs,
1001 		u32 nr_bvec, enum dma_data_direction dir)
1002 {
1003 	struct ib_device *dev = qp->pd->device;
1004 	u32 i;
1005 
1006 	switch (ctx->type) {
1007 	case RDMA_RW_MR:
1008 		for (i = 0; i < ctx->nr_ops; i++)
1009 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
1010 		ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
1011 		kfree(ctx->reg[0].sgt.sgl);
1012 		kfree(ctx->reg);
1013 		break;
1014 	case RDMA_RW_IOVA:
1015 		dma_iova_destroy(dev->dma_device, &ctx->iova.state,
1016 				 ctx->iova.mapped_len, dir, 0);
1017 		break;
1018 	case RDMA_RW_MULTI_WR:
1019 		for (i = 0; i < nr_bvec; i++)
1020 			ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
1021 					  ctx->map.sges[i].length, dir);
1022 		kfree(ctx->map.sges);
1023 		break;
1024 	case RDMA_RW_SINGLE_WR:
1025 		ib_dma_unmap_bvec(dev, ctx->single.sge.addr,
1026 				  ctx->single.sge.length, dir);
1027 		break;
1028 	default:
1029 		WARN_ON_ONCE(1);
1030 		return;
1031 	}
1032 }
1033 EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec);
1034 
1035 /**
1036  * rdma_rw_ctx_destroy_signature - release all resources allocated by
1037  *	rdma_rw_ctx_signature_init
1038  * @ctx:	context to release
1039  * @qp:		queue pair to operate on
1040  * @port_num:	port num to which the connection is bound
1041  * @sg:		scatterlist that was used for the READ/WRITE
1042  * @sg_cnt:	number of entries in @sg
1043  * @prot_sg:	scatterlist that was used for the READ/WRITE of the PI
1044  * @prot_sg_cnt: number of entries in @prot_sg
1045  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
1046  */
rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx * ctx,struct ib_qp * qp,u32 port_num,struct scatterlist * sg,u32 sg_cnt,struct scatterlist * prot_sg,u32 prot_sg_cnt,enum dma_data_direction dir)1047 void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
1048 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
1049 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
1050 		enum dma_data_direction dir)
1051 {
1052 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
1053 		return;
1054 
1055 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
1056 	kfree(ctx->reg);
1057 
1058 	if (prot_sg_cnt)
1059 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
1060 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
1061 }
1062 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
1063 
1064 /**
1065  * rdma_rw_mr_factor - return number of MRs required for a payload
1066  * @device:	device handling the connection
1067  * @port_num:	port num to which the connection is bound
1068  * @maxpages:	maximum payload pages per rdma_rw_ctx
1069  *
1070  * Returns the number of MRs the device requires to move @maxpayload
1071  * bytes. The returned value is used during transport creation to
1072  * compute max_rdma_ctxts and the size of the transport's Send and
1073  * Send Completion Queues.
1074  */
rdma_rw_mr_factor(struct ib_device * device,u32 port_num,unsigned int maxpages)1075 unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
1076 			       unsigned int maxpages)
1077 {
1078 	unsigned int mr_pages;
1079 
1080 	if (rdma_rw_can_use_mr(device, port_num))
1081 		mr_pages = rdma_rw_fr_page_list_len(device, false);
1082 	else
1083 		mr_pages = device->attrs.max_sge_rd;
1084 	return DIV_ROUND_UP(maxpages, mr_pages);
1085 }
1086 EXPORT_SYMBOL(rdma_rw_mr_factor);
1087 
1088 /**
1089  * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts
1090  * @dev: RDMA device
1091  * @port_num: port number
1092  * @max_rdma_ctxs: number of rdma_rw_ctx structures
1093  * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if
1094  *                data integrity will be enabled on the QP)
1095  *
1096  * Returns the total number of Send Queue entries needed for
1097  * @max_rdma_ctxs. The result accounts for memory registration and
1098  * invalidation work requests when the device requires them.
1099  *
1100  * ULPs use this to size Send Queues and Send CQs before creating a
1101  * Queue Pair.
1102  */
rdma_rw_max_send_wr(struct ib_device * dev,u32 port_num,unsigned int max_rdma_ctxs,u32 create_flags)1103 unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
1104 				 unsigned int max_rdma_ctxs, u32 create_flags)
1105 {
1106 	unsigned int factor = 1;
1107 	unsigned int result;
1108 
1109 	if (create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1110 	    rdma_rw_can_use_mr(dev, port_num))
1111 		factor += 2;	/* reg + inv */
1112 
1113 	if (check_mul_overflow(factor, max_rdma_ctxs, &result))
1114 		return UINT_MAX;
1115 	return result;
1116 }
1117 EXPORT_SYMBOL(rdma_rw_max_send_wr);
1118 
rdma_rw_init_qp(struct ib_device * dev,struct ib_qp_init_attr * attr)1119 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
1120 {
1121 	unsigned int factor = 1;
1122 
1123 	WARN_ON_ONCE(attr->port_num == 0);
1124 
1125 	/*
1126 	 * If the device uses MRs to perform RDMA READ or WRITE operations,
1127 	 * or if data integrity is enabled, account for registration and
1128 	 * invalidation work requests.
1129 	 */
1130 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1131 	    rdma_rw_can_use_mr(dev, attr->port_num))
1132 		factor += 2;	/* reg + inv */
1133 
1134 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
1135 
1136 	/*
1137 	 * The device might not support all we need, and we'll have to
1138 	 * live with what we get.
1139 	 */
1140 	attr->cap.max_send_wr =
1141 		min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
1142 }
1143 
rdma_rw_init_mrs(struct ib_qp * qp,struct ib_qp_init_attr * attr)1144 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
1145 {
1146 	struct ib_device *dev = qp->pd->device;
1147 	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
1148 	int ret = 0;
1149 
1150 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
1151 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
1152 		nr_mrs = attr->cap.max_rdma_ctxs;
1153 		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
1154 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
1155 		nr_mrs = attr->cap.max_rdma_ctxs;
1156 		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
1157 	}
1158 
1159 	if (nr_mrs) {
1160 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
1161 				IB_MR_TYPE_MEM_REG,
1162 				max_num_sg, 0);
1163 		if (ret) {
1164 			pr_err("%s: failed to allocated %u MRs\n",
1165 				__func__, nr_mrs);
1166 			return ret;
1167 		}
1168 	}
1169 
1170 	if (nr_sig_mrs) {
1171 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
1172 				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
1173 		if (ret) {
1174 			pr_err("%s: failed to allocated %u SIG MRs\n",
1175 				__func__, nr_sig_mrs);
1176 			goto out_free_rdma_mrs;
1177 		}
1178 	}
1179 
1180 	return 0;
1181 
1182 out_free_rdma_mrs:
1183 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1184 	return ret;
1185 }
1186 
rdma_rw_cleanup_mrs(struct ib_qp * qp)1187 void rdma_rw_cleanup_mrs(struct ib_qp *qp)
1188 {
1189 	ib_mr_pool_destroy(qp, &qp->sig_mrs);
1190 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1191 }
1192