xref: /linux/drivers/infiniband/core/rw.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2016 HGST, a Western Digital Company.
4  */
5 #include <linux/memremap.h>
6 #include <linux/moduleparam.h>
7 #include <linux/slab.h>
8 #include <linux/pci-p2pdma.h>
9 #include <rdma/mr_pool.h>
10 #include <rdma/rw.h>
11 
12 enum {
13 	RDMA_RW_SINGLE_WR,
14 	RDMA_RW_MULTI_WR,
15 	RDMA_RW_MR,
16 	RDMA_RW_SIG_MR,
17 	RDMA_RW_IOVA,
18 };
19 
20 static bool rdma_rw_force_mr;
21 module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
22 MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
23 
24 /*
25  * Report whether memory registration should be used. Memory registration must
26  * be used for iWarp devices because of iWARP-specific limitations. Memory
27  * registration is also enabled if registering memory might yield better
28  * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
29  */
30 static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num)
31 {
32 	if (rdma_protocol_iwarp(dev, port_num))
33 		return true;
34 	if (dev->attrs.max_sgl_rd)
35 		return true;
36 	if (unlikely(rdma_rw_force_mr))
37 		return true;
38 	return false;
39 }
40 
41 /*
42  * Check if the device will use memory registration for this RW operation.
43  * For RDMA READs we must use MRs on iWarp and can optionally use them as an
44  * optimization otherwise.  Additionally we have a debug option to force usage
45  * of MRs to help testing this code path.
46  */
47 static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num,
48 		enum dma_data_direction dir, int dma_nents)
49 {
50 	if (dir == DMA_FROM_DEVICE) {
51 		if (rdma_protocol_iwarp(dev, port_num))
52 			return true;
53 		if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd)
54 			return true;
55 	}
56 	if (unlikely(rdma_rw_force_mr))
57 		return true;
58 	return false;
59 }
60 
61 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
62 					   bool pi_support)
63 {
64 	u32 max_pages;
65 
66 	if (pi_support)
67 		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
68 	else
69 		max_pages = dev->attrs.max_fast_reg_page_list_len;
70 
71 	/* arbitrary limit to avoid allocating gigantic resources */
72 	return min_t(u32, max_pages, 256);
73 }
74 
75 static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
76 {
77 	int count = 0;
78 
79 	if (reg->mr->need_inval) {
80 		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
81 		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
82 		reg->inv_wr.next = &reg->reg_wr.wr;
83 		count++;
84 	} else {
85 		reg->inv_wr.next = NULL;
86 	}
87 
88 	return count;
89 }
90 
91 /* Caller must have zero-initialized *reg. */
92 static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
93 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
94 		u32 sg_cnt, u32 offset)
95 {
96 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
97 						    qp->integrity_en);
98 	u32 nents = min(sg_cnt, pages_per_mr);
99 	int count = 0, ret;
100 
101 	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
102 	if (!reg->mr)
103 		return -EAGAIN;
104 
105 	count += rdma_rw_inv_key(reg);
106 
107 	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
108 	if (ret < 0 || ret < nents) {
109 		ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
110 		return -EINVAL;
111 	}
112 
113 	reg->reg_wr.wr.opcode = IB_WR_REG_MR;
114 	reg->reg_wr.mr = reg->mr;
115 	reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
116 	if (rdma_protocol_iwarp(qp->device, port_num))
117 		reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
118 	count++;
119 
120 	reg->sge.addr = reg->mr->iova;
121 	reg->sge.length = reg->mr->length;
122 	return count;
123 }
124 
125 static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg,
126 		struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num,
127 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
128 {
129 	if (prev) {
130 		if (reg->mr->need_inval)
131 			prev->wr.wr.next = &reg->inv_wr;
132 		else
133 			prev->wr.wr.next = &reg->reg_wr.wr;
134 	}
135 
136 	reg->reg_wr.wr.next = &reg->wr.wr;
137 
138 	reg->wr.wr.sg_list = &reg->sge;
139 	reg->wr.wr.num_sge = 1;
140 	reg->wr.remote_addr = remote_addr;
141 	reg->wr.rkey = rkey;
142 
143 	if (dir == DMA_TO_DEVICE) {
144 		reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
145 	} else if (!rdma_cap_read_inv(qp->device, port_num)) {
146 		reg->wr.wr.opcode = IB_WR_RDMA_READ;
147 	} else {
148 		reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
149 		reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
150 	}
151 
152 	return 1;
153 }
154 
155 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
156 		u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
157 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
158 {
159 	struct rdma_rw_reg_ctx *prev = NULL;
160 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
161 						    qp->integrity_en);
162 	int i, j, ret = 0, count = 0;
163 
164 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr);
165 	ctx->reg = kzalloc_objs(*ctx->reg, ctx->nr_ops);
166 	if (!ctx->reg) {
167 		ret = -ENOMEM;
168 		goto out;
169 	}
170 
171 	for (i = 0; i < ctx->nr_ops; i++) {
172 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
173 		u32 nents = min(sg_cnt, pages_per_mr);
174 
175 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
176 				offset);
177 		if (ret < 0)
178 			goto out_free;
179 		count += ret;
180 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
181 				remote_addr, rkey, dir);
182 		remote_addr += reg->sge.length;
183 		sg_cnt -= nents;
184 		for (j = 0; j < nents; j++)
185 			sg = sg_next(sg);
186 		prev = reg;
187 		offset = 0;
188 	}
189 
190 	if (prev)
191 		prev->wr.wr.next = NULL;
192 
193 	ctx->type = RDMA_RW_MR;
194 	return count;
195 
196 out_free:
197 	while (--i >= 0)
198 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
199 	kfree(ctx->reg);
200 out:
201 	return ret;
202 }
203 
204 static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
205 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
206 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
207 		enum dma_data_direction dir)
208 {
209 	struct ib_device *dev = qp->pd->device;
210 	struct rdma_rw_reg_ctx *prev = NULL;
211 	u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
212 	struct scatterlist *sg;
213 	int i, ret, count = 0;
214 	u32 nents = 0;
215 
216 	ctx->reg = kzalloc_objs(*ctx->reg, DIV_ROUND_UP(nr_bvec, pages_per_mr));
217 	if (!ctx->reg)
218 		return -ENOMEM;
219 
220 	/*
221 	 * Build scatterlist from bvecs using the iterator. This follows
222 	 * the pattern from __blk_rq_map_sg.
223 	 */
224 	ctx->reg[0].sgt.sgl = kmalloc_objs(*ctx->reg[0].sgt.sgl, nr_bvec);
225 	if (!ctx->reg[0].sgt.sgl) {
226 		ret = -ENOMEM;
227 		goto out_free_reg;
228 	}
229 	sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec);
230 
231 	for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) {
232 		struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
233 
234 		if (nents >= nr_bvec) {
235 			ret = -EINVAL;
236 			goto out_free_sgl;
237 		}
238 		sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset);
239 		bvec_iter_advance(bvecs, iter, bv.bv_len);
240 		nents++;
241 	}
242 	sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents));
243 	ctx->reg[0].sgt.orig_nents = nents;
244 
245 	/* DMA map the scatterlist */
246 	ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
247 	if (ret)
248 		goto out_free_sgl;
249 
250 	ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr);
251 
252 	sg = ctx->reg[0].sgt.sgl;
253 	nents = ctx->reg[0].sgt.nents;
254 	for (i = 0; i < ctx->nr_ops; i++) {
255 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
256 		u32 sge_cnt = min(nents, pages_per_mr);
257 
258 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0);
259 		if (ret < 0)
260 			goto out_free_mrs;
261 		count += ret;
262 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
263 				remote_addr, rkey, dir);
264 		remote_addr += reg->sge.length;
265 		nents -= sge_cnt;
266 		sg += sge_cnt;
267 		prev = reg;
268 	}
269 
270 	if (prev)
271 		prev->wr.wr.next = NULL;
272 
273 	ctx->type = RDMA_RW_MR;
274 	return count;
275 
276 out_free_mrs:
277 	while (--i >= 0)
278 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
279 	ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
280 out_free_sgl:
281 	kfree(ctx->reg[0].sgt.sgl);
282 out_free_reg:
283 	kfree(ctx->reg);
284 	return ret;
285 }
286 
287 static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
288 		struct scatterlist *sg, u32 sg_cnt, u32 offset,
289 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
290 {
291 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
292 		      qp->max_read_sge;
293 	struct ib_sge *sge;
294 	u32 total_len = 0, i, j;
295 
296 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
297 
298 	ctx->map.sges = sge = kzalloc_objs(*sge, sg_cnt);
299 	if (!ctx->map.sges)
300 		goto out;
301 
302 	ctx->map.wrs = kzalloc_objs(*ctx->map.wrs, ctx->nr_ops);
303 	if (!ctx->map.wrs)
304 		goto out_free_sges;
305 
306 	for (i = 0; i < ctx->nr_ops; i++) {
307 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
308 		u32 nr_sge = min(sg_cnt, max_sge);
309 
310 		if (dir == DMA_TO_DEVICE)
311 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
312 		else
313 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
314 		rdma_wr->remote_addr = remote_addr + total_len;
315 		rdma_wr->rkey = rkey;
316 		rdma_wr->wr.num_sge = nr_sge;
317 		rdma_wr->wr.sg_list = sge;
318 
319 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
320 			sge->addr = sg_dma_address(sg) + offset;
321 			sge->length = sg_dma_len(sg) - offset;
322 			sge->lkey = qp->pd->local_dma_lkey;
323 
324 			total_len += sge->length;
325 			sge++;
326 			sg_cnt--;
327 			offset = 0;
328 		}
329 
330 		rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
331 			&ctx->map.wrs[i + 1].wr : NULL;
332 	}
333 
334 	ctx->type = RDMA_RW_MULTI_WR;
335 	return ctx->nr_ops;
336 
337 out_free_sges:
338 	kfree(ctx->map.sges);
339 out:
340 	return -ENOMEM;
341 }
342 
343 static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
344 		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
345 		enum dma_data_direction dir)
346 {
347 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
348 
349 	ctx->nr_ops = 1;
350 
351 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
352 	ctx->single.sge.addr = sg_dma_address(sg) + offset;
353 	ctx->single.sge.length = sg_dma_len(sg) - offset;
354 
355 	memset(rdma_wr, 0, sizeof(*rdma_wr));
356 	if (dir == DMA_TO_DEVICE)
357 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
358 	else
359 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
360 	rdma_wr->wr.sg_list = &ctx->single.sge;
361 	rdma_wr->wr.num_sge = 1;
362 	rdma_wr->remote_addr = remote_addr;
363 	rdma_wr->rkey = rkey;
364 
365 	ctx->type = RDMA_RW_SINGLE_WR;
366 	return 1;
367 }
368 
369 static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
370 		struct ib_qp *qp, const struct bio_vec *bvecs,
371 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
372 		enum dma_data_direction dir)
373 {
374 	struct ib_device *dev = qp->pd->device;
375 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
376 	struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
377 	u64 dma_addr;
378 
379 	ctx->nr_ops = 1;
380 
381 	dma_addr = ib_dma_map_bvec(dev, &bv, dir);
382 	if (ib_dma_mapping_error(dev, dma_addr))
383 		return -ENOMEM;
384 
385 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
386 	ctx->single.sge.addr = dma_addr;
387 	ctx->single.sge.length = bv.bv_len;
388 
389 	memset(rdma_wr, 0, sizeof(*rdma_wr));
390 	if (dir == DMA_TO_DEVICE)
391 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
392 	else
393 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
394 	rdma_wr->wr.sg_list = &ctx->single.sge;
395 	rdma_wr->wr.num_sge = 1;
396 	rdma_wr->remote_addr = remote_addr;
397 	rdma_wr->rkey = rkey;
398 
399 	ctx->type = RDMA_RW_SINGLE_WR;
400 	return 1;
401 }
402 
403 static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
404 		const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter,
405 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
406 {
407 	struct ib_device *dev = qp->pd->device;
408 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
409 		      qp->max_read_sge;
410 	struct ib_sge *sge;
411 	u32 total_len = 0, i, j;
412 	u32 mapped_bvecs = 0;
413 	u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
414 	size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges));
415 	size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs));
416 	size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs));
417 	void *mem;
418 
419 	if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX ||
420 	    check_add_overflow(wrs_offset, wrs_size, &wrs_size))
421 		return -ENOMEM;
422 
423 	mem = kzalloc(wrs_size, GFP_KERNEL);
424 	if (!mem)
425 		return -ENOMEM;
426 
427 	ctx->map.sges = sge = mem;
428 	ctx->map.wrs = mem + wrs_offset;
429 
430 	for (i = 0; i < nr_ops; i++) {
431 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
432 		u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge);
433 
434 		if (dir == DMA_TO_DEVICE)
435 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
436 		else
437 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
438 		rdma_wr->remote_addr = remote_addr + total_len;
439 		rdma_wr->rkey = rkey;
440 		rdma_wr->wr.num_sge = nr_sge;
441 		rdma_wr->wr.sg_list = sge;
442 
443 		for (j = 0; j < nr_sge; j++) {
444 			struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
445 			u64 dma_addr;
446 
447 			dma_addr = ib_dma_map_bvec(dev, &bv, dir);
448 			if (ib_dma_mapping_error(dev, dma_addr))
449 				goto out_unmap;
450 
451 			mapped_bvecs++;
452 			sge->addr = dma_addr;
453 			sge->length = bv.bv_len;
454 			sge->lkey = qp->pd->local_dma_lkey;
455 
456 			total_len += bv.bv_len;
457 			sge++;
458 
459 			bvec_iter_advance_single(bvecs, iter, bv.bv_len);
460 		}
461 
462 		rdma_wr->wr.next = i + 1 < nr_ops ?
463 			&ctx->map.wrs[i + 1].wr : NULL;
464 	}
465 
466 	ctx->nr_ops = nr_ops;
467 	ctx->type = RDMA_RW_MULTI_WR;
468 	return nr_ops;
469 
470 out_unmap:
471 	for (i = 0; i < mapped_bvecs; i++)
472 		ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
473 				  ctx->map.sges[i].length, dir);
474 	kfree(ctx->map.sges);
475 	return -ENOMEM;
476 }
477 
478 /*
479  * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
480  * This reduces IOTLB sync overhead by doing one sync at the end instead of
481  * one per bvec, and produces a contiguous DMA address range that can be
482  * described by a single SGE.
483  *
484  * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
485  * mapping is not available, or another negative error code on failure.
486  */
487 static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
488 		struct ib_qp *qp, const struct bio_vec *bvec,
489 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
490 		enum dma_data_direction dir)
491 {
492 	struct ib_device *dev = qp->pd->device;
493 	struct device *dma_dev = dev->dma_device;
494 	size_t total_len = iter->bi_size;
495 	struct bio_vec first_bv;
496 	size_t mapped_len = 0;
497 	int ret;
498 
499 	/* Virtual DMA devices cannot support IOVA allocators */
500 	if (ib_uses_virt_dma(dev))
501 		return -EOPNOTSUPP;
502 
503 	/* Try to allocate contiguous IOVA space */
504 	first_bv = mp_bvec_iter_bvec(bvec, *iter);
505 	if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
506 				bvec_phys(&first_bv), total_len))
507 		return -EOPNOTSUPP;
508 
509 	/* Link all bvecs into the IOVA space */
510 	while (iter->bi_size) {
511 		struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
512 
513 		ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
514 				    mapped_len, bv.bv_len, dir, 0);
515 		if (ret)
516 			goto out_destroy;
517 
518 		mapped_len += bv.bv_len;
519 		bvec_iter_advance(bvec, iter, bv.bv_len);
520 	}
521 
522 	/* Sync the IOTLB once for all linked pages */
523 	ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
524 	if (ret)
525 		goto out_destroy;
526 
527 	ctx->iova.mapped_len = mapped_len;
528 
529 	/* Single SGE covers the entire contiguous IOVA range */
530 	ctx->iova.sge.addr = ctx->iova.state.addr;
531 	ctx->iova.sge.length = mapped_len;
532 	ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
533 
534 	/* Single WR for the whole transfer */
535 	memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
536 	if (dir == DMA_TO_DEVICE)
537 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
538 	else
539 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
540 	ctx->iova.wr.wr.num_sge = 1;
541 	ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
542 	ctx->iova.wr.remote_addr = remote_addr;
543 	ctx->iova.wr.rkey = rkey;
544 
545 	ctx->type = RDMA_RW_IOVA;
546 	ctx->nr_ops = 1;
547 	return 1;
548 
549 out_destroy:
550 	/*
551 	 * dma_iova_destroy() expects the actual mapped length, not the
552 	 * total allocation size. It unlinks only the successfully linked
553 	 * range and frees the entire IOVA allocation.
554 	 */
555 	dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
556 	return ret;
557 }
558 
559 /**
560  * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
561  * @ctx:	context to initialize
562  * @qp:		queue pair to operate on
563  * @port_num:	port num to which the connection is bound
564  * @sg:		scatterlist to READ/WRITE from/to
565  * @sg_cnt:	number of entries in @sg
566  * @sg_offset:	current byte offset into @sg
567  * @remote_addr:remote address to read/write (relative to @rkey)
568  * @rkey:	remote key to operate on
569  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
570  *
571  * Returns the number of WQEs that will be needed on the workqueue if
572  * successful, or a negative error code.
573  */
574 int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
575 		struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
576 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
577 {
578 	struct ib_device *dev = qp->pd->device;
579 	struct sg_table sgt = {
580 		.sgl = sg,
581 		.orig_nents = sg_cnt,
582 	};
583 	int ret;
584 
585 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
586 	if (ret)
587 		return ret;
588 	sg_cnt = sgt.nents;
589 
590 	/*
591 	 * Skip to the S/G entry that sg_offset falls into:
592 	 */
593 	for (;;) {
594 		u32 len = sg_dma_len(sg);
595 
596 		if (sg_offset < len)
597 			break;
598 
599 		sg = sg_next(sg);
600 		sg_offset -= len;
601 		sg_cnt--;
602 	}
603 
604 	ret = -EIO;
605 	if (WARN_ON_ONCE(sg_cnt == 0))
606 		goto out_unmap_sg;
607 
608 	if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
609 		ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
610 				sg_offset, remote_addr, rkey, dir);
611 	} else if (sg_cnt > 1) {
612 		ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
613 				remote_addr, rkey, dir);
614 	} else {
615 		ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
616 				remote_addr, rkey, dir);
617 	}
618 
619 	if (ret < 0)
620 		goto out_unmap_sg;
621 	return ret;
622 
623 out_unmap_sg:
624 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
625 	return ret;
626 }
627 EXPORT_SYMBOL(rdma_rw_ctx_init);
628 
629 /**
630  * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec
631  * @ctx:	context to initialize
632  * @qp:		queue pair to operate on
633  * @port_num:	port num to which the connection is bound
634  * @bvecs:	bio_vec array to READ/WRITE from/to
635  * @nr_bvec:	number of entries in @bvecs
636  * @iter:	bvec iterator describing offset and length
637  * @remote_addr: remote address to read/write (relative to @rkey)
638  * @rkey:	remote key to operate on
639  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
640  *
641  * Maps the bio_vec array directly, avoiding intermediate scatterlist
642  * conversion. Supports MR registration for iWARP devices and force_mr mode.
643  *
644  * Returns the number of WQEs that will be needed on the workqueue if
645  * successful, or a negative error code:
646  *
647  *   * -EINVAL  - @nr_bvec is zero or @iter.bi_size is zero
648  *   * -ENOMEM - DMA mapping or memory allocation failed
649  */
650 int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
651 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
652 		struct bvec_iter iter, u64 remote_addr, u32 rkey,
653 		enum dma_data_direction dir)
654 {
655 	struct ib_device *dev = qp->pd->device;
656 	int ret;
657 
658 	if (nr_bvec == 0 || iter.bi_size == 0)
659 		return -EINVAL;
660 
661 	/*
662 	 * iWARP requires MR registration for all RDMA READs. The force_mr
663 	 * debug option also mandates MR usage.
664 	 */
665 	if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num))
666 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
667 						nr_bvec, &iter, remote_addr,
668 						rkey, dir);
669 	if (unlikely(rdma_rw_force_mr))
670 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
671 						nr_bvec, &iter, remote_addr,
672 						rkey, dir);
673 
674 	if (nr_bvec == 1)
675 		return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
676 				remote_addr, rkey, dir);
677 
678 	/*
679 	 * Try IOVA-based mapping first for multi-bvec transfers.
680 	 * IOVA coalesces bvecs into a single DMA-contiguous region,
681 	 * reducing the number of WRs needed and avoiding MR overhead.
682 	 */
683 	ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
684 			rkey, dir);
685 	if (ret != -EOPNOTSUPP)
686 		return ret;
687 
688 	/*
689 	 * IOVA mapping not available. Check if MR registration provides
690 	 * better performance than multiple SGE entries.
691 	 */
692 	if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
693 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
694 						nr_bvec, &iter, remote_addr,
695 						rkey, dir);
696 
697 	return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
698 			remote_addr, rkey, dir);
699 }
700 EXPORT_SYMBOL(rdma_rw_ctx_init_bvec);
701 
702 /**
703  * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
704  * @ctx:	context to initialize
705  * @qp:		queue pair to operate on
706  * @port_num:	port num to which the connection is bound
707  * @sg:		scatterlist to READ/WRITE from/to
708  * @sg_cnt:	number of entries in @sg
709  * @prot_sg:	scatterlist to READ/WRITE protection information from/to
710  * @prot_sg_cnt: number of entries in @prot_sg
711  * @sig_attrs:	signature offloading algorithms
712  * @remote_addr:remote address to read/write (relative to @rkey)
713  * @rkey:	remote key to operate on
714  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
715  *
716  * Returns the number of WQEs that will be needed on the workqueue if
717  * successful, or a negative error code.
718  */
719 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
720 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
721 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
722 		struct ib_sig_attrs *sig_attrs,
723 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
724 {
725 	struct ib_device *dev = qp->pd->device;
726 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
727 						    qp->integrity_en);
728 	struct sg_table sgt = {
729 		.sgl = sg,
730 		.orig_nents = sg_cnt,
731 	};
732 	struct sg_table prot_sgt = {
733 		.sgl = prot_sg,
734 		.orig_nents = prot_sg_cnt,
735 	};
736 	struct ib_rdma_wr *rdma_wr;
737 	int count = 0, ret;
738 
739 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
740 		pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n",
741 		       sg_cnt, prot_sg_cnt, pages_per_mr);
742 		return -EINVAL;
743 	}
744 
745 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
746 	if (ret)
747 		return ret;
748 
749 	if (prot_sg_cnt) {
750 		ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0);
751 		if (ret)
752 			goto out_unmap_sg;
753 	}
754 
755 	ctx->type = RDMA_RW_SIG_MR;
756 	ctx->nr_ops = 1;
757 	ctx->reg = kzalloc_obj(*ctx->reg);
758 	if (!ctx->reg) {
759 		ret = -ENOMEM;
760 		goto out_unmap_prot_sg;
761 	}
762 
763 	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
764 	if (!ctx->reg->mr) {
765 		ret = -EAGAIN;
766 		goto out_free_ctx;
767 	}
768 
769 	count += rdma_rw_inv_key(ctx->reg);
770 
771 	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
772 
773 	ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg,
774 			      prot_sgt.nents, NULL, SZ_4K);
775 	if (unlikely(ret)) {
776 		pr_err("failed to map PI sg (%u)\n",
777 		       sgt.nents + prot_sgt.nents);
778 		goto out_destroy_sig_mr;
779 	}
780 
781 	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
782 	ctx->reg->reg_wr.wr.wr_cqe = NULL;
783 	ctx->reg->reg_wr.wr.num_sge = 0;
784 	ctx->reg->reg_wr.wr.send_flags = 0;
785 	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
786 	if (rdma_protocol_iwarp(qp->device, port_num))
787 		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
788 	ctx->reg->reg_wr.mr = ctx->reg->mr;
789 	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
790 	count++;
791 
792 	ctx->reg->sge.addr = ctx->reg->mr->iova;
793 	ctx->reg->sge.length = ctx->reg->mr->length;
794 	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
795 		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
796 
797 	rdma_wr = &ctx->reg->wr;
798 	rdma_wr->wr.sg_list = &ctx->reg->sge;
799 	rdma_wr->wr.num_sge = 1;
800 	rdma_wr->remote_addr = remote_addr;
801 	rdma_wr->rkey = rkey;
802 	if (dir == DMA_TO_DEVICE)
803 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
804 	else
805 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
806 	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
807 	count++;
808 
809 	return count;
810 
811 out_destroy_sig_mr:
812 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
813 out_free_ctx:
814 	kfree(ctx->reg);
815 out_unmap_prot_sg:
816 	if (prot_sgt.nents)
817 		ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0);
818 out_unmap_sg:
819 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
820 	return ret;
821 }
822 EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
823 
824 /*
825  * Now that we are going to post the WRs we can update the lkey and need_inval
826  * state on the MRs.  If we were doing this at init time, we would get double
827  * or missing invalidations if a context was initialized but not actually
828  * posted.
829  */
830 static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
831 {
832 	reg->mr->need_inval = need_inval;
833 	ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
834 	reg->reg_wr.key = reg->mr->lkey;
835 	reg->sge.lkey = reg->mr->lkey;
836 }
837 
838 /**
839  * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
840  * @ctx:	context to operate on
841  * @qp:		queue pair to operate on
842  * @port_num:	port num to which the connection is bound
843  * @cqe:	completion queue entry for the last WR
844  * @chain_wr:	WR to append to the posted chain
845  *
846  * Return the WR chain for the set of RDMA READ/WRITE operations described by
847  * @ctx, as well as any memory registration operations needed.  If @chain_wr
848  * is non-NULL the WR it points to will be appended to the chain of WRs posted.
849  * If @chain_wr is not set @cqe must be set so that the caller gets a
850  * completion notification.
851  */
852 struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
853 		u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
854 {
855 	struct ib_send_wr *first_wr, *last_wr;
856 	int i;
857 
858 	switch (ctx->type) {
859 	case RDMA_RW_SIG_MR:
860 	case RDMA_RW_MR:
861 		for (i = 0; i < ctx->nr_ops; i++) {
862 			rdma_rw_update_lkey(&ctx->reg[i],
863 				ctx->reg[i].wr.wr.opcode !=
864 					IB_WR_RDMA_READ_WITH_INV);
865 		}
866 
867 		if (ctx->reg[0].inv_wr.next)
868 			first_wr = &ctx->reg[0].inv_wr;
869 		else
870 			first_wr = &ctx->reg[0].reg_wr.wr;
871 		last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
872 		break;
873 	case RDMA_RW_IOVA:
874 		first_wr = &ctx->iova.wr.wr;
875 		last_wr = &ctx->iova.wr.wr;
876 		break;
877 	case RDMA_RW_MULTI_WR:
878 		first_wr = &ctx->map.wrs[0].wr;
879 		last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
880 		break;
881 	case RDMA_RW_SINGLE_WR:
882 		first_wr = &ctx->single.wr.wr;
883 		last_wr = &ctx->single.wr.wr;
884 		break;
885 	default:
886 		BUG();
887 	}
888 
889 	if (chain_wr) {
890 		last_wr->next = chain_wr;
891 	} else {
892 		last_wr->wr_cqe = cqe;
893 		last_wr->send_flags |= IB_SEND_SIGNALED;
894 	}
895 
896 	return first_wr;
897 }
898 EXPORT_SYMBOL(rdma_rw_ctx_wrs);
899 
900 /**
901  * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
902  * @ctx:	context to operate on
903  * @qp:		queue pair to operate on
904  * @port_num:	port num to which the connection is bound
905  * @cqe:	completion queue entry for the last WR
906  * @chain_wr:	WR to append to the posted chain
907  *
908  * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
909  * any memory registration operations needed.  If @chain_wr is non-NULL the
910  * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
911  * is not set @cqe must be set so that the caller gets a completion
912  * notification.
913  */
914 int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
915 		struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
916 {
917 	struct ib_send_wr *first_wr;
918 
919 	first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
920 	return ib_post_send(qp, first_wr, NULL);
921 }
922 EXPORT_SYMBOL(rdma_rw_ctx_post);
923 
924 /**
925  * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
926  * @ctx:	context to release
927  * @qp:		queue pair to operate on
928  * @port_num:	port num to which the connection is bound
929  * @sg:		scatterlist that was used for the READ/WRITE
930  * @sg_cnt:	number of entries in @sg
931  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
932  */
933 void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
934 			 u32 port_num, struct scatterlist *sg, u32 sg_cnt,
935 			 enum dma_data_direction dir)
936 {
937 	int i;
938 
939 	switch (ctx->type) {
940 	case RDMA_RW_MR:
941 		/* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
942 		WARN_ON_ONCE(ctx->reg[0].sgt.sgl);
943 		for (i = 0; i < ctx->nr_ops; i++)
944 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
945 		kfree(ctx->reg);
946 		break;
947 	case RDMA_RW_MULTI_WR:
948 		kfree(ctx->map.wrs);
949 		kfree(ctx->map.sges);
950 		break;
951 	case RDMA_RW_SINGLE_WR:
952 		break;
953 	case RDMA_RW_IOVA:
954 		/* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
955 		WARN_ON_ONCE(1);
956 		return;
957 	default:
958 		BUG();
959 		break;
960 	}
961 
962 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
963 }
964 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
965 
966 /**
967  * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec
968  * @ctx:	context to release
969  * @qp:		queue pair to operate on
970  * @port_num:	port num to which the connection is bound (unused)
971  * @bvecs:	bio_vec array that was used for the READ/WRITE (unused)
972  * @nr_bvec:	number of entries in @bvecs
973  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
974  *
975  * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec()
976  * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error.
977  *
978  * The @port_num and @bvecs parameters are unused but present for API
979  * symmetry with rdma_rw_ctx_destroy().
980  */
981 void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
982 		u32 __maybe_unused port_num,
983 		const struct bio_vec __maybe_unused *bvecs,
984 		u32 nr_bvec, enum dma_data_direction dir)
985 {
986 	struct ib_device *dev = qp->pd->device;
987 	u32 i;
988 
989 	switch (ctx->type) {
990 	case RDMA_RW_MR:
991 		for (i = 0; i < ctx->nr_ops; i++)
992 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
993 		ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
994 		kfree(ctx->reg[0].sgt.sgl);
995 		kfree(ctx->reg);
996 		break;
997 	case RDMA_RW_IOVA:
998 		dma_iova_destroy(dev->dma_device, &ctx->iova.state,
999 				 ctx->iova.mapped_len, dir, 0);
1000 		break;
1001 	case RDMA_RW_MULTI_WR:
1002 		for (i = 0; i < nr_bvec; i++)
1003 			ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
1004 					  ctx->map.sges[i].length, dir);
1005 		kfree(ctx->map.sges);
1006 		break;
1007 	case RDMA_RW_SINGLE_WR:
1008 		ib_dma_unmap_bvec(dev, ctx->single.sge.addr,
1009 				  ctx->single.sge.length, dir);
1010 		break;
1011 	default:
1012 		WARN_ON_ONCE(1);
1013 		return;
1014 	}
1015 }
1016 EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec);
1017 
1018 /**
1019  * rdma_rw_ctx_destroy_signature - release all resources allocated by
1020  *	rdma_rw_ctx_signature_init
1021  * @ctx:	context to release
1022  * @qp:		queue pair to operate on
1023  * @port_num:	port num to which the connection is bound
1024  * @sg:		scatterlist that was used for the READ/WRITE
1025  * @sg_cnt:	number of entries in @sg
1026  * @prot_sg:	scatterlist that was used for the READ/WRITE of the PI
1027  * @prot_sg_cnt: number of entries in @prot_sg
1028  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
1029  */
1030 void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
1031 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
1032 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
1033 		enum dma_data_direction dir)
1034 {
1035 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
1036 		return;
1037 
1038 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
1039 	kfree(ctx->reg);
1040 
1041 	if (prot_sg_cnt)
1042 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
1043 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
1044 }
1045 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
1046 
1047 /**
1048  * rdma_rw_mr_factor - return number of MRs required for a payload
1049  * @device:	device handling the connection
1050  * @port_num:	port num to which the connection is bound
1051  * @maxpages:	maximum payload pages per rdma_rw_ctx
1052  *
1053  * Returns the number of MRs the device requires to move @maxpayload
1054  * bytes. The returned value is used during transport creation to
1055  * compute max_rdma_ctxts and the size of the transport's Send and
1056  * Send Completion Queues.
1057  */
1058 unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
1059 			       unsigned int maxpages)
1060 {
1061 	unsigned int mr_pages;
1062 
1063 	if (rdma_rw_can_use_mr(device, port_num))
1064 		mr_pages = rdma_rw_fr_page_list_len(device, false);
1065 	else
1066 		mr_pages = device->attrs.max_sge_rd;
1067 	return DIV_ROUND_UP(maxpages, mr_pages);
1068 }
1069 EXPORT_SYMBOL(rdma_rw_mr_factor);
1070 
1071 /**
1072  * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts
1073  * @dev: RDMA device
1074  * @port_num: port number
1075  * @max_rdma_ctxs: number of rdma_rw_ctx structures
1076  * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if
1077  *                data integrity will be enabled on the QP)
1078  *
1079  * Returns the total number of Send Queue entries needed for
1080  * @max_rdma_ctxs. The result accounts for memory registration and
1081  * invalidation work requests when the device requires them.
1082  *
1083  * ULPs use this to size Send Queues and Send CQs before creating a
1084  * Queue Pair.
1085  */
1086 unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
1087 				 unsigned int max_rdma_ctxs, u32 create_flags)
1088 {
1089 	unsigned int factor = 1;
1090 	unsigned int result;
1091 
1092 	if (create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1093 	    rdma_rw_can_use_mr(dev, port_num))
1094 		factor += 2;	/* reg + inv */
1095 
1096 	if (check_mul_overflow(factor, max_rdma_ctxs, &result))
1097 		return UINT_MAX;
1098 	return result;
1099 }
1100 EXPORT_SYMBOL(rdma_rw_max_send_wr);
1101 
1102 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
1103 {
1104 	unsigned int factor = 1;
1105 
1106 	WARN_ON_ONCE(attr->port_num == 0);
1107 
1108 	/*
1109 	 * If the device uses MRs to perform RDMA READ or WRITE operations,
1110 	 * or if data integrity is enabled, account for registration and
1111 	 * invalidation work requests.
1112 	 */
1113 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1114 	    rdma_rw_can_use_mr(dev, attr->port_num))
1115 		factor += 2;	/* reg + inv */
1116 
1117 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
1118 
1119 	/*
1120 	 * The device might not support all we need, and we'll have to
1121 	 * live with what we get.
1122 	 */
1123 	attr->cap.max_send_wr =
1124 		min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
1125 }
1126 
1127 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
1128 {
1129 	struct ib_device *dev = qp->pd->device;
1130 	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
1131 	int ret = 0;
1132 
1133 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
1134 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
1135 		nr_mrs = attr->cap.max_rdma_ctxs;
1136 		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
1137 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
1138 		nr_mrs = attr->cap.max_rdma_ctxs;
1139 		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
1140 	}
1141 
1142 	if (nr_mrs) {
1143 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
1144 				IB_MR_TYPE_MEM_REG,
1145 				max_num_sg, 0);
1146 		if (ret) {
1147 			pr_err("%s: failed to allocated %u MRs\n",
1148 				__func__, nr_mrs);
1149 			return ret;
1150 		}
1151 	}
1152 
1153 	if (nr_sig_mrs) {
1154 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
1155 				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
1156 		if (ret) {
1157 			pr_err("%s: failed to allocated %u SIG MRs\n",
1158 				__func__, nr_sig_mrs);
1159 			goto out_free_rdma_mrs;
1160 		}
1161 	}
1162 
1163 	return 0;
1164 
1165 out_free_rdma_mrs:
1166 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1167 	return ret;
1168 }
1169 
1170 void rdma_rw_cleanup_mrs(struct ib_qp *qp)
1171 {
1172 	ib_mr_pool_destroy(qp, &qp->sig_mrs);
1173 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1174 }
1175