xref: /linux/drivers/infiniband/core/rw.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2016 HGST, a Western Digital Company.
4  */
5 #include <linux/memremap.h>
6 #include <linux/moduleparam.h>
7 #include <linux/slab.h>
8 #include <linux/pci-p2pdma.h>
9 #include <rdma/mr_pool.h>
10 #include <rdma/rw.h>
11 
12 enum {
13 	RDMA_RW_SINGLE_WR,
14 	RDMA_RW_MULTI_WR,
15 	RDMA_RW_MR,
16 	RDMA_RW_SIG_MR,
17 	RDMA_RW_IOVA,
18 };
19 
20 static bool rdma_rw_force_mr;
21 module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
22 MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
23 
24 /*
25  * Report whether memory registration should be used. Memory registration must
26  * be used for iWarp devices because of iWARP-specific limitations. Memory
27  * registration is also enabled if registering memory might yield better
28  * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
29  */
30 static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num)
31 {
32 	if (rdma_protocol_iwarp(dev, port_num))
33 		return true;
34 	if (dev->attrs.max_sgl_rd)
35 		return true;
36 	if (unlikely(rdma_rw_force_mr))
37 		return true;
38 	return false;
39 }
40 
41 /*
42  * Check if the device will use memory registration for this RW operation.
43  * For RDMA READs we must use MRs on iWarp and can optionally use them as an
44  * optimization otherwise.  Additionally we have a debug option to force usage
45  * of MRs to help testing this code path.
46  */
47 static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num,
48 		enum dma_data_direction dir, int dma_nents)
49 {
50 	if (dir == DMA_FROM_DEVICE) {
51 		if (rdma_protocol_iwarp(dev, port_num))
52 			return true;
53 		if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd)
54 			return true;
55 	}
56 	if (unlikely(rdma_rw_force_mr))
57 		return true;
58 	return false;
59 }
60 
61 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
62 					   bool pi_support)
63 {
64 	u32 max_pages;
65 
66 	if (pi_support)
67 		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
68 	else
69 		max_pages = dev->attrs.max_fast_reg_page_list_len;
70 
71 	/* arbitrary limit to avoid allocating gigantic resources */
72 	return min_t(u32, max_pages, 256);
73 }
74 
75 static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
76 {
77 	int count = 0;
78 
79 	if (reg->mr->need_inval) {
80 		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
81 		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
82 		reg->inv_wr.next = &reg->reg_wr.wr;
83 		count++;
84 	} else {
85 		reg->inv_wr.next = NULL;
86 	}
87 
88 	return count;
89 }
90 
91 /* Caller must have zero-initialized *reg. */
92 static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
93 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
94 		u32 sg_cnt, u32 offset)
95 {
96 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
97 						    qp->integrity_en);
98 	u32 nents = min(sg_cnt, pages_per_mr);
99 	int count = 0, ret;
100 
101 	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
102 	if (!reg->mr)
103 		return -EAGAIN;
104 
105 	count += rdma_rw_inv_key(reg);
106 
107 	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
108 	if (ret < 0 || ret < nents) {
109 		ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
110 		return -EINVAL;
111 	}
112 
113 	reg->reg_wr.wr.opcode = IB_WR_REG_MR;
114 	reg->reg_wr.mr = reg->mr;
115 	reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
116 	if (rdma_protocol_iwarp(qp->device, port_num))
117 		reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
118 	count++;
119 
120 	reg->sge.addr = reg->mr->iova;
121 	reg->sge.length = reg->mr->length;
122 	return count;
123 }
124 
125 static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg,
126 		struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num,
127 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
128 {
129 	if (prev) {
130 		if (reg->mr->need_inval)
131 			prev->wr.wr.next = &reg->inv_wr;
132 		else
133 			prev->wr.wr.next = &reg->reg_wr.wr;
134 	}
135 
136 	reg->reg_wr.wr.next = &reg->wr.wr;
137 
138 	reg->wr.wr.sg_list = &reg->sge;
139 	reg->wr.wr.num_sge = 1;
140 	reg->wr.remote_addr = remote_addr;
141 	reg->wr.rkey = rkey;
142 
143 	if (dir == DMA_TO_DEVICE) {
144 		reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
145 	} else if (!rdma_cap_read_inv(qp->device, port_num)) {
146 		reg->wr.wr.opcode = IB_WR_RDMA_READ;
147 	} else {
148 		reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
149 		reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
150 	}
151 
152 	return 1;
153 }
154 
155 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
156 		u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
157 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
158 {
159 	struct rdma_rw_reg_ctx *prev = NULL;
160 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
161 						    qp->integrity_en);
162 	int i, j, ret = 0, count = 0;
163 
164 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr);
165 	ctx->reg = kzalloc_objs(*ctx->reg, ctx->nr_ops);
166 	if (!ctx->reg) {
167 		ret = -ENOMEM;
168 		goto out;
169 	}
170 
171 	for (i = 0; i < ctx->nr_ops; i++) {
172 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
173 		u32 nents = min(sg_cnt, pages_per_mr);
174 
175 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
176 				offset);
177 		if (ret < 0)
178 			goto out_free;
179 		count += ret;
180 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
181 				remote_addr, rkey, dir);
182 		remote_addr += reg->sge.length;
183 		sg_cnt -= nents;
184 		for (j = 0; j < nents; j++)
185 			sg = sg_next(sg);
186 		prev = reg;
187 		offset = 0;
188 	}
189 
190 	if (prev)
191 		prev->wr.wr.next = NULL;
192 
193 	ctx->type = RDMA_RW_MR;
194 	return count;
195 
196 out_free:
197 	while (--i >= 0)
198 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
199 	kfree(ctx->reg);
200 out:
201 	return ret;
202 }
203 
204 static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
205 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
206 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
207 		enum dma_data_direction dir)
208 {
209 	struct ib_device *dev = qp->pd->device;
210 	struct rdma_rw_reg_ctx *prev = NULL;
211 	u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
212 	struct scatterlist *sg;
213 	int i, ret, count = 0;
214 	u32 nents = 0;
215 
216 	ctx->reg = kzalloc_objs(*ctx->reg, DIV_ROUND_UP(nr_bvec, pages_per_mr),
217 				GFP_KERNEL);
218 	if (!ctx->reg)
219 		return -ENOMEM;
220 
221 	/*
222 	 * Build scatterlist from bvecs using the iterator. This follows
223 	 * the pattern from __blk_rq_map_sg.
224 	 */
225 	ctx->reg[0].sgt.sgl = kmalloc_objs(*ctx->reg[0].sgt.sgl, nr_bvec,
226 					   GFP_KERNEL);
227 	if (!ctx->reg[0].sgt.sgl) {
228 		ret = -ENOMEM;
229 		goto out_free_reg;
230 	}
231 	sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec);
232 
233 	for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) {
234 		struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
235 
236 		if (nents >= nr_bvec) {
237 			ret = -EINVAL;
238 			goto out_free_sgl;
239 		}
240 		sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset);
241 		bvec_iter_advance(bvecs, iter, bv.bv_len);
242 		nents++;
243 	}
244 	sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents));
245 	ctx->reg[0].sgt.orig_nents = nents;
246 
247 	/* DMA map the scatterlist */
248 	ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
249 	if (ret)
250 		goto out_free_sgl;
251 
252 	ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr);
253 
254 	sg = ctx->reg[0].sgt.sgl;
255 	nents = ctx->reg[0].sgt.nents;
256 	for (i = 0; i < ctx->nr_ops; i++) {
257 		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
258 		u32 sge_cnt = min(nents, pages_per_mr);
259 
260 		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0);
261 		if (ret < 0)
262 			goto out_free_mrs;
263 		count += ret;
264 		count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
265 				remote_addr, rkey, dir);
266 		remote_addr += reg->sge.length;
267 		nents -= sge_cnt;
268 		sg += sge_cnt;
269 		prev = reg;
270 	}
271 
272 	if (prev)
273 		prev->wr.wr.next = NULL;
274 
275 	ctx->type = RDMA_RW_MR;
276 	return count;
277 
278 out_free_mrs:
279 	while (--i >= 0)
280 		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
281 	ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
282 out_free_sgl:
283 	kfree(ctx->reg[0].sgt.sgl);
284 out_free_reg:
285 	kfree(ctx->reg);
286 	return ret;
287 }
288 
289 static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
290 		struct scatterlist *sg, u32 sg_cnt, u32 offset,
291 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
292 {
293 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
294 		      qp->max_read_sge;
295 	struct ib_sge *sge;
296 	u32 total_len = 0, i, j;
297 
298 	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
299 
300 	ctx->map.sges = sge = kzalloc_objs(*sge, sg_cnt);
301 	if (!ctx->map.sges)
302 		goto out;
303 
304 	ctx->map.wrs = kzalloc_objs(*ctx->map.wrs, ctx->nr_ops);
305 	if (!ctx->map.wrs)
306 		goto out_free_sges;
307 
308 	for (i = 0; i < ctx->nr_ops; i++) {
309 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
310 		u32 nr_sge = min(sg_cnt, max_sge);
311 
312 		if (dir == DMA_TO_DEVICE)
313 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
314 		else
315 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
316 		rdma_wr->remote_addr = remote_addr + total_len;
317 		rdma_wr->rkey = rkey;
318 		rdma_wr->wr.num_sge = nr_sge;
319 		rdma_wr->wr.sg_list = sge;
320 
321 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
322 			sge->addr = sg_dma_address(sg) + offset;
323 			sge->length = sg_dma_len(sg) - offset;
324 			sge->lkey = qp->pd->local_dma_lkey;
325 
326 			total_len += sge->length;
327 			sge++;
328 			sg_cnt--;
329 			offset = 0;
330 		}
331 
332 		rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
333 			&ctx->map.wrs[i + 1].wr : NULL;
334 	}
335 
336 	ctx->type = RDMA_RW_MULTI_WR;
337 	return ctx->nr_ops;
338 
339 out_free_sges:
340 	kfree(ctx->map.sges);
341 out:
342 	return -ENOMEM;
343 }
344 
345 static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
346 		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
347 		enum dma_data_direction dir)
348 {
349 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
350 
351 	ctx->nr_ops = 1;
352 
353 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
354 	ctx->single.sge.addr = sg_dma_address(sg) + offset;
355 	ctx->single.sge.length = sg_dma_len(sg) - offset;
356 
357 	memset(rdma_wr, 0, sizeof(*rdma_wr));
358 	if (dir == DMA_TO_DEVICE)
359 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
360 	else
361 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
362 	rdma_wr->wr.sg_list = &ctx->single.sge;
363 	rdma_wr->wr.num_sge = 1;
364 	rdma_wr->remote_addr = remote_addr;
365 	rdma_wr->rkey = rkey;
366 
367 	ctx->type = RDMA_RW_SINGLE_WR;
368 	return 1;
369 }
370 
371 static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
372 		struct ib_qp *qp, const struct bio_vec *bvecs,
373 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
374 		enum dma_data_direction dir)
375 {
376 	struct ib_device *dev = qp->pd->device;
377 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
378 	struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
379 	u64 dma_addr;
380 
381 	ctx->nr_ops = 1;
382 
383 	dma_addr = ib_dma_map_bvec(dev, &bv, dir);
384 	if (ib_dma_mapping_error(dev, dma_addr))
385 		return -ENOMEM;
386 
387 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
388 	ctx->single.sge.addr = dma_addr;
389 	ctx->single.sge.length = bv.bv_len;
390 
391 	memset(rdma_wr, 0, sizeof(*rdma_wr));
392 	if (dir == DMA_TO_DEVICE)
393 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
394 	else
395 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
396 	rdma_wr->wr.sg_list = &ctx->single.sge;
397 	rdma_wr->wr.num_sge = 1;
398 	rdma_wr->remote_addr = remote_addr;
399 	rdma_wr->rkey = rkey;
400 
401 	ctx->type = RDMA_RW_SINGLE_WR;
402 	return 1;
403 }
404 
405 static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
406 		const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter,
407 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
408 {
409 	struct ib_device *dev = qp->pd->device;
410 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
411 		      qp->max_read_sge;
412 	struct ib_sge *sge;
413 	u32 total_len = 0, i, j;
414 	u32 mapped_bvecs = 0;
415 	u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
416 	size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges));
417 	size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs));
418 	size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs));
419 	void *mem;
420 
421 	if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX ||
422 	    check_add_overflow(wrs_offset, wrs_size, &wrs_size))
423 		return -ENOMEM;
424 
425 	mem = kzalloc(wrs_size, GFP_KERNEL);
426 	if (!mem)
427 		return -ENOMEM;
428 
429 	ctx->map.sges = sge = mem;
430 	ctx->map.wrs = mem + wrs_offset;
431 
432 	for (i = 0; i < nr_ops; i++) {
433 		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
434 		u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge);
435 
436 		if (dir == DMA_TO_DEVICE)
437 			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
438 		else
439 			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
440 		rdma_wr->remote_addr = remote_addr + total_len;
441 		rdma_wr->rkey = rkey;
442 		rdma_wr->wr.num_sge = nr_sge;
443 		rdma_wr->wr.sg_list = sge;
444 
445 		for (j = 0; j < nr_sge; j++) {
446 			struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
447 			u64 dma_addr;
448 
449 			dma_addr = ib_dma_map_bvec(dev, &bv, dir);
450 			if (ib_dma_mapping_error(dev, dma_addr))
451 				goto out_unmap;
452 
453 			mapped_bvecs++;
454 			sge->addr = dma_addr;
455 			sge->length = bv.bv_len;
456 			sge->lkey = qp->pd->local_dma_lkey;
457 
458 			total_len += bv.bv_len;
459 			sge++;
460 
461 			bvec_iter_advance_single(bvecs, iter, bv.bv_len);
462 		}
463 
464 		rdma_wr->wr.next = i + 1 < nr_ops ?
465 			&ctx->map.wrs[i + 1].wr : NULL;
466 	}
467 
468 	ctx->nr_ops = nr_ops;
469 	ctx->type = RDMA_RW_MULTI_WR;
470 	return nr_ops;
471 
472 out_unmap:
473 	for (i = 0; i < mapped_bvecs; i++)
474 		ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
475 				  ctx->map.sges[i].length, dir);
476 	kfree(ctx->map.sges);
477 	return -ENOMEM;
478 }
479 
480 /*
481  * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
482  * This reduces IOTLB sync overhead by doing one sync at the end instead of
483  * one per bvec, and produces a contiguous DMA address range that can be
484  * described by a single SGE.
485  *
486  * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
487  * mapping is not available, or another negative error code on failure.
488  */
489 static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
490 		struct ib_qp *qp, const struct bio_vec *bvec,
491 		struct bvec_iter *iter, u64 remote_addr, u32 rkey,
492 		enum dma_data_direction dir)
493 {
494 	struct ib_device *dev = qp->pd->device;
495 	struct device *dma_dev = dev->dma_device;
496 	size_t total_len = iter->bi_size;
497 	struct bio_vec first_bv;
498 	size_t mapped_len = 0;
499 	int ret;
500 
501 	/* Virtual DMA devices cannot support IOVA allocators */
502 	if (ib_uses_virt_dma(dev))
503 		return -EOPNOTSUPP;
504 
505 	/* Try to allocate contiguous IOVA space */
506 	first_bv = mp_bvec_iter_bvec(bvec, *iter);
507 	if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
508 				bvec_phys(&first_bv), total_len))
509 		return -EOPNOTSUPP;
510 
511 	/* Link all bvecs into the IOVA space */
512 	while (iter->bi_size) {
513 		struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
514 
515 		ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
516 				    mapped_len, bv.bv_len, dir, 0);
517 		if (ret)
518 			goto out_destroy;
519 
520 		mapped_len += bv.bv_len;
521 		bvec_iter_advance(bvec, iter, bv.bv_len);
522 	}
523 
524 	/* Sync the IOTLB once for all linked pages */
525 	ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
526 	if (ret)
527 		goto out_destroy;
528 
529 	ctx->iova.mapped_len = mapped_len;
530 
531 	/* Single SGE covers the entire contiguous IOVA range */
532 	ctx->iova.sge.addr = ctx->iova.state.addr;
533 	ctx->iova.sge.length = mapped_len;
534 	ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
535 
536 	/* Single WR for the whole transfer */
537 	memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
538 	if (dir == DMA_TO_DEVICE)
539 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
540 	else
541 		ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
542 	ctx->iova.wr.wr.num_sge = 1;
543 	ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
544 	ctx->iova.wr.remote_addr = remote_addr;
545 	ctx->iova.wr.rkey = rkey;
546 
547 	ctx->type = RDMA_RW_IOVA;
548 	ctx->nr_ops = 1;
549 	return 1;
550 
551 out_destroy:
552 	/*
553 	 * dma_iova_destroy() expects the actual mapped length, not the
554 	 * total allocation size. It unlinks only the successfully linked
555 	 * range and frees the entire IOVA allocation.
556 	 */
557 	dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
558 	return ret;
559 }
560 
561 /**
562  * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
563  * @ctx:	context to initialize
564  * @qp:		queue pair to operate on
565  * @port_num:	port num to which the connection is bound
566  * @sg:		scatterlist to READ/WRITE from/to
567  * @sg_cnt:	number of entries in @sg
568  * @sg_offset:	current byte offset into @sg
569  * @remote_addr:remote address to read/write (relative to @rkey)
570  * @rkey:	remote key to operate on
571  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
572  *
573  * Returns the number of WQEs that will be needed on the workqueue if
574  * successful, or a negative error code.
575  */
576 int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
577 		struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
578 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
579 {
580 	struct ib_device *dev = qp->pd->device;
581 	struct sg_table sgt = {
582 		.sgl = sg,
583 		.orig_nents = sg_cnt,
584 	};
585 	int ret;
586 
587 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
588 	if (ret)
589 		return ret;
590 	sg_cnt = sgt.nents;
591 
592 	/*
593 	 * Skip to the S/G entry that sg_offset falls into:
594 	 */
595 	for (;;) {
596 		u32 len = sg_dma_len(sg);
597 
598 		if (sg_offset < len)
599 			break;
600 
601 		sg = sg_next(sg);
602 		sg_offset -= len;
603 		sg_cnt--;
604 	}
605 
606 	ret = -EIO;
607 	if (WARN_ON_ONCE(sg_cnt == 0))
608 		goto out_unmap_sg;
609 
610 	if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
611 		ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
612 				sg_offset, remote_addr, rkey, dir);
613 	} else if (sg_cnt > 1) {
614 		ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
615 				remote_addr, rkey, dir);
616 	} else {
617 		ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
618 				remote_addr, rkey, dir);
619 	}
620 
621 	if (ret < 0)
622 		goto out_unmap_sg;
623 	return ret;
624 
625 out_unmap_sg:
626 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
627 	return ret;
628 }
629 EXPORT_SYMBOL(rdma_rw_ctx_init);
630 
631 /**
632  * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec
633  * @ctx:	context to initialize
634  * @qp:		queue pair to operate on
635  * @port_num:	port num to which the connection is bound
636  * @bvecs:	bio_vec array to READ/WRITE from/to
637  * @nr_bvec:	number of entries in @bvecs
638  * @iter:	bvec iterator describing offset and length
639  * @remote_addr: remote address to read/write (relative to @rkey)
640  * @rkey:	remote key to operate on
641  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
642  *
643  * Maps the bio_vec array directly, avoiding intermediate scatterlist
644  * conversion. Supports MR registration for iWARP devices and force_mr mode.
645  *
646  * Returns the number of WQEs that will be needed on the workqueue if
647  * successful, or a negative error code:
648  *
649  *   * -EINVAL  - @nr_bvec is zero or @iter.bi_size is zero
650  *   * -ENOMEM - DMA mapping or memory allocation failed
651  */
652 int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
653 		u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
654 		struct bvec_iter iter, u64 remote_addr, u32 rkey,
655 		enum dma_data_direction dir)
656 {
657 	struct ib_device *dev = qp->pd->device;
658 	int ret;
659 
660 	if (nr_bvec == 0 || iter.bi_size == 0)
661 		return -EINVAL;
662 
663 	/*
664 	 * iWARP requires MR registration for all RDMA READs. The force_mr
665 	 * debug option also mandates MR usage.
666 	 */
667 	if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num))
668 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
669 						nr_bvec, &iter, remote_addr,
670 						rkey, dir);
671 	if (unlikely(rdma_rw_force_mr))
672 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
673 						nr_bvec, &iter, remote_addr,
674 						rkey, dir);
675 
676 	if (nr_bvec == 1)
677 		return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
678 				remote_addr, rkey, dir);
679 
680 	/*
681 	 * Try IOVA-based mapping first for multi-bvec transfers.
682 	 * IOVA coalesces bvecs into a single DMA-contiguous region,
683 	 * reducing the number of WRs needed and avoiding MR overhead.
684 	 */
685 	ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
686 			rkey, dir);
687 	if (ret != -EOPNOTSUPP)
688 		return ret;
689 
690 	/*
691 	 * IOVA mapping not available. Check if MR registration provides
692 	 * better performance than multiple SGE entries.
693 	 */
694 	if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
695 		return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
696 						nr_bvec, &iter, remote_addr,
697 						rkey, dir);
698 
699 	return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
700 			remote_addr, rkey, dir);
701 }
702 EXPORT_SYMBOL(rdma_rw_ctx_init_bvec);
703 
704 /**
705  * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
706  * @ctx:	context to initialize
707  * @qp:		queue pair to operate on
708  * @port_num:	port num to which the connection is bound
709  * @sg:		scatterlist to READ/WRITE from/to
710  * @sg_cnt:	number of entries in @sg
711  * @prot_sg:	scatterlist to READ/WRITE protection information from/to
712  * @prot_sg_cnt: number of entries in @prot_sg
713  * @sig_attrs:	signature offloading algorithms
714  * @remote_addr:remote address to read/write (relative to @rkey)
715  * @rkey:	remote key to operate on
716  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
717  *
718  * Returns the number of WQEs that will be needed on the workqueue if
719  * successful, or a negative error code.
720  */
721 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
722 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
723 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
724 		struct ib_sig_attrs *sig_attrs,
725 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
726 {
727 	struct ib_device *dev = qp->pd->device;
728 	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
729 						    qp->integrity_en);
730 	struct sg_table sgt = {
731 		.sgl = sg,
732 		.orig_nents = sg_cnt,
733 	};
734 	struct sg_table prot_sgt = {
735 		.sgl = prot_sg,
736 		.orig_nents = prot_sg_cnt,
737 	};
738 	struct ib_rdma_wr *rdma_wr;
739 	int count = 0, ret;
740 
741 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
742 		pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n",
743 		       sg_cnt, prot_sg_cnt, pages_per_mr);
744 		return -EINVAL;
745 	}
746 
747 	ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
748 	if (ret)
749 		return ret;
750 
751 	if (prot_sg_cnt) {
752 		ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0);
753 		if (ret)
754 			goto out_unmap_sg;
755 	}
756 
757 	ctx->type = RDMA_RW_SIG_MR;
758 	ctx->nr_ops = 1;
759 	ctx->reg = kzalloc_obj(*ctx->reg);
760 	if (!ctx->reg) {
761 		ret = -ENOMEM;
762 		goto out_unmap_prot_sg;
763 	}
764 
765 	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
766 	if (!ctx->reg->mr) {
767 		ret = -EAGAIN;
768 		goto out_free_ctx;
769 	}
770 
771 	count += rdma_rw_inv_key(ctx->reg);
772 
773 	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
774 
775 	ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg,
776 			      prot_sgt.nents, NULL, SZ_4K);
777 	if (unlikely(ret)) {
778 		pr_err("failed to map PI sg (%u)\n",
779 		       sgt.nents + prot_sgt.nents);
780 		goto out_destroy_sig_mr;
781 	}
782 
783 	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
784 	ctx->reg->reg_wr.wr.wr_cqe = NULL;
785 	ctx->reg->reg_wr.wr.num_sge = 0;
786 	ctx->reg->reg_wr.wr.send_flags = 0;
787 	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
788 	if (rdma_protocol_iwarp(qp->device, port_num))
789 		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
790 	ctx->reg->reg_wr.mr = ctx->reg->mr;
791 	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
792 	count++;
793 
794 	ctx->reg->sge.addr = ctx->reg->mr->iova;
795 	ctx->reg->sge.length = ctx->reg->mr->length;
796 	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
797 		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
798 
799 	rdma_wr = &ctx->reg->wr;
800 	rdma_wr->wr.sg_list = &ctx->reg->sge;
801 	rdma_wr->wr.num_sge = 1;
802 	rdma_wr->remote_addr = remote_addr;
803 	rdma_wr->rkey = rkey;
804 	if (dir == DMA_TO_DEVICE)
805 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
806 	else
807 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
808 	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
809 	count++;
810 
811 	return count;
812 
813 out_destroy_sig_mr:
814 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
815 out_free_ctx:
816 	kfree(ctx->reg);
817 out_unmap_prot_sg:
818 	if (prot_sgt.nents)
819 		ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0);
820 out_unmap_sg:
821 	ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
822 	return ret;
823 }
824 EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
825 
826 /*
827  * Now that we are going to post the WRs we can update the lkey and need_inval
828  * state on the MRs.  If we were doing this at init time, we would get double
829  * or missing invalidations if a context was initialized but not actually
830  * posted.
831  */
832 static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
833 {
834 	reg->mr->need_inval = need_inval;
835 	ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
836 	reg->reg_wr.key = reg->mr->lkey;
837 	reg->sge.lkey = reg->mr->lkey;
838 }
839 
840 /**
841  * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
842  * @ctx:	context to operate on
843  * @qp:		queue pair to operate on
844  * @port_num:	port num to which the connection is bound
845  * @cqe:	completion queue entry for the last WR
846  * @chain_wr:	WR to append to the posted chain
847  *
848  * Return the WR chain for the set of RDMA READ/WRITE operations described by
849  * @ctx, as well as any memory registration operations needed.  If @chain_wr
850  * is non-NULL the WR it points to will be appended to the chain of WRs posted.
851  * If @chain_wr is not set @cqe must be set so that the caller gets a
852  * completion notification.
853  */
854 struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
855 		u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
856 {
857 	struct ib_send_wr *first_wr, *last_wr;
858 	int i;
859 
860 	switch (ctx->type) {
861 	case RDMA_RW_SIG_MR:
862 	case RDMA_RW_MR:
863 		for (i = 0; i < ctx->nr_ops; i++) {
864 			rdma_rw_update_lkey(&ctx->reg[i],
865 				ctx->reg[i].wr.wr.opcode !=
866 					IB_WR_RDMA_READ_WITH_INV);
867 		}
868 
869 		if (ctx->reg[0].inv_wr.next)
870 			first_wr = &ctx->reg[0].inv_wr;
871 		else
872 			first_wr = &ctx->reg[0].reg_wr.wr;
873 		last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
874 		break;
875 	case RDMA_RW_IOVA:
876 		first_wr = &ctx->iova.wr.wr;
877 		last_wr = &ctx->iova.wr.wr;
878 		break;
879 	case RDMA_RW_MULTI_WR:
880 		first_wr = &ctx->map.wrs[0].wr;
881 		last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
882 		break;
883 	case RDMA_RW_SINGLE_WR:
884 		first_wr = &ctx->single.wr.wr;
885 		last_wr = &ctx->single.wr.wr;
886 		break;
887 	default:
888 		BUG();
889 	}
890 
891 	if (chain_wr) {
892 		last_wr->next = chain_wr;
893 	} else {
894 		last_wr->wr_cqe = cqe;
895 		last_wr->send_flags |= IB_SEND_SIGNALED;
896 	}
897 
898 	return first_wr;
899 }
900 EXPORT_SYMBOL(rdma_rw_ctx_wrs);
901 
902 /**
903  * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
904  * @ctx:	context to operate on
905  * @qp:		queue pair to operate on
906  * @port_num:	port num to which the connection is bound
907  * @cqe:	completion queue entry for the last WR
908  * @chain_wr:	WR to append to the posted chain
909  *
910  * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
911  * any memory registration operations needed.  If @chain_wr is non-NULL the
912  * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
913  * is not set @cqe must be set so that the caller gets a completion
914  * notification.
915  */
916 int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
917 		struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
918 {
919 	struct ib_send_wr *first_wr;
920 
921 	first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
922 	return ib_post_send(qp, first_wr, NULL);
923 }
924 EXPORT_SYMBOL(rdma_rw_ctx_post);
925 
926 /**
927  * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
928  * @ctx:	context to release
929  * @qp:		queue pair to operate on
930  * @port_num:	port num to which the connection is bound
931  * @sg:		scatterlist that was used for the READ/WRITE
932  * @sg_cnt:	number of entries in @sg
933  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
934  */
935 void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
936 			 u32 port_num, struct scatterlist *sg, u32 sg_cnt,
937 			 enum dma_data_direction dir)
938 {
939 	int i;
940 
941 	switch (ctx->type) {
942 	case RDMA_RW_MR:
943 		/* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
944 		WARN_ON_ONCE(ctx->reg[0].sgt.sgl);
945 		for (i = 0; i < ctx->nr_ops; i++)
946 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
947 		kfree(ctx->reg);
948 		break;
949 	case RDMA_RW_MULTI_WR:
950 		kfree(ctx->map.wrs);
951 		kfree(ctx->map.sges);
952 		break;
953 	case RDMA_RW_SINGLE_WR:
954 		break;
955 	case RDMA_RW_IOVA:
956 		/* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
957 		WARN_ON_ONCE(1);
958 		return;
959 	default:
960 		BUG();
961 		break;
962 	}
963 
964 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
965 }
966 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
967 
968 /**
969  * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec
970  * @ctx:	context to release
971  * @qp:		queue pair to operate on
972  * @port_num:	port num to which the connection is bound (unused)
973  * @bvecs:	bio_vec array that was used for the READ/WRITE (unused)
974  * @nr_bvec:	number of entries in @bvecs
975  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
976  *
977  * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec()
978  * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error.
979  *
980  * The @port_num and @bvecs parameters are unused but present for API
981  * symmetry with rdma_rw_ctx_destroy().
982  */
983 void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
984 		u32 __maybe_unused port_num,
985 		const struct bio_vec __maybe_unused *bvecs,
986 		u32 nr_bvec, enum dma_data_direction dir)
987 {
988 	struct ib_device *dev = qp->pd->device;
989 	u32 i;
990 
991 	switch (ctx->type) {
992 	case RDMA_RW_MR:
993 		for (i = 0; i < ctx->nr_ops; i++)
994 			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
995 		ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
996 		kfree(ctx->reg[0].sgt.sgl);
997 		kfree(ctx->reg);
998 		break;
999 	case RDMA_RW_IOVA:
1000 		dma_iova_destroy(dev->dma_device, &ctx->iova.state,
1001 				 ctx->iova.mapped_len, dir, 0);
1002 		break;
1003 	case RDMA_RW_MULTI_WR:
1004 		for (i = 0; i < nr_bvec; i++)
1005 			ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
1006 					  ctx->map.sges[i].length, dir);
1007 		kfree(ctx->map.sges);
1008 		break;
1009 	case RDMA_RW_SINGLE_WR:
1010 		ib_dma_unmap_bvec(dev, ctx->single.sge.addr,
1011 				  ctx->single.sge.length, dir);
1012 		break;
1013 	default:
1014 		WARN_ON_ONCE(1);
1015 		return;
1016 	}
1017 }
1018 EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec);
1019 
1020 /**
1021  * rdma_rw_ctx_destroy_signature - release all resources allocated by
1022  *	rdma_rw_ctx_signature_init
1023  * @ctx:	context to release
1024  * @qp:		queue pair to operate on
1025  * @port_num:	port num to which the connection is bound
1026  * @sg:		scatterlist that was used for the READ/WRITE
1027  * @sg_cnt:	number of entries in @sg
1028  * @prot_sg:	scatterlist that was used for the READ/WRITE of the PI
1029  * @prot_sg_cnt: number of entries in @prot_sg
1030  * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
1031  */
1032 void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
1033 		u32 port_num, struct scatterlist *sg, u32 sg_cnt,
1034 		struct scatterlist *prot_sg, u32 prot_sg_cnt,
1035 		enum dma_data_direction dir)
1036 {
1037 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
1038 		return;
1039 
1040 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
1041 	kfree(ctx->reg);
1042 
1043 	if (prot_sg_cnt)
1044 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
1045 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
1046 }
1047 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
1048 
1049 /**
1050  * rdma_rw_mr_factor - return number of MRs required for a payload
1051  * @device:	device handling the connection
1052  * @port_num:	port num to which the connection is bound
1053  * @maxpages:	maximum payload pages per rdma_rw_ctx
1054  *
1055  * Returns the number of MRs the device requires to move @maxpayload
1056  * bytes. The returned value is used during transport creation to
1057  * compute max_rdma_ctxts and the size of the transport's Send and
1058  * Send Completion Queues.
1059  */
1060 unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
1061 			       unsigned int maxpages)
1062 {
1063 	unsigned int mr_pages;
1064 
1065 	if (rdma_rw_can_use_mr(device, port_num))
1066 		mr_pages = rdma_rw_fr_page_list_len(device, false);
1067 	else
1068 		mr_pages = device->attrs.max_sge_rd;
1069 	return DIV_ROUND_UP(maxpages, mr_pages);
1070 }
1071 EXPORT_SYMBOL(rdma_rw_mr_factor);
1072 
1073 /**
1074  * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts
1075  * @dev: RDMA device
1076  * @port_num: port number
1077  * @max_rdma_ctxs: number of rdma_rw_ctx structures
1078  * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if
1079  *                data integrity will be enabled on the QP)
1080  *
1081  * Returns the total number of Send Queue entries needed for
1082  * @max_rdma_ctxs. The result accounts for memory registration and
1083  * invalidation work requests when the device requires them.
1084  *
1085  * ULPs use this to size Send Queues and Send CQs before creating a
1086  * Queue Pair.
1087  */
1088 unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
1089 				 unsigned int max_rdma_ctxs, u32 create_flags)
1090 {
1091 	unsigned int factor = 1;
1092 	unsigned int result;
1093 
1094 	if (create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1095 	    rdma_rw_can_use_mr(dev, port_num))
1096 		factor += 2;	/* reg + inv */
1097 
1098 	if (check_mul_overflow(factor, max_rdma_ctxs, &result))
1099 		return UINT_MAX;
1100 	return result;
1101 }
1102 EXPORT_SYMBOL(rdma_rw_max_send_wr);
1103 
1104 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
1105 {
1106 	unsigned int factor = 1;
1107 
1108 	WARN_ON_ONCE(attr->port_num == 0);
1109 
1110 	/*
1111 	 * If the device uses MRs to perform RDMA READ or WRITE operations,
1112 	 * or if data integrity is enabled, account for registration and
1113 	 * invalidation work requests.
1114 	 */
1115 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1116 	    rdma_rw_can_use_mr(dev, attr->port_num))
1117 		factor += 2;	/* reg + inv */
1118 
1119 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
1120 
1121 	/*
1122 	 * The device might not support all we need, and we'll have to
1123 	 * live with what we get.
1124 	 */
1125 	attr->cap.max_send_wr =
1126 		min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
1127 }
1128 
1129 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
1130 {
1131 	struct ib_device *dev = qp->pd->device;
1132 	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
1133 	int ret = 0;
1134 
1135 	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
1136 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
1137 		nr_mrs = attr->cap.max_rdma_ctxs;
1138 		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
1139 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
1140 		nr_mrs = attr->cap.max_rdma_ctxs;
1141 		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
1142 	}
1143 
1144 	if (nr_mrs) {
1145 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
1146 				IB_MR_TYPE_MEM_REG,
1147 				max_num_sg, 0);
1148 		if (ret) {
1149 			pr_err("%s: failed to allocated %u MRs\n",
1150 				__func__, nr_mrs);
1151 			return ret;
1152 		}
1153 	}
1154 
1155 	if (nr_sig_mrs) {
1156 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
1157 				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
1158 		if (ret) {
1159 			pr_err("%s: failed to allocated %u SIG MRs\n",
1160 				__func__, nr_sig_mrs);
1161 			goto out_free_rdma_mrs;
1162 		}
1163 	}
1164 
1165 	return 0;
1166 
1167 out_free_rdma_mrs:
1168 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1169 	return ret;
1170 }
1171 
1172 void rdma_rw_cleanup_mrs(struct ib_qp *qp)
1173 {
1174 	ib_mr_pool_destroy(qp, &qp->sig_mrs);
1175 	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
1176 }
1177