xref: /linux/block/blk-mq-dma.c (revision 55a42f78ffd386e01a5404419f8c5ded7db70a21)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2025 Christoph Hellwig
4  */
5 #include <linux/blk-integrity.h>
6 #include <linux/blk-mq-dma.h>
7 #include "blk.h"
8 
9 struct phys_vec {
10 	phys_addr_t	paddr;
11 	u32		len;
12 };
13 
14 static bool __blk_map_iter_next(struct blk_map_iter *iter)
15 {
16 	if (iter->iter.bi_size)
17 		return true;
18 	if (!iter->bio || !iter->bio->bi_next)
19 		return false;
20 
21 	iter->bio = iter->bio->bi_next;
22 	if (iter->is_integrity) {
23 		iter->iter = bio_integrity(iter->bio)->bip_iter;
24 		iter->bvecs = bio_integrity(iter->bio)->bip_vec;
25 	} else {
26 		iter->iter = iter->bio->bi_iter;
27 		iter->bvecs = iter->bio->bi_io_vec;
28 	}
29 	return true;
30 }
31 
32 static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
33 			      struct phys_vec *vec)
34 {
35 	unsigned int max_size;
36 	struct bio_vec bv;
37 
38 	if (!iter->iter.bi_size)
39 		return false;
40 
41 	bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
42 	vec->paddr = bvec_phys(&bv);
43 	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
44 	bv.bv_len = min(bv.bv_len, max_size);
45 	bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
46 
47 	/*
48 	 * If we are entirely done with this bi_io_vec entry, check if the next
49 	 * one could be merged into it.  This typically happens when moving to
50 	 * the next bio, but some callers also don't pack bvecs tight.
51 	 */
52 	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
53 		struct bio_vec next;
54 
55 		if (!__blk_map_iter_next(iter))
56 			break;
57 
58 		next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
59 		if (bv.bv_len + next.bv_len > max_size ||
60 		    !biovec_phys_mergeable(req->q, &bv, &next))
61 			break;
62 
63 		bv.bv_len += next.bv_len;
64 		bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
65 	}
66 
67 	vec->len = bv.bv_len;
68 	return true;
69 }
70 
71 /*
72  * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
73  * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
74  * we need to ensure our segments are aligned to this as well.
75  *
76  * Note that there is no point in using the slightly more complicated IOVA based
77  * path for single segment mappings.
78  */
79 static inline bool blk_can_dma_map_iova(struct request *req,
80 		struct device *dma_dev)
81 {
82 	return !((queue_virt_boundary(req->q) + 1) &
83 		dma_get_merge_boundary(dma_dev));
84 }
85 
86 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
87 {
88 	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
89 	iter->len = vec->len;
90 	return true;
91 }
92 
93 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
94 		struct blk_dma_iter *iter, struct phys_vec *vec)
95 {
96 	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
97 			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
98 	if (dma_mapping_error(dma_dev, iter->addr)) {
99 		iter->status = BLK_STS_RESOURCE;
100 		return false;
101 	}
102 	iter->len = vec->len;
103 	return true;
104 }
105 
106 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
107 		struct dma_iova_state *state, struct blk_dma_iter *iter,
108 		struct phys_vec *vec)
109 {
110 	enum dma_data_direction dir = rq_dma_dir(req);
111 	unsigned int mapped = 0;
112 	int error;
113 
114 	iter->addr = state->addr;
115 	iter->len = dma_iova_size(state);
116 
117 	do {
118 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
119 				vec->len, dir, 0);
120 		if (error)
121 			break;
122 		mapped += vec->len;
123 	} while (blk_map_iter_next(req, &iter->iter, vec));
124 
125 	error = dma_iova_sync(dma_dev, state, 0, mapped);
126 	if (error) {
127 		iter->status = errno_to_blk_status(error);
128 		return false;
129 	}
130 
131 	return true;
132 }
133 
134 static inline void blk_rq_map_iter_init(struct request *rq,
135 					struct blk_map_iter *iter)
136 {
137 	struct bio *bio = rq->bio;
138 
139 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
140 		*iter = (struct blk_map_iter) {
141 			.bvecs = &rq->special_vec,
142 			.iter = {
143 				.bi_size = rq->special_vec.bv_len,
144 			}
145 		};
146        } else if (bio) {
147 		*iter = (struct blk_map_iter) {
148 			.bio = bio,
149 			.bvecs = bio->bi_io_vec,
150 			.iter = bio->bi_iter,
151 		};
152 	} else {
153 		/* the internal flush request may not have bio attached */
154 	        *iter = (struct blk_map_iter) {};
155 	}
156 }
157 
158 static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
159 		struct dma_iova_state *state, struct blk_dma_iter *iter,
160 		unsigned int total_len)
161 {
162 	struct phys_vec vec;
163 
164 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
165 	iter->status = BLK_STS_OK;
166 
167 	/*
168 	 * Grab the first segment ASAP because we'll need it to check for P2P
169 	 * transfers.
170 	 */
171 	if (!blk_map_iter_next(req, &iter->iter, &vec))
172 		return false;
173 
174 	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
175 				 phys_to_page(vec.paddr))) {
176 	case PCI_P2PDMA_MAP_BUS_ADDR:
177 		if (iter->iter.is_integrity)
178 			bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA;
179 		else
180 			req->cmd_flags |= REQ_P2PDMA;
181 		return blk_dma_map_bus(iter, &vec);
182 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
183 		/*
184 		 * P2P transfers through the host bridge are treated the
185 		 * same as non-P2P transfers below and during unmap.
186 		 */
187 	case PCI_P2PDMA_MAP_NONE:
188 		break;
189 	default:
190 		iter->status = BLK_STS_INVAL;
191 		return false;
192 	}
193 
194 	if (blk_can_dma_map_iova(req, dma_dev) &&
195 	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
196 		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
197 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
198 }
199 
200 /**
201  * blk_rq_dma_map_iter_start - map the first DMA segment for a request
202  * @req:	request to map
203  * @dma_dev:	device to map to
204  * @state:	DMA IOVA state
205  * @iter:	block layer DMA iterator
206  *
207  * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
208  * caller and don't need to be initialized.  @state needs to be stored for use
209  * at unmap time, @iter is only needed at map time.
210  *
211  * Returns %false if there is no segment to map, including due to an error, or
212  * %true ft it did map a segment.
213  *
214  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
215  * the length in @iter.len.  If no segment was mapped the status code is
216  * returned in @iter.status.
217  *
218  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
219  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
220  * to try to map the following segments.
221  */
222 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
223 		struct dma_iova_state *state, struct blk_dma_iter *iter)
224 {
225 	blk_rq_map_iter_init(req, &iter->iter);
226 	return blk_dma_map_iter_start(req, dma_dev, state, iter,
227 				      blk_rq_payload_bytes(req));
228 }
229 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
230 
231 /**
232  * blk_rq_dma_map_iter_next - map the next DMA segment for a request
233  * @req:	request to map
234  * @dma_dev:	device to map to
235  * @state:	DMA IOVA state
236  * @iter:	block layer DMA iterator
237  *
238  * Iterate to the next mapping after a previous call to
239  * blk_rq_dma_map_iter_start().  See there for a detailed description of the
240  * arguments.
241  *
242  * Returns %false if there is no segment to map, including due to an error, or
243  * %true ft it did map a segment.
244  *
245  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
246  * the length in @iter.len.  If no segment was mapped the status code is
247  * returned in @iter.status.
248  */
249 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
250 		struct dma_iova_state *state, struct blk_dma_iter *iter)
251 {
252 	struct phys_vec vec;
253 
254 	if (!blk_map_iter_next(req, &iter->iter, &vec))
255 		return false;
256 
257 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
258 		return blk_dma_map_bus(iter, &vec);
259 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
260 }
261 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
262 
263 static inline struct scatterlist *
264 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
265 {
266 	if (!*sg)
267 		return sglist;
268 
269 	/*
270 	 * If the driver previously mapped a shorter list, we could see a
271 	 * termination bit prematurely unless it fully inits the sg table
272 	 * on each mapping. We KNOW that there must be more entries here
273 	 * or the driver would be buggy, so force clear the termination bit
274 	 * to avoid doing a full sg_init_table() in drivers for each command.
275 	 */
276 	sg_unmark_end(*sg);
277 	return sg_next(*sg);
278 }
279 
280 /*
281  * Map a request to scatterlist, return number of sg entries setup. Caller
282  * must make sure sg can hold rq->nr_phys_segments entries.
283  */
284 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
285 		    struct scatterlist **last_sg)
286 {
287 	struct blk_map_iter iter;
288 	struct phys_vec vec;
289 	int nsegs = 0;
290 
291 	blk_rq_map_iter_init(rq, &iter);
292 	while (blk_map_iter_next(rq, &iter, &vec)) {
293 		*last_sg = blk_next_sg(last_sg, sglist);
294 		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
295 				offset_in_page(vec.paddr));
296 		nsegs++;
297 	}
298 
299 	if (*last_sg)
300 		sg_mark_end(*last_sg);
301 
302 	/*
303 	 * Something must have been wrong if the figured number of
304 	 * segment is bigger than number of req's physical segments
305 	 */
306 	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
307 
308 	return nsegs;
309 }
310 EXPORT_SYMBOL(__blk_rq_map_sg);
311 
312 #ifdef CONFIG_BLK_DEV_INTEGRITY
313 /**
314  * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
315  * 					 for a request
316  * @req:	request to map
317  * @dma_dev:	device to map to
318  * @state:	DMA IOVA state
319  * @iter:	block layer DMA iterator
320  *
321  * Start DMA mapping @req integrity data to @dma_dev.  @state and @iter are
322  * provided by the caller and don't need to be initialized.  @state needs to be
323  * stored for use at unmap time, @iter is only needed at map time.
324  *
325  * Returns %false if there is no segment to map, including due to an error, or
326  * %true if it did map a segment.
327  *
328  * If a segment was mapped, the DMA address for it is returned in @iter.addr
329  * and the length in @iter.len.  If no segment was mapped the status code is
330  * returned in @iter.status.
331  *
332  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
333  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
334  * to try to map the following segments.
335  */
336 bool blk_rq_integrity_dma_map_iter_start(struct request *req,
337 		struct device *dma_dev,  struct dma_iova_state *state,
338 		struct blk_dma_iter *iter)
339 {
340 	unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
341 					   blk_rq_sectors(req));
342 	struct bio *bio = req->bio;
343 
344 	iter->iter = (struct blk_map_iter) {
345 		.bio = bio,
346 		.iter = bio_integrity(bio)->bip_iter,
347 		.bvecs = bio_integrity(bio)->bip_vec,
348 		.is_integrity = true,
349 	};
350 	return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
351 }
352 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
353 
354 /**
355  * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for
356  * 					 a request
357  * @req:	request to map
358  * @dma_dev:	device to map to
359  * @state:	DMA IOVA state
360  * @iter:	block layer DMA iterator
361  *
362  * Iterate to the next integrity mapping after a previous call to
363  * blk_rq_integrity_dma_map_iter_start().  See there for a detailed description
364  * of the arguments.
365  *
366  * Returns %false if there is no segment to map, including due to an error, or
367  * %true if it did map a segment.
368  *
369  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
370  * the length in @iter.len.  If no segment was mapped the status code is
371  * returned in @iter.status.
372  */
373 bool blk_rq_integrity_dma_map_iter_next(struct request *req,
374                struct device *dma_dev, struct blk_dma_iter *iter)
375 {
376 	struct phys_vec vec;
377 
378 	if (!blk_map_iter_next(req, &iter->iter, &vec))
379 		return false;
380 
381 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
382 		return blk_dma_map_bus(iter, &vec);
383 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
384 }
385 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
386 
387 /**
388  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
389  * @rq:		request to map
390  * @sglist:	target scatterlist
391  *
392  * Description: Map the integrity vectors in request into a
393  * scatterlist.  The scatterlist must be big enough to hold all
394  * elements.  I.e. sized using blk_rq_count_integrity_sg() or
395  * rq->nr_integrity_segments.
396  */
397 int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
398 {
399 	struct request_queue *q = rq->q;
400 	struct scatterlist *sg = NULL;
401 	struct bio *bio = rq->bio;
402 	unsigned int segments = 0;
403 	struct phys_vec vec;
404 
405 	struct blk_map_iter iter = {
406 		.bio = bio,
407 		.iter = bio_integrity(bio)->bip_iter,
408 		.bvecs = bio_integrity(bio)->bip_vec,
409 		.is_integrity = true,
410 	};
411 
412 	while (blk_map_iter_next(rq, &iter, &vec)) {
413 		sg = blk_next_sg(&sg, sglist);
414 		sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
415 				offset_in_page(vec.paddr));
416 		segments++;
417 	}
418 
419 	if (sg)
420 	        sg_mark_end(sg);
421 
422 	/*
423 	 * Something must have been wrong if the figured number of segment
424 	 * is bigger than number of req's physical integrity segments
425 	 */
426 	BUG_ON(segments > rq->nr_integrity_segments);
427 	BUG_ON(segments > queue_max_integrity_segments(q));
428 	return segments;
429 }
430 EXPORT_SYMBOL(blk_rq_map_integrity_sg);
431 #endif
432