xref: /linux/block/blk-mq-dma.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2025 Christoph Hellwig
4  */
5 #include <linux/blk-integrity.h>
6 #include <linux/blk-mq-dma.h>
7 #include "blk.h"
8 
9 static bool __blk_map_iter_next(struct blk_map_iter *iter)
10 {
11 	if (iter->iter.bi_size)
12 		return true;
13 	if (!iter->bio || !iter->bio->bi_next)
14 		return false;
15 
16 	iter->bio = iter->bio->bi_next;
17 	if (iter->is_integrity) {
18 		iter->iter = bio_integrity(iter->bio)->bip_iter;
19 		iter->bvecs = bio_integrity(iter->bio)->bip_vec;
20 	} else {
21 		iter->iter = iter->bio->bi_iter;
22 		iter->bvecs = iter->bio->bi_io_vec;
23 	}
24 	return true;
25 }
26 
27 static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
28 			      struct phys_vec *vec)
29 {
30 	unsigned int max_size;
31 	struct bio_vec bv;
32 
33 	if (!iter->iter.bi_size)
34 		return false;
35 
36 	bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
37 	vec->paddr = bvec_phys(&bv);
38 	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
39 	bv.bv_len = min(bv.bv_len, max_size);
40 	bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
41 
42 	/*
43 	 * If we are entirely done with this bi_io_vec entry, check if the next
44 	 * one could be merged into it.  This typically happens when moving to
45 	 * the next bio, but some callers also don't pack bvecs tight.
46 	 */
47 	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
48 		struct bio_vec next;
49 
50 		if (!__blk_map_iter_next(iter))
51 			break;
52 
53 		next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
54 		if (bv.bv_len + next.bv_len > max_size ||
55 		    !biovec_phys_mergeable(req->q, &bv, &next))
56 			break;
57 
58 		bv.bv_len += next.bv_len;
59 		bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
60 	}
61 
62 	vec->len = bv.bv_len;
63 	return true;
64 }
65 
66 /*
67  * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
68  * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
69  * we need to ensure our segments are aligned to this as well.
70  *
71  * Note that there is no point in using the slightly more complicated IOVA based
72  * path for single segment mappings.
73  */
74 static inline bool blk_can_dma_map_iova(struct request *req,
75 		struct device *dma_dev)
76 {
77 	return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
78 }
79 
80 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
81 {
82 	iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
83 	iter->len = vec->len;
84 	return true;
85 }
86 
87 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
88 		struct blk_dma_iter *iter, struct phys_vec *vec)
89 {
90 	unsigned int attrs = 0;
91 
92 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
93 		attrs |= DMA_ATTR_MMIO;
94 
95 	iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
96 			rq_dma_dir(req), attrs);
97 	if (dma_mapping_error(dma_dev, iter->addr)) {
98 		iter->status = BLK_STS_RESOURCE;
99 		return false;
100 	}
101 	iter->len = vec->len;
102 	return true;
103 }
104 
105 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
106 		struct dma_iova_state *state, struct blk_dma_iter *iter,
107 		struct phys_vec *vec)
108 {
109 	enum dma_data_direction dir = rq_dma_dir(req);
110 	unsigned int attrs = 0;
111 	size_t mapped = 0;
112 	int error;
113 
114 	iter->addr = state->addr;
115 	iter->len = dma_iova_size(state);
116 
117 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
118 		attrs |= DMA_ATTR_MMIO;
119 
120 	do {
121 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
122 				vec->len, dir, attrs);
123 		if (error)
124 			goto out_unlink;
125 		mapped += vec->len;
126 	} while (blk_map_iter_next(req, &iter->iter, vec));
127 
128 	error = dma_iova_sync(dma_dev, state, 0, mapped);
129 	if (error)
130 		goto out_unlink;
131 
132 	return true;
133 
134 out_unlink:
135 	dma_iova_destroy(dma_dev, state, mapped, dir, attrs);
136 	iter->status = errno_to_blk_status(error);
137 	return false;
138 }
139 
140 static inline void blk_rq_map_iter_init(struct request *rq,
141 					struct blk_map_iter *iter)
142 {
143 	struct bio *bio = rq->bio;
144 
145 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
146 		*iter = (struct blk_map_iter) {
147 			.bvecs = &rq->special_vec,
148 			.iter = {
149 				.bi_size = rq->special_vec.bv_len,
150 			}
151 		};
152 	} else if (bio) {
153 		*iter = (struct blk_map_iter) {
154 			.bio = bio,
155 			.bvecs = bio->bi_io_vec,
156 			.iter = bio->bi_iter,
157 		};
158 	} else {
159 		/* the internal flush request may not have bio attached */
160 		*iter = (struct blk_map_iter) {};
161 	}
162 }
163 
164 static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
165 		struct dma_iova_state *state, struct blk_dma_iter *iter,
166 		unsigned int total_len)
167 {
168 	struct phys_vec vec;
169 
170 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
171 	iter->status = BLK_STS_OK;
172 	iter->p2pdma.map = PCI_P2PDMA_MAP_NONE;
173 
174 	/*
175 	 * Grab the first segment ASAP because we'll need it to check for P2P
176 	 * transfers.
177 	 */
178 	if (!blk_map_iter_next(req, &iter->iter, &vec))
179 		return false;
180 
181 	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
182 				 phys_to_page(vec.paddr))) {
183 	case PCI_P2PDMA_MAP_BUS_ADDR:
184 		return blk_dma_map_bus(iter, &vec);
185 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
186 		/*
187 		 * P2P transfers through the host bridge are treated the
188 		 * same as non-P2P transfers below and during unmap.
189 		 */
190 	case PCI_P2PDMA_MAP_NONE:
191 		break;
192 	default:
193 		iter->status = BLK_STS_INVAL;
194 		return false;
195 	}
196 
197 	if (blk_can_dma_map_iova(req, dma_dev) &&
198 	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
199 		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
200 	memset(state, 0, sizeof(*state));
201 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
202 }
203 
204 /**
205  * blk_rq_dma_map_iter_start - map the first DMA segment for a request
206  * @req:	request to map
207  * @dma_dev:	device to map to
208  * @state:	DMA IOVA state
209  * @iter:	block layer DMA iterator
210  *
211  * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
212  * caller and don't need to be initialized.  @state needs to be stored for use
213  * at unmap time, @iter is only needed at map time.
214  *
215  * Returns %false if there is no segment to map, including due to an error, or
216  * %true ft it did map a segment.
217  *
218  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
219  * the length in @iter.len.  If no segment was mapped the status code is
220  * returned in @iter.status.
221  *
222  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
223  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
224  * to try to map the following segments.
225  */
226 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
227 		struct dma_iova_state *state, struct blk_dma_iter *iter)
228 {
229 	blk_rq_map_iter_init(req, &iter->iter);
230 	return blk_dma_map_iter_start(req, dma_dev, state, iter,
231 				      blk_rq_payload_bytes(req));
232 }
233 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
234 
235 /**
236  * blk_rq_dma_map_iter_next - map the next DMA segment for a request
237  * @req:	request to map
238  * @dma_dev:	device to map to
239  * @iter:	block layer DMA iterator
240  *
241  * Iterate to the next mapping after a previous call to
242  * blk_rq_dma_map_iter_start().  See there for a detailed description of the
243  * arguments.
244  *
245  * Returns %false if there is no segment to map, including due to an error, or
246  * %true ft it did map a segment.
247  *
248  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
249  * the length in @iter.len.  If no segment was mapped the status code is
250  * returned in @iter.status.
251  */
252 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
253 		struct blk_dma_iter *iter)
254 {
255 	struct phys_vec vec;
256 
257 	if (!blk_map_iter_next(req, &iter->iter, &vec))
258 		return false;
259 
260 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
261 		return blk_dma_map_bus(iter, &vec);
262 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
263 }
264 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
265 
266 static inline struct scatterlist *
267 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
268 {
269 	if (!*sg)
270 		return sglist;
271 
272 	/*
273 	 * If the driver previously mapped a shorter list, we could see a
274 	 * termination bit prematurely unless it fully inits the sg table
275 	 * on each mapping. We KNOW that there must be more entries here
276 	 * or the driver would be buggy, so force clear the termination bit
277 	 * to avoid doing a full sg_init_table() in drivers for each command.
278 	 */
279 	sg_unmark_end(*sg);
280 	return sg_next(*sg);
281 }
282 
283 /*
284  * Map a request to scatterlist, return number of sg entries setup. Caller
285  * must make sure sg can hold rq->nr_phys_segments entries.
286  */
287 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
288 		    struct scatterlist **last_sg)
289 {
290 	struct blk_map_iter iter;
291 	struct phys_vec vec;
292 	int nsegs = 0;
293 
294 	blk_rq_map_iter_init(rq, &iter);
295 	while (blk_map_iter_next(rq, &iter, &vec)) {
296 		*last_sg = blk_next_sg(last_sg, sglist);
297 
298 		WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
299 		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
300 				offset_in_page(vec.paddr));
301 		nsegs++;
302 	}
303 
304 	if (*last_sg)
305 		sg_mark_end(*last_sg);
306 
307 	/*
308 	 * Something must have been wrong if the figured number of
309 	 * segment is bigger than number of req's physical segments
310 	 */
311 	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
312 
313 	return nsegs;
314 }
315 EXPORT_SYMBOL(__blk_rq_map_sg);
316 
317 #ifdef CONFIG_BLK_DEV_INTEGRITY
318 /**
319  * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
320  * 					 for a request
321  * @req:	request to map
322  * @dma_dev:	device to map to
323  * @state:	DMA IOVA state
324  * @iter:	block layer DMA iterator
325  *
326  * Start DMA mapping @req integrity data to @dma_dev.  @state and @iter are
327  * provided by the caller and don't need to be initialized.  @state needs to be
328  * stored for use at unmap time, @iter is only needed at map time.
329  *
330  * Returns %false if there is no segment to map, including due to an error, or
331  * %true if it did map a segment.
332  *
333  * If a segment was mapped, the DMA address for it is returned in @iter.addr
334  * and the length in @iter.len.  If no segment was mapped the status code is
335  * returned in @iter.status.
336  *
337  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
338  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
339  * to try to map the following segments.
340  */
341 bool blk_rq_integrity_dma_map_iter_start(struct request *req,
342 		struct device *dma_dev,  struct dma_iova_state *state,
343 		struct blk_dma_iter *iter)
344 {
345 	unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
346 					   blk_rq_sectors(req));
347 	struct bio *bio = req->bio;
348 
349 	iter->iter = (struct blk_map_iter) {
350 		.bio = bio,
351 		.iter = bio_integrity(bio)->bip_iter,
352 		.bvecs = bio_integrity(bio)->bip_vec,
353 		.is_integrity = true,
354 	};
355 	return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
356 }
357 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
358 
359 /**
360  * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for
361  * 					 a request
362  * @req:	request to map
363  * @dma_dev:	device to map to
364  * @state:	DMA IOVA state
365  * @iter:	block layer DMA iterator
366  *
367  * Iterate to the next integrity mapping after a previous call to
368  * blk_rq_integrity_dma_map_iter_start().  See there for a detailed description
369  * of the arguments.
370  *
371  * Returns %false if there is no segment to map, including due to an error, or
372  * %true if it did map a segment.
373  *
374  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
375  * the length in @iter.len.  If no segment was mapped the status code is
376  * returned in @iter.status.
377  */
378 bool blk_rq_integrity_dma_map_iter_next(struct request *req,
379                struct device *dma_dev, struct blk_dma_iter *iter)
380 {
381 	struct phys_vec vec;
382 
383 	if (!blk_map_iter_next(req, &iter->iter, &vec))
384 		return false;
385 
386 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
387 		return blk_dma_map_bus(iter, &vec);
388 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
389 }
390 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
391 
392 /**
393  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
394  * @rq:		request to map
395  * @sglist:	target scatterlist
396  *
397  * Description: Map the integrity vectors in request into a
398  * scatterlist.  The scatterlist must be big enough to hold all
399  * elements.  I.e. sized using blk_rq_count_integrity_sg() or
400  * rq->nr_integrity_segments.
401  */
402 int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
403 {
404 	struct request_queue *q = rq->q;
405 	struct scatterlist *sg = NULL;
406 	struct bio *bio = rq->bio;
407 	unsigned int segments = 0;
408 	struct phys_vec vec;
409 
410 	struct blk_map_iter iter = {
411 		.bio = bio,
412 		.iter = bio_integrity(bio)->bip_iter,
413 		.bvecs = bio_integrity(bio)->bip_vec,
414 		.is_integrity = true,
415 	};
416 
417 	while (blk_map_iter_next(rq, &iter, &vec)) {
418 		sg = blk_next_sg(&sg, sglist);
419 
420 		WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
421 		sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
422 				offset_in_page(vec.paddr));
423 		segments++;
424 	}
425 
426 	if (sg)
427 	        sg_mark_end(sg);
428 
429 	/*
430 	 * Something must have been wrong if the figured number of segment
431 	 * is bigger than number of req's physical integrity segments
432 	 */
433 	BUG_ON(segments > rq->nr_integrity_segments);
434 	BUG_ON(segments > queue_max_integrity_segments(q));
435 	return segments;
436 }
437 EXPORT_SYMBOL(blk_rq_map_integrity_sg);
438 #endif
439