xref: /linux/block/blk-mq-dma.c (revision c31f4aa8fed048fa70e742c4bb49bb48dc489ab3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2025 Christoph Hellwig
4  */
5 #include <linux/blk-integrity.h>
6 #include <linux/blk-mq-dma.h>
7 #include "blk.h"
8 
9 struct phys_vec {
10 	phys_addr_t	paddr;
11 	u32		len;
12 };
13 
14 static bool __blk_map_iter_next(struct blk_map_iter *iter)
15 {
16 	if (iter->iter.bi_size)
17 		return true;
18 	if (!iter->bio || !iter->bio->bi_next)
19 		return false;
20 
21 	iter->bio = iter->bio->bi_next;
22 	if (iter->is_integrity) {
23 		iter->iter = bio_integrity(iter->bio)->bip_iter;
24 		iter->bvecs = bio_integrity(iter->bio)->bip_vec;
25 	} else {
26 		iter->iter = iter->bio->bi_iter;
27 		iter->bvecs = iter->bio->bi_io_vec;
28 	}
29 	return true;
30 }
31 
32 static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
33 			      struct phys_vec *vec)
34 {
35 	unsigned int max_size;
36 	struct bio_vec bv;
37 
38 	if (!iter->iter.bi_size)
39 		return false;
40 
41 	bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
42 	vec->paddr = bvec_phys(&bv);
43 	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
44 	bv.bv_len = min(bv.bv_len, max_size);
45 	bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
46 
47 	/*
48 	 * If we are entirely done with this bi_io_vec entry, check if the next
49 	 * one could be merged into it.  This typically happens when moving to
50 	 * the next bio, but some callers also don't pack bvecs tight.
51 	 */
52 	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
53 		struct bio_vec next;
54 
55 		if (!__blk_map_iter_next(iter))
56 			break;
57 
58 		next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
59 		if (bv.bv_len + next.bv_len > max_size ||
60 		    !biovec_phys_mergeable(req->q, &bv, &next))
61 			break;
62 
63 		bv.bv_len += next.bv_len;
64 		bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
65 	}
66 
67 	vec->len = bv.bv_len;
68 	return true;
69 }
70 
71 /*
72  * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
73  * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
74  * we need to ensure our segments are aligned to this as well.
75  *
76  * Note that there is no point in using the slightly more complicated IOVA based
77  * path for single segment mappings.
78  */
79 static inline bool blk_can_dma_map_iova(struct request *req,
80 		struct device *dma_dev)
81 {
82 	return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
83 }
84 
85 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
86 {
87 	iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
88 	iter->len = vec->len;
89 	return true;
90 }
91 
92 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
93 		struct blk_dma_iter *iter, struct phys_vec *vec)
94 {
95 	unsigned int attrs = 0;
96 
97 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
98 		attrs |= DMA_ATTR_MMIO;
99 
100 	iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
101 			rq_dma_dir(req), attrs);
102 	if (dma_mapping_error(dma_dev, iter->addr)) {
103 		iter->status = BLK_STS_RESOURCE;
104 		return false;
105 	}
106 	iter->len = vec->len;
107 	return true;
108 }
109 
110 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
111 		struct dma_iova_state *state, struct blk_dma_iter *iter,
112 		struct phys_vec *vec)
113 {
114 	enum dma_data_direction dir = rq_dma_dir(req);
115 	unsigned int mapped = 0;
116 	unsigned int attrs = 0;
117 	int error;
118 
119 	iter->addr = state->addr;
120 	iter->len = dma_iova_size(state);
121 
122 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
123 		attrs |= DMA_ATTR_MMIO;
124 
125 	do {
126 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
127 				vec->len, dir, attrs);
128 		if (error)
129 			break;
130 		mapped += vec->len;
131 	} while (blk_map_iter_next(req, &iter->iter, vec));
132 
133 	error = dma_iova_sync(dma_dev, state, 0, mapped);
134 	if (error) {
135 		iter->status = errno_to_blk_status(error);
136 		return false;
137 	}
138 
139 	return true;
140 }
141 
142 static inline void blk_rq_map_iter_init(struct request *rq,
143 					struct blk_map_iter *iter)
144 {
145 	struct bio *bio = rq->bio;
146 
147 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
148 		*iter = (struct blk_map_iter) {
149 			.bvecs = &rq->special_vec,
150 			.iter = {
151 				.bi_size = rq->special_vec.bv_len,
152 			}
153 		};
154 	} else if (bio) {
155 		*iter = (struct blk_map_iter) {
156 			.bio = bio,
157 			.bvecs = bio->bi_io_vec,
158 			.iter = bio->bi_iter,
159 		};
160 	} else {
161 		/* the internal flush request may not have bio attached */
162 		*iter = (struct blk_map_iter) {};
163 	}
164 }
165 
166 static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
167 		struct dma_iova_state *state, struct blk_dma_iter *iter,
168 		unsigned int total_len)
169 {
170 	struct phys_vec vec;
171 
172 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
173 	iter->status = BLK_STS_OK;
174 	iter->p2pdma.map = PCI_P2PDMA_MAP_NONE;
175 
176 	/*
177 	 * Grab the first segment ASAP because we'll need it to check for P2P
178 	 * transfers.
179 	 */
180 	if (!blk_map_iter_next(req, &iter->iter, &vec))
181 		return false;
182 
183 	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
184 				 phys_to_page(vec.paddr))) {
185 	case PCI_P2PDMA_MAP_BUS_ADDR:
186 		return blk_dma_map_bus(iter, &vec);
187 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
188 		/*
189 		 * P2P transfers through the host bridge are treated the
190 		 * same as non-P2P transfers below and during unmap.
191 		 */
192 	case PCI_P2PDMA_MAP_NONE:
193 		break;
194 	default:
195 		iter->status = BLK_STS_INVAL;
196 		return false;
197 	}
198 
199 	if (blk_can_dma_map_iova(req, dma_dev) &&
200 	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
201 		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
202 	memset(state, 0, sizeof(*state));
203 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
204 }
205 
206 /**
207  * blk_rq_dma_map_iter_start - map the first DMA segment for a request
208  * @req:	request to map
209  * @dma_dev:	device to map to
210  * @state:	DMA IOVA state
211  * @iter:	block layer DMA iterator
212  *
213  * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
214  * caller and don't need to be initialized.  @state needs to be stored for use
215  * at unmap time, @iter is only needed at map time.
216  *
217  * Returns %false if there is no segment to map, including due to an error, or
218  * %true ft it did map a segment.
219  *
220  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
221  * the length in @iter.len.  If no segment was mapped the status code is
222  * returned in @iter.status.
223  *
224  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
225  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
226  * to try to map the following segments.
227  */
228 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
229 		struct dma_iova_state *state, struct blk_dma_iter *iter)
230 {
231 	blk_rq_map_iter_init(req, &iter->iter);
232 	return blk_dma_map_iter_start(req, dma_dev, state, iter,
233 				      blk_rq_payload_bytes(req));
234 }
235 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
236 
237 /**
238  * blk_rq_dma_map_iter_next - map the next DMA segment for a request
239  * @req:	request to map
240  * @dma_dev:	device to map to
241  * @state:	DMA IOVA state
242  * @iter:	block layer DMA iterator
243  *
244  * Iterate to the next mapping after a previous call to
245  * blk_rq_dma_map_iter_start().  See there for a detailed description of the
246  * arguments.
247  *
248  * Returns %false if there is no segment to map, including due to an error, or
249  * %true ft it did map a segment.
250  *
251  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
252  * the length in @iter.len.  If no segment was mapped the status code is
253  * returned in @iter.status.
254  */
255 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
256 		struct dma_iova_state *state, struct blk_dma_iter *iter)
257 {
258 	struct phys_vec vec;
259 
260 	if (!blk_map_iter_next(req, &iter->iter, &vec))
261 		return false;
262 
263 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
264 		return blk_dma_map_bus(iter, &vec);
265 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
266 }
267 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
268 
269 static inline struct scatterlist *
270 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
271 {
272 	if (!*sg)
273 		return sglist;
274 
275 	/*
276 	 * If the driver previously mapped a shorter list, we could see a
277 	 * termination bit prematurely unless it fully inits the sg table
278 	 * on each mapping. We KNOW that there must be more entries here
279 	 * or the driver would be buggy, so force clear the termination bit
280 	 * to avoid doing a full sg_init_table() in drivers for each command.
281 	 */
282 	sg_unmark_end(*sg);
283 	return sg_next(*sg);
284 }
285 
286 /*
287  * Map a request to scatterlist, return number of sg entries setup. Caller
288  * must make sure sg can hold rq->nr_phys_segments entries.
289  */
290 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
291 		    struct scatterlist **last_sg)
292 {
293 	struct blk_map_iter iter;
294 	struct phys_vec vec;
295 	int nsegs = 0;
296 
297 	blk_rq_map_iter_init(rq, &iter);
298 	while (blk_map_iter_next(rq, &iter, &vec)) {
299 		*last_sg = blk_next_sg(last_sg, sglist);
300 		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
301 				offset_in_page(vec.paddr));
302 		nsegs++;
303 	}
304 
305 	if (*last_sg)
306 		sg_mark_end(*last_sg);
307 
308 	/*
309 	 * Something must have been wrong if the figured number of
310 	 * segment is bigger than number of req's physical segments
311 	 */
312 	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
313 
314 	return nsegs;
315 }
316 EXPORT_SYMBOL(__blk_rq_map_sg);
317 
318 #ifdef CONFIG_BLK_DEV_INTEGRITY
319 /**
320  * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
321  * 					 for a request
322  * @req:	request to map
323  * @dma_dev:	device to map to
324  * @state:	DMA IOVA state
325  * @iter:	block layer DMA iterator
326  *
327  * Start DMA mapping @req integrity data to @dma_dev.  @state and @iter are
328  * provided by the caller and don't need to be initialized.  @state needs to be
329  * stored for use at unmap time, @iter is only needed at map time.
330  *
331  * Returns %false if there is no segment to map, including due to an error, or
332  * %true if it did map a segment.
333  *
334  * If a segment was mapped, the DMA address for it is returned in @iter.addr
335  * and the length in @iter.len.  If no segment was mapped the status code is
336  * returned in @iter.status.
337  *
338  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
339  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
340  * to try to map the following segments.
341  */
342 bool blk_rq_integrity_dma_map_iter_start(struct request *req,
343 		struct device *dma_dev,  struct dma_iova_state *state,
344 		struct blk_dma_iter *iter)
345 {
346 	unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
347 					   blk_rq_sectors(req));
348 	struct bio *bio = req->bio;
349 
350 	iter->iter = (struct blk_map_iter) {
351 		.bio = bio,
352 		.iter = bio_integrity(bio)->bip_iter,
353 		.bvecs = bio_integrity(bio)->bip_vec,
354 		.is_integrity = true,
355 	};
356 	return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
357 }
358 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
359 
360 /**
361  * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for
362  * 					 a request
363  * @req:	request to map
364  * @dma_dev:	device to map to
365  * @state:	DMA IOVA state
366  * @iter:	block layer DMA iterator
367  *
368  * Iterate to the next integrity mapping after a previous call to
369  * blk_rq_integrity_dma_map_iter_start().  See there for a detailed description
370  * of the arguments.
371  *
372  * Returns %false if there is no segment to map, including due to an error, or
373  * %true if it did map a segment.
374  *
375  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
376  * the length in @iter.len.  If no segment was mapped the status code is
377  * returned in @iter.status.
378  */
379 bool blk_rq_integrity_dma_map_iter_next(struct request *req,
380                struct device *dma_dev, struct blk_dma_iter *iter)
381 {
382 	struct phys_vec vec;
383 
384 	if (!blk_map_iter_next(req, &iter->iter, &vec))
385 		return false;
386 
387 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
388 		return blk_dma_map_bus(iter, &vec);
389 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
390 }
391 EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
392 
393 /**
394  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
395  * @rq:		request to map
396  * @sglist:	target scatterlist
397  *
398  * Description: Map the integrity vectors in request into a
399  * scatterlist.  The scatterlist must be big enough to hold all
400  * elements.  I.e. sized using blk_rq_count_integrity_sg() or
401  * rq->nr_integrity_segments.
402  */
403 int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
404 {
405 	struct request_queue *q = rq->q;
406 	struct scatterlist *sg = NULL;
407 	struct bio *bio = rq->bio;
408 	unsigned int segments = 0;
409 	struct phys_vec vec;
410 
411 	struct blk_map_iter iter = {
412 		.bio = bio,
413 		.iter = bio_integrity(bio)->bip_iter,
414 		.bvecs = bio_integrity(bio)->bip_vec,
415 		.is_integrity = true,
416 	};
417 
418 	while (blk_map_iter_next(rq, &iter, &vec)) {
419 		sg = blk_next_sg(&sg, sglist);
420 		sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
421 				offset_in_page(vec.paddr));
422 		segments++;
423 	}
424 
425 	if (sg)
426 	        sg_mark_end(sg);
427 
428 	/*
429 	 * Something must have been wrong if the figured number of segment
430 	 * is bigger than number of req's physical integrity segments
431 	 */
432 	BUG_ON(segments > rq->nr_integrity_segments);
433 	BUG_ON(segments > queue_max_integrity_segments(q));
434 	return segments;
435 }
436 EXPORT_SYMBOL(blk_rq_map_integrity_sg);
437 #endif
438