xref: /illumos-gate/usr/src/uts/common/os/dkioc_free_util.c (revision f73e1ebf60792a8bdb2d559097c3131b68c09318)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Nexenta Inc.  All rights reserved.
14  * Copyright 2020 Joyent, Inc.
15  */
16 
17 /* needed when building libzpool */
18 #ifndef	_KERNEL
19 #include <sys/zfs_context.h>
20 #endif
21 
22 #include <sys/sunddi.h>
23 #include <sys/dkio.h>
24 #include <sys/dkioc_free_util.h>
25 #include <sys/sysmacros.h>
26 #include <sys/file.h>
27 #include <sys/sdt.h>
28 
29 static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
30     uint64_t len_blk);
31 static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
32     uint64_t, dfl_iter_fn_t, void *, int);
33 static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
34     dfl_iter_fn_t, void *, int);
35 
36 /*
37  * Copy-in convenience function for variable-length dkioc_free_list_t
38  * structures. The pointer to be copied from is in `arg' (may be a pointer
39  * to userspace). A new buffer is allocated and a pointer to it is placed
40  * in `out'. `ddi_flags' indicates whether the pointer is from user-
41  * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to
42  * kmem_zalloc when allocating the new structure.
43  * Returns 0 on success, or an errno on failure.
44  */
45 int
46 dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags)
47 {
48 	dkioc_free_list_t *dfl;
49 
50 	if (ddi_flags & FKIOCTL) {
51 		dkioc_free_list_t *dfl_in = arg;
52 
53 		if (dfl_in->dfl_num_exts == 0 ||
54 		    dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS)
55 			return (SET_ERROR(EINVAL));
56 		dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags);
57 		if (dfl == NULL)
58 			return (SET_ERROR(ENOMEM));
59 		bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts));
60 	} else {
61 		uint64_t num_exts;
62 
63 		if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t,
64 		    dfl_num_exts), &num_exts, sizeof (num_exts),
65 		    ddi_flags) != 0)
66 			return (SET_ERROR(EFAULT));
67 		if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS)
68 			return (SET_ERROR(EINVAL));
69 		dfl = kmem_alloc(DFL_SZ(num_exts), kmflags);
70 		if (dfl == NULL)
71 			return (SET_ERROR(ENOMEM));
72 		if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 ||
73 		    dfl->dfl_num_exts != num_exts) {
74 			kmem_free(dfl, DFL_SZ(num_exts));
75 			return (SET_ERROR(EFAULT));
76 		}
77 	}
78 
79 	*out = dfl;
80 	return (0);
81 }
82 
83 /* Frees a variable-length dkioc_free_list_t structure. */
84 void
85 dfl_free(dkioc_free_list_t *dfl)
86 {
87 	kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
88 }
89 
90 /*
91  * Convenience function to resize and segment the array of extents in
92  * a DKIOCFREE request as required by a driver.
93  *
94  * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
95  * on either the number of extents that can be submitted in a single request,
96  * or the total number of blocks that can be submitted in a single request.
97  * In addition, devices may have alignment requirements on the starting
98  * address stricter than the device block size.
99  *
100  * Since there is currently no mechanism for callers of DKIOCFREE to discover
101  * such restrictions, instead of rejecting any requests that do not conform to
102  * some undiscoverable (to the caller) set of requirements, a driver can use
103  * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
104  * required to conform to its requirements.
105  *
106  * The original request is passed as 'dfl' and the alignment requirements
107  * are passed in 'dfi'. Additionally the maximum offset of the device allowed
108  * in bytes) is passed as max_off -- this allows a driver with
109  * multiple instances of different sizes but similar requirements (e.g.
110  * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
111  * struct for each device.
112  *
113  * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
114  * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
115  * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
116  * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
117  * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
118  * instances and populate them with the adjusted extents that do conform to the
119  * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
120  * passed to it when this occurs. The net result is that 'func' can always
121  * assume it will be called with a dkioc_free_list_t with extents that
122  * comply with the requirements in 'dfi'. 'func' is also responsible for
123  * freeing the dkioc_free_list_t passed to it (likely via a completion
124  * callback).
125  *
126  * Combined with the behavior described above, dfl_iter() can be viewed as
127  * consuming the dkioc_free_list_t passed to it. Either it will pass it along
128  * to 'func' (and let 'func' handle freeing it), or it will free it and
129  * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
130  * letting 'func' handle freeing the new instances). This way neither the
131  * dfl_iter() caller nor nor the driver need to worry about treating
132  * conforming and non-conforming requests differently.
133  *
134  * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
135  * any notion of partial completion -- either it returns success (0) or
136  * an error. It's not clear if such a notion would even be possible while
137  * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
138  * interface. As such, there's little benefit to providing more detailed error
139  * semantics beyond what DKIOCFREE can handle.
140  *
141  * Due to this, a somewhat simplistic approach is taken to error handling. The
142  * original list of extents is first checked to make sure they all appear
143  * valid -- that is they do not start or extend beyond the end of the device.
144  * Any request that contains such extents is always rejected in it's entirety.
145  * It is possible after applying any needed adjustments to the original list
146  * of extents that the result is not acceptable to the driver. For example,
147  * a device with a 512 byte block size that tries to free the range 513-1023
148  * (bytes) would not be able to be processed. Such extents will be silently
149  * ignored. If the original request consists of nothing but such requests,
150  * dfl_iter() will never call 'func' and will merely return 0.
151  */
152 int
153 dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
154     dfl_iter_fn_t func, void *arg, int kmflag)
155 {
156 	dkioc_free_list_ext_t *ext;
157 	uint64_t n_bytes, n_segs, start_idx, i;
158 	uint_t bsize = 1U << dfi->dfi_bshift;
159 	int r = 0;
160 	boolean_t need_copy = B_FALSE;
161 
162 	/*
163 	 * Make sure the block size derived from dfi_bshift is at least 512
164 	 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
165 	 * to prevent any problems with other parts of the system that might
166 	 * assume a minimum block size of 512, and the upper bound is just
167 	 * to prevent overflow when creating the block size from dfi_bshift
168 	 * (though it seems unlikely we'll have _block_ sizes near a GiB
169 	 * any time soon).
170 	 */
171 	if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
172 		r = SET_ERROR(EINVAL);
173 		goto done;
174 	}
175 
176 	/* Max bytes must be a multiple of the block size */
177 	if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
178 		r = SET_ERROR(EINVAL);
179 		goto done;
180 	}
181 
182 	/* Start offset alignment must also be a multiple of the block size */
183 	if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
184 		r = SET_ERROR(EINVAL);
185 		goto done;
186 	}
187 
188 	/* Max bytes in an extent must be a multiple of the block size */
189 	if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
190 		r = SET_ERROR(EINVAL);
191 		goto done;
192 	}
193 
194 	/*
195 	 * It makes no sense to allow a single extent to be larger than the
196 	 * total allowed for an entire request.
197 	 */
198 	if (dfi->dfi_max_ext_bytes > 0 &&
199 	    dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
200 		r = SET_ERROR(EINVAL);
201 		goto done;
202 	}
203 
204 	/*
205 	 * The first pass, align everything as needed and make sure all the
206 	 * extents look valid.
207 	 */
208 	if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
209 		goto done;
210 	}
211 
212 	/*
213 	 * Go through and split things up as needed. The general idea is to
214 	 * split along the original extent boundaries when needed. We only
215 	 * split an extent from the original request into multiple extents
216 	 * if the original extent is by itself too big for the device to
217 	 * process in a single request.
218 	 */
219 	start_idx = 0;
220 	n_bytes = n_segs = 0;
221 	ext = dfl->dfl_exts;
222 	for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
223 		uint64_t start = dfl->dfl_offset + ext->dfle_start;
224 		uint64_t len = ext->dfle_length;
225 
226 		if (len == 0) {
227 			/*
228 			 * If we encounter a zero length extent, we're going
229 			 * to create a new copy of dfl no matter what --
230 			 * the size of dfl is determined by dfl_num_exts so
231 			 * we cannot do things like shift the contents and
232 			 * reduce dfl_num_exts to get a contiguous array
233 			 * of non-zero length extents.
234 			 */
235 			need_copy = B_TRUE;
236 			continue;
237 		}
238 
239 		if (dfi->dfi_max_ext_bytes > 0 &&
240 		    len > dfi->dfi_max_ext_bytes) {
241 			/*
242 			 * An extent that's too large. Dispatch what we've
243 			 * accumulated, and then split this extent into
244 			 * smaller ones the device can accept.
245 			 */
246 			if ((r = process_range(dfl, start_idx, i - start_idx,
247 			    func, arg, kmflag)) != 0) {
248 				goto done;
249 			}
250 
251 			if ((r = split_extent(dfl, dfi, i, func, arg,
252 			    kmflag)) != 0) {
253 				goto done;
254 			}
255 			start_idx = i + 1;
256 			n_segs = 0;
257 			n_bytes = 0;
258 			continue;
259 		}
260 
261 		if (dfi->dfi_max_bytes > 0 &&
262 		    n_bytes + len > dfi->dfi_max_bytes) {
263 			/*
264 			 * This extent would put us over the limit for total
265 			 * bytes that can be trimmed in one request.
266 			 * Dispatch what we've accumulated. Then deal
267 			 * with this extent.
268 			 */
269 			if ((r = process_range(dfl, start_idx, i - start_idx,
270 			    func, arg, kmflag)) != 0) {
271 				goto done;
272 			}
273 
274 			if (len < dfi->dfi_max_bytes) {
275 				/*
276 				 * After dispatching what we've accumulated,
277 				 * this extent can fit in a new request
278 				 * Just add it to the accumulated list of
279 				 * extents and move on.
280 				 */
281 				start_idx = i;
282 				n_segs = 1;
283 				n_bytes = len;
284 				continue;
285 			}
286 
287 			/*
288 			 * Even after starting a new request, this extent
289 			 * is too big. Split it until it fits.
290 			 */
291 			if ((r = split_extent(dfl, dfi, i, func, arg,
292 			    kmflag)) != 0) {
293 				goto done;
294 			}
295 
296 			start_idx = i + 1;
297 			n_segs = 0;
298 			n_bytes = 0;
299 			continue;
300 		}
301 
302 		if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
303 			/*
304 			 * This extent will put us over the limit on the number
305 			 * of extents the device can accept. Dispatch what
306 			 * we've accumulated so far, start a new
307 			 * request, then re-evaluate this extent (in case
308 			 * the extent on it's own is too large).
309 			 */
310 			if ((r = process_range(dfl, start_idx, i - start_idx,
311 			    func, arg, kmflag)) != 0) {
312 				goto done;
313 			}
314 
315 			start_idx = i;
316 			n_segs = 0;
317 			n_bytes = 0;
318 			continue;
319 		}
320 
321 		n_segs++;
322 		n_bytes += len;
323 	}
324 
325 	/*
326 	 * If a copy wasn't required, and we haven't processed a subset of
327 	 * the extents already, we can just use the original request.
328 	 */
329 	if (!need_copy && start_idx == 0) {
330 		return (func(dfl, arg, kmflag));
331 	}
332 
333 	r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
334 
335 done:
336 	dfl_free(dfl);
337 	return (r);
338 }
339 
340 /*
341  * Adjust the start and length of each extent in dfl so that it conforms to
342  * the requirements in dfi. It also verifies that no extent extends beyond
343  * the end of the device (given by len_blk).
344  *
345  * Returns 0 on success, or an error value.
346  */
347 static int
348 adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
349     uint64_t max_off)
350 {
351 	dkioc_free_list_ext_t *exts = dfl->dfl_exts;
352 	/*
353 	 * These must be uint64_t to prevent the P2 macros from truncating
354 	 * the result.
355 	 */
356 	const uint64_t align = dfi->dfi_align;
357 	const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
358 
359 	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
360 		/*
361 		 * Since there are no known requirements on the value of
362 		 * dfl_offset, it's possible (though odd) to have a scenario
363 		 * where dfl_offset == 1, and dfle_start == 511 (resulting
364 		 * in an actual start offset of 512). As such, we always
365 		 * apply the offset and find the resulting starting offset
366 		 * and length (in bytes) first, then apply any rounding
367 		 * and alignment.
368 		 */
369 		uint64_t start = exts->dfle_start + dfl->dfl_offset;
370 		uint64_t end = start + exts->dfle_length;
371 
372 		/*
373 		 * Make sure after applying dfl->dfl_offset and any alignment
374 		 * adjustments that the results don't overflow.
375 		 */
376 		if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
377 			return (SET_ERROR(EOVERFLOW));
378 		}
379 
380 		if (end < start) {
381 			return (SET_ERROR(EOVERFLOW));
382 		}
383 
384 		/*
385 		 * Make sure we don't extend past the end of the device
386 		 */
387 		if (end > max_off) {
388 			return (SET_ERROR(EINVAL));
389 		}
390 
391 		start = P2ROUNDUP(start, align);
392 		end = P2ALIGN(end, bsize);
393 
394 		/*
395 		 * Remove the offset so that when it's later applied again,
396 		 * the correct start value is obtained.
397 		 */
398 		exts->dfle_start = start - dfl->dfl_offset;
399 
400 		/*
401 		 * If the original length was less than the block size
402 		 * of the device, we can end up with end < start. If that
403 		 * happens we just set the length to zero.
404 		 */
405 		exts->dfle_length = (end < start) ? 0 : end - start;
406 	}
407 
408 	return (0);
409 }
410 
411 /*
412  * Take a subset of extents from dfl (starting at start_idx, with n entries)
413  * and create a new dkioc_free_list_t, passing that to func.
414  */
415 static int
416 process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
417     dfl_iter_fn_t func, void *arg, int kmflag)
418 {
419 	dkioc_free_list_t *new_dfl = NULL;
420 	dkioc_free_list_ext_t *new_exts = NULL;
421 	dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
422 	size_t actual_n = n;
423 	int r = 0;
424 
425 	if (n == 0) {
426 		return (0);
427 	}
428 
429 	/*
430 	 * Ignore any zero length extents. No known devices attach any
431 	 * semantic meaning to such extents, and are likely just a result of
432 	 * narrowing the range of the extent to fit the device alignment
433 	 * requirements. It is possible the original caller submitted a
434 	 * zero length extent, but we ignore those as well. Since we can't
435 	 * communicate partial results back to the caller anyway, it's
436 	 * unclear whether reporting that one of potentially many exents was
437 	 * too small (without being able to identify which one) to the caller
438 	 * of the DKIOCFREE request would be useful.
439 	 */
440 	for (uint64_t i = 0; i < n; i++) {
441 		if (exts[i].dfle_length == 0 && --actual_n == 0) {
442 			return (0);
443 		}
444 	}
445 
446 	new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
447 	if (new_dfl == NULL) {
448 		return (SET_ERROR(ENOMEM));
449 	}
450 
451 	new_dfl->dfl_flags = dfl->dfl_flags;
452 	new_dfl->dfl_num_exts = actual_n;
453 	new_dfl->dfl_offset = dfl->dfl_offset;
454 	new_exts = new_dfl->dfl_exts;
455 
456 	for (uint64_t i = 0; i < n; i++) {
457 		if (exts[i].dfle_length == 0) {
458 			continue;
459 		}
460 
461 		*new_exts++ = exts[i];
462 	}
463 
464 	return (func(new_dfl, arg, kmflag));
465 }
466 
467 /*
468  * If dfi_max_ext_bytes is set, use as the max segment length,
469  * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
470  */
471 #define	MAX_SEGLEN(dfi) \
472 	(((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes :	\
473 	((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
474 
475 /*
476  * Split the extent at idx into multiple lists (calling func for each one).
477  */
478 static int
479 split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
480     dfl_iter_fn_t func, void *arg, int kmflag)
481 {
482 	ASSERT3U(idx, <, dfl->dfl_num_exts);
483 
484 	const uint64_t		maxlen = MAX_SEGLEN(dfi);
485 	dkioc_free_list_ext_t	*ext = dfl->dfl_exts + idx;
486 	uint64_t		remain = ext->dfle_length;
487 	int			r;
488 
489 	/*
490 	 * Break the extent into as many single requests as needed. While it
491 	 * would be possible in some circumstances to combine the final chunk
492 	 * of the extent (after splitting) with the remaining extents in the
493 	 * original request, it's not clear there's much benefit from the
494 	 * added complexity. Such behavior could be added in the future if
495 	 * it's determined to be worthwhile.
496 	 */
497 	while (remain > 0) {
498 		uint64_t start = dfl->dfl_offset + ext->dfle_start;
499 		uint64_t len = remain;
500 
501 		/*
502 		 * If we know we have at least one more segment left after
503 		 * the current iteration of this loop, split it so that
504 		 * the next segment starts on an aligned boundary.
505 		 */
506 		if (len > maxlen) {
507 			uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
508 			len = end - start;
509 		}
510 
511 		ext->dfle_length = len;
512 
513 		if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
514 			return (r);
515 		}
516 
517 		ext->dfle_start += len;
518 		remain -= len;
519 	}
520 
521 	return (0);
522 }
523