1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
14 * Copyright 2020 Joyent, Inc.
15 */
16
17 /* needed when building libzpool */
18 #ifndef _KERNEL
19 #include <sys/zfs_context.h>
20 #endif
21
22 #include <sys/sunddi.h>
23 #include <sys/dkio.h>
24 #include <sys/dkioc_free_util.h>
25 #include <sys/sysmacros.h>
26 #include <sys/file.h>
27 #include <sys/sdt.h>
28
29 static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
30 uint64_t len_blk);
31 static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
32 uint64_t, dfl_iter_fn_t, void *, int);
33 static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
34 dfl_iter_fn_t, void *, int);
35
36 /*
37 * Copy-in convenience function for variable-length dkioc_free_list_t
38 * structures. The pointer to be copied from is in `arg' (may be a pointer
39 * to userspace). A new buffer is allocated and a pointer to it is placed
40 * in `out'. `ddi_flags' indicates whether the pointer is from user-
41 * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to
42 * kmem_zalloc when allocating the new structure.
43 * Returns 0 on success, or an errno on failure.
44 */
45 int
dfl_copyin(void * arg,dkioc_free_list_t ** out,int ddi_flags,int kmflags)46 dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags)
47 {
48 dkioc_free_list_t *dfl;
49
50 if (ddi_flags & FKIOCTL) {
51 dkioc_free_list_t *dfl_in = arg;
52
53 if (dfl_in->dfl_num_exts == 0 ||
54 dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS)
55 return (SET_ERROR(EINVAL));
56 dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags);
57 if (dfl == NULL)
58 return (SET_ERROR(ENOMEM));
59 bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts));
60 } else {
61 uint64_t num_exts;
62
63 if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t,
64 dfl_num_exts), &num_exts, sizeof (num_exts),
65 ddi_flags) != 0)
66 return (SET_ERROR(EFAULT));
67 if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS)
68 return (SET_ERROR(EINVAL));
69 dfl = kmem_alloc(DFL_SZ(num_exts), kmflags);
70 if (dfl == NULL)
71 return (SET_ERROR(ENOMEM));
72 if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 ||
73 dfl->dfl_num_exts != num_exts) {
74 kmem_free(dfl, DFL_SZ(num_exts));
75 return (SET_ERROR(EFAULT));
76 }
77 }
78
79 *out = dfl;
80 return (0);
81 }
82
83 /* Frees a variable-length dkioc_free_list_t structure. */
84 void
dfl_free(dkioc_free_list_t * dfl)85 dfl_free(dkioc_free_list_t *dfl)
86 {
87 kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
88 }
89
90 /*
91 * Convenience function to resize and segment the array of extents in
92 * a DKIOCFREE request as required by a driver.
93 *
94 * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
95 * on either the number of extents that can be submitted in a single request,
96 * or the total number of blocks that can be submitted in a single request.
97 * In addition, devices may have alignment requirements on the starting
98 * address stricter than the device block size.
99 *
100 * Since there is currently no mechanism for callers of DKIOCFREE to discover
101 * such restrictions, instead of rejecting any requests that do not conform to
102 * some undiscoverable (to the caller) set of requirements, a driver can use
103 * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
104 * required to conform to its requirements.
105 *
106 * The original request is passed as 'dfl' and the alignment requirements
107 * are passed in 'dfi'. Additionally the maximum offset of the device allowed
108 * in bytes) is passed as max_off -- this allows a driver with
109 * multiple instances of different sizes but similar requirements (e.g.
110 * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
111 * struct for each device.
112 *
113 * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
114 * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
115 * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
116 * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
117 * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
118 * instances and populate them with the adjusted extents that do conform to the
119 * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
120 * passed to it when this occurs. The net result is that 'func' can always
121 * assume it will be called with a dkioc_free_list_t with extents that
122 * comply with the requirements in 'dfi'. 'func' is also responsible for
123 * freeing the dkioc_free_list_t passed to it (likely via a completion
124 * callback).
125 *
126 * Combined with the behavior described above, dfl_iter() can be viewed as
127 * consuming the dkioc_free_list_t passed to it. Either it will pass it along
128 * to 'func' (and let 'func' handle freeing it), or it will free it and
129 * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
130 * letting 'func' handle freeing the new instances). This way neither the
131 * dfl_iter() caller nor nor the driver need to worry about treating
132 * conforming and non-conforming requests differently.
133 *
134 * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
135 * any notion of partial completion -- either it returns success (0) or
136 * an error. It's not clear if such a notion would even be possible while
137 * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
138 * interface. As such, there's little benefit to providing more detailed error
139 * semantics beyond what DKIOCFREE can handle.
140 *
141 * Due to this, a somewhat simplistic approach is taken to error handling. The
142 * original list of extents is first checked to make sure they all appear
143 * valid -- that is they do not start or extend beyond the end of the device.
144 * Any request that contains such extents is always rejected in it's entirety.
145 * It is possible after applying any needed adjustments to the original list
146 * of extents that the result is not acceptable to the driver. For example,
147 * a device with a 512 byte block size that tries to free the range 513-1023
148 * (bytes) would not be able to be processed. Such extents will be silently
149 * ignored. If the original request consists of nothing but such requests,
150 * dfl_iter() will never call 'func' and will merely return 0.
151 */
152 int
dfl_iter(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t max_off,dfl_iter_fn_t func,void * arg,int kmflag)153 dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
154 dfl_iter_fn_t func, void *arg, int kmflag)
155 {
156 dkioc_free_list_ext_t *ext;
157 uint64_t n_bytes, n_segs, start_idx, i;
158 uint_t bsize = 1U << dfi->dfi_bshift;
159 int r = 0;
160 boolean_t need_copy = B_FALSE;
161
162 /*
163 * Make sure the block size derived from dfi_bshift is at least 512
164 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
165 * to prevent any problems with other parts of the system that might
166 * assume a minimum block size of 512, and the upper bound is just
167 * to prevent overflow when creating the block size from dfi_bshift
168 * (though it seems unlikely we'll have _block_ sizes near a GiB
169 * any time soon).
170 */
171 if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
172 r = SET_ERROR(EINVAL);
173 goto done;
174 }
175
176 /* Max bytes must be a multiple of the block size */
177 if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
178 r = SET_ERROR(EINVAL);
179 goto done;
180 }
181
182 /* Start offset alignment must also be a multiple of the block size */
183 if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
184 r = SET_ERROR(EINVAL);
185 goto done;
186 }
187
188 /* Max bytes in an extent must be a multiple of the block size */
189 if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
190 r = SET_ERROR(EINVAL);
191 goto done;
192 }
193
194 /*
195 * It makes no sense to allow a single extent to be larger than the
196 * total allowed for an entire request.
197 */
198 if (dfi->dfi_max_ext_bytes > 0 &&
199 dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
200 r = SET_ERROR(EINVAL);
201 goto done;
202 }
203
204 /*
205 * The first pass, align everything as needed and make sure all the
206 * extents look valid.
207 */
208 if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
209 goto done;
210 }
211
212 /*
213 * Go through and split things up as needed. The general idea is to
214 * split along the original extent boundaries when needed. We only
215 * split an extent from the original request into multiple extents
216 * if the original extent is by itself too big for the device to
217 * process in a single request.
218 */
219 start_idx = 0;
220 n_bytes = n_segs = 0;
221 ext = dfl->dfl_exts;
222 for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
223 uint64_t start = dfl->dfl_offset + ext->dfle_start;
224 uint64_t len = ext->dfle_length;
225
226 if (len == 0) {
227 /*
228 * If we encounter a zero length extent, we're going
229 * to create a new copy of dfl no matter what --
230 * the size of dfl is determined by dfl_num_exts so
231 * we cannot do things like shift the contents and
232 * reduce dfl_num_exts to get a contiguous array
233 * of non-zero length extents.
234 */
235 need_copy = B_TRUE;
236 continue;
237 }
238
239 if (dfi->dfi_max_ext_bytes > 0 &&
240 len > dfi->dfi_max_ext_bytes) {
241 /*
242 * An extent that's too large. Dispatch what we've
243 * accumulated, and then split this extent into
244 * smaller ones the device can accept.
245 */
246 if ((r = process_range(dfl, start_idx, i - start_idx,
247 func, arg, kmflag)) != 0) {
248 goto done;
249 }
250
251 if ((r = split_extent(dfl, dfi, i, func, arg,
252 kmflag)) != 0) {
253 goto done;
254 }
255 start_idx = i + 1;
256 n_segs = 0;
257 n_bytes = 0;
258 continue;
259 }
260
261 if (dfi->dfi_max_bytes > 0 &&
262 n_bytes + len > dfi->dfi_max_bytes) {
263 /*
264 * This extent would put us over the limit for total
265 * bytes that can be trimmed in one request.
266 * Dispatch what we've accumulated. Then deal
267 * with this extent.
268 */
269 if ((r = process_range(dfl, start_idx, i - start_idx,
270 func, arg, kmflag)) != 0) {
271 goto done;
272 }
273
274 if (len < dfi->dfi_max_bytes) {
275 /*
276 * After dispatching what we've accumulated,
277 * this extent can fit in a new request
278 * Just add it to the accumulated list of
279 * extents and move on.
280 */
281 start_idx = i;
282 n_segs = 1;
283 n_bytes = len;
284 continue;
285 }
286
287 /*
288 * Even after starting a new request, this extent
289 * is too big. Split it until it fits.
290 */
291 if ((r = split_extent(dfl, dfi, i, func, arg,
292 kmflag)) != 0) {
293 goto done;
294 }
295
296 start_idx = i + 1;
297 n_segs = 0;
298 n_bytes = 0;
299 continue;
300 }
301
302 if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
303 /*
304 * This extent will put us over the limit on the number
305 * of extents the device can accept. Dispatch what
306 * we've accumulated so far.
307 */
308 if ((r = process_range(dfl, start_idx, i - start_idx,
309 func, arg, kmflag)) != 0) {
310 goto done;
311 }
312
313 start_idx = i;
314 n_segs = 1;
315 n_bytes = len;
316 continue;
317 }
318
319 n_segs++;
320 n_bytes += len;
321 }
322
323 /*
324 * If a copy wasn't required, and we haven't processed a subset of
325 * the extents already, we can just use the original request.
326 */
327 if (!need_copy && start_idx == 0) {
328 return (func(dfl, arg, kmflag));
329 }
330
331 r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
332
333 done:
334 dfl_free(dfl);
335 return (r);
336 }
337
338 /*
339 * Adjust the start and length of each extent in dfl so that it conforms to
340 * the requirements in dfi. It also verifies that no extent extends beyond
341 * the end of the device (given by len_blk).
342 *
343 * Returns 0 on success, or an error value.
344 */
345 static int
adjust_exts(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t max_off)346 adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
347 uint64_t max_off)
348 {
349 dkioc_free_list_ext_t *exts = dfl->dfl_exts;
350 /*
351 * These must be uint64_t to prevent the P2 macros from truncating
352 * the result.
353 */
354 const uint64_t align = dfi->dfi_align;
355 const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
356
357 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
358 /*
359 * Since there are no known requirements on the value of
360 * dfl_offset, it's possible (though odd) to have a scenario
361 * where dfl_offset == 1, and dfle_start == 511 (resulting
362 * in an actual start offset of 512). As such, we always
363 * apply the offset and find the resulting starting offset
364 * and length (in bytes) first, then apply any rounding
365 * and alignment.
366 */
367 uint64_t start = exts->dfle_start + dfl->dfl_offset;
368 uint64_t end = start + exts->dfle_length;
369
370 /*
371 * Make sure after applying dfl->dfl_offset and any alignment
372 * adjustments that the results don't overflow.
373 */
374 if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
375 return (SET_ERROR(EOVERFLOW));
376 }
377
378 if (end < start) {
379 return (SET_ERROR(EOVERFLOW));
380 }
381
382 /*
383 * Make sure we don't extend past the end of the device
384 */
385 if (end > max_off) {
386 return (SET_ERROR(EINVAL));
387 }
388
389 start = P2ROUNDUP(start, align);
390 end = P2ALIGN(end, bsize);
391
392 /*
393 * Remove the offset so that when it's later applied again,
394 * the correct start value is obtained.
395 */
396 exts->dfle_start = start - dfl->dfl_offset;
397
398 /*
399 * If the original length was less than the block size
400 * of the device, we can end up with end < start. If that
401 * happens we just set the length to zero.
402 */
403 exts->dfle_length = (end < start) ? 0 : end - start;
404 }
405
406 return (0);
407 }
408
409 /*
410 * Take a subset of extents from dfl (starting at start_idx, with n entries)
411 * and create a new dkioc_free_list_t, passing that to func.
412 */
413 static int
process_range(dkioc_free_list_t * dfl,uint64_t start_idx,uint64_t n,dfl_iter_fn_t func,void * arg,int kmflag)414 process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
415 dfl_iter_fn_t func, void *arg, int kmflag)
416 {
417 dkioc_free_list_t *new_dfl = NULL;
418 dkioc_free_list_ext_t *new_exts = NULL;
419 dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
420 size_t actual_n = n;
421 int r = 0;
422
423 if (n == 0) {
424 return (0);
425 }
426
427 /*
428 * Ignore any zero length extents. No known devices attach any
429 * semantic meaning to such extents, and are likely just a result of
430 * narrowing the range of the extent to fit the device alignment
431 * requirements. It is possible the original caller submitted a
432 * zero length extent, but we ignore those as well. Since we can't
433 * communicate partial results back to the caller anyway, it's
434 * unclear whether reporting that one of potentially many exents was
435 * too small (without being able to identify which one) to the caller
436 * of the DKIOCFREE request would be useful.
437 */
438 for (uint64_t i = 0; i < n; i++) {
439 if (exts[i].dfle_length == 0 && --actual_n == 0) {
440 return (0);
441 }
442 }
443
444 new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
445 if (new_dfl == NULL) {
446 return (SET_ERROR(ENOMEM));
447 }
448
449 new_dfl->dfl_flags = dfl->dfl_flags;
450 new_dfl->dfl_num_exts = actual_n;
451 new_dfl->dfl_offset = dfl->dfl_offset;
452 new_exts = new_dfl->dfl_exts;
453
454 for (uint64_t i = 0; i < n; i++) {
455 if (exts[i].dfle_length == 0) {
456 continue;
457 }
458
459 *new_exts++ = exts[i];
460 }
461
462 return (func(new_dfl, arg, kmflag));
463 }
464
465 /*
466 * If dfi_max_ext_bytes is set, use as the max segment length,
467 * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
468 */
469 #define MAX_SEGLEN(dfi) \
470 (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \
471 ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
472
473 /*
474 * Split the extent at idx into multiple lists (calling func for each one).
475 */
476 static int
split_extent(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t idx,dfl_iter_fn_t func,void * arg,int kmflag)477 split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
478 dfl_iter_fn_t func, void *arg, int kmflag)
479 {
480 ASSERT3U(idx, <, dfl->dfl_num_exts);
481
482 const uint64_t maxlen = MAX_SEGLEN(dfi);
483 dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx;
484 uint64_t remain = ext->dfle_length;
485 int r;
486
487 /*
488 * Break the extent into as many single requests as needed. While it
489 * would be possible in some circumstances to combine the final chunk
490 * of the extent (after splitting) with the remaining extents in the
491 * original request, it's not clear there's much benefit from the
492 * added complexity. Such behavior could be added in the future if
493 * it's determined to be worthwhile.
494 */
495 while (remain > 0) {
496 uint64_t start = dfl->dfl_offset + ext->dfle_start;
497 uint64_t len = remain;
498
499 /*
500 * If we know we have at least one more segment left after
501 * the current iteration of this loop, split it so that
502 * the next segment starts on an aligned boundary.
503 */
504 if (len > maxlen) {
505 uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
506 len = end - start;
507 }
508
509 ext->dfle_length = len;
510
511 if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
512 return (r);
513 }
514
515 ext->dfle_start += len;
516 remain -= len;
517 }
518
519 return (0);
520 }
521