1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2021 Tintri by DDN, Inc. All rights reserved. 14 * Copyright 2020 Joyent, Inc. 15 */ 16 17 /* needed when building libzpool */ 18 #ifndef _KERNEL 19 #include <sys/zfs_context.h> 20 #endif 21 22 #include <sys/sunddi.h> 23 #include <sys/dkio.h> 24 #include <sys/dkioc_free_util.h> 25 #include <sys/sysmacros.h> 26 #include <sys/file.h> 27 #include <sys/sdt.h> 28 29 static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *, 30 uint64_t len_blk); 31 static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *, 32 uint64_t, dfl_iter_fn_t, void *, int); 33 static int process_range(dkioc_free_list_t *, uint64_t, uint64_t, 34 dfl_iter_fn_t, void *, int); 35 36 /* 37 * Copy-in convenience function for variable-length dkioc_free_list_t 38 * structures. The pointer to be copied from is in `arg' (may be a pointer 39 * to userspace). A new buffer is allocated and a pointer to it is placed 40 * in `out'. `ddi_flags' indicates whether the pointer is from user- 41 * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to 42 * kmem_zalloc when allocating the new structure. 43 * Returns 0 on success, or an errno on failure. 44 */ 45 int 46 dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags) 47 { 48 dkioc_free_list_t *dfl; 49 50 if (ddi_flags & FKIOCTL) { 51 dkioc_free_list_t *dfl_in = arg; 52 53 if (dfl_in->dfl_num_exts == 0 || 54 dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS) 55 return (SET_ERROR(EINVAL)); 56 dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags); 57 if (dfl == NULL) 58 return (SET_ERROR(ENOMEM)); 59 bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts)); 60 } else { 61 uint64_t num_exts; 62 63 if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t, 64 dfl_num_exts), &num_exts, sizeof (num_exts), 65 ddi_flags) != 0) 66 return (SET_ERROR(EFAULT)); 67 if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS) 68 return (SET_ERROR(EINVAL)); 69 dfl = kmem_alloc(DFL_SZ(num_exts), kmflags); 70 if (dfl == NULL) 71 return (SET_ERROR(ENOMEM)); 72 if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 || 73 dfl->dfl_num_exts != num_exts) { 74 kmem_free(dfl, DFL_SZ(num_exts)); 75 return (SET_ERROR(EFAULT)); 76 } 77 } 78 79 *out = dfl; 80 return (0); 81 } 82 83 /* Frees a variable-length dkioc_free_list_t structure. */ 84 void 85 dfl_free(dkioc_free_list_t *dfl) 86 { 87 kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); 88 } 89 90 /* 91 * Convenience function to resize and segment the array of extents in 92 * a DKIOCFREE request as required by a driver. 93 * 94 * Some devices that implement DKIOCFREE (e.g. vioblk) have limits 95 * on either the number of extents that can be submitted in a single request, 96 * or the total number of blocks that can be submitted in a single request. 97 * In addition, devices may have alignment requirements on the starting 98 * address stricter than the device block size. 99 * 100 * Since there is currently no mechanism for callers of DKIOCFREE to discover 101 * such restrictions, instead of rejecting any requests that do not conform to 102 * some undiscoverable (to the caller) set of requirements, a driver can use 103 * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as 104 * required to conform to its requirements. 105 * 106 * The original request is passed as 'dfl' and the alignment requirements 107 * are passed in 'dfi'. Additionally the maximum offset of the device allowed 108 * in bytes) is passed as max_off -- this allows a driver with 109 * multiple instances of different sizes but similar requirements (e.g. 110 * a partitioned blkdev device) to not construct a separate dkioc_free_info_t 111 * struct for each device. 112 * 113 * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of 114 * arg passed to it as needed. If the extents in the dkioc_free_list_t passed 115 * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will 116 * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter() 117 * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t 118 * instances and populate them with the adjusted extents that do conform to the 119 * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t 120 * passed to it when this occurs. The net result is that 'func' can always 121 * assume it will be called with a dkioc_free_list_t with extents that 122 * comply with the requirements in 'dfi'. 'func' is also responsible for 123 * freeing the dkioc_free_list_t passed to it (likely via a completion 124 * callback). 125 * 126 * Combined with the behavior described above, dfl_iter() can be viewed as 127 * consuming the dkioc_free_list_t passed to it. Either it will pass it along 128 * to 'func' (and let 'func' handle freeing it), or it will free it and 129 * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still 130 * letting 'func' handle freeing the new instances). This way neither the 131 * dfl_iter() caller nor nor the driver need to worry about treating 132 * conforming and non-conforming requests differently. 133 * 134 * Unfortunately, the DKIOCFREE ioctl provides no method for communicating 135 * any notion of partial completion -- either it returns success (0) or 136 * an error. It's not clear if such a notion would even be possible while 137 * supporting multiple types of devices (NVMe, SCSI, etc.) with the same 138 * interface. As such, there's little benefit to providing more detailed error 139 * semantics beyond what DKIOCFREE can handle. 140 * 141 * Due to this, a somewhat simplistic approach is taken to error handling. The 142 * original list of extents is first checked to make sure they all appear 143 * valid -- that is they do not start or extend beyond the end of the device. 144 * Any request that contains such extents is always rejected in it's entirety. 145 * It is possible after applying any needed adjustments to the original list 146 * of extents that the result is not acceptable to the driver. For example, 147 * a device with a 512 byte block size that tries to free the range 513-1023 148 * (bytes) would not be able to be processed. Such extents will be silently 149 * ignored. If the original request consists of nothing but such requests, 150 * dfl_iter() will never call 'func' and will merely return 0. 151 */ 152 int 153 dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off, 154 dfl_iter_fn_t func, void *arg, int kmflag) 155 { 156 dkioc_free_list_ext_t *ext; 157 uint64_t n_bytes, n_segs, start_idx, i; 158 uint_t bsize = 1U << dfi->dfi_bshift; 159 int r = 0; 160 boolean_t need_copy = B_FALSE; 161 162 /* 163 * Make sure the block size derived from dfi_bshift is at least 512 164 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is 165 * to prevent any problems with other parts of the system that might 166 * assume a minimum block size of 512, and the upper bound is just 167 * to prevent overflow when creating the block size from dfi_bshift 168 * (though it seems unlikely we'll have _block_ sizes near a GiB 169 * any time soon). 170 */ 171 if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) { 172 r = SET_ERROR(EINVAL); 173 goto done; 174 } 175 176 /* Max bytes must be a multiple of the block size */ 177 if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) { 178 r = SET_ERROR(EINVAL); 179 goto done; 180 } 181 182 /* Start offset alignment must also be a multiple of the block size */ 183 if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) { 184 r = SET_ERROR(EINVAL); 185 goto done; 186 } 187 188 /* Max bytes in an extent must be a multiple of the block size */ 189 if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) { 190 r = SET_ERROR(EINVAL); 191 goto done; 192 } 193 194 /* 195 * It makes no sense to allow a single extent to be larger than the 196 * total allowed for an entire request. 197 */ 198 if (dfi->dfi_max_ext_bytes > 0 && 199 dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) { 200 r = SET_ERROR(EINVAL); 201 goto done; 202 } 203 204 /* 205 * The first pass, align everything as needed and make sure all the 206 * extents look valid. 207 */ 208 if ((r = adjust_exts(dfl, dfi, max_off)) != 0) { 209 goto done; 210 } 211 212 /* 213 * Go through and split things up as needed. The general idea is to 214 * split along the original extent boundaries when needed. We only 215 * split an extent from the original request into multiple extents 216 * if the original extent is by itself too big for the device to 217 * process in a single request. 218 */ 219 start_idx = 0; 220 n_bytes = n_segs = 0; 221 ext = dfl->dfl_exts; 222 for (i = 0; i < dfl->dfl_num_exts; i++, ext++) { 223 uint64_t start = dfl->dfl_offset + ext->dfle_start; 224 uint64_t len = ext->dfle_length; 225 226 if (len == 0) { 227 /* 228 * If we encounter a zero length extent, we're going 229 * to create a new copy of dfl no matter what -- 230 * the size of dfl is determined by dfl_num_exts so 231 * we cannot do things like shift the contents and 232 * reduce dfl_num_exts to get a contiguous array 233 * of non-zero length extents. 234 */ 235 need_copy = B_TRUE; 236 continue; 237 } 238 239 if (dfi->dfi_max_ext_bytes > 0 && 240 len > dfi->dfi_max_ext_bytes) { 241 /* 242 * An extent that's too large. Dispatch what we've 243 * accumulated, and then split this extent into 244 * smaller ones the device can accept. 245 */ 246 if ((r = process_range(dfl, start_idx, i - start_idx, 247 func, arg, kmflag)) != 0) { 248 goto done; 249 } 250 251 if ((r = split_extent(dfl, dfi, i, func, arg, 252 kmflag)) != 0) { 253 goto done; 254 } 255 start_idx = i + 1; 256 n_segs = 0; 257 n_bytes = 0; 258 continue; 259 } 260 261 if (dfi->dfi_max_bytes > 0 && 262 n_bytes + len > dfi->dfi_max_bytes) { 263 /* 264 * This extent would put us over the limit for total 265 * bytes that can be trimmed in one request. 266 * Dispatch what we've accumulated. Then deal 267 * with this extent. 268 */ 269 if ((r = process_range(dfl, start_idx, i - start_idx, 270 func, arg, kmflag)) != 0) { 271 goto done; 272 } 273 274 if (len < dfi->dfi_max_bytes) { 275 /* 276 * After dispatching what we've accumulated, 277 * this extent can fit in a new request 278 * Just add it to the accumulated list of 279 * extents and move on. 280 */ 281 start_idx = i; 282 n_segs = 1; 283 n_bytes = len; 284 continue; 285 } 286 287 /* 288 * Even after starting a new request, this extent 289 * is too big. Split it until it fits. 290 */ 291 if ((r = split_extent(dfl, dfi, i, func, arg, 292 kmflag)) != 0) { 293 goto done; 294 } 295 296 start_idx = i + 1; 297 n_segs = 0; 298 n_bytes = 0; 299 continue; 300 } 301 302 if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) { 303 /* 304 * This extent will put us over the limit on the number 305 * of extents the device can accept. Dispatch what 306 * we've accumulated so far. 307 */ 308 if ((r = process_range(dfl, start_idx, i - start_idx, 309 func, arg, kmflag)) != 0) { 310 goto done; 311 } 312 313 start_idx = i; 314 n_segs = 1; 315 n_bytes = len; 316 continue; 317 } 318 319 n_segs++; 320 n_bytes += len; 321 } 322 323 /* 324 * If a copy wasn't required, and we haven't processed a subset of 325 * the extents already, we can just use the original request. 326 */ 327 if (!need_copy && start_idx == 0) { 328 return (func(dfl, arg, kmflag)); 329 } 330 331 r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag); 332 333 done: 334 dfl_free(dfl); 335 return (r); 336 } 337 338 /* 339 * Adjust the start and length of each extent in dfl so that it conforms to 340 * the requirements in dfi. It also verifies that no extent extends beyond 341 * the end of the device (given by len_blk). 342 * 343 * Returns 0 on success, or an error value. 344 */ 345 static int 346 adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, 347 uint64_t max_off) 348 { 349 dkioc_free_list_ext_t *exts = dfl->dfl_exts; 350 /* 351 * These must be uint64_t to prevent the P2 macros from truncating 352 * the result. 353 */ 354 const uint64_t align = dfi->dfi_align; 355 const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift; 356 357 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) { 358 /* 359 * Since there are no known requirements on the value of 360 * dfl_offset, it's possible (though odd) to have a scenario 361 * where dfl_offset == 1, and dfle_start == 511 (resulting 362 * in an actual start offset of 512). As such, we always 363 * apply the offset and find the resulting starting offset 364 * and length (in bytes) first, then apply any rounding 365 * and alignment. 366 */ 367 uint64_t start = exts->dfle_start + dfl->dfl_offset; 368 uint64_t end = start + exts->dfle_length; 369 370 /* 371 * Make sure after applying dfl->dfl_offset and any alignment 372 * adjustments that the results don't overflow. 373 */ 374 if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) { 375 return (SET_ERROR(EOVERFLOW)); 376 } 377 378 if (end < start) { 379 return (SET_ERROR(EOVERFLOW)); 380 } 381 382 /* 383 * Make sure we don't extend past the end of the device 384 */ 385 if (end > max_off) { 386 return (SET_ERROR(EINVAL)); 387 } 388 389 start = P2ROUNDUP(start, align); 390 end = P2ALIGN(end, bsize); 391 392 /* 393 * Remove the offset so that when it's later applied again, 394 * the correct start value is obtained. 395 */ 396 exts->dfle_start = start - dfl->dfl_offset; 397 398 /* 399 * If the original length was less than the block size 400 * of the device, we can end up with end < start. If that 401 * happens we just set the length to zero. 402 */ 403 exts->dfle_length = (end < start) ? 0 : end - start; 404 } 405 406 return (0); 407 } 408 409 /* 410 * Take a subset of extents from dfl (starting at start_idx, with n entries) 411 * and create a new dkioc_free_list_t, passing that to func. 412 */ 413 static int 414 process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n, 415 dfl_iter_fn_t func, void *arg, int kmflag) 416 { 417 dkioc_free_list_t *new_dfl = NULL; 418 dkioc_free_list_ext_t *new_exts = NULL; 419 dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx; 420 size_t actual_n = n; 421 int r = 0; 422 423 if (n == 0) { 424 return (0); 425 } 426 427 /* 428 * Ignore any zero length extents. No known devices attach any 429 * semantic meaning to such extents, and are likely just a result of 430 * narrowing the range of the extent to fit the device alignment 431 * requirements. It is possible the original caller submitted a 432 * zero length extent, but we ignore those as well. Since we can't 433 * communicate partial results back to the caller anyway, it's 434 * unclear whether reporting that one of potentially many exents was 435 * too small (without being able to identify which one) to the caller 436 * of the DKIOCFREE request would be useful. 437 */ 438 for (uint64_t i = 0; i < n; i++) { 439 if (exts[i].dfle_length == 0 && --actual_n == 0) { 440 return (0); 441 } 442 } 443 444 new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag); 445 if (new_dfl == NULL) { 446 return (SET_ERROR(ENOMEM)); 447 } 448 449 new_dfl->dfl_flags = dfl->dfl_flags; 450 new_dfl->dfl_num_exts = actual_n; 451 new_dfl->dfl_offset = dfl->dfl_offset; 452 new_exts = new_dfl->dfl_exts; 453 454 for (uint64_t i = 0; i < n; i++) { 455 if (exts[i].dfle_length == 0) { 456 continue; 457 } 458 459 *new_exts++ = exts[i]; 460 } 461 462 return (func(new_dfl, arg, kmflag)); 463 } 464 465 /* 466 * If dfi_max_ext_bytes is set, use as the max segment length, 467 * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX 468 */ 469 #define MAX_SEGLEN(dfi) \ 470 (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \ 471 ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX) 472 473 /* 474 * Split the extent at idx into multiple lists (calling func for each one). 475 */ 476 static int 477 split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx, 478 dfl_iter_fn_t func, void *arg, int kmflag) 479 { 480 ASSERT3U(idx, <, dfl->dfl_num_exts); 481 482 const uint64_t maxlen = MAX_SEGLEN(dfi); 483 dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx; 484 uint64_t remain = ext->dfle_length; 485 int r; 486 487 /* 488 * Break the extent into as many single requests as needed. While it 489 * would be possible in some circumstances to combine the final chunk 490 * of the extent (after splitting) with the remaining extents in the 491 * original request, it's not clear there's much benefit from the 492 * added complexity. Such behavior could be added in the future if 493 * it's determined to be worthwhile. 494 */ 495 while (remain > 0) { 496 uint64_t start = dfl->dfl_offset + ext->dfle_start; 497 uint64_t len = remain; 498 499 /* 500 * If we know we have at least one more segment left after 501 * the current iteration of this loop, split it so that 502 * the next segment starts on an aligned boundary. 503 */ 504 if (len > maxlen) { 505 uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align); 506 len = end - start; 507 } 508 509 ext->dfle_length = len; 510 511 if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) { 512 return (r); 513 } 514 515 ext->dfle_start += len; 516 remain -= len; 517 } 518 519 return (0); 520 } 521