1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2017 Nexenta Inc. All rights reserved. 14 * Copyright 2020 Joyent, Inc. 15 */ 16 17 /* needed when building libzpool */ 18 #ifndef _KERNEL 19 #include <sys/zfs_context.h> 20 #endif 21 22 #include <sys/sunddi.h> 23 #include <sys/dkio.h> 24 #include <sys/dkioc_free_util.h> 25 #include <sys/sysmacros.h> 26 #include <sys/file.h> 27 #include <sys/sdt.h> 28 29 static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *, 30 uint64_t len_blk); 31 static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *, 32 uint64_t, dfl_iter_fn_t, void *, int); 33 static int process_range(dkioc_free_list_t *, uint64_t, uint64_t, 34 dfl_iter_fn_t, void *, int); 35 36 /* 37 * Copy-in convenience function for variable-length dkioc_free_list_t 38 * structures. The pointer to be copied from is in `arg' (may be a pointer 39 * to userspace). A new buffer is allocated and a pointer to it is placed 40 * in `out'. `ddi_flags' indicates whether the pointer is from user- 41 * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to 42 * kmem_zalloc when allocating the new structure. 43 * Returns 0 on success, or an errno on failure. 44 */ 45 int 46 dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags) 47 { 48 dkioc_free_list_t *dfl; 49 50 if (ddi_flags & FKIOCTL) { 51 dkioc_free_list_t *dfl_in = arg; 52 53 if (dfl_in->dfl_num_exts == 0 || 54 dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS) 55 return (SET_ERROR(EINVAL)); 56 dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags); 57 if (dfl == NULL) 58 return (SET_ERROR(ENOMEM)); 59 bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts)); 60 } else { 61 uint64_t num_exts; 62 63 if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t, 64 dfl_num_exts), &num_exts, sizeof (num_exts), 65 ddi_flags) != 0) 66 return (SET_ERROR(EFAULT)); 67 if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS) 68 return (SET_ERROR(EINVAL)); 69 dfl = kmem_alloc(DFL_SZ(num_exts), kmflags); 70 if (dfl == NULL) 71 return (SET_ERROR(ENOMEM)); 72 if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 || 73 dfl->dfl_num_exts != num_exts) { 74 kmem_free(dfl, DFL_SZ(num_exts)); 75 return (SET_ERROR(EFAULT)); 76 } 77 } 78 79 *out = dfl; 80 return (0); 81 } 82 83 /* Frees a variable-length dkioc_free_list_t structure. */ 84 void 85 dfl_free(dkioc_free_list_t *dfl) 86 { 87 kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); 88 } 89 90 /* 91 * Convenience function to resize and segment the array of extents in 92 * a DKIOCFREE request as required by a driver. 93 * 94 * Some devices that implement DKIOCFREE (e.g. vioblk) have limits 95 * on either the number of extents that can be submitted in a single request, 96 * or the total number of blocks that can be submitted in a single request. 97 * In addition, devices may have alignment requirements on the starting 98 * address stricter than the device block size. 99 * 100 * Since there is currently no mechanism for callers of DKIOCFREE to discover 101 * such restrictions, instead of rejecting any requests that do not conform to 102 * some undiscoverable (to the caller) set of requirements, a driver can use 103 * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as 104 * required to conform to its requirements. 105 * 106 * The original request is passed as 'dfl' and the alignment requirements 107 * are passed in 'dfi'. Additionally the maximum offset of the device allowed 108 * in bytes) is passed as max_off -- this allows a driver with 109 * multiple instances of different sizes but similar requirements (e.g. 110 * a partitioned blkdev device) to not construct a separate dkioc_free_info_t 111 * struct for each device. 112 * 113 * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of 114 * arg passed to it as needed. If the extents in the dkioc_free_list_t passed 115 * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will 116 * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter() 117 * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t 118 * instances and populate them with the adjusted extents that do conform to the 119 * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t 120 * passed to it when this occurs. The net result is that 'func' can always 121 * assume it will be called with a dkioc_free_list_t with extents that 122 * comply with the requirements in 'dfi'. 'func' is also responsible for 123 * freeing the dkioc_free_list_t passed to it (likely via a completion 124 * callback). 125 * 126 * Combined with the behavior described above, dfl_iter() can be viewed as 127 * consuming the dkioc_free_list_t passed to it. Either it will pass it along 128 * to 'func' (and let 'func' handle freeing it), or it will free it and 129 * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still 130 * letting 'func' handle freeing the new instances). This way neither the 131 * dfl_iter() caller nor nor the driver need to worry about treating 132 * conforming and non-conforming requests differently. 133 * 134 * Unfortunately, the DKIOCFREE ioctl provides no method for communicating 135 * any notion of partial completion -- either it returns success (0) or 136 * an error. It's not clear if such a notion would even be possible while 137 * supporting multiple types of devices (NVMe, SCSI, etc.) with the same 138 * interface. As such, there's little benefit to providing more detailed error 139 * semantics beyond what DKIOCFREE can handle. 140 * 141 * Due to this, a somewhat simplistic approach is taken to error handling. The 142 * original list of extents is first checked to make sure they all appear 143 * valid -- that is they do not start or extend beyond the end of the device. 144 * Any request that contains such extents is always rejected in it's entirety. 145 * It is possible after applying any needed adjustments to the original list 146 * of extents that the result is not acceptable to the driver. For example, 147 * a device with a 512 byte block size that tries to free the range 513-1023 148 * (bytes) would not be able to be processed. Such extents will be silently 149 * ignored. If the original request consists of nothing but such requests, 150 * dfl_iter() will never call 'func' and will merely return 0. 151 */ 152 int 153 dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off, 154 dfl_iter_fn_t func, void *arg, int kmflag) 155 { 156 dkioc_free_list_ext_t *ext; 157 uint64_t n_bytes, n_segs, start_idx, i; 158 uint_t bsize = 1U << dfi->dfi_bshift; 159 int r = 0; 160 boolean_t need_copy = B_FALSE; 161 162 /* 163 * Make sure the block size derived from dfi_bshift is at least 512 164 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is 165 * to prevent any problems with other parts of the system that might 166 * assume a minimum block size of 512, and the upper bound is just 167 * to prevent overflow when creating the block size from dfi_bshift 168 * (though it seems unlikely we'll have _block_ sizes near a GiB 169 * any time soon). 170 */ 171 if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) { 172 r = SET_ERROR(EINVAL); 173 goto done; 174 } 175 176 /* Max bytes must be a multiple of the block size */ 177 if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) { 178 r = SET_ERROR(EINVAL); 179 goto done; 180 } 181 182 /* Start offset alignment must also be a multiple of the block size */ 183 if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) { 184 r = SET_ERROR(EINVAL); 185 goto done; 186 } 187 188 /* Max bytes in an extent must be a multiple of the block size */ 189 if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) { 190 r = SET_ERROR(EINVAL); 191 goto done; 192 } 193 194 /* 195 * It makes no sense to allow a single extent to be larger than the 196 * total allowed for an entire request. 197 */ 198 if (dfi->dfi_max_ext_bytes > 0 && 199 dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) { 200 r = SET_ERROR(EINVAL); 201 goto done; 202 } 203 204 /* 205 * The first pass, align everything as needed and make sure all the 206 * extents look valid. 207 */ 208 if ((r = adjust_exts(dfl, dfi, max_off)) != 0) { 209 goto done; 210 } 211 212 /* 213 * Go through and split things up as needed. The general idea is to 214 * split along the original extent boundaries when needed. We only 215 * split an extent from the original request into multiple extents 216 * if the original extent is by itself too big for the device to 217 * process in a single request. 218 */ 219 start_idx = 0; 220 n_bytes = n_segs = 0; 221 ext = dfl->dfl_exts; 222 for (i = 0; i < dfl->dfl_num_exts; i++, ext++) { 223 uint64_t start = dfl->dfl_offset + ext->dfle_start; 224 uint64_t len = ext->dfle_length; 225 226 if (len == 0) { 227 /* 228 * If we encounter a zero length extent, we're going 229 * to create a new copy of dfl no matter what -- 230 * the size of dfl is determined by dfl_num_exts so 231 * we cannot do things like shift the contents and 232 * reduce dfl_num_exts to get a contiguous array 233 * of non-zero length extents. 234 */ 235 need_copy = B_TRUE; 236 continue; 237 } 238 239 if (dfi->dfi_max_ext_bytes > 0 && 240 len > dfi->dfi_max_ext_bytes) { 241 /* 242 * An extent that's too large. Dispatch what we've 243 * accumulated, and then split this extent into 244 * smaller ones the device can accept. 245 */ 246 if ((r = process_range(dfl, start_idx, i - start_idx, 247 func, arg, kmflag)) != 0) { 248 goto done; 249 } 250 251 if ((r = split_extent(dfl, dfi, i, func, arg, 252 kmflag)) != 0) { 253 goto done; 254 } 255 start_idx = i + 1; 256 n_segs = 0; 257 n_bytes = 0; 258 continue; 259 } 260 261 if (dfi->dfi_max_bytes > 0 && 262 n_bytes + len > dfi->dfi_max_bytes) { 263 /* 264 * This extent would put us over the limit for total 265 * bytes that can be trimmed in one request. 266 * Dispatch what we've accumulated. Then deal 267 * with this extent. 268 */ 269 if ((r = process_range(dfl, start_idx, i - start_idx, 270 func, arg, kmflag)) != 0) { 271 goto done; 272 } 273 274 if (len < dfi->dfi_max_bytes) { 275 /* 276 * After dispatching what we've accumulated, 277 * this extent can fit in a new request 278 * Just add it to the accumulated list of 279 * extents and move on. 280 */ 281 start_idx = i; 282 n_segs = 1; 283 n_bytes = len; 284 continue; 285 } 286 287 /* 288 * Even after starting a new request, this extent 289 * is too big. Split it until it fits. 290 */ 291 if ((r = split_extent(dfl, dfi, i, func, arg, 292 kmflag)) != 0) { 293 goto done; 294 } 295 296 start_idx = i + 1; 297 n_segs = 0; 298 n_bytes = 0; 299 continue; 300 } 301 302 if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) { 303 /* 304 * This extent will put us over the limit on the number 305 * of extents the device can accept. Dispatch what 306 * we've accumulated so far, start a new 307 * request, then re-evaluate this extent (in case 308 * the extent on it's own is too large). 309 */ 310 if ((r = process_range(dfl, start_idx, i - start_idx, 311 func, arg, kmflag)) != 0) { 312 goto done; 313 } 314 315 start_idx = i; 316 n_segs = 0; 317 n_bytes = 0; 318 continue; 319 } 320 321 n_segs++; 322 n_bytes += len; 323 } 324 325 /* 326 * If a copy wasn't required, and we haven't processed a subset of 327 * the extents already, we can just use the original request. 328 */ 329 if (!need_copy && start_idx == 0) { 330 return (func(dfl, arg, kmflag)); 331 } 332 333 r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag); 334 335 done: 336 dfl_free(dfl); 337 return (r); 338 } 339 340 /* 341 * Adjust the start and length of each extent in dfl so that it conforms to 342 * the requirements in dfi. It also verifies that no extent extends beyond 343 * the end of the device (given by len_blk). 344 * 345 * Returns 0 on success, or an error value. 346 */ 347 static int 348 adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, 349 uint64_t max_off) 350 { 351 dkioc_free_list_ext_t *exts = dfl->dfl_exts; 352 /* 353 * These must be uint64_t to prevent the P2 macros from truncating 354 * the result. 355 */ 356 const uint64_t align = dfi->dfi_align; 357 const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift; 358 359 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) { 360 /* 361 * Since there are no known requirements on the value of 362 * dfl_offset, it's possible (though odd) to have a scenario 363 * where dfl_offset == 1, and dfle_start == 511 (resulting 364 * in an actual start offset of 512). As such, we always 365 * apply the offset and find the resulting starting offset 366 * and length (in bytes) first, then apply any rounding 367 * and alignment. 368 */ 369 uint64_t start = exts->dfle_start + dfl->dfl_offset; 370 uint64_t end = start + exts->dfle_length; 371 372 /* 373 * Make sure after applying dfl->dfl_offset and any alignment 374 * adjustments that the results don't overflow. 375 */ 376 if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) { 377 return (SET_ERROR(EOVERFLOW)); 378 } 379 380 if (end < start) { 381 return (SET_ERROR(EOVERFLOW)); 382 } 383 384 /* 385 * Make sure we don't extend past the end of the device 386 */ 387 if (end > max_off) { 388 return (SET_ERROR(EINVAL)); 389 } 390 391 start = P2ROUNDUP(start, align); 392 end = P2ALIGN(end, bsize); 393 394 /* 395 * Remove the offset so that when it's later applied again, 396 * the correct start value is obtained. 397 */ 398 exts->dfle_start = start - dfl->dfl_offset; 399 400 /* 401 * If the original length was less than the block size 402 * of the device, we can end up with end < start. If that 403 * happens we just set the length to zero. 404 */ 405 exts->dfle_length = (end < start) ? 0 : end - start; 406 } 407 408 return (0); 409 } 410 411 /* 412 * Take a subset of extents from dfl (starting at start_idx, with n entries) 413 * and create a new dkioc_free_list_t, passing that to func. 414 */ 415 static int 416 process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n, 417 dfl_iter_fn_t func, void *arg, int kmflag) 418 { 419 dkioc_free_list_t *new_dfl = NULL; 420 dkioc_free_list_ext_t *new_exts = NULL; 421 dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx; 422 size_t actual_n = n; 423 int r = 0; 424 425 if (n == 0) { 426 return (0); 427 } 428 429 /* 430 * Ignore any zero length extents. No known devices attach any 431 * semantic meaning to such extents, and are likely just a result of 432 * narrowing the range of the extent to fit the device alignment 433 * requirements. It is possible the original caller submitted a 434 * zero length extent, but we ignore those as well. Since we can't 435 * communicate partial results back to the caller anyway, it's 436 * unclear whether reporting that one of potentially many exents was 437 * too small (without being able to identify which one) to the caller 438 * of the DKIOCFREE request would be useful. 439 */ 440 for (uint64_t i = 0; i < n; i++) { 441 if (exts[i].dfle_length == 0 && --actual_n == 0) { 442 return (0); 443 } 444 } 445 446 new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag); 447 if (new_dfl == NULL) { 448 return (SET_ERROR(ENOMEM)); 449 } 450 451 new_dfl->dfl_flags = dfl->dfl_flags; 452 new_dfl->dfl_num_exts = actual_n; 453 new_dfl->dfl_offset = dfl->dfl_offset; 454 new_exts = new_dfl->dfl_exts; 455 456 for (uint64_t i = 0; i < n; i++) { 457 if (exts[i].dfle_length == 0) { 458 continue; 459 } 460 461 *new_exts++ = exts[i]; 462 } 463 464 return (func(new_dfl, arg, kmflag)); 465 } 466 467 /* 468 * If dfi_max_ext_bytes is set, use as the max segment length, 469 * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX 470 */ 471 #define MAX_SEGLEN(dfi) \ 472 (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \ 473 ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX) 474 475 /* 476 * Split the extent at idx into multiple lists (calling func for each one). 477 */ 478 static int 479 split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx, 480 dfl_iter_fn_t func, void *arg, int kmflag) 481 { 482 ASSERT3U(idx, <, dfl->dfl_num_exts); 483 484 const uint64_t maxlen = MAX_SEGLEN(dfi); 485 dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx; 486 uint64_t remain = ext->dfle_length; 487 int r; 488 489 /* 490 * Break the extent into as many single requests as needed. While it 491 * would be possible in some circumstances to combine the final chunk 492 * of the extent (after splitting) with the remaining extents in the 493 * original request, it's not clear there's much benefit from the 494 * added complexity. Such behavior could be added in the future if 495 * it's determined to be worthwhile. 496 */ 497 while (remain > 0) { 498 uint64_t start = dfl->dfl_offset + ext->dfle_start; 499 uint64_t len = remain; 500 501 /* 502 * If we know we have at least one more segment left after 503 * the current iteration of this loop, split it so that 504 * the next segment starts on an aligned boundary. 505 */ 506 if (len > maxlen) { 507 uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align); 508 len = end - start; 509 } 510 511 ext->dfle_length = len; 512 513 if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) { 514 return (r); 515 } 516 517 ext->dfle_start += len; 518 remain -= len; 519 } 520 521 return (0); 522 } 523