1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 12 #define DM_MSG_PREFIX "zoned reclaim" 13 14 struct dmz_reclaim { 15 struct dmz_metadata *metadata; 16 17 struct delayed_work work; 18 struct workqueue_struct *wq; 19 20 struct dm_kcopyd_client *kc; 21 struct dm_kcopyd_throttle kc_throttle; 22 int kc_err; 23 24 int dev_idx; 25 26 unsigned long flags; 27 28 /* Last target access time */ 29 unsigned long atime; 30 }; 31 32 /* 33 * Reclaim state flags. 34 */ 35 enum { 36 DMZ_RECLAIM_KCOPY, 37 }; 38 39 /* 40 * Number of seconds of target BIO inactivity to consider the target idle. 41 */ 42 #define DMZ_IDLE_PERIOD (10UL * HZ) 43 44 /* 45 * Percentage of unmapped (free) random zones below which reclaim starts 46 * even if the target is busy. 47 */ 48 #define DMZ_RECLAIM_LOW_UNMAP_ZONES 30 49 50 /* 51 * Percentage of unmapped (free) random zones above which reclaim will 52 * stop if the target is busy. 53 */ 54 #define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50 55 56 /* 57 * Align a sequential zone write pointer to chunk_block. 58 */ 59 static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, 60 sector_t block) 61 { 62 struct dmz_metadata *zmd = zrc->metadata; 63 struct dmz_dev *dev = zone->dev; 64 sector_t wp_block = zone->wp_block; 65 unsigned int nr_blocks; 66 int ret; 67 68 if (wp_block == block) 69 return 0; 70 71 if (wp_block > block) 72 return -EIO; 73 74 /* 75 * Zeroout the space between the write 76 * pointer and the requested position. 77 */ 78 nr_blocks = block - wp_block; 79 ret = blkdev_issue_zeroout(dev->bdev, 80 dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), 81 dmz_blk2sect(nr_blocks), GFP_NOIO, 0); 82 if (ret) { 83 dmz_dev_err(dev, 84 "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", 85 zone->id, (unsigned long long)wp_block, 86 (unsigned long long)block, nr_blocks, ret); 87 dmz_check_bdev(dev); 88 return ret; 89 } 90 91 zone->wp_block = block; 92 93 return 0; 94 } 95 96 /* 97 * dm_kcopyd_copy end notification. 98 */ 99 static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err, 100 void *context) 101 { 102 struct dmz_reclaim *zrc = context; 103 104 if (read_err || write_err) 105 zrc->kc_err = -EIO; 106 else 107 zrc->kc_err = 0; 108 109 clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags); 110 smp_mb__after_atomic(); 111 wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY); 112 } 113 114 /* 115 * Copy valid blocks of src_zone into dst_zone. 116 */ 117 static int dmz_reclaim_copy(struct dmz_reclaim *zrc, 118 struct dm_zone *src_zone, struct dm_zone *dst_zone) 119 { 120 struct dmz_metadata *zmd = zrc->metadata; 121 struct dm_io_region src, dst; 122 sector_t block = 0, end_block; 123 sector_t nr_blocks; 124 sector_t src_zone_block; 125 sector_t dst_zone_block; 126 unsigned long flags = 0; 127 int ret; 128 129 if (dmz_is_seq(src_zone)) 130 end_block = src_zone->wp_block; 131 else 132 end_block = dmz_zone_nr_blocks(zmd); 133 src_zone_block = dmz_start_block(zmd, src_zone); 134 dst_zone_block = dmz_start_block(zmd, dst_zone); 135 136 if (dmz_is_seq(dst_zone)) 137 set_bit(DM_KCOPYD_WRITE_SEQ, &flags); 138 139 while (block < end_block) { 140 if (src_zone->dev->flags & DMZ_BDEV_DYING) 141 return -EIO; 142 if (dst_zone->dev->flags & DMZ_BDEV_DYING) 143 return -EIO; 144 145 if (dmz_reclaim_should_terminate(src_zone)) 146 return -EINTR; 147 148 /* Get a valid region from the source zone */ 149 ret = dmz_first_valid_block(zmd, src_zone, &block); 150 if (ret <= 0) 151 return ret; 152 nr_blocks = ret; 153 154 /* 155 * If we are writing in a sequential zone, we must make sure 156 * that writes are sequential. So Zeroout any eventual hole 157 * between writes. 158 */ 159 if (dmz_is_seq(dst_zone)) { 160 ret = dmz_reclaim_align_wp(zrc, dst_zone, block); 161 if (ret) 162 return ret; 163 } 164 165 src.bdev = src_zone->dev->bdev; 166 src.sector = dmz_blk2sect(src_zone_block + block); 167 src.count = dmz_blk2sect(nr_blocks); 168 169 dst.bdev = dst_zone->dev->bdev; 170 dst.sector = dmz_blk2sect(dst_zone_block + block); 171 dst.count = src.count; 172 173 /* Copy the valid region */ 174 set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags); 175 dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, 176 dmz_reclaim_kcopy_end, zrc); 177 178 /* Wait for copy to complete */ 179 wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY, 180 TASK_UNINTERRUPTIBLE); 181 if (zrc->kc_err) 182 return zrc->kc_err; 183 184 block += nr_blocks; 185 if (dmz_is_seq(dst_zone)) 186 dst_zone->wp_block = block; 187 } 188 189 return 0; 190 } 191 192 /* 193 * Move valid blocks of dzone buffer zone into dzone (after its write pointer) 194 * and free the buffer zone. 195 */ 196 static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) 197 { 198 struct dm_zone *bzone = dzone->bzone; 199 sector_t chunk_block = dzone->wp_block; 200 struct dmz_metadata *zmd = zrc->metadata; 201 int ret; 202 203 DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", 204 dmz_metadata_label(zmd), zrc->dev_idx, 205 dzone->chunk, bzone->id, dmz_weight(bzone), 206 dzone->id, dmz_weight(dzone)); 207 208 /* Flush data zone into the buffer zone */ 209 ret = dmz_reclaim_copy(zrc, bzone, dzone); 210 if (ret < 0) 211 return ret; 212 213 dmz_lock_flush(zmd); 214 215 /* Validate copied blocks */ 216 ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block); 217 if (ret == 0) { 218 /* Free the buffer zone */ 219 dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd)); 220 dmz_lock_map(zmd); 221 dmz_unmap_zone(zmd, bzone); 222 dmz_unlock_zone_reclaim(dzone); 223 dmz_free_zone(zmd, bzone); 224 dmz_unlock_map(zmd); 225 } 226 227 dmz_unlock_flush(zmd); 228 229 return ret; 230 } 231 232 /* 233 * Merge valid blocks of dzone into its buffer zone and free dzone. 234 */ 235 static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) 236 { 237 unsigned int chunk = dzone->chunk; 238 struct dm_zone *bzone = dzone->bzone; 239 struct dmz_metadata *zmd = zrc->metadata; 240 int ret = 0; 241 242 DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", 243 dmz_metadata_label(zmd), zrc->dev_idx, 244 chunk, dzone->id, dmz_weight(dzone), 245 bzone->id, dmz_weight(bzone)); 246 247 /* Flush data zone into the buffer zone */ 248 ret = dmz_reclaim_copy(zrc, dzone, bzone); 249 if (ret < 0) 250 return ret; 251 252 dmz_lock_flush(zmd); 253 254 /* Validate copied blocks */ 255 ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0); 256 if (ret == 0) { 257 /* 258 * Free the data zone and remap the chunk to 259 * the buffer zone. 260 */ 261 dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); 262 dmz_lock_map(zmd); 263 dmz_unmap_zone(zmd, bzone); 264 dmz_unmap_zone(zmd, dzone); 265 dmz_unlock_zone_reclaim(dzone); 266 dmz_free_zone(zmd, dzone); 267 dmz_map_zone(zmd, bzone, chunk); 268 dmz_unlock_map(zmd); 269 } 270 271 dmz_unlock_flush(zmd); 272 273 return ret; 274 } 275 276 /* 277 * Move valid blocks of the random data zone dzone into a free sequential zone. 278 * Once blocks are moved, remap the zone chunk to the sequential zone. 279 */ 280 static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) 281 { 282 unsigned int chunk = dzone->chunk; 283 struct dm_zone *szone = NULL; 284 struct dmz_metadata *zmd = zrc->metadata; 285 int ret; 286 int alloc_flags = DMZ_ALLOC_SEQ; 287 288 /* Get a free random or sequential zone */ 289 dmz_lock_map(zmd); 290 again: 291 szone = dmz_alloc_zone(zmd, zrc->dev_idx, 292 alloc_flags | DMZ_ALLOC_RECLAIM); 293 if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { 294 alloc_flags = DMZ_ALLOC_RND; 295 goto again; 296 } 297 dmz_unlock_map(zmd); 298 if (!szone) 299 return -ENOSPC; 300 301 DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u", 302 dmz_metadata_label(zmd), zrc->dev_idx, chunk, 303 dmz_is_cache(dzone) ? "cache" : "rnd", 304 dzone->id, dmz_weight(dzone), 305 dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); 306 307 /* Flush the random data zone into the sequential zone */ 308 ret = dmz_reclaim_copy(zrc, dzone, szone); 309 310 dmz_lock_flush(zmd); 311 312 if (ret == 0) { 313 /* Validate copied blocks */ 314 ret = dmz_copy_valid_blocks(zmd, dzone, szone); 315 } 316 if (ret) { 317 /* Free the sequential zone */ 318 dmz_lock_map(zmd); 319 dmz_free_zone(zmd, szone); 320 dmz_unlock_map(zmd); 321 } else { 322 /* Free the data zone and remap the chunk */ 323 dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); 324 dmz_lock_map(zmd); 325 dmz_unmap_zone(zmd, dzone); 326 dmz_unlock_zone_reclaim(dzone); 327 dmz_free_zone(zmd, dzone); 328 dmz_map_zone(zmd, szone, chunk); 329 dmz_unlock_map(zmd); 330 } 331 332 dmz_unlock_flush(zmd); 333 334 return ret; 335 } 336 337 /* 338 * Reclaim an empty zone. 339 */ 340 static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) 341 { 342 struct dmz_metadata *zmd = zrc->metadata; 343 344 dmz_lock_flush(zmd); 345 dmz_lock_map(zmd); 346 dmz_unmap_zone(zmd, dzone); 347 dmz_unlock_zone_reclaim(dzone); 348 dmz_free_zone(zmd, dzone); 349 dmz_unlock_map(zmd); 350 dmz_unlock_flush(zmd); 351 } 352 353 /* 354 * Test if the target device is idle. 355 */ 356 static inline int dmz_target_idle(struct dmz_reclaim *zrc) 357 { 358 return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); 359 } 360 361 /* 362 * Find a candidate zone for reclaim and process it. 363 */ 364 static int dmz_do_reclaim(struct dmz_reclaim *zrc) 365 { 366 struct dmz_metadata *zmd = zrc->metadata; 367 struct dm_zone *dzone; 368 struct dm_zone *rzone; 369 unsigned long start; 370 int ret; 371 372 /* Get a data zone */ 373 dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx, 374 dmz_target_idle(zrc)); 375 if (!dzone) { 376 DMDEBUG("(%s/%u): No zone found to reclaim", 377 dmz_metadata_label(zmd), zrc->dev_idx); 378 return -EBUSY; 379 } 380 rzone = dzone; 381 382 start = jiffies; 383 if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { 384 if (!dmz_weight(dzone)) { 385 /* Empty zone */ 386 dmz_reclaim_empty(zrc, dzone); 387 ret = 0; 388 } else { 389 /* 390 * Reclaim the random data zone by moving its 391 * valid data blocks to a free sequential zone. 392 */ 393 ret = dmz_reclaim_rnd_data(zrc, dzone); 394 } 395 } else { 396 struct dm_zone *bzone = dzone->bzone; 397 sector_t chunk_block = 0; 398 399 ret = dmz_first_valid_block(zmd, bzone, &chunk_block); 400 if (ret < 0) 401 goto out; 402 403 if (ret == 0 || chunk_block >= dzone->wp_block) { 404 /* 405 * The buffer zone is empty or its valid blocks are 406 * after the data zone write pointer. 407 */ 408 ret = dmz_reclaim_buf(zrc, dzone); 409 rzone = bzone; 410 } else { 411 /* 412 * Reclaim the data zone by merging it into the 413 * buffer zone so that the buffer zone itself can 414 * be later reclaimed. 415 */ 416 ret = dmz_reclaim_seq_data(zrc, dzone); 417 } 418 } 419 out: 420 if (ret) { 421 if (ret == -EINTR) 422 DMDEBUG("(%s/%u): reclaim zone %u interrupted", 423 dmz_metadata_label(zmd), zrc->dev_idx, 424 rzone->id); 425 else 426 DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d", 427 dmz_metadata_label(zmd), zrc->dev_idx, 428 rzone->id, ret); 429 dmz_unlock_zone_reclaim(dzone); 430 return ret; 431 } 432 433 ret = dmz_flush_metadata(zrc->metadata); 434 if (ret) { 435 DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d", 436 dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret); 437 return ret; 438 } 439 440 DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms", 441 dmz_metadata_label(zmd), zrc->dev_idx, 442 rzone->id, jiffies_to_msecs(jiffies - start)); 443 return 0; 444 } 445 446 static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) 447 { 448 struct dmz_metadata *zmd = zrc->metadata; 449 unsigned int nr_cache = dmz_nr_cache_zones(zmd); 450 unsigned int nr_unmap, nr_zones; 451 452 if (nr_cache) { 453 nr_zones = nr_cache; 454 nr_unmap = dmz_nr_unmap_cache_zones(zmd); 455 } else { 456 nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); 457 nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); 458 } 459 if (nr_unmap <= 1) 460 return 0; 461 return nr_unmap * 100 / nr_zones; 462 } 463 464 /* 465 * Test if reclaim is necessary. 466 */ 467 static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) 468 { 469 unsigned int nr_reclaim; 470 471 nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx); 472 473 if (dmz_nr_cache_zones(zrc->metadata)) { 474 /* 475 * The first device in a multi-device 476 * setup only contains cache zones, so 477 * never start reclaim there. 478 */ 479 if (zrc->dev_idx == 0) 480 return false; 481 nr_reclaim += dmz_nr_cache_zones(zrc->metadata); 482 } 483 484 /* Reclaim when idle */ 485 if (dmz_target_idle(zrc) && nr_reclaim) 486 return true; 487 488 /* If there are still plenty of cache zones, do not reclaim */ 489 if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES) 490 return false; 491 492 /* 493 * If the percentage of unmapped cache zones is low, 494 * reclaim even if the target is busy. 495 */ 496 return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES; 497 } 498 499 /* 500 * Reclaim work function. 501 */ 502 static void dmz_reclaim_work(struct work_struct *work) 503 { 504 struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); 505 struct dmz_metadata *zmd = zrc->metadata; 506 unsigned int p_unmap; 507 int ret; 508 509 if (dmz_dev_is_dying(zmd)) 510 return; 511 512 p_unmap = dmz_reclaim_percentage(zrc); 513 if (!dmz_should_reclaim(zrc, p_unmap)) { 514 mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); 515 return; 516 } 517 518 /* 519 * We need to start reclaiming random zones: set up zone copy 520 * throttling to either go fast if we are very low on random zones 521 * and slower if there are still some free random zones to avoid 522 * as much as possible to negatively impact the user workload. 523 */ 524 if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) { 525 /* Idle or very low percentage: go fast */ 526 zrc->kc_throttle.throttle = 100; 527 } else { 528 /* Busy but we still have some random zone: throttle */ 529 zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); 530 } 531 532 DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", 533 dmz_metadata_label(zmd), zrc->dev_idx, 534 zrc->kc_throttle.throttle, 535 (dmz_target_idle(zrc) ? "Idle" : "Busy"), 536 p_unmap, dmz_nr_unmap_cache_zones(zmd), 537 dmz_nr_cache_zones(zmd), 538 dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx), 539 dmz_nr_rnd_zones(zmd, zrc->dev_idx)); 540 541 ret = dmz_do_reclaim(zrc); 542 if (ret && ret != -EINTR) { 543 if (!dmz_check_dev(zmd)) 544 return; 545 } 546 547 dmz_schedule_reclaim(zrc); 548 } 549 550 /* 551 * Initialize reclaim. 552 */ 553 int dmz_ctr_reclaim(struct dmz_metadata *zmd, 554 struct dmz_reclaim **reclaim, int idx) 555 { 556 struct dmz_reclaim *zrc; 557 int ret; 558 559 zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL); 560 if (!zrc) 561 return -ENOMEM; 562 563 zrc->metadata = zmd; 564 zrc->atime = jiffies; 565 zrc->dev_idx = idx; 566 567 /* Reclaim kcopyd client */ 568 zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle); 569 if (IS_ERR(zrc->kc)) { 570 ret = PTR_ERR(zrc->kc); 571 zrc->kc = NULL; 572 goto err; 573 } 574 575 /* Reclaim work */ 576 INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); 577 zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM, 578 dmz_metadata_label(zmd), idx); 579 if (!zrc->wq) { 580 ret = -ENOMEM; 581 goto err; 582 } 583 584 *reclaim = zrc; 585 queue_delayed_work(zrc->wq, &zrc->work, 0); 586 587 return 0; 588 err: 589 if (zrc->kc) 590 dm_kcopyd_client_destroy(zrc->kc); 591 kfree(zrc); 592 593 return ret; 594 } 595 596 /* 597 * Terminate reclaim. 598 */ 599 void dmz_dtr_reclaim(struct dmz_reclaim *zrc) 600 { 601 cancel_delayed_work_sync(&zrc->work); 602 destroy_workqueue(zrc->wq); 603 dm_kcopyd_client_destroy(zrc->kc); 604 kfree(zrc); 605 } 606 607 /* 608 * Suspend reclaim. 609 */ 610 void dmz_suspend_reclaim(struct dmz_reclaim *zrc) 611 { 612 cancel_delayed_work_sync(&zrc->work); 613 } 614 615 /* 616 * Resume reclaim. 617 */ 618 void dmz_resume_reclaim(struct dmz_reclaim *zrc) 619 { 620 queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); 621 } 622 623 /* 624 * BIO accounting. 625 */ 626 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc) 627 { 628 zrc->atime = jiffies; 629 } 630 631 /* 632 * Start reclaim if necessary. 633 */ 634 void dmz_schedule_reclaim(struct dmz_reclaim *zrc) 635 { 636 unsigned int p_unmap = dmz_reclaim_percentage(zrc); 637 638 if (dmz_should_reclaim(zrc, p_unmap)) 639 mod_delayed_work(zrc->wq, &zrc->work, 0); 640 } 641