xref: /linux/drivers/md/dm-zoned-reclaim.c (revision 1517d90cfafe0f95fd7863d04e1596f7beb7dfa8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm-zoned.h"
9 
10 #include <linux/module.h>
11 
12 #define	DM_MSG_PREFIX		"zoned reclaim"
13 
14 struct dmz_reclaim {
15 	struct dmz_metadata     *metadata;
16 	struct dmz_dev		*dev;
17 
18 	struct delayed_work	work;
19 	struct workqueue_struct *wq;
20 
21 	struct dm_kcopyd_client	*kc;
22 	struct dm_kcopyd_throttle kc_throttle;
23 	int			kc_err;
24 
25 	unsigned long		flags;
26 
27 	/* Last target access time */
28 	unsigned long		atime;
29 };
30 
31 /*
32  * Reclaim state flags.
33  */
34 enum {
35 	DMZ_RECLAIM_KCOPY,
36 };
37 
38 /*
39  * Number of seconds of target BIO inactivity to consider the target idle.
40  */
41 #define DMZ_IDLE_PERIOD			(10UL * HZ)
42 
43 /*
44  * Percentage of unmapped (free) random zones below which reclaim starts
45  * even if the target is busy.
46  */
47 #define DMZ_RECLAIM_LOW_UNMAP_RND	30
48 
49 /*
50  * Percentage of unmapped (free) random zones above which reclaim will
51  * stop if the target is busy.
52  */
53 #define DMZ_RECLAIM_HIGH_UNMAP_RND	50
54 
55 /*
56  * Align a sequential zone write pointer to chunk_block.
57  */
58 static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
59 				sector_t block)
60 {
61 	struct dmz_metadata *zmd = zrc->metadata;
62 	sector_t wp_block = zone->wp_block;
63 	unsigned int nr_blocks;
64 	int ret;
65 
66 	if (wp_block == block)
67 		return 0;
68 
69 	if (wp_block > block)
70 		return -EIO;
71 
72 	/*
73 	 * Zeroout the space between the write
74 	 * pointer and the requested position.
75 	 */
76 	nr_blocks = block - wp_block;
77 	ret = blkdev_issue_zeroout(zrc->dev->bdev,
78 				   dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
79 				   dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
80 	if (ret) {
81 		dmz_dev_err(zrc->dev,
82 			    "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
83 			    dmz_id(zmd, zone), (unsigned long long)wp_block,
84 			    (unsigned long long)block, nr_blocks, ret);
85 		return ret;
86 	}
87 
88 	zone->wp_block = block;
89 
90 	return 0;
91 }
92 
93 /*
94  * dm_kcopyd_copy end notification.
95  */
96 static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err,
97 				  void *context)
98 {
99 	struct dmz_reclaim *zrc = context;
100 
101 	if (read_err || write_err)
102 		zrc->kc_err = -EIO;
103 	else
104 		zrc->kc_err = 0;
105 
106 	clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags);
107 	smp_mb__after_atomic();
108 	wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY);
109 }
110 
111 /*
112  * Copy valid blocks of src_zone into dst_zone.
113  */
114 static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
115 			    struct dm_zone *src_zone, struct dm_zone *dst_zone)
116 {
117 	struct dmz_metadata *zmd = zrc->metadata;
118 	struct dmz_dev *dev = zrc->dev;
119 	struct dm_io_region src, dst;
120 	sector_t block = 0, end_block;
121 	sector_t nr_blocks;
122 	sector_t src_zone_block;
123 	sector_t dst_zone_block;
124 	unsigned long flags = 0;
125 	int ret;
126 
127 	if (dmz_is_seq(src_zone))
128 		end_block = src_zone->wp_block;
129 	else
130 		end_block = dev->zone_nr_blocks;
131 	src_zone_block = dmz_start_block(zmd, src_zone);
132 	dst_zone_block = dmz_start_block(zmd, dst_zone);
133 
134 	if (dmz_is_seq(dst_zone))
135 		set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
136 
137 	while (block < end_block) {
138 		if (dev->flags & DMZ_BDEV_DYING)
139 			return -EIO;
140 
141 		/* Get a valid region from the source zone */
142 		ret = dmz_first_valid_block(zmd, src_zone, &block);
143 		if (ret <= 0)
144 			return ret;
145 		nr_blocks = ret;
146 
147 		/*
148 		 * If we are writing in a sequential zone, we must make sure
149 		 * that writes are sequential. So Zeroout any eventual hole
150 		 * between writes.
151 		 */
152 		if (dmz_is_seq(dst_zone)) {
153 			ret = dmz_reclaim_align_wp(zrc, dst_zone, block);
154 			if (ret)
155 				return ret;
156 		}
157 
158 		src.bdev = dev->bdev;
159 		src.sector = dmz_blk2sect(src_zone_block + block);
160 		src.count = dmz_blk2sect(nr_blocks);
161 
162 		dst.bdev = dev->bdev;
163 		dst.sector = dmz_blk2sect(dst_zone_block + block);
164 		dst.count = src.count;
165 
166 		/* Copy the valid region */
167 		set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
168 		dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
169 			       dmz_reclaim_kcopy_end, zrc);
170 
171 		/* Wait for copy to complete */
172 		wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
173 			       TASK_UNINTERRUPTIBLE);
174 		if (zrc->kc_err)
175 			return zrc->kc_err;
176 
177 		block += nr_blocks;
178 		if (dmz_is_seq(dst_zone))
179 			dst_zone->wp_block = block;
180 	}
181 
182 	return 0;
183 }
184 
185 /*
186  * Move valid blocks of dzone buffer zone into dzone (after its write pointer)
187  * and free the buffer zone.
188  */
189 static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
190 {
191 	struct dm_zone *bzone = dzone->bzone;
192 	sector_t chunk_block = dzone->wp_block;
193 	struct dmz_metadata *zmd = zrc->metadata;
194 	int ret;
195 
196 	dmz_dev_debug(zrc->dev,
197 		      "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
198 		      dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
199 		      dmz_id(zmd, dzone), dmz_weight(dzone));
200 
201 	/* Flush data zone into the buffer zone */
202 	ret = dmz_reclaim_copy(zrc, bzone, dzone);
203 	if (ret < 0)
204 		return ret;
205 
206 	dmz_lock_flush(zmd);
207 
208 	/* Validate copied blocks */
209 	ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
210 	if (ret == 0) {
211 		/* Free the buffer zone */
212 		dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
213 		dmz_lock_map(zmd);
214 		dmz_unmap_zone(zmd, bzone);
215 		dmz_unlock_zone_reclaim(dzone);
216 		dmz_free_zone(zmd, bzone);
217 		dmz_unlock_map(zmd);
218 	}
219 
220 	dmz_unlock_flush(zmd);
221 
222 	return ret;
223 }
224 
225 /*
226  * Merge valid blocks of dzone into its buffer zone and free dzone.
227  */
228 static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
229 {
230 	unsigned int chunk = dzone->chunk;
231 	struct dm_zone *bzone = dzone->bzone;
232 	struct dmz_metadata *zmd = zrc->metadata;
233 	int ret = 0;
234 
235 	dmz_dev_debug(zrc->dev,
236 		      "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
237 		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
238 		      dmz_id(zmd, bzone), dmz_weight(bzone));
239 
240 	/* Flush data zone into the buffer zone */
241 	ret = dmz_reclaim_copy(zrc, dzone, bzone);
242 	if (ret < 0)
243 		return ret;
244 
245 	dmz_lock_flush(zmd);
246 
247 	/* Validate copied blocks */
248 	ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0);
249 	if (ret == 0) {
250 		/*
251 		 * Free the data zone and remap the chunk to
252 		 * the buffer zone.
253 		 */
254 		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
255 		dmz_lock_map(zmd);
256 		dmz_unmap_zone(zmd, bzone);
257 		dmz_unmap_zone(zmd, dzone);
258 		dmz_unlock_zone_reclaim(dzone);
259 		dmz_free_zone(zmd, dzone);
260 		dmz_map_zone(zmd, bzone, chunk);
261 		dmz_unlock_map(zmd);
262 	}
263 
264 	dmz_unlock_flush(zmd);
265 
266 	return ret;
267 }
268 
269 /*
270  * Move valid blocks of the random data zone dzone into a free sequential zone.
271  * Once blocks are moved, remap the zone chunk to the sequential zone.
272  */
273 static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
274 {
275 	unsigned int chunk = dzone->chunk;
276 	struct dm_zone *szone = NULL;
277 	struct dmz_metadata *zmd = zrc->metadata;
278 	int ret;
279 
280 	/* Get a free sequential zone */
281 	dmz_lock_map(zmd);
282 	szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
283 	dmz_unlock_map(zmd);
284 	if (!szone)
285 		return -ENOSPC;
286 
287 	dmz_dev_debug(zrc->dev,
288 		      "Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
289 		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
290 		      dmz_id(zmd, szone));
291 
292 	/* Flush the random data zone into the sequential zone */
293 	ret = dmz_reclaim_copy(zrc, dzone, szone);
294 
295 	dmz_lock_flush(zmd);
296 
297 	if (ret == 0) {
298 		/* Validate copied blocks */
299 		ret = dmz_copy_valid_blocks(zmd, dzone, szone);
300 	}
301 	if (ret) {
302 		/* Free the sequential zone */
303 		dmz_lock_map(zmd);
304 		dmz_free_zone(zmd, szone);
305 		dmz_unlock_map(zmd);
306 	} else {
307 		/* Free the data zone and remap the chunk */
308 		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
309 		dmz_lock_map(zmd);
310 		dmz_unmap_zone(zmd, dzone);
311 		dmz_unlock_zone_reclaim(dzone);
312 		dmz_free_zone(zmd, dzone);
313 		dmz_map_zone(zmd, szone, chunk);
314 		dmz_unlock_map(zmd);
315 	}
316 
317 	dmz_unlock_flush(zmd);
318 
319 	return ret;
320 }
321 
322 /*
323  * Reclaim an empty zone.
324  */
325 static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
326 {
327 	struct dmz_metadata *zmd = zrc->metadata;
328 
329 	dmz_lock_flush(zmd);
330 	dmz_lock_map(zmd);
331 	dmz_unmap_zone(zmd, dzone);
332 	dmz_unlock_zone_reclaim(dzone);
333 	dmz_free_zone(zmd, dzone);
334 	dmz_unlock_map(zmd);
335 	dmz_unlock_flush(zmd);
336 }
337 
338 /*
339  * Find a candidate zone for reclaim and process it.
340  */
341 static int dmz_do_reclaim(struct dmz_reclaim *zrc)
342 {
343 	struct dmz_metadata *zmd = zrc->metadata;
344 	struct dm_zone *dzone;
345 	struct dm_zone *rzone;
346 	unsigned long start;
347 	int ret;
348 
349 	/* Get a data zone */
350 	dzone = dmz_get_zone_for_reclaim(zmd);
351 	if (IS_ERR(dzone))
352 		return PTR_ERR(dzone);
353 
354 	start = jiffies;
355 
356 	if (dmz_is_rnd(dzone)) {
357 		if (!dmz_weight(dzone)) {
358 			/* Empty zone */
359 			dmz_reclaim_empty(zrc, dzone);
360 			ret = 0;
361 		} else {
362 			/*
363 			 * Reclaim the random data zone by moving its
364 			 * valid data blocks to a free sequential zone.
365 			 */
366 			ret = dmz_reclaim_rnd_data(zrc, dzone);
367 		}
368 		rzone = dzone;
369 
370 	} else {
371 		struct dm_zone *bzone = dzone->bzone;
372 		sector_t chunk_block = 0;
373 
374 		ret = dmz_first_valid_block(zmd, bzone, &chunk_block);
375 		if (ret < 0)
376 			goto out;
377 
378 		if (ret == 0 || chunk_block >= dzone->wp_block) {
379 			/*
380 			 * The buffer zone is empty or its valid blocks are
381 			 * after the data zone write pointer.
382 			 */
383 			ret = dmz_reclaim_buf(zrc, dzone);
384 			rzone = bzone;
385 		} else {
386 			/*
387 			 * Reclaim the data zone by merging it into the
388 			 * buffer zone so that the buffer zone itself can
389 			 * be later reclaimed.
390 			 */
391 			ret = dmz_reclaim_seq_data(zrc, dzone);
392 			rzone = dzone;
393 		}
394 	}
395 out:
396 	if (ret) {
397 		dmz_unlock_zone_reclaim(dzone);
398 		return ret;
399 	}
400 
401 	ret = dmz_flush_metadata(zrc->metadata);
402 	if (ret) {
403 		dmz_dev_debug(zrc->dev,
404 			      "Metadata flush for zone %u failed, err %d\n",
405 			      dmz_id(zmd, rzone), ret);
406 		return ret;
407 	}
408 
409 	dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
410 		      dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
411 	return 0;
412 }
413 
414 /*
415  * Test if the target device is idle.
416  */
417 static inline int dmz_target_idle(struct dmz_reclaim *zrc)
418 {
419 	return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
420 }
421 
422 /*
423  * Test if reclaim is necessary.
424  */
425 static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
426 {
427 	struct dmz_metadata *zmd = zrc->metadata;
428 	unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
429 	unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
430 	unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
431 
432 	/* Reclaim when idle */
433 	if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
434 		return true;
435 
436 	/* If there are still plenty of random zones, do not reclaim */
437 	if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
438 		return false;
439 
440 	/*
441 	 * If the percentage of unmapped random zones is low,
442 	 * reclaim even if the target is busy.
443 	 */
444 	return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
445 }
446 
447 /*
448  * Reclaim work function.
449  */
450 static void dmz_reclaim_work(struct work_struct *work)
451 {
452 	struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
453 	struct dmz_metadata *zmd = zrc->metadata;
454 	unsigned int nr_rnd, nr_unmap_rnd;
455 	unsigned int p_unmap_rnd;
456 	int ret;
457 
458 	if (dmz_bdev_is_dying(zrc->dev))
459 		return;
460 
461 	if (!dmz_should_reclaim(zrc)) {
462 		mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
463 		return;
464 	}
465 
466 	/*
467 	 * We need to start reclaiming random zones: set up zone copy
468 	 * throttling to either go fast if we are very low on random zones
469 	 * and slower if there are still some free random zones to avoid
470 	 * as much as possible to negatively impact the user workload.
471 	 */
472 	nr_rnd = dmz_nr_rnd_zones(zmd);
473 	nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
474 	p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
475 	if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
476 		/* Idle or very low percentage: go fast */
477 		zrc->kc_throttle.throttle = 100;
478 	} else {
479 		/* Busy but we still have some random zone: throttle */
480 		zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
481 	}
482 
483 	dmz_dev_debug(zrc->dev,
484 		      "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
485 		      zrc->kc_throttle.throttle,
486 		      (dmz_target_idle(zrc) ? "Idle" : "Busy"),
487 		      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
488 
489 	ret = dmz_do_reclaim(zrc);
490 	if (ret) {
491 		dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
492 		if (ret == -EIO)
493 			/*
494 			 * LLD might be performing some error handling sequence
495 			 * at the underlying device. To not interfere, do not
496 			 * attempt to schedule the next reclaim run immediately.
497 			 */
498 			return;
499 	}
500 
501 	dmz_schedule_reclaim(zrc);
502 }
503 
504 /*
505  * Initialize reclaim.
506  */
507 int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
508 		    struct dmz_reclaim **reclaim)
509 {
510 	struct dmz_reclaim *zrc;
511 	int ret;
512 
513 	zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL);
514 	if (!zrc)
515 		return -ENOMEM;
516 
517 	zrc->dev = dev;
518 	zrc->metadata = zmd;
519 	zrc->atime = jiffies;
520 
521 	/* Reclaim kcopyd client */
522 	zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
523 	if (IS_ERR(zrc->kc)) {
524 		ret = PTR_ERR(zrc->kc);
525 		zrc->kc = NULL;
526 		goto err;
527 	}
528 
529 	/* Reclaim work */
530 	INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
531 	zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
532 					  dev->name);
533 	if (!zrc->wq) {
534 		ret = -ENOMEM;
535 		goto err;
536 	}
537 
538 	*reclaim = zrc;
539 	queue_delayed_work(zrc->wq, &zrc->work, 0);
540 
541 	return 0;
542 err:
543 	if (zrc->kc)
544 		dm_kcopyd_client_destroy(zrc->kc);
545 	kfree(zrc);
546 
547 	return ret;
548 }
549 
550 /*
551  * Terminate reclaim.
552  */
553 void dmz_dtr_reclaim(struct dmz_reclaim *zrc)
554 {
555 	cancel_delayed_work_sync(&zrc->work);
556 	destroy_workqueue(zrc->wq);
557 	dm_kcopyd_client_destroy(zrc->kc);
558 	kfree(zrc);
559 }
560 
561 /*
562  * Suspend reclaim.
563  */
564 void dmz_suspend_reclaim(struct dmz_reclaim *zrc)
565 {
566 	cancel_delayed_work_sync(&zrc->work);
567 }
568 
569 /*
570  * Resume reclaim.
571  */
572 void dmz_resume_reclaim(struct dmz_reclaim *zrc)
573 {
574 	queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
575 }
576 
577 /*
578  * BIO accounting.
579  */
580 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
581 {
582 	zrc->atime = jiffies;
583 }
584 
585 /*
586  * Start reclaim if necessary.
587  */
588 void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
589 {
590 	if (dmz_should_reclaim(zrc))
591 		mod_delayed_work(zrc->wq, &zrc->work, 0);
592 }
593 
594