xref: /linux/drivers/md/dm-zoned-reclaim.c (revision b7019ac550eb3916f34d79db583e9b7ea2524afa)
1 /*
2  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-zoned.h"
8 
9 #include <linux/module.h>
10 
11 #define	DM_MSG_PREFIX		"zoned reclaim"
12 
13 struct dmz_reclaim {
14 	struct dmz_metadata     *metadata;
15 	struct dmz_dev		*dev;
16 
17 	struct delayed_work	work;
18 	struct workqueue_struct *wq;
19 
20 	struct dm_kcopyd_client	*kc;
21 	struct dm_kcopyd_throttle kc_throttle;
22 	int			kc_err;
23 
24 	unsigned long		flags;
25 
26 	/* Last target access time */
27 	unsigned long		atime;
28 };
29 
30 /*
31  * Reclaim state flags.
32  */
33 enum {
34 	DMZ_RECLAIM_KCOPY,
35 };
36 
37 /*
38  * Number of seconds of target BIO inactivity to consider the target idle.
39  */
40 #define DMZ_IDLE_PERIOD		(10UL * HZ)
41 
42 /*
43  * Percentage of unmapped (free) random zones below which reclaim starts
44  * even if the target is busy.
45  */
46 #define DMZ_RECLAIM_LOW_UNMAP_RND	30
47 
48 /*
49  * Percentage of unmapped (free) random zones above which reclaim will
50  * stop if the target is busy.
51  */
52 #define DMZ_RECLAIM_HIGH_UNMAP_RND	50
53 
54 /*
55  * Align a sequential zone write pointer to chunk_block.
56  */
57 static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
58 				sector_t block)
59 {
60 	struct dmz_metadata *zmd = zrc->metadata;
61 	sector_t wp_block = zone->wp_block;
62 	unsigned int nr_blocks;
63 	int ret;
64 
65 	if (wp_block == block)
66 		return 0;
67 
68 	if (wp_block > block)
69 		return -EIO;
70 
71 	/*
72 	 * Zeroout the space between the write
73 	 * pointer and the requested position.
74 	 */
75 	nr_blocks = block - wp_block;
76 	ret = blkdev_issue_zeroout(zrc->dev->bdev,
77 				   dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
78 				   dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
79 	if (ret) {
80 		dmz_dev_err(zrc->dev,
81 			    "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
82 			    dmz_id(zmd, zone), (unsigned long long)wp_block,
83 			    (unsigned long long)block, nr_blocks, ret);
84 		return ret;
85 	}
86 
87 	zone->wp_block = block;
88 
89 	return 0;
90 }
91 
92 /*
93  * dm_kcopyd_copy end notification.
94  */
95 static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err,
96 				  void *context)
97 {
98 	struct dmz_reclaim *zrc = context;
99 
100 	if (read_err || write_err)
101 		zrc->kc_err = -EIO;
102 	else
103 		zrc->kc_err = 0;
104 
105 	clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags);
106 	smp_mb__after_atomic();
107 	wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY);
108 }
109 
110 /*
111  * Copy valid blocks of src_zone into dst_zone.
112  */
113 static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
114 			    struct dm_zone *src_zone, struct dm_zone *dst_zone)
115 {
116 	struct dmz_metadata *zmd = zrc->metadata;
117 	struct dmz_dev *dev = zrc->dev;
118 	struct dm_io_region src, dst;
119 	sector_t block = 0, end_block;
120 	sector_t nr_blocks;
121 	sector_t src_zone_block;
122 	sector_t dst_zone_block;
123 	unsigned long flags = 0;
124 	int ret;
125 
126 	if (dmz_is_seq(src_zone))
127 		end_block = src_zone->wp_block;
128 	else
129 		end_block = dev->zone_nr_blocks;
130 	src_zone_block = dmz_start_block(zmd, src_zone);
131 	dst_zone_block = dmz_start_block(zmd, dst_zone);
132 
133 	if (dmz_is_seq(dst_zone))
134 		set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
135 
136 	while (block < end_block) {
137 		/* Get a valid region from the source zone */
138 		ret = dmz_first_valid_block(zmd, src_zone, &block);
139 		if (ret <= 0)
140 			return ret;
141 		nr_blocks = ret;
142 
143 		/*
144 		 * If we are writing in a sequential zone, we must make sure
145 		 * that writes are sequential. So Zeroout any eventual hole
146 		 * between writes.
147 		 */
148 		if (dmz_is_seq(dst_zone)) {
149 			ret = dmz_reclaim_align_wp(zrc, dst_zone, block);
150 			if (ret)
151 				return ret;
152 		}
153 
154 		src.bdev = dev->bdev;
155 		src.sector = dmz_blk2sect(src_zone_block + block);
156 		src.count = dmz_blk2sect(nr_blocks);
157 
158 		dst.bdev = dev->bdev;
159 		dst.sector = dmz_blk2sect(dst_zone_block + block);
160 		dst.count = src.count;
161 
162 		/* Copy the valid region */
163 		set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
164 		dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
165 			       dmz_reclaim_kcopy_end, zrc);
166 
167 		/* Wait for copy to complete */
168 		wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
169 			       TASK_UNINTERRUPTIBLE);
170 		if (zrc->kc_err)
171 			return zrc->kc_err;
172 
173 		block += nr_blocks;
174 		if (dmz_is_seq(dst_zone))
175 			dst_zone->wp_block = block;
176 	}
177 
178 	return 0;
179 }
180 
181 /*
182  * Move valid blocks of dzone buffer zone into dzone (after its write pointer)
183  * and free the buffer zone.
184  */
185 static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
186 {
187 	struct dm_zone *bzone = dzone->bzone;
188 	sector_t chunk_block = dzone->wp_block;
189 	struct dmz_metadata *zmd = zrc->metadata;
190 	int ret;
191 
192 	dmz_dev_debug(zrc->dev,
193 		      "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
194 		      dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
195 		      dmz_id(zmd, dzone), dmz_weight(dzone));
196 
197 	/* Flush data zone into the buffer zone */
198 	ret = dmz_reclaim_copy(zrc, bzone, dzone);
199 	if (ret < 0)
200 		return ret;
201 
202 	dmz_lock_flush(zmd);
203 
204 	/* Validate copied blocks */
205 	ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
206 	if (ret == 0) {
207 		/* Free the buffer zone */
208 		dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
209 		dmz_lock_map(zmd);
210 		dmz_unmap_zone(zmd, bzone);
211 		dmz_unlock_zone_reclaim(dzone);
212 		dmz_free_zone(zmd, bzone);
213 		dmz_unlock_map(zmd);
214 	}
215 
216 	dmz_unlock_flush(zmd);
217 
218 	return 0;
219 }
220 
221 /*
222  * Merge valid blocks of dzone into its buffer zone and free dzone.
223  */
224 static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
225 {
226 	unsigned int chunk = dzone->chunk;
227 	struct dm_zone *bzone = dzone->bzone;
228 	struct dmz_metadata *zmd = zrc->metadata;
229 	int ret = 0;
230 
231 	dmz_dev_debug(zrc->dev,
232 		      "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
233 		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
234 		      dmz_id(zmd, bzone), dmz_weight(bzone));
235 
236 	/* Flush data zone into the buffer zone */
237 	ret = dmz_reclaim_copy(zrc, dzone, bzone);
238 	if (ret < 0)
239 		return ret;
240 
241 	dmz_lock_flush(zmd);
242 
243 	/* Validate copied blocks */
244 	ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0);
245 	if (ret == 0) {
246 		/*
247 		 * Free the data zone and remap the chunk to
248 		 * the buffer zone.
249 		 */
250 		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
251 		dmz_lock_map(zmd);
252 		dmz_unmap_zone(zmd, bzone);
253 		dmz_unmap_zone(zmd, dzone);
254 		dmz_unlock_zone_reclaim(dzone);
255 		dmz_free_zone(zmd, dzone);
256 		dmz_map_zone(zmd, bzone, chunk);
257 		dmz_unlock_map(zmd);
258 	}
259 
260 	dmz_unlock_flush(zmd);
261 
262 	return 0;
263 }
264 
265 /*
266  * Move valid blocks of the random data zone dzone into a free sequential zone.
267  * Once blocks are moved, remap the zone chunk to the sequential zone.
268  */
269 static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
270 {
271 	unsigned int chunk = dzone->chunk;
272 	struct dm_zone *szone = NULL;
273 	struct dmz_metadata *zmd = zrc->metadata;
274 	int ret;
275 
276 	/* Get a free sequential zone */
277 	dmz_lock_map(zmd);
278 	szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
279 	dmz_unlock_map(zmd);
280 	if (!szone)
281 		return -ENOSPC;
282 
283 	dmz_dev_debug(zrc->dev,
284 		      "Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
285 		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
286 		      dmz_id(zmd, szone));
287 
288 	/* Flush the random data zone into the sequential zone */
289 	ret = dmz_reclaim_copy(zrc, dzone, szone);
290 
291 	dmz_lock_flush(zmd);
292 
293 	if (ret == 0) {
294 		/* Validate copied blocks */
295 		ret = dmz_copy_valid_blocks(zmd, dzone, szone);
296 	}
297 	if (ret) {
298 		/* Free the sequential zone */
299 		dmz_lock_map(zmd);
300 		dmz_free_zone(zmd, szone);
301 		dmz_unlock_map(zmd);
302 	} else {
303 		/* Free the data zone and remap the chunk */
304 		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
305 		dmz_lock_map(zmd);
306 		dmz_unmap_zone(zmd, dzone);
307 		dmz_unlock_zone_reclaim(dzone);
308 		dmz_free_zone(zmd, dzone);
309 		dmz_map_zone(zmd, szone, chunk);
310 		dmz_unlock_map(zmd);
311 	}
312 
313 	dmz_unlock_flush(zmd);
314 
315 	return 0;
316 }
317 
318 /*
319  * Reclaim an empty zone.
320  */
321 static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
322 {
323 	struct dmz_metadata *zmd = zrc->metadata;
324 
325 	dmz_lock_flush(zmd);
326 	dmz_lock_map(zmd);
327 	dmz_unmap_zone(zmd, dzone);
328 	dmz_unlock_zone_reclaim(dzone);
329 	dmz_free_zone(zmd, dzone);
330 	dmz_unlock_map(zmd);
331 	dmz_unlock_flush(zmd);
332 }
333 
334 /*
335  * Find a candidate zone for reclaim and process it.
336  */
337 static void dmz_reclaim(struct dmz_reclaim *zrc)
338 {
339 	struct dmz_metadata *zmd = zrc->metadata;
340 	struct dm_zone *dzone;
341 	struct dm_zone *rzone;
342 	unsigned long start;
343 	int ret;
344 
345 	/* Get a data zone */
346 	dzone = dmz_get_zone_for_reclaim(zmd);
347 	if (!dzone)
348 		return;
349 
350 	start = jiffies;
351 
352 	if (dmz_is_rnd(dzone)) {
353 		if (!dmz_weight(dzone)) {
354 			/* Empty zone */
355 			dmz_reclaim_empty(zrc, dzone);
356 			ret = 0;
357 		} else {
358 			/*
359 			 * Reclaim the random data zone by moving its
360 			 * valid data blocks to a free sequential zone.
361 			 */
362 			ret = dmz_reclaim_rnd_data(zrc, dzone);
363 		}
364 		rzone = dzone;
365 
366 	} else {
367 		struct dm_zone *bzone = dzone->bzone;
368 		sector_t chunk_block = 0;
369 
370 		ret = dmz_first_valid_block(zmd, bzone, &chunk_block);
371 		if (ret < 0)
372 			goto out;
373 
374 		if (ret == 0 || chunk_block >= dzone->wp_block) {
375 			/*
376 			 * The buffer zone is empty or its valid blocks are
377 			 * after the data zone write pointer.
378 			 */
379 			ret = dmz_reclaim_buf(zrc, dzone);
380 			rzone = bzone;
381 		} else {
382 			/*
383 			 * Reclaim the data zone by merging it into the
384 			 * buffer zone so that the buffer zone itself can
385 			 * be later reclaimed.
386 			 */
387 			ret = dmz_reclaim_seq_data(zrc, dzone);
388 			rzone = dzone;
389 		}
390 	}
391 out:
392 	if (ret) {
393 		dmz_unlock_zone_reclaim(dzone);
394 		return;
395 	}
396 
397 	(void) dmz_flush_metadata(zrc->metadata);
398 
399 	dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
400 		      dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
401 }
402 
403 /*
404  * Test if the target device is idle.
405  */
406 static inline int dmz_target_idle(struct dmz_reclaim *zrc)
407 {
408 	return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
409 }
410 
411 /*
412  * Test if reclaim is necessary.
413  */
414 static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
415 {
416 	struct dmz_metadata *zmd = zrc->metadata;
417 	unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
418 	unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
419 	unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
420 
421 	/* Reclaim when idle */
422 	if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
423 		return true;
424 
425 	/* If there are still plenty of random zones, do not reclaim */
426 	if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
427 		return false;
428 
429 	/*
430 	 * If the percentage of unmappped random zones is low,
431 	 * reclaim even if the target is busy.
432 	 */
433 	return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
434 }
435 
436 /*
437  * Reclaim work function.
438  */
439 static void dmz_reclaim_work(struct work_struct *work)
440 {
441 	struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
442 	struct dmz_metadata *zmd = zrc->metadata;
443 	unsigned int nr_rnd, nr_unmap_rnd;
444 	unsigned int p_unmap_rnd;
445 
446 	if (!dmz_should_reclaim(zrc)) {
447 		mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
448 		return;
449 	}
450 
451 	/*
452 	 * We need to start reclaiming random zones: set up zone copy
453 	 * throttling to either go fast if we are very low on random zones
454 	 * and slower if there are still some free random zones to avoid
455 	 * as much as possible to negatively impact the user workload.
456 	 */
457 	nr_rnd = dmz_nr_rnd_zones(zmd);
458 	nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
459 	p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
460 	if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
461 		/* Idle or very low percentage: go fast */
462 		zrc->kc_throttle.throttle = 100;
463 	} else {
464 		/* Busy but we still have some random zone: throttle */
465 		zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
466 	}
467 
468 	dmz_dev_debug(zrc->dev,
469 		      "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
470 		      zrc->kc_throttle.throttle,
471 		      (dmz_target_idle(zrc) ? "Idle" : "Busy"),
472 		      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
473 
474 	dmz_reclaim(zrc);
475 
476 	dmz_schedule_reclaim(zrc);
477 }
478 
479 /*
480  * Initialize reclaim.
481  */
482 int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
483 		    struct dmz_reclaim **reclaim)
484 {
485 	struct dmz_reclaim *zrc;
486 	int ret;
487 
488 	zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL);
489 	if (!zrc)
490 		return -ENOMEM;
491 
492 	zrc->dev = dev;
493 	zrc->metadata = zmd;
494 	zrc->atime = jiffies;
495 
496 	/* Reclaim kcopyd client */
497 	zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
498 	if (IS_ERR(zrc->kc)) {
499 		ret = PTR_ERR(zrc->kc);
500 		zrc->kc = NULL;
501 		goto err;
502 	}
503 
504 	/* Reclaim work */
505 	INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
506 	zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
507 					  dev->name);
508 	if (!zrc->wq) {
509 		ret = -ENOMEM;
510 		goto err;
511 	}
512 
513 	*reclaim = zrc;
514 	queue_delayed_work(zrc->wq, &zrc->work, 0);
515 
516 	return 0;
517 err:
518 	if (zrc->kc)
519 		dm_kcopyd_client_destroy(zrc->kc);
520 	kfree(zrc);
521 
522 	return ret;
523 }
524 
525 /*
526  * Terminate reclaim.
527  */
528 void dmz_dtr_reclaim(struct dmz_reclaim *zrc)
529 {
530 	cancel_delayed_work_sync(&zrc->work);
531 	destroy_workqueue(zrc->wq);
532 	dm_kcopyd_client_destroy(zrc->kc);
533 	kfree(zrc);
534 }
535 
536 /*
537  * Suspend reclaim.
538  */
539 void dmz_suspend_reclaim(struct dmz_reclaim *zrc)
540 {
541 	cancel_delayed_work_sync(&zrc->work);
542 }
543 
544 /*
545  * Resume reclaim.
546  */
547 void dmz_resume_reclaim(struct dmz_reclaim *zrc)
548 {
549 	queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
550 }
551 
552 /*
553  * BIO accounting.
554  */
555 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
556 {
557 	zrc->atime = jiffies;
558 }
559 
560 /*
561  * Start reclaim if necessary.
562  */
563 void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
564 {
565 	if (dmz_should_reclaim(zrc))
566 		mod_delayed_work(zrc->wq, &zrc->work, 0);
567 }
568 
569