1 /* 2 * background writeback - scan btree for dirty data and write it to the backing 3 * device 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcache.h" 10 #include "btree.h" 11 #include "debug.h" 12 #include "writeback.h" 13 14 #include <trace/events/bcache.h> 15 16 static struct workqueue_struct *dirty_wq; 17 18 static void read_dirty(struct closure *); 19 20 struct dirty_io { 21 struct closure cl; 22 struct cached_dev *dc; 23 struct bio bio; 24 }; 25 26 /* Rate limiting */ 27 28 static void __update_writeback_rate(struct cached_dev *dc) 29 { 30 struct cache_set *c = dc->disk.c; 31 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; 32 uint64_t cache_dirty_target = 33 div_u64(cache_sectors * dc->writeback_percent, 100); 34 35 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 36 c->cached_dev_sectors); 37 38 /* PD controller */ 39 40 int change = 0; 41 int64_t error; 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 44 45 dc->disk.sectors_dirty_last = dirty; 46 47 derivative *= dc->writeback_rate_d_term; 48 derivative = clamp(derivative, -dirty, dirty); 49 50 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 51 dc->writeback_rate_d_smooth, 0); 52 53 /* Avoid divide by zero */ 54 if (!target) 55 goto out; 56 57 error = div64_s64((dirty + derivative - target) << 8, target); 58 59 change = div_s64((dc->writeback_rate.rate * error) >> 8, 60 dc->writeback_rate_p_term_inverse); 61 62 /* Don't increase writeback rate if the device isn't keeping up */ 63 if (change > 0 && 64 time_after64(local_clock(), 65 dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) 66 change = 0; 67 68 dc->writeback_rate.rate = 69 clamp_t(int64_t, dc->writeback_rate.rate + change, 70 1, NSEC_PER_MSEC); 71 out: 72 dc->writeback_rate_derivative = derivative; 73 dc->writeback_rate_change = change; 74 dc->writeback_rate_target = target; 75 76 schedule_delayed_work(&dc->writeback_rate_update, 77 dc->writeback_rate_update_seconds * HZ); 78 } 79 80 static void update_writeback_rate(struct work_struct *work) 81 { 82 struct cached_dev *dc = container_of(to_delayed_work(work), 83 struct cached_dev, 84 writeback_rate_update); 85 86 down_read(&dc->writeback_lock); 87 88 if (atomic_read(&dc->has_dirty) && 89 dc->writeback_percent) 90 __update_writeback_rate(dc); 91 92 up_read(&dc->writeback_lock); 93 } 94 95 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 96 { 97 if (atomic_read(&dc->disk.detaching) || 98 !dc->writeback_percent) 99 return 0; 100 101 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 102 } 103 104 /* Background writeback */ 105 106 static bool dirty_pred(struct keybuf *buf, struct bkey *k) 107 { 108 return KEY_DIRTY(k); 109 } 110 111 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) 112 { 113 uint64_t stripe; 114 unsigned nr_sectors = KEY_SIZE(k); 115 struct cached_dev *dc = container_of(buf, struct cached_dev, 116 writeback_keys); 117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits; 118 119 if (!KEY_DIRTY(k)) 120 return false; 121 122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits; 123 while (1) { 124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != 125 stripe_size) 126 return false; 127 128 if (nr_sectors <= stripe_size) 129 return true; 130 131 nr_sectors -= stripe_size; 132 stripe++; 133 } 134 } 135 136 static void dirty_init(struct keybuf_key *w) 137 { 138 struct dirty_io *io = w->private; 139 struct bio *bio = &io->bio; 140 141 bio_init(bio); 142 if (!io->dc->writeback_percent) 143 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 144 145 bio->bi_size = KEY_SIZE(&w->key) << 9; 146 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); 147 bio->bi_private = w; 148 bio->bi_io_vec = bio->bi_inline_vecs; 149 bch_bio_map(bio, NULL); 150 } 151 152 static void refill_dirty(struct closure *cl) 153 { 154 struct cached_dev *dc = container_of(cl, struct cached_dev, 155 writeback.cl); 156 struct keybuf *buf = &dc->writeback_keys; 157 bool searched_from_start = false; 158 struct bkey end = MAX_KEY; 159 SET_KEY_INODE(&end, dc->disk.id); 160 161 if (!atomic_read(&dc->disk.detaching) && 162 !dc->writeback_running) 163 closure_return(cl); 164 165 down_write(&dc->writeback_lock); 166 167 if (!atomic_read(&dc->has_dirty)) { 168 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 169 bch_write_bdev_super(dc, NULL); 170 171 up_write(&dc->writeback_lock); 172 closure_return(cl); 173 } 174 175 if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 176 buf->last_scanned = KEY(dc->disk.id, 0, 0); 177 searched_from_start = true; 178 } 179 180 if (dc->partial_stripes_expensive) { 181 uint64_t i; 182 183 for (i = 0; i < dc->disk.nr_stripes; i++) 184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) == 185 1 << dc->disk.stripe_size_bits) 186 goto full_stripes; 187 188 goto normal_refill; 189 full_stripes: 190 bch_refill_keybuf(dc->disk.c, buf, &end, 191 dirty_full_stripe_pred); 192 } else { 193 normal_refill: 194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 195 } 196 197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 198 /* Searched the entire btree - delay awhile */ 199 200 if (RB_EMPTY_ROOT(&buf->keys)) { 201 atomic_set(&dc->has_dirty, 0); 202 cached_dev_put(dc); 203 } 204 205 if (!atomic_read(&dc->disk.detaching)) 206 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 207 } 208 209 up_write(&dc->writeback_lock); 210 211 ratelimit_reset(&dc->writeback_rate); 212 213 /* Punt to workqueue only so we don't recurse and blow the stack */ 214 continue_at(cl, read_dirty, dirty_wq); 215 } 216 217 void bch_writeback_queue(struct cached_dev *dc) 218 { 219 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { 220 if (!atomic_read(&dc->disk.detaching)) 221 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 222 223 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); 224 } 225 } 226 227 void bch_writeback_add(struct cached_dev *dc) 228 { 229 if (!atomic_read(&dc->has_dirty) && 230 !atomic_xchg(&dc->has_dirty, 1)) { 231 atomic_inc(&dc->count); 232 233 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 234 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 235 /* XXX: should do this synchronously */ 236 bch_write_bdev_super(dc, NULL); 237 } 238 239 bch_writeback_queue(dc); 240 241 if (dc->writeback_percent) 242 schedule_delayed_work(&dc->writeback_rate_update, 243 dc->writeback_rate_update_seconds * HZ); 244 } 245 } 246 247 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 248 uint64_t offset, int nr_sectors) 249 { 250 struct bcache_device *d = c->devices[inode]; 251 unsigned stripe_size, stripe_offset; 252 uint64_t stripe; 253 254 if (!d) 255 return; 256 257 stripe_size = 1 << d->stripe_size_bits; 258 stripe = offset >> d->stripe_size_bits; 259 stripe_offset = offset & (stripe_size - 1); 260 261 while (nr_sectors) { 262 int s = min_t(unsigned, abs(nr_sectors), 263 stripe_size - stripe_offset); 264 265 if (nr_sectors < 0) 266 s = -s; 267 268 atomic_add(s, d->stripe_sectors_dirty + stripe); 269 nr_sectors -= s; 270 stripe_offset = 0; 271 stripe++; 272 } 273 } 274 275 /* Background writeback - IO loop */ 276 277 static void dirty_io_destructor(struct closure *cl) 278 { 279 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 280 kfree(io); 281 } 282 283 static void write_dirty_finish(struct closure *cl) 284 { 285 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 286 struct keybuf_key *w = io->bio.bi_private; 287 struct cached_dev *dc = io->dc; 288 struct bio_vec *bv; 289 int i; 290 291 bio_for_each_segment_all(bv, &io->bio, i) 292 __free_page(bv->bv_page); 293 294 /* This is kind of a dumb way of signalling errors. */ 295 if (KEY_DIRTY(&w->key)) { 296 unsigned i; 297 struct btree_op op; 298 bch_btree_op_init_stack(&op); 299 300 op.type = BTREE_REPLACE; 301 bkey_copy(&op.replace, &w->key); 302 303 SET_KEY_DIRTY(&w->key, false); 304 bch_keylist_add(&op.keys, &w->key); 305 306 for (i = 0; i < KEY_PTRS(&w->key); i++) 307 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 308 309 bch_btree_insert(&op, dc->disk.c); 310 closure_sync(&op.cl); 311 312 if (op.insert_collision) 313 trace_bcache_writeback_collision(&w->key); 314 315 atomic_long_inc(op.insert_collision 316 ? &dc->disk.c->writeback_keys_failed 317 : &dc->disk.c->writeback_keys_done); 318 } 319 320 bch_keybuf_del(&dc->writeback_keys, w); 321 atomic_dec_bug(&dc->in_flight); 322 323 closure_wake_up(&dc->writeback_wait); 324 325 closure_return_with_destructor(cl, dirty_io_destructor); 326 } 327 328 static void dirty_endio(struct bio *bio, int error) 329 { 330 struct keybuf_key *w = bio->bi_private; 331 struct dirty_io *io = w->private; 332 333 if (error) 334 SET_KEY_DIRTY(&w->key, false); 335 336 closure_put(&io->cl); 337 } 338 339 static void write_dirty(struct closure *cl) 340 { 341 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 342 struct keybuf_key *w = io->bio.bi_private; 343 344 dirty_init(w); 345 io->bio.bi_rw = WRITE; 346 io->bio.bi_sector = KEY_START(&w->key); 347 io->bio.bi_bdev = io->dc->bdev; 348 io->bio.bi_end_io = dirty_endio; 349 350 closure_bio_submit(&io->bio, cl, &io->dc->disk); 351 352 continue_at(cl, write_dirty_finish, dirty_wq); 353 } 354 355 static void read_dirty_endio(struct bio *bio, int error) 356 { 357 struct keybuf_key *w = bio->bi_private; 358 struct dirty_io *io = w->private; 359 360 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 361 error, "reading dirty data from cache"); 362 363 dirty_endio(bio, error); 364 } 365 366 static void read_dirty_submit(struct closure *cl) 367 { 368 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 369 370 closure_bio_submit(&io->bio, cl, &io->dc->disk); 371 372 continue_at(cl, write_dirty, dirty_wq); 373 } 374 375 static void read_dirty(struct closure *cl) 376 { 377 struct cached_dev *dc = container_of(cl, struct cached_dev, 378 writeback.cl); 379 unsigned delay = writeback_delay(dc, 0); 380 struct keybuf_key *w; 381 struct dirty_io *io; 382 383 /* 384 * XXX: if we error, background writeback just spins. Should use some 385 * mempools. 386 */ 387 388 while (1) { 389 w = bch_keybuf_next(&dc->writeback_keys); 390 if (!w) 391 break; 392 393 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 394 395 if (delay > 0 && 396 (KEY_START(&w->key) != dc->last_read || 397 jiffies_to_msecs(delay) > 50)) { 398 w->private = NULL; 399 400 closure_delay(&dc->writeback, delay); 401 continue_at(cl, read_dirty, dirty_wq); 402 } 403 404 dc->last_read = KEY_OFFSET(&w->key); 405 406 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 407 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 408 GFP_KERNEL); 409 if (!io) 410 goto err; 411 412 w->private = io; 413 io->dc = dc; 414 415 dirty_init(w); 416 io->bio.bi_sector = PTR_OFFSET(&w->key, 0); 417 io->bio.bi_bdev = PTR_CACHE(dc->disk.c, 418 &w->key, 0)->bdev; 419 io->bio.bi_rw = READ; 420 io->bio.bi_end_io = read_dirty_endio; 421 422 if (bio_alloc_pages(&io->bio, GFP_KERNEL)) 423 goto err_free; 424 425 trace_bcache_writeback(&w->key); 426 427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 428 429 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 430 431 atomic_inc(&dc->in_flight); 432 433 if (!closure_wait_event(&dc->writeback_wait, cl, 434 atomic_read(&dc->in_flight) < 64)) 435 continue_at(cl, read_dirty, dirty_wq); 436 } 437 438 if (0) { 439 err_free: 440 kfree(w->private); 441 err: 442 bch_keybuf_del(&dc->writeback_keys, w); 443 } 444 445 refill_dirty(cl); 446 } 447 448 /* Init */ 449 450 static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 451 struct cached_dev *dc) 452 { 453 struct bkey *k; 454 struct btree_iter iter; 455 456 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 457 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 458 if (!b->level) { 459 if (KEY_INODE(k) > dc->disk.id) 460 break; 461 462 if (KEY_DIRTY(k)) 463 bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 464 KEY_START(k), 465 KEY_SIZE(k)); 466 } else { 467 btree(sectors_dirty_init, k, b, op, dc); 468 if (KEY_INODE(k) > dc->disk.id) 469 break; 470 471 cond_resched(); 472 } 473 474 return 0; 475 } 476 477 void bch_sectors_dirty_init(struct cached_dev *dc) 478 { 479 struct btree_op op; 480 481 bch_btree_op_init_stack(&op); 482 btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 483 } 484 485 void bch_cached_dev_writeback_init(struct cached_dev *dc) 486 { 487 closure_init_unlocked(&dc->writeback); 488 init_rwsem(&dc->writeback_lock); 489 490 bch_keybuf_init(&dc->writeback_keys); 491 492 dc->writeback_metadata = true; 493 dc->writeback_running = true; 494 dc->writeback_percent = 10; 495 dc->writeback_delay = 30; 496 dc->writeback_rate.rate = 1024; 497 498 dc->writeback_rate_update_seconds = 30; 499 dc->writeback_rate_d_term = 16; 500 dc->writeback_rate_p_term_inverse = 64; 501 dc->writeback_rate_d_smooth = 8; 502 503 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 504 schedule_delayed_work(&dc->writeback_rate_update, 505 dc->writeback_rate_update_seconds * HZ); 506 } 507 508 void bch_writeback_exit(void) 509 { 510 if (dirty_wq) 511 destroy_workqueue(dirty_wq); 512 } 513 514 int __init bch_writeback_init(void) 515 { 516 dirty_wq = create_singlethread_workqueue("bcache_writeback"); 517 if (!dirty_wq) 518 return -ENOMEM; 519 520 return 0; 521 } 522