1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * uptodate.c 5 * 6 * Tracking the up-to-date-ness of a local buffer_head with respect to 7 * the cluster. 8 * 9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 * 26 * Standard buffer head caching flags (uptodate, etc) are insufficient 27 * in a clustered environment - a buffer may be marked up to date on 28 * our local node but could have been modified by another cluster 29 * member. As a result an additional (and performant) caching scheme 30 * is required. A further requirement is that we consume as little 31 * memory as possible - we never pin buffer_head structures in order 32 * to cache them. 33 * 34 * We track the existence of up to date buffers on the inodes which 35 * are associated with them. Because we don't want to pin 36 * buffer_heads, this is only a (strong) hint and several other checks 37 * are made in the I/O path to ensure that we don't use a stale or 38 * invalid buffer without going to disk: 39 * - buffer_jbd is used liberally - if a bh is in the journal on 40 * this node then it *must* be up to date. 41 * - the standard buffer_uptodate() macro is used to detect buffers 42 * which may be invalid (even if we have an up to date tracking 43 * item for them) 44 * 45 * For a full understanding of how this code works together, one 46 * should read the callers in dlmglue.c, the I/O functions in 47 * buffer_head_io.c and ocfs2_journal_access in journal.c 48 */ 49 50 #include <linux/fs.h> 51 #include <linux/types.h> 52 #include <linux/slab.h> 53 #include <linux/highmem.h> 54 #include <linux/buffer_head.h> 55 #include <linux/rbtree.h> 56 #include <linux/jbd.h> 57 58 #define MLOG_MASK_PREFIX ML_UPTODATE 59 60 #include <cluster/masklog.h> 61 62 #include "ocfs2.h" 63 64 #include "inode.h" 65 #include "uptodate.h" 66 67 struct ocfs2_meta_cache_item { 68 struct rb_node c_node; 69 sector_t c_block; 70 }; 71 72 static kmem_cache_t *ocfs2_uptodate_cachep = NULL; 73 74 void ocfs2_metadata_cache_init(struct inode *inode) 75 { 76 struct ocfs2_inode_info *oi = OCFS2_I(inode); 77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 78 79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 80 ci->ci_num_cached = 0; 81 } 82 83 /* No lock taken here as 'root' is not expected to be visible to other 84 * processes. */ 85 static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 86 { 87 unsigned int purged = 0; 88 struct rb_node *node; 89 struct ocfs2_meta_cache_item *item; 90 91 while ((node = rb_last(root)) != NULL) { 92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 93 94 mlog(0, "Purge item %llu\n", 95 (unsigned long long) item->c_block); 96 97 rb_erase(&item->c_node, root); 98 kmem_cache_free(ocfs2_uptodate_cachep, item); 99 100 purged++; 101 } 102 return purged; 103 } 104 105 /* Called from locking and called from ocfs2_clear_inode. Dump the 106 * cache for a given inode. 107 * 108 * This function is a few more lines longer than necessary due to some 109 * accounting done here, but I think it's worth tracking down those 110 * bugs sooner -- Mark */ 111 void ocfs2_metadata_cache_purge(struct inode *inode) 112 { 113 struct ocfs2_inode_info *oi = OCFS2_I(inode); 114 unsigned int tree, to_purge, purged; 115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 116 struct rb_root root = RB_ROOT; 117 118 spin_lock(&oi->ip_lock); 119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 120 to_purge = ci->ci_num_cached; 121 122 mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, 123 tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); 124 125 /* If we're a tree, save off the root so that we can safely 126 * initialize the cache. We do the work to free tree members 127 * without the spinlock. */ 128 if (tree) 129 root = ci->ci_cache.ci_tree; 130 131 ocfs2_metadata_cache_init(inode); 132 spin_unlock(&oi->ip_lock); 133 134 purged = ocfs2_purge_copied_metadata_tree(&root); 135 /* If possible, track the number wiped so that we can more 136 * easily detect counting errors. Unfortunately, this is only 137 * meaningful for trees. */ 138 if (tree && purged != to_purge) 139 mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", 140 (unsigned long long)oi->ip_blkno, to_purge, purged); 141 } 142 143 /* Returns the index in the cache array, -1 if not found. 144 * Requires ip_lock. */ 145 static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, 146 sector_t item) 147 { 148 int i; 149 150 for (i = 0; i < ci->ci_num_cached; i++) { 151 if (item == ci->ci_cache.ci_array[i]) 152 return i; 153 } 154 155 return -1; 156 } 157 158 /* Returns the cache item if found, otherwise NULL. 159 * Requires ip_lock. */ 160 static struct ocfs2_meta_cache_item * 161 ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, 162 sector_t block) 163 { 164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node; 165 struct ocfs2_meta_cache_item *item = NULL; 166 167 while (n) { 168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); 169 170 if (block < item->c_block) 171 n = n->rb_left; 172 else if (block > item->c_block) 173 n = n->rb_right; 174 else 175 return item; 176 } 177 178 return NULL; 179 } 180 181 static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 182 struct buffer_head *bh) 183 { 184 int index = -1; 185 struct ocfs2_meta_cache_item *item = NULL; 186 187 spin_lock(&oi->ip_lock); 188 189 mlog(0, "Inode %llu, query block %llu (inline = %u)\n", 190 (unsigned long long)oi->ip_blkno, 191 (unsigned long long) bh->b_blocknr, 192 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 193 194 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 195 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 196 bh->b_blocknr); 197 else 198 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 199 bh->b_blocknr); 200 201 spin_unlock(&oi->ip_lock); 202 203 mlog(0, "index = %d, item = %p\n", index, item); 204 205 return (index != -1) || (item != NULL); 206 } 207 208 /* Warning: even if it returns true, this does *not* guarantee that 209 * the block is stored in our inode metadata cache. */ 210 int ocfs2_buffer_uptodate(struct inode *inode, 211 struct buffer_head *bh) 212 { 213 /* Doesn't matter if the bh is in our cache or not -- if it's 214 * not marked uptodate then we know it can't have correct 215 * data. */ 216 if (!buffer_uptodate(bh)) 217 return 0; 218 219 /* OCFS2 does not allow multiple nodes to be changing the same 220 * block at the same time. */ 221 if (buffer_jbd(bh)) 222 return 1; 223 224 /* Ok, locally the buffer is marked as up to date, now search 225 * our cache to see if we can trust that. */ 226 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 227 } 228 229 /* Requires ip_lock */ 230 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 231 sector_t block) 232 { 233 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 234 235 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 236 ci->ci_num_cached); 237 238 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 239 ci->ci_num_cached++; 240 } 241 242 /* By now the caller should have checked that the item does *not* 243 * exist in the tree. 244 * Requires ip_lock. */ 245 static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, 246 struct ocfs2_meta_cache_item *new) 247 { 248 sector_t block = new->c_block; 249 struct rb_node *parent = NULL; 250 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 251 struct ocfs2_meta_cache_item *tmp; 252 253 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, 254 ci->ci_num_cached); 255 256 while(*p) { 257 parent = *p; 258 259 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); 260 261 if (block < tmp->c_block) 262 p = &(*p)->rb_left; 263 else if (block > tmp->c_block) 264 p = &(*p)->rb_right; 265 else { 266 /* This should never happen! */ 267 mlog(ML_ERROR, "Duplicate block %llu cached!\n", 268 (unsigned long long) block); 269 BUG(); 270 } 271 } 272 273 rb_link_node(&new->c_node, parent, p); 274 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); 275 ci->ci_num_cached++; 276 } 277 278 static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 279 struct ocfs2_caching_info *ci) 280 { 281 assert_spin_locked(&oi->ip_lock); 282 283 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && 284 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); 285 } 286 287 /* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 288 * pointers in tree after we use them - this allows caller to detect 289 * when to free in case of error. */ 290 static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 291 struct ocfs2_meta_cache_item **tree) 292 { 293 int i; 294 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 295 296 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 297 "Inode %llu, num cached = %u, should be %u\n", 298 (unsigned long long)oi->ip_blkno, ci->ci_num_cached, 299 OCFS2_INODE_MAX_CACHE_ARRAY); 300 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 301 "Inode %llu not marked as inline anymore!\n", 302 (unsigned long long)oi->ip_blkno); 303 assert_spin_locked(&oi->ip_lock); 304 305 /* Be careful to initialize the tree members *first* because 306 * once the ci_tree is used, the array is junk... */ 307 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 308 tree[i]->c_block = ci->ci_cache.ci_array[i]; 309 310 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 311 ci->ci_cache.ci_tree = RB_ROOT; 312 /* this will be set again by __ocfs2_insert_cache_tree */ 313 ci->ci_num_cached = 0; 314 315 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 316 __ocfs2_insert_cache_tree(ci, tree[i]); 317 tree[i] = NULL; 318 } 319 320 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 321 (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 322 } 323 324 /* Slow path function - memory allocation is necessary. See the 325 * comment above ocfs2_set_buffer_uptodate for more information. */ 326 static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 327 sector_t block, 328 int expand_tree) 329 { 330 int i; 331 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 332 struct ocfs2_meta_cache_item *new = NULL; 333 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 334 { NULL, }; 335 336 mlog(0, "Inode %llu, block %llu, expand = %d\n", 337 (unsigned long long)oi->ip_blkno, 338 (unsigned long long)block, expand_tree); 339 340 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 341 if (!new) { 342 mlog_errno(-ENOMEM); 343 return; 344 } 345 new->c_block = block; 346 347 if (expand_tree) { 348 /* Do *not* allocate an array here - the removal code 349 * has no way of tracking that. */ 350 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 351 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 352 GFP_NOFS); 353 if (!tree[i]) { 354 mlog_errno(-ENOMEM); 355 goto out_free; 356 } 357 358 /* These are initialized in ocfs2_expand_cache! */ 359 } 360 } 361 362 spin_lock(&oi->ip_lock); 363 if (ocfs2_insert_can_use_array(oi, ci)) { 364 mlog(0, "Someone cleared the tree underneath us\n"); 365 /* Ok, items were removed from the cache in between 366 * locks. Detect this and revert back to the fast path */ 367 ocfs2_append_cache_array(ci, block); 368 spin_unlock(&oi->ip_lock); 369 goto out_free; 370 } 371 372 if (expand_tree) 373 ocfs2_expand_cache(oi, tree); 374 375 __ocfs2_insert_cache_tree(ci, new); 376 spin_unlock(&oi->ip_lock); 377 378 new = NULL; 379 out_free: 380 if (new) 381 kmem_cache_free(ocfs2_uptodate_cachep, new); 382 383 /* If these were used, then ocfs2_expand_cache re-set them to 384 * NULL for us. */ 385 if (tree[0]) { 386 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 387 if (tree[i]) 388 kmem_cache_free(ocfs2_uptodate_cachep, 389 tree[i]); 390 } 391 } 392 393 /* Item insertion is guarded by ip_io_mutex, so the insertion path takes 394 * advantage of this by not rechecking for a duplicate insert during 395 * the slow case. Additionally, if the cache needs to be bumped up to 396 * a tree, the code will not recheck after acquiring the lock -- 397 * multiple paths cannot be expanding to a tree at the same time. 398 * 399 * The slow path takes into account that items can be removed 400 * (including the whole tree wiped and reset) when this process it out 401 * allocating memory. In those cases, it reverts back to the fast 402 * path. 403 * 404 * Note that this function may actually fail to insert the block if 405 * memory cannot be allocated. This is not fatal however (but may 406 * result in a performance penalty) */ 407 void ocfs2_set_buffer_uptodate(struct inode *inode, 408 struct buffer_head *bh) 409 { 410 int expand; 411 struct ocfs2_inode_info *oi = OCFS2_I(inode); 412 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 413 414 /* The block may very well exist in our cache already, so avoid 415 * doing any more work in that case. */ 416 if (ocfs2_buffer_cached(oi, bh)) 417 return; 418 419 mlog(0, "Inode %llu, inserting block %llu\n", 420 (unsigned long long)oi->ip_blkno, 421 (unsigned long long)bh->b_blocknr); 422 423 /* No need to recheck under spinlock - insertion is guarded by 424 * ip_io_mutex */ 425 spin_lock(&oi->ip_lock); 426 if (ocfs2_insert_can_use_array(oi, ci)) { 427 /* Fast case - it's an array and there's a free 428 * spot. */ 429 ocfs2_append_cache_array(ci, bh->b_blocknr); 430 spin_unlock(&oi->ip_lock); 431 return; 432 } 433 434 expand = 0; 435 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 436 /* We need to bump things up to a tree. */ 437 expand = 1; 438 } 439 spin_unlock(&oi->ip_lock); 440 441 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 442 } 443 444 /* Called against a newly allocated buffer. Most likely nobody should 445 * be able to read this sort of metadata while it's still being 446 * allocated, but this is careful to take ip_io_mutex anyway. */ 447 void ocfs2_set_new_buffer_uptodate(struct inode *inode, 448 struct buffer_head *bh) 449 { 450 struct ocfs2_inode_info *oi = OCFS2_I(inode); 451 452 /* This should definitely *not* exist in our cache */ 453 BUG_ON(ocfs2_buffer_cached(oi, bh)); 454 455 set_buffer_uptodate(bh); 456 457 mutex_lock(&oi->ip_io_mutex); 458 ocfs2_set_buffer_uptodate(inode, bh); 459 mutex_unlock(&oi->ip_io_mutex); 460 } 461 462 /* Requires ip_lock. */ 463 static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, 464 int index) 465 { 466 sector_t *array = ci->ci_cache.ci_array; 467 int bytes; 468 469 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 470 BUG_ON(index >= ci->ci_num_cached); 471 BUG_ON(!ci->ci_num_cached); 472 473 mlog(0, "remove index %d (num_cached = %u\n", index, 474 ci->ci_num_cached); 475 476 ci->ci_num_cached--; 477 478 /* don't need to copy if the array is now empty, or if we 479 * removed at the tail */ 480 if (ci->ci_num_cached && index < ci->ci_num_cached) { 481 bytes = sizeof(sector_t) * (ci->ci_num_cached - index); 482 memmove(&array[index], &array[index + 1], bytes); 483 } 484 } 485 486 /* Requires ip_lock. */ 487 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 488 struct ocfs2_meta_cache_item *item) 489 { 490 mlog(0, "remove block %llu from tree\n", 491 (unsigned long long) item->c_block); 492 493 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 494 ci->ci_num_cached--; 495 } 496 497 /* Called when we remove a chunk of metadata from an inode. We don't 498 * bother reverting things to an inlined array in the case of a remove 499 * which moves us back under the limit. */ 500 void ocfs2_remove_from_cache(struct inode *inode, 501 struct buffer_head *bh) 502 { 503 int index; 504 sector_t block = bh->b_blocknr; 505 struct ocfs2_meta_cache_item *item = NULL; 506 struct ocfs2_inode_info *oi = OCFS2_I(inode); 507 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 508 509 spin_lock(&oi->ip_lock); 510 mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", 511 (unsigned long long)oi->ip_blkno, 512 (unsigned long long) block, ci->ci_num_cached, 513 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 514 515 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 516 index = ocfs2_search_cache_array(ci, block); 517 if (index != -1) 518 ocfs2_remove_metadata_array(ci, index); 519 } else { 520 item = ocfs2_search_cache_tree(ci, block); 521 if (item) 522 ocfs2_remove_metadata_tree(ci, item); 523 } 524 spin_unlock(&oi->ip_lock); 525 526 if (item) 527 kmem_cache_free(ocfs2_uptodate_cachep, item); 528 } 529 530 int __init init_ocfs2_uptodate_cache(void) 531 { 532 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 533 sizeof(struct ocfs2_meta_cache_item), 534 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 535 if (!ocfs2_uptodate_cachep) 536 return -ENOMEM; 537 538 mlog(0, "%u inlined cache items per inode.\n", 539 OCFS2_INODE_MAX_CACHE_ARRAY); 540 541 return 0; 542 } 543 544 void exit_ocfs2_uptodate_cache(void) 545 { 546 if (ocfs2_uptodate_cachep) 547 kmem_cache_destroy(ocfs2_uptodate_cachep); 548 } 549