1 /* 2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2, or (at your option) 7 * any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; see the file COPYING. If not, write to 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 17 */ 18 19 /* 20 * fsnotify inode mark locking/lifetime/and refcnting 21 * 22 * REFCNT: 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel 24 * currently are referencing the objects. Both kind of objects typically will 25 * live inside the kernel with a refcnt of 2, one for its creation and one for 26 * the reference a group and a mark hold to each other. 27 * If you are holding the appropriate locks, you can take a reference and the 28 * object itself is guaranteed to survive until the reference is dropped. 29 * 30 * LOCKING: 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken 32 * in order as follows: 33 * 34 * group->mark_mutex 35 * mark->lock 36 * inode->i_lock 37 * 38 * group->mark_mutex protects the marks_list anchored inside a given group and 39 * each mark is hooked via the g_list. It also protects the groups private 40 * data (i.e group limits). 41 42 * mark->lock protects the marks attributes like its masks and flags. 43 * Furthermore it protects the access to a reference of the group that the mark 44 * is assigned to as well as the access to a reference of the inode/vfsmount 45 * that is being watched by the mark. 46 * 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 48 * given inode and each mark is hooked via the i_list. (and sorta the 49 * free_i_list) 50 * 51 * 52 * LIFETIME: 53 * Inode marks survive between when they are added to an inode and when their 54 * refcnt==0. 55 * 56 * The inode mark can be cleared for a number of different reasons including: 57 * - The inode is unlinked for the last time. (fsnotify_inode_remove) 58 * - The inode is being evicted from cache. (fsnotify_inode_delete) 59 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) 60 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) 61 * - The fsnotify_group associated with the mark is going away and all such marks 62 * need to be cleaned up. (fsnotify_clear_marks_by_group) 63 * 64 * Worst case we are given an inode and need to clean up all the marks on that 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 66 * mark on the list we take a reference (so the mark can't disappear under us). 67 * We remove that mark form the inode's list of marks and we add this mark to a 68 * private list anchored on the stack using i_free_list; we walk i_free_list 69 * and before we destroy the mark we make sure that we dont race with a 70 * concurrent destroy_group by getting a ref to the marks group and taking the 71 * groups mutex. 72 73 * Very similarly for freeing by group, except we use free_g_list. 74 * 75 * This has the very interesting property of being able to run concurrently with 76 * any (or all) other directions. 77 */ 78 79 #include <linux/fs.h> 80 #include <linux/init.h> 81 #include <linux/kernel.h> 82 #include <linux/kthread.h> 83 #include <linux/module.h> 84 #include <linux/mutex.h> 85 #include <linux/slab.h> 86 #include <linux/spinlock.h> 87 #include <linux/srcu.h> 88 89 #include <linux/atomic.h> 90 91 #include <linux/fsnotify_backend.h> 92 #include "fsnotify.h" 93 94 struct srcu_struct fsnotify_mark_srcu; 95 static DEFINE_SPINLOCK(destroy_lock); 96 static LIST_HEAD(destroy_list); 97 static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); 98 99 void fsnotify_get_mark(struct fsnotify_mark *mark) 100 { 101 atomic_inc(&mark->refcnt); 102 } 103 104 void fsnotify_put_mark(struct fsnotify_mark *mark) 105 { 106 if (atomic_dec_and_test(&mark->refcnt)) { 107 if (mark->group) 108 fsnotify_put_group(mark->group); 109 mark->free_mark(mark); 110 } 111 } 112 113 /* Calculate mask of events for a list of marks */ 114 u32 fsnotify_recalc_mask(struct hlist_head *head) 115 { 116 u32 new_mask = 0; 117 struct fsnotify_mark *mark; 118 119 hlist_for_each_entry(mark, head, obj_list) 120 new_mask |= mark->mask; 121 return new_mask; 122 } 123 124 /* 125 * Any time a mark is getting freed we end up here. 126 * The caller had better be holding a reference to this mark so we don't actually 127 * do the final put under the mark->lock 128 */ 129 void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, 130 struct fsnotify_group *group) 131 { 132 struct inode *inode = NULL; 133 134 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 135 136 spin_lock(&mark->lock); 137 138 /* something else already called this function on this mark */ 139 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 140 spin_unlock(&mark->lock); 141 return; 142 } 143 144 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 145 146 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 147 inode = mark->inode; 148 fsnotify_destroy_inode_mark(mark); 149 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) 150 fsnotify_destroy_vfsmount_mark(mark); 151 else 152 BUG(); 153 154 list_del_init(&mark->g_list); 155 156 spin_unlock(&mark->lock); 157 158 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) 159 iput(inode); 160 /* release lock temporarily */ 161 mutex_unlock(&group->mark_mutex); 162 163 spin_lock(&destroy_lock); 164 list_add(&mark->g_list, &destroy_list); 165 spin_unlock(&destroy_lock); 166 wake_up(&destroy_waitq); 167 /* 168 * We don't necessarily have a ref on mark from caller so the above destroy 169 * may have actually freed it, unless this group provides a 'freeing_mark' 170 * function which must be holding a reference. 171 */ 172 173 /* 174 * Some groups like to know that marks are being freed. This is a 175 * callback to the group function to let it know that this mark 176 * is being freed. 177 */ 178 if (group->ops->freeing_mark) 179 group->ops->freeing_mark(mark, group); 180 181 /* 182 * __fsnotify_update_child_dentry_flags(inode); 183 * 184 * I really want to call that, but we can't, we have no idea if the inode 185 * still exists the second we drop the mark->lock. 186 * 187 * The next time an event arrive to this inode from one of it's children 188 * __fsnotify_parent will see that the inode doesn't care about it's 189 * children and will update all of these flags then. So really this 190 * is just a lazy update (and could be a perf win...) 191 */ 192 193 atomic_dec(&group->num_marks); 194 195 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 196 } 197 198 void fsnotify_destroy_mark(struct fsnotify_mark *mark, 199 struct fsnotify_group *group) 200 { 201 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 202 fsnotify_destroy_mark_locked(mark, group); 203 mutex_unlock(&group->mark_mutex); 204 } 205 206 /* 207 * Destroy all marks in the given list. The marks must be already detached from 208 * the original inode / vfsmount. 209 */ 210 void fsnotify_destroy_marks(struct list_head *to_free) 211 { 212 struct fsnotify_mark *mark, *lmark; 213 struct fsnotify_group *group; 214 215 list_for_each_entry_safe(mark, lmark, to_free, free_list) { 216 spin_lock(&mark->lock); 217 fsnotify_get_group(mark->group); 218 group = mark->group; 219 spin_unlock(&mark->lock); 220 221 fsnotify_destroy_mark(mark, group); 222 fsnotify_put_mark(mark); 223 fsnotify_put_group(group); 224 } 225 } 226 227 void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) 228 { 229 assert_spin_locked(&mark->lock); 230 231 mark->mask = mask; 232 233 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) 234 fsnotify_set_inode_mark_mask_locked(mark, mask); 235 } 236 237 void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) 238 { 239 assert_spin_locked(&mark->lock); 240 241 mark->ignored_mask = mask; 242 } 243 244 /* 245 * Sorting function for lists of fsnotify marks. 246 * 247 * Fanotify supports different notification classes (reflected as priority of 248 * notification group). Events shall be passed to notification groups in 249 * decreasing priority order. To achieve this marks in notification lists for 250 * inodes and vfsmounts are sorted so that priorities of corresponding groups 251 * are descending. 252 * 253 * Furthermore correct handling of the ignore mask requires processing inode 254 * and vfsmount marks of each group together. Using the group address as 255 * further sort criterion provides a unique sorting order and thus we can 256 * merge inode and vfsmount lists of marks in linear time and find groups 257 * present in both lists. 258 * 259 * A return value of 1 signifies that b has priority over a. 260 * A return value of 0 signifies that the two marks have to be handled together. 261 * A return value of -1 signifies that a has priority over b. 262 */ 263 int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) 264 { 265 if (a == b) 266 return 0; 267 if (!a) 268 return 1; 269 if (!b) 270 return -1; 271 if (a->priority < b->priority) 272 return 1; 273 if (a->priority > b->priority) 274 return -1; 275 if (a < b) 276 return 1; 277 return -1; 278 } 279 280 /* Add mark into proper place in given list of marks */ 281 int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, 282 int allow_dups) 283 { 284 struct fsnotify_mark *lmark, *last = NULL; 285 int cmp; 286 287 /* is mark the first mark? */ 288 if (hlist_empty(head)) { 289 hlist_add_head_rcu(&mark->obj_list, head); 290 return 0; 291 } 292 293 /* should mark be in the middle of the current list? */ 294 hlist_for_each_entry(lmark, head, obj_list) { 295 last = lmark; 296 297 if ((lmark->group == mark->group) && !allow_dups) 298 return -EEXIST; 299 300 cmp = fsnotify_compare_groups(lmark->group, mark->group); 301 if (cmp >= 0) { 302 hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); 303 return 0; 304 } 305 } 306 307 BUG_ON(last == NULL); 308 /* mark should be the last entry. last is the current last entry */ 309 hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); 310 return 0; 311 } 312 313 /* 314 * Attach an initialized mark to a given group and fs object. 315 * These marks may be used for the fsnotify backend to determine which 316 * event types should be delivered to which group. 317 */ 318 int fsnotify_add_mark_locked(struct fsnotify_mark *mark, 319 struct fsnotify_group *group, struct inode *inode, 320 struct vfsmount *mnt, int allow_dups) 321 { 322 int ret = 0; 323 324 BUG_ON(inode && mnt); 325 BUG_ON(!inode && !mnt); 326 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 327 328 /* 329 * LOCKING ORDER!!!! 330 * group->mark_mutex 331 * mark->lock 332 * inode->i_lock 333 */ 334 spin_lock(&mark->lock); 335 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; 336 337 fsnotify_get_group(group); 338 mark->group = group; 339 list_add(&mark->g_list, &group->marks_list); 340 atomic_inc(&group->num_marks); 341 fsnotify_get_mark(mark); /* for i_list and g_list */ 342 343 if (inode) { 344 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); 345 if (ret) 346 goto err; 347 } else if (mnt) { 348 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); 349 if (ret) 350 goto err; 351 } else { 352 BUG(); 353 } 354 355 /* this will pin the object if appropriate */ 356 fsnotify_set_mark_mask_locked(mark, mark->mask); 357 spin_unlock(&mark->lock); 358 359 if (inode) 360 __fsnotify_update_child_dentry_flags(inode); 361 362 return ret; 363 err: 364 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 365 list_del_init(&mark->g_list); 366 fsnotify_put_group(group); 367 mark->group = NULL; 368 atomic_dec(&group->num_marks); 369 370 spin_unlock(&mark->lock); 371 372 spin_lock(&destroy_lock); 373 list_add(&mark->g_list, &destroy_list); 374 spin_unlock(&destroy_lock); 375 wake_up(&destroy_waitq); 376 377 return ret; 378 } 379 380 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, 381 struct inode *inode, struct vfsmount *mnt, int allow_dups) 382 { 383 int ret; 384 mutex_lock(&group->mark_mutex); 385 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); 386 mutex_unlock(&group->mark_mutex); 387 return ret; 388 } 389 390 /* 391 * Given a list of marks, find the mark associated with given group. If found 392 * take a reference to that mark and return it, else return NULL. 393 */ 394 struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, 395 struct fsnotify_group *group) 396 { 397 struct fsnotify_mark *mark; 398 399 hlist_for_each_entry(mark, head, obj_list) { 400 if (mark->group == group) { 401 fsnotify_get_mark(mark); 402 return mark; 403 } 404 } 405 return NULL; 406 } 407 408 /* 409 * clear any marks in a group in which mark->flags & flags is true 410 */ 411 void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, 412 unsigned int flags) 413 { 414 struct fsnotify_mark *lmark, *mark; 415 LIST_HEAD(to_free); 416 417 /* 418 * We have to be really careful here. Anytime we drop mark_mutex, e.g. 419 * fsnotify_clear_marks_by_inode() can come and free marks. Even in our 420 * to_free list so we have to use mark_mutex even when accessing that 421 * list. And freeing mark requires us to drop mark_mutex. So we can 422 * reliably free only the first mark in the list. That's why we first 423 * move marks to free to to_free list in one go and then free marks in 424 * to_free list one by one. 425 */ 426 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 427 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { 428 if (mark->flags & flags) 429 list_move(&mark->g_list, &to_free); 430 } 431 mutex_unlock(&group->mark_mutex); 432 433 while (1) { 434 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 435 if (list_empty(&to_free)) { 436 mutex_unlock(&group->mark_mutex); 437 break; 438 } 439 mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); 440 fsnotify_get_mark(mark); 441 fsnotify_destroy_mark_locked(mark, group); 442 mutex_unlock(&group->mark_mutex); 443 fsnotify_put_mark(mark); 444 } 445 } 446 447 /* 448 * Given a group, destroy all of the marks associated with that group. 449 */ 450 void fsnotify_clear_marks_by_group(struct fsnotify_group *group) 451 { 452 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); 453 } 454 455 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) 456 { 457 assert_spin_locked(&old->lock); 458 new->inode = old->inode; 459 new->mnt = old->mnt; 460 if (old->group) 461 fsnotify_get_group(old->group); 462 new->group = old->group; 463 new->mask = old->mask; 464 new->free_mark = old->free_mark; 465 } 466 467 /* 468 * Nothing fancy, just initialize lists and locks and counters. 469 */ 470 void fsnotify_init_mark(struct fsnotify_mark *mark, 471 void (*free_mark)(struct fsnotify_mark *mark)) 472 { 473 memset(mark, 0, sizeof(*mark)); 474 spin_lock_init(&mark->lock); 475 atomic_set(&mark->refcnt, 1); 476 mark->free_mark = free_mark; 477 } 478 479 static int fsnotify_mark_destroy(void *ignored) 480 { 481 struct fsnotify_mark *mark, *next; 482 struct list_head private_destroy_list; 483 484 for (;;) { 485 spin_lock(&destroy_lock); 486 /* exchange the list head */ 487 list_replace_init(&destroy_list, &private_destroy_list); 488 spin_unlock(&destroy_lock); 489 490 synchronize_srcu(&fsnotify_mark_srcu); 491 492 list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { 493 list_del_init(&mark->g_list); 494 fsnotify_put_mark(mark); 495 } 496 497 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); 498 } 499 500 return 0; 501 } 502 503 static int __init fsnotify_mark_init(void) 504 { 505 struct task_struct *thread; 506 507 thread = kthread_run(fsnotify_mark_destroy, NULL, 508 "fsnotify_mark"); 509 if (IS_ERR(thread)) 510 panic("unable to start fsnotify mark destruction thread."); 511 512 return 0; 513 } 514 device_initcall(fsnotify_mark_init); 515