1 /* 2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2, or (at your option) 7 * any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; see the file COPYING. If not, write to 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 17 */ 18 19 /* 20 * fsnotify inode mark locking/lifetime/and refcnting 21 * 22 * REFCNT: 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel 24 * currently are referencing the objects. Both kind of objects typically will 25 * live inside the kernel with a refcnt of 2, one for its creation and one for 26 * the reference a group and a mark hold to each other. 27 * If you are holding the appropriate locks, you can take a reference and the 28 * object itself is guaranteed to survive until the reference is dropped. 29 * 30 * LOCKING: 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken 32 * in order as follows: 33 * 34 * group->mark_mutex 35 * mark->lock 36 * inode->i_lock 37 * 38 * group->mark_mutex protects the marks_list anchored inside a given group and 39 * each mark is hooked via the g_list. It also protects the groups private 40 * data (i.e group limits). 41 42 * mark->lock protects the marks attributes like its masks and flags. 43 * Furthermore it protects the access to a reference of the group that the mark 44 * is assigned to as well as the access to a reference of the inode/vfsmount 45 * that is being watched by the mark. 46 * 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 48 * given inode and each mark is hooked via the i_list. (and sorta the 49 * free_i_list) 50 * 51 * 52 * LIFETIME: 53 * Inode marks survive between when they are added to an inode and when their 54 * refcnt==0. 55 * 56 * The inode mark can be cleared for a number of different reasons including: 57 * - The inode is unlinked for the last time. (fsnotify_inode_remove) 58 * - The inode is being evicted from cache. (fsnotify_inode_delete) 59 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) 60 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) 61 * - The fsnotify_group associated with the mark is going away and all such marks 62 * need to be cleaned up. (fsnotify_clear_marks_by_group) 63 * 64 * Worst case we are given an inode and need to clean up all the marks on that 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 66 * mark on the list we take a reference (so the mark can't disappear under us). 67 * We remove that mark form the inode's list of marks and we add this mark to a 68 * private list anchored on the stack using i_free_list; we walk i_free_list 69 * and before we destroy the mark we make sure that we dont race with a 70 * concurrent destroy_group by getting a ref to the marks group and taking the 71 * groups mutex. 72 73 * Very similarly for freeing by group, except we use free_g_list. 74 * 75 * This has the very interesting property of being able to run concurrently with 76 * any (or all) other directions. 77 */ 78 79 #include <linux/fs.h> 80 #include <linux/init.h> 81 #include <linux/kernel.h> 82 #include <linux/kthread.h> 83 #include <linux/module.h> 84 #include <linux/mutex.h> 85 #include <linux/slab.h> 86 #include <linux/spinlock.h> 87 #include <linux/srcu.h> 88 89 #include <linux/atomic.h> 90 91 #include <linux/fsnotify_backend.h> 92 #include "fsnotify.h" 93 94 struct srcu_struct fsnotify_mark_srcu; 95 static DEFINE_SPINLOCK(destroy_lock); 96 static LIST_HEAD(destroy_list); 97 static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); 98 99 void fsnotify_get_mark(struct fsnotify_mark *mark) 100 { 101 atomic_inc(&mark->refcnt); 102 } 103 104 void fsnotify_put_mark(struct fsnotify_mark *mark) 105 { 106 if (atomic_dec_and_test(&mark->refcnt)) { 107 if (mark->group) 108 fsnotify_put_group(mark->group); 109 mark->free_mark(mark); 110 } 111 } 112 113 /* Calculate mask of events for a list of marks */ 114 u32 fsnotify_recalc_mask(struct hlist_head *head) 115 { 116 u32 new_mask = 0; 117 struct fsnotify_mark *mark; 118 119 hlist_for_each_entry(mark, head, obj_list) 120 new_mask |= mark->mask; 121 return new_mask; 122 } 123 124 /* 125 * Remove mark from inode / vfsmount list, group list, drop inode reference 126 * if we got one. 127 * 128 * Must be called with group->mark_mutex held. 129 */ 130 void fsnotify_detach_mark(struct fsnotify_mark *mark) 131 { 132 struct inode *inode = NULL; 133 struct fsnotify_group *group = mark->group; 134 135 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 136 137 spin_lock(&mark->lock); 138 139 /* something else already called this function on this mark */ 140 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { 141 spin_unlock(&mark->lock); 142 return; 143 } 144 145 mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; 146 147 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 148 inode = mark->inode; 149 fsnotify_destroy_inode_mark(mark); 150 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) 151 fsnotify_destroy_vfsmount_mark(mark); 152 else 153 BUG(); 154 /* 155 * Note that we didn't update flags telling whether inode cares about 156 * what's happening with children. We update these flags from 157 * __fsnotify_parent() lazily when next event happens on one of our 158 * children. 159 */ 160 161 list_del_init(&mark->g_list); 162 163 spin_unlock(&mark->lock); 164 165 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) 166 iput(inode); 167 168 atomic_dec(&group->num_marks); 169 } 170 171 /* 172 * Free fsnotify mark. The freeing is actually happening from a kthread which 173 * first waits for srcu period end. Caller must have a reference to the mark 174 * or be protected by fsnotify_mark_srcu. 175 */ 176 void fsnotify_free_mark(struct fsnotify_mark *mark) 177 { 178 struct fsnotify_group *group = mark->group; 179 180 spin_lock(&mark->lock); 181 /* something else already called this function on this mark */ 182 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 183 spin_unlock(&mark->lock); 184 return; 185 } 186 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 187 spin_unlock(&mark->lock); 188 189 spin_lock(&destroy_lock); 190 list_add(&mark->g_list, &destroy_list); 191 spin_unlock(&destroy_lock); 192 wake_up(&destroy_waitq); 193 194 /* 195 * Some groups like to know that marks are being freed. This is a 196 * callback to the group function to let it know that this mark 197 * is being freed. 198 */ 199 if (group->ops->freeing_mark) 200 group->ops->freeing_mark(mark, group); 201 } 202 203 void fsnotify_destroy_mark(struct fsnotify_mark *mark, 204 struct fsnotify_group *group) 205 { 206 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 207 fsnotify_detach_mark(mark); 208 mutex_unlock(&group->mark_mutex); 209 fsnotify_free_mark(mark); 210 } 211 212 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) 213 { 214 struct fsnotify_mark *mark; 215 216 while (1) { 217 /* 218 * We have to be careful since we can race with e.g. 219 * fsnotify_clear_marks_by_group() and once we drop 'lock', 220 * mark can get removed from the obj_list and destroyed. But 221 * we are holding mark reference so mark cannot be freed and 222 * calling fsnotify_destroy_mark() more than once is fine. 223 */ 224 spin_lock(lock); 225 if (hlist_empty(head)) { 226 spin_unlock(lock); 227 break; 228 } 229 mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); 230 /* 231 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here 232 * since inode / mount is going away anyway. So just remove 233 * mark from the list. 234 */ 235 hlist_del_init_rcu(&mark->obj_list); 236 fsnotify_get_mark(mark); 237 spin_unlock(lock); 238 fsnotify_destroy_mark(mark, mark->group); 239 fsnotify_put_mark(mark); 240 } 241 } 242 243 void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) 244 { 245 assert_spin_locked(&mark->lock); 246 247 mark->mask = mask; 248 249 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) 250 fsnotify_set_inode_mark_mask_locked(mark, mask); 251 } 252 253 void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) 254 { 255 assert_spin_locked(&mark->lock); 256 257 mark->ignored_mask = mask; 258 } 259 260 /* 261 * Sorting function for lists of fsnotify marks. 262 * 263 * Fanotify supports different notification classes (reflected as priority of 264 * notification group). Events shall be passed to notification groups in 265 * decreasing priority order. To achieve this marks in notification lists for 266 * inodes and vfsmounts are sorted so that priorities of corresponding groups 267 * are descending. 268 * 269 * Furthermore correct handling of the ignore mask requires processing inode 270 * and vfsmount marks of each group together. Using the group address as 271 * further sort criterion provides a unique sorting order and thus we can 272 * merge inode and vfsmount lists of marks in linear time and find groups 273 * present in both lists. 274 * 275 * A return value of 1 signifies that b has priority over a. 276 * A return value of 0 signifies that the two marks have to be handled together. 277 * A return value of -1 signifies that a has priority over b. 278 */ 279 int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) 280 { 281 if (a == b) 282 return 0; 283 if (!a) 284 return 1; 285 if (!b) 286 return -1; 287 if (a->priority < b->priority) 288 return 1; 289 if (a->priority > b->priority) 290 return -1; 291 if (a < b) 292 return 1; 293 return -1; 294 } 295 296 /* Add mark into proper place in given list of marks */ 297 int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, 298 int allow_dups) 299 { 300 struct fsnotify_mark *lmark, *last = NULL; 301 int cmp; 302 303 /* is mark the first mark? */ 304 if (hlist_empty(head)) { 305 hlist_add_head_rcu(&mark->obj_list, head); 306 return 0; 307 } 308 309 /* should mark be in the middle of the current list? */ 310 hlist_for_each_entry(lmark, head, obj_list) { 311 last = lmark; 312 313 if ((lmark->group == mark->group) && !allow_dups) 314 return -EEXIST; 315 316 cmp = fsnotify_compare_groups(lmark->group, mark->group); 317 if (cmp >= 0) { 318 hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); 319 return 0; 320 } 321 } 322 323 BUG_ON(last == NULL); 324 /* mark should be the last entry. last is the current last entry */ 325 hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); 326 return 0; 327 } 328 329 /* 330 * Attach an initialized mark to a given group and fs object. 331 * These marks may be used for the fsnotify backend to determine which 332 * event types should be delivered to which group. 333 */ 334 int fsnotify_add_mark_locked(struct fsnotify_mark *mark, 335 struct fsnotify_group *group, struct inode *inode, 336 struct vfsmount *mnt, int allow_dups) 337 { 338 int ret = 0; 339 340 BUG_ON(inode && mnt); 341 BUG_ON(!inode && !mnt); 342 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 343 344 /* 345 * LOCKING ORDER!!!! 346 * group->mark_mutex 347 * mark->lock 348 * inode->i_lock 349 */ 350 spin_lock(&mark->lock); 351 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; 352 353 fsnotify_get_group(group); 354 mark->group = group; 355 list_add(&mark->g_list, &group->marks_list); 356 atomic_inc(&group->num_marks); 357 fsnotify_get_mark(mark); /* for i_list and g_list */ 358 359 if (inode) { 360 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); 361 if (ret) 362 goto err; 363 } else if (mnt) { 364 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); 365 if (ret) 366 goto err; 367 } else { 368 BUG(); 369 } 370 371 /* this will pin the object if appropriate */ 372 fsnotify_set_mark_mask_locked(mark, mark->mask); 373 spin_unlock(&mark->lock); 374 375 if (inode) 376 __fsnotify_update_child_dentry_flags(inode); 377 378 return ret; 379 err: 380 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 381 list_del_init(&mark->g_list); 382 fsnotify_put_group(group); 383 mark->group = NULL; 384 atomic_dec(&group->num_marks); 385 386 spin_unlock(&mark->lock); 387 388 spin_lock(&destroy_lock); 389 list_add(&mark->g_list, &destroy_list); 390 spin_unlock(&destroy_lock); 391 wake_up(&destroy_waitq); 392 393 return ret; 394 } 395 396 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, 397 struct inode *inode, struct vfsmount *mnt, int allow_dups) 398 { 399 int ret; 400 mutex_lock(&group->mark_mutex); 401 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); 402 mutex_unlock(&group->mark_mutex); 403 return ret; 404 } 405 406 /* 407 * Given a list of marks, find the mark associated with given group. If found 408 * take a reference to that mark and return it, else return NULL. 409 */ 410 struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, 411 struct fsnotify_group *group) 412 { 413 struct fsnotify_mark *mark; 414 415 hlist_for_each_entry(mark, head, obj_list) { 416 if (mark->group == group) { 417 fsnotify_get_mark(mark); 418 return mark; 419 } 420 } 421 return NULL; 422 } 423 424 /* 425 * clear any marks in a group in which mark->flags & flags is true 426 */ 427 void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, 428 unsigned int flags) 429 { 430 struct fsnotify_mark *lmark, *mark; 431 LIST_HEAD(to_free); 432 433 /* 434 * We have to be really careful here. Anytime we drop mark_mutex, e.g. 435 * fsnotify_clear_marks_by_inode() can come and free marks. Even in our 436 * to_free list so we have to use mark_mutex even when accessing that 437 * list. And freeing mark requires us to drop mark_mutex. So we can 438 * reliably free only the first mark in the list. That's why we first 439 * move marks to free to to_free list in one go and then free marks in 440 * to_free list one by one. 441 */ 442 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 443 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { 444 if (mark->flags & flags) 445 list_move(&mark->g_list, &to_free); 446 } 447 mutex_unlock(&group->mark_mutex); 448 449 while (1) { 450 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 451 if (list_empty(&to_free)) { 452 mutex_unlock(&group->mark_mutex); 453 break; 454 } 455 mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); 456 fsnotify_get_mark(mark); 457 fsnotify_detach_mark(mark); 458 mutex_unlock(&group->mark_mutex); 459 fsnotify_free_mark(mark); 460 fsnotify_put_mark(mark); 461 } 462 } 463 464 /* 465 * Given a group, destroy all of the marks associated with that group. 466 */ 467 void fsnotify_clear_marks_by_group(struct fsnotify_group *group) 468 { 469 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); 470 } 471 472 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) 473 { 474 assert_spin_locked(&old->lock); 475 new->inode = old->inode; 476 new->mnt = old->mnt; 477 if (old->group) 478 fsnotify_get_group(old->group); 479 new->group = old->group; 480 new->mask = old->mask; 481 new->free_mark = old->free_mark; 482 } 483 484 /* 485 * Nothing fancy, just initialize lists and locks and counters. 486 */ 487 void fsnotify_init_mark(struct fsnotify_mark *mark, 488 void (*free_mark)(struct fsnotify_mark *mark)) 489 { 490 memset(mark, 0, sizeof(*mark)); 491 spin_lock_init(&mark->lock); 492 atomic_set(&mark->refcnt, 1); 493 mark->free_mark = free_mark; 494 } 495 496 static int fsnotify_mark_destroy(void *ignored) 497 { 498 struct fsnotify_mark *mark, *next; 499 struct list_head private_destroy_list; 500 501 for (;;) { 502 spin_lock(&destroy_lock); 503 /* exchange the list head */ 504 list_replace_init(&destroy_list, &private_destroy_list); 505 spin_unlock(&destroy_lock); 506 507 synchronize_srcu(&fsnotify_mark_srcu); 508 509 list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { 510 list_del_init(&mark->g_list); 511 fsnotify_put_mark(mark); 512 } 513 514 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); 515 } 516 517 return 0; 518 } 519 520 static int __init fsnotify_mark_init(void) 521 { 522 struct task_struct *thread; 523 524 thread = kthread_run(fsnotify_mark_destroy, NULL, 525 "fsnotify_mark"); 526 if (IS_ERR(thread)) 527 panic("unable to start fsnotify mark destruction thread."); 528 529 return 0; 530 } 531 device_initcall(fsnotify_mark_init); 532