1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_trans_priv.h"
16 #include "xfs_inode_item.h"
17 #include "xfs_quota.h"
18 #include "xfs_trace.h"
19 #include "xfs_icache.h"
20 #include "xfs_bmap_util.h"
21 #include "xfs_dquot_item.h"
22 #include "xfs_dquot.h"
23 #include "xfs_reflink.h"
24 #include "xfs_ialloc.h"
25 #include "xfs_ag.h"
26 #include "xfs_log_priv.h"
27 #include "xfs_health.h"
28 #include "xfs_da_format.h"
29 #include "xfs_dir2.h"
30 #include "xfs_metafile.h"
31
32 #include <linux/iversion.h>
33
34 /* Radix tree tags for incore inode tree. */
35
36 /* inode is to be reclaimed */
37 #define XFS_ICI_RECLAIM_TAG 0
38 /* Inode has speculative preallocations (posteof or cow) to clean. */
39 #define XFS_ICI_BLOCKGC_TAG 1
40
41 /*
42 * The goal for walking incore inodes. These can correspond with incore inode
43 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
44 */
45 enum xfs_icwalk_goal {
46 /* Goals directly associated with tagged inodes. */
47 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
48 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
49 };
50
51 static int xfs_icwalk(struct xfs_mount *mp,
52 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
53 static int xfs_icwalk_ag(struct xfs_perag *pag,
54 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
55
56 /*
57 * Private inode cache walk flags for struct xfs_icwalk. Must not
58 * coincide with XFS_ICWALK_FLAGS_VALID.
59 */
60
61 /* Stop scanning after icw_scan_limit inodes. */
62 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
63
64 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
65 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
66
67 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \
68 XFS_ICWALK_FLAG_RECLAIM_SICK | \
69 XFS_ICWALK_FLAG_UNION)
70
71 /* Marks for the perag xarray */
72 #define XFS_PERAG_RECLAIM_MARK XA_MARK_0
73 #define XFS_PERAG_BLOCKGC_MARK XA_MARK_1
74
ici_tag_to_mark(unsigned int tag)75 static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
76 {
77 if (tag == XFS_ICI_RECLAIM_TAG)
78 return XFS_PERAG_RECLAIM_MARK;
79 ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
80 return XFS_PERAG_BLOCKGC_MARK;
81 }
82
83 /*
84 * Allocate and initialise an xfs_inode.
85 */
86 struct xfs_inode *
xfs_inode_alloc(struct xfs_mount * mp,xfs_ino_t ino)87 xfs_inode_alloc(
88 struct xfs_mount *mp,
89 xfs_ino_t ino)
90 {
91 struct xfs_inode *ip;
92
93 /*
94 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
95 * and return NULL here on ENOMEM.
96 */
97 ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
98
99 if (inode_init_always(mp->m_super, VFS_I(ip))) {
100 kmem_cache_free(xfs_inode_cache, ip);
101 return NULL;
102 }
103
104 /* VFS doesn't initialise i_mode! */
105 VFS_I(ip)->i_mode = 0;
106 mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
107 M_IGEO(mp)->min_folio_order);
108
109 XFS_STATS_INC(mp, vn_active);
110 ASSERT(atomic_read(&ip->i_pincount) == 0);
111 ASSERT(ip->i_ino == 0);
112
113 /* initialise the xfs inode */
114 ip->i_ino = ino;
115 ip->i_mount = mp;
116 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
117 ip->i_cowfp = NULL;
118 memset(&ip->i_af, 0, sizeof(ip->i_af));
119 ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
120 memset(&ip->i_df, 0, sizeof(ip->i_df));
121 ip->i_flags = 0;
122 ip->i_delayed_blks = 0;
123 ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
124 ip->i_nblocks = 0;
125 ip->i_forkoff = 0;
126 ip->i_sick = 0;
127 ip->i_checked = 0;
128 INIT_WORK(&ip->i_ioend_work, xfs_end_io);
129 INIT_LIST_HEAD(&ip->i_ioend_list);
130 spin_lock_init(&ip->i_ioend_lock);
131 ip->i_next_unlinked = NULLAGINO;
132 ip->i_prev_unlinked = 0;
133
134 return ip;
135 }
136
137 STATIC void
xfs_inode_free_callback(struct rcu_head * head)138 xfs_inode_free_callback(
139 struct rcu_head *head)
140 {
141 struct inode *inode = container_of(head, struct inode, i_rcu);
142 struct xfs_inode *ip = XFS_I(inode);
143
144 switch (VFS_I(ip)->i_mode & S_IFMT) {
145 case S_IFREG:
146 case S_IFDIR:
147 case S_IFLNK:
148 xfs_idestroy_fork(&ip->i_df);
149 break;
150 }
151
152 xfs_ifork_zap_attr(ip);
153
154 if (ip->i_cowfp) {
155 xfs_idestroy_fork(ip->i_cowfp);
156 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
157 }
158 if (ip->i_itemp) {
159 ASSERT(!test_bit(XFS_LI_IN_AIL,
160 &ip->i_itemp->ili_item.li_flags));
161 xfs_inode_item_destroy(ip);
162 ip->i_itemp = NULL;
163 }
164
165 kmem_cache_free(xfs_inode_cache, ip);
166 }
167
168 static void
__xfs_inode_free(struct xfs_inode * ip)169 __xfs_inode_free(
170 struct xfs_inode *ip)
171 {
172 /* asserts to verify all state is correct here */
173 ASSERT(atomic_read(&ip->i_pincount) == 0);
174 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
175 XFS_STATS_DEC(ip->i_mount, vn_active);
176
177 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
178 }
179
180 void
xfs_inode_free(struct xfs_inode * ip)181 xfs_inode_free(
182 struct xfs_inode *ip)
183 {
184 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
185
186 /*
187 * Because we use RCU freeing we need to ensure the inode always
188 * appears to be reclaimed with an invalid inode number when in the
189 * free state. The ip->i_flags_lock provides the barrier against lookup
190 * races.
191 */
192 spin_lock(&ip->i_flags_lock);
193 ip->i_flags = XFS_IRECLAIM;
194 ip->i_ino = 0;
195 spin_unlock(&ip->i_flags_lock);
196
197 __xfs_inode_free(ip);
198 }
199
200 /*
201 * Queue background inode reclaim work if there are reclaimable inodes and there
202 * isn't reclaim work already scheduled or in progress.
203 */
204 static void
xfs_reclaim_work_queue(struct xfs_mount * mp)205 xfs_reclaim_work_queue(
206 struct xfs_mount *mp)
207 {
208
209 rcu_read_lock();
210 if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
211 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
212 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
213 }
214 rcu_read_unlock();
215 }
216
217 /*
218 * Background scanning to trim preallocated space. This is queued based on the
219 * 'speculative_prealloc_lifetime' tunable (5m by default).
220 */
221 static inline void
xfs_blockgc_queue(struct xfs_perag * pag)222 xfs_blockgc_queue(
223 struct xfs_perag *pag)
224 {
225 struct xfs_mount *mp = pag_mount(pag);
226
227 if (!xfs_is_blockgc_enabled(mp))
228 return;
229
230 rcu_read_lock();
231 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
232 queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work,
233 secs_to_jiffies(xfs_blockgc_secs));
234 rcu_read_unlock();
235 }
236
237 /* Set a tag on both the AG incore inode tree and the AG radix tree. */
238 static void
xfs_perag_set_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)239 xfs_perag_set_inode_tag(
240 struct xfs_perag *pag,
241 xfs_agino_t agino,
242 unsigned int tag)
243 {
244 bool was_tagged;
245
246 lockdep_assert_held(&pag->pag_ici_lock);
247
248 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
249 radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
250
251 if (tag == XFS_ICI_RECLAIM_TAG)
252 pag->pag_ici_reclaimable++;
253
254 if (was_tagged)
255 return;
256
257 /* propagate the tag up into the pag xarray tree */
258 xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag));
259
260 /* start background work */
261 switch (tag) {
262 case XFS_ICI_RECLAIM_TAG:
263 xfs_reclaim_work_queue(pag_mount(pag));
264 break;
265 case XFS_ICI_BLOCKGC_TAG:
266 xfs_blockgc_queue(pag);
267 break;
268 }
269
270 trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
271 }
272
273 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
274 static void
xfs_perag_clear_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)275 xfs_perag_clear_inode_tag(
276 struct xfs_perag *pag,
277 xfs_agino_t agino,
278 unsigned int tag)
279 {
280 lockdep_assert_held(&pag->pag_ici_lock);
281
282 /*
283 * Reclaim can signal (with a null agino) that it cleared its own tag
284 * by removing the inode from the radix tree.
285 */
286 if (agino != NULLAGINO)
287 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
288 else
289 ASSERT(tag == XFS_ICI_RECLAIM_TAG);
290
291 if (tag == XFS_ICI_RECLAIM_TAG)
292 pag->pag_ici_reclaimable--;
293
294 if (radix_tree_tagged(&pag->pag_ici_root, tag))
295 return;
296
297 /* clear the tag from the pag xarray */
298 xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag));
299 trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
300 }
301
302 /*
303 * Find the next AG after @pag, or the first AG if @pag is NULL.
304 */
305 static struct xfs_perag *
xfs_perag_grab_next_tag(struct xfs_mount * mp,struct xfs_perag * pag,int tag)306 xfs_perag_grab_next_tag(
307 struct xfs_mount *mp,
308 struct xfs_perag *pag,
309 int tag)
310 {
311 return to_perag(xfs_group_grab_next_mark(mp,
312 pag ? pag_group(pag) : NULL,
313 ici_tag_to_mark(tag), XG_TYPE_AG));
314 }
315
316 /*
317 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
318 * part of the structure. This is made more complex by the fact we store
319 * information about the on-disk values in the VFS inode and so we can't just
320 * overwrite the values unconditionally. Hence we save the parameters we
321 * need to retain across reinitialisation, and rewrite them into the VFS inode
322 * after reinitialisation even if it fails.
323 */
324 static int
xfs_reinit_inode(struct xfs_mount * mp,struct inode * inode)325 xfs_reinit_inode(
326 struct xfs_mount *mp,
327 struct inode *inode)
328 {
329 int error;
330 uint32_t nlink = inode->i_nlink;
331 uint32_t generation = inode->i_generation;
332 uint64_t version = inode_peek_iversion(inode);
333 umode_t mode = inode->i_mode;
334 dev_t dev = inode->i_rdev;
335 kuid_t uid = inode->i_uid;
336 kgid_t gid = inode->i_gid;
337 unsigned long state = inode->i_state;
338
339 error = inode_init_always(mp->m_super, inode);
340
341 set_nlink(inode, nlink);
342 inode->i_generation = generation;
343 inode_set_iversion_queried(inode, version);
344 inode->i_mode = mode;
345 inode->i_rdev = dev;
346 inode->i_uid = uid;
347 inode->i_gid = gid;
348 inode->i_state = state;
349 mapping_set_folio_min_order(inode->i_mapping,
350 M_IGEO(mp)->min_folio_order);
351 return error;
352 }
353
354 /*
355 * Carefully nudge an inode whose VFS state has been torn down back into a
356 * usable state. Drops the i_flags_lock and the rcu read lock.
357 */
358 static int
xfs_iget_recycle(struct xfs_perag * pag,struct xfs_inode * ip)359 xfs_iget_recycle(
360 struct xfs_perag *pag,
361 struct xfs_inode *ip) __releases(&ip->i_flags_lock)
362 {
363 struct xfs_mount *mp = ip->i_mount;
364 struct inode *inode = VFS_I(ip);
365 int error;
366
367 trace_xfs_iget_recycle(ip);
368
369 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
370 return -EAGAIN;
371
372 /*
373 * We need to make it look like the inode is being reclaimed to prevent
374 * the actual reclaim workers from stomping over us while we recycle
375 * the inode. We can't clear the radix tree tag yet as it requires
376 * pag_ici_lock to be held exclusive.
377 */
378 ip->i_flags |= XFS_IRECLAIM;
379
380 spin_unlock(&ip->i_flags_lock);
381 rcu_read_unlock();
382
383 ASSERT(!rwsem_is_locked(&inode->i_rwsem));
384 error = xfs_reinit_inode(mp, inode);
385 xfs_iunlock(ip, XFS_ILOCK_EXCL);
386 if (error) {
387 /*
388 * Re-initializing the inode failed, and we are in deep
389 * trouble. Try to re-add it to the reclaim list.
390 */
391 rcu_read_lock();
392 spin_lock(&ip->i_flags_lock);
393 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
394 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
395 spin_unlock(&ip->i_flags_lock);
396 rcu_read_unlock();
397
398 trace_xfs_iget_recycle_fail(ip);
399 return error;
400 }
401
402 spin_lock(&pag->pag_ici_lock);
403 spin_lock(&ip->i_flags_lock);
404
405 /*
406 * Clear the per-lifetime state in the inode as we are now effectively
407 * a new inode and need to return to the initial state before reuse
408 * occurs.
409 */
410 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
411 ip->i_flags |= XFS_INEW;
412 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
413 XFS_ICI_RECLAIM_TAG);
414 inode->i_state = I_NEW;
415 spin_unlock(&ip->i_flags_lock);
416 spin_unlock(&pag->pag_ici_lock);
417
418 return 0;
419 }
420
421 /*
422 * If we are allocating a new inode, then check what was returned is
423 * actually a free, empty inode. If we are not allocating an inode,
424 * then check we didn't find a free inode.
425 *
426 * Returns:
427 * 0 if the inode free state matches the lookup context
428 * -ENOENT if the inode is free and we are not allocating
429 * -EFSCORRUPTED if there is any state mismatch at all
430 */
431 static int
xfs_iget_check_free_state(struct xfs_inode * ip,int flags)432 xfs_iget_check_free_state(
433 struct xfs_inode *ip,
434 int flags)
435 {
436 if (flags & XFS_IGET_CREATE) {
437 /* should be a free inode */
438 if (VFS_I(ip)->i_mode != 0) {
439 xfs_warn(ip->i_mount,
440 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
441 ip->i_ino, VFS_I(ip)->i_mode);
442 xfs_agno_mark_sick(ip->i_mount,
443 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
444 XFS_SICK_AG_INOBT);
445 return -EFSCORRUPTED;
446 }
447
448 if (ip->i_nblocks != 0) {
449 xfs_warn(ip->i_mount,
450 "Corruption detected! Free inode 0x%llx has blocks allocated!",
451 ip->i_ino);
452 xfs_agno_mark_sick(ip->i_mount,
453 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
454 XFS_SICK_AG_INOBT);
455 return -EFSCORRUPTED;
456 }
457 return 0;
458 }
459
460 /* should be an allocated inode */
461 if (VFS_I(ip)->i_mode == 0)
462 return -ENOENT;
463
464 return 0;
465 }
466
467 /* Make all pending inactivation work start immediately. */
468 static bool
xfs_inodegc_queue_all(struct xfs_mount * mp)469 xfs_inodegc_queue_all(
470 struct xfs_mount *mp)
471 {
472 struct xfs_inodegc *gc;
473 int cpu;
474 bool ret = false;
475
476 for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
477 gc = per_cpu_ptr(mp->m_inodegc, cpu);
478 if (!llist_empty(&gc->list)) {
479 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
480 ret = true;
481 }
482 }
483
484 return ret;
485 }
486
487 /* Wait for all queued work and collect errors */
488 static int
xfs_inodegc_wait_all(struct xfs_mount * mp)489 xfs_inodegc_wait_all(
490 struct xfs_mount *mp)
491 {
492 int cpu;
493 int error = 0;
494
495 flush_workqueue(mp->m_inodegc_wq);
496 for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
497 struct xfs_inodegc *gc;
498
499 gc = per_cpu_ptr(mp->m_inodegc, cpu);
500 if (gc->error && !error)
501 error = gc->error;
502 gc->error = 0;
503 }
504
505 return error;
506 }
507
508 /*
509 * Check the validity of the inode we just found it the cache
510 */
511 static int
xfs_iget_cache_hit(struct xfs_perag * pag,struct xfs_inode * ip,xfs_ino_t ino,int flags,int lock_flags)512 xfs_iget_cache_hit(
513 struct xfs_perag *pag,
514 struct xfs_inode *ip,
515 xfs_ino_t ino,
516 int flags,
517 int lock_flags) __releases(RCU)
518 {
519 struct inode *inode = VFS_I(ip);
520 struct xfs_mount *mp = ip->i_mount;
521 int error;
522
523 /*
524 * check for re-use of an inode within an RCU grace period due to the
525 * radix tree nodes not being updated yet. We monitor for this by
526 * setting the inode number to zero before freeing the inode structure.
527 * If the inode has been reallocated and set up, then the inode number
528 * will not match, so check for that, too.
529 */
530 spin_lock(&ip->i_flags_lock);
531 if (ip->i_ino != ino)
532 goto out_skip;
533
534 /*
535 * If we are racing with another cache hit that is currently
536 * instantiating this inode or currently recycling it out of
537 * reclaimable state, wait for the initialisation to complete
538 * before continuing.
539 *
540 * If we're racing with the inactivation worker we also want to wait.
541 * If we're creating a new file, it's possible that the worker
542 * previously marked the inode as free on disk but hasn't finished
543 * updating the incore state yet. The AGI buffer will be dirty and
544 * locked to the icreate transaction, so a synchronous push of the
545 * inodegc workers would result in deadlock. For a regular iget, the
546 * worker is running already, so we might as well wait.
547 *
548 * XXX(hch): eventually we should do something equivalent to
549 * wait_on_inode to wait for these flags to be cleared
550 * instead of polling for it.
551 */
552 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
553 goto out_skip;
554
555 if (ip->i_flags & XFS_NEED_INACTIVE) {
556 /* Unlinked inodes cannot be re-grabbed. */
557 if (VFS_I(ip)->i_nlink == 0) {
558 error = -ENOENT;
559 goto out_error;
560 }
561 goto out_inodegc_flush;
562 }
563
564 /*
565 * Check the inode free state is valid. This also detects lookup
566 * racing with unlinks.
567 */
568 error = xfs_iget_check_free_state(ip, flags);
569 if (error)
570 goto out_error;
571
572 /* Skip inodes that have no vfs state. */
573 if ((flags & XFS_IGET_INCORE) &&
574 (ip->i_flags & XFS_IRECLAIMABLE))
575 goto out_skip;
576
577 /* The inode fits the selection criteria; process it. */
578 if (ip->i_flags & XFS_IRECLAIMABLE) {
579 /* Drops i_flags_lock and RCU read lock. */
580 error = xfs_iget_recycle(pag, ip);
581 if (error == -EAGAIN)
582 goto out_skip;
583 if (error)
584 return error;
585 } else {
586 /* If the VFS inode is being torn down, pause and try again. */
587 if (!igrab(inode))
588 goto out_skip;
589
590 /* We've got a live one. */
591 spin_unlock(&ip->i_flags_lock);
592 rcu_read_unlock();
593 trace_xfs_iget_hit(ip);
594 }
595
596 if (lock_flags != 0)
597 xfs_ilock(ip, lock_flags);
598
599 if (!(flags & XFS_IGET_INCORE))
600 xfs_iflags_clear(ip, XFS_ISTALE);
601 XFS_STATS_INC(mp, xs_ig_found);
602
603 return 0;
604
605 out_skip:
606 trace_xfs_iget_skip(ip);
607 XFS_STATS_INC(mp, xs_ig_frecycle);
608 error = -EAGAIN;
609 out_error:
610 spin_unlock(&ip->i_flags_lock);
611 rcu_read_unlock();
612 return error;
613
614 out_inodegc_flush:
615 spin_unlock(&ip->i_flags_lock);
616 rcu_read_unlock();
617 /*
618 * Do not wait for the workers, because the caller could hold an AGI
619 * buffer lock. We're just going to sleep in a loop anyway.
620 */
621 if (xfs_is_inodegc_enabled(mp))
622 xfs_inodegc_queue_all(mp);
623 return -EAGAIN;
624 }
625
626 static int
xfs_iget_cache_miss(struct xfs_mount * mp,struct xfs_perag * pag,xfs_trans_t * tp,xfs_ino_t ino,struct xfs_inode ** ipp,int flags,int lock_flags)627 xfs_iget_cache_miss(
628 struct xfs_mount *mp,
629 struct xfs_perag *pag,
630 xfs_trans_t *tp,
631 xfs_ino_t ino,
632 struct xfs_inode **ipp,
633 int flags,
634 int lock_flags)
635 {
636 struct xfs_inode *ip;
637 int error;
638 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
639
640 ip = xfs_inode_alloc(mp, ino);
641 if (!ip)
642 return -ENOMEM;
643
644 error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
645 if (error)
646 goto out_destroy;
647
648 /*
649 * For version 5 superblocks, if we are initialising a new inode, we
650 * simply build the new inode core with a random generation number.
651 *
652 * For version 4 (and older) superblocks, log recovery is dependent on
653 * the i_flushiter field being initialised from the current on-disk
654 * value and hence we must also read the inode off disk even when
655 * initializing new inodes.
656 */
657 if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) {
658 VFS_I(ip)->i_generation = get_random_u32();
659 } else {
660 struct xfs_buf *bp;
661
662 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
663 if (error)
664 goto out_destroy;
665
666 error = xfs_inode_from_disk(ip,
667 xfs_buf_offset(bp, ip->i_imap.im_boffset));
668 if (!error)
669 xfs_buf_set_ref(bp, XFS_INO_REF);
670 else
671 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
672 xfs_trans_brelse(tp, bp);
673
674 if (error)
675 goto out_destroy;
676 }
677
678 trace_xfs_iget_miss(ip);
679
680 /*
681 * Check the inode free state is valid. This also detects lookup
682 * racing with unlinks.
683 */
684 error = xfs_iget_check_free_state(ip, flags);
685 if (error)
686 goto out_destroy;
687
688 /*
689 * Preload the radix tree so we can insert safely under the
690 * write spinlock. Note that we cannot sleep inside the preload
691 * region.
692 */
693 if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
694 error = -EAGAIN;
695 goto out_destroy;
696 }
697
698 /*
699 * Because the inode hasn't been added to the radix-tree yet it can't
700 * be found by another thread, so we can do the non-sleeping lock here.
701 */
702 if (lock_flags) {
703 if (!xfs_ilock_nowait(ip, lock_flags))
704 BUG();
705 }
706
707 /*
708 * These values must be set before inserting the inode into the radix
709 * tree as the moment it is inserted a concurrent lookup (allowed by the
710 * RCU locking mechanism) can find it and that lookup must see that this
711 * is an inode currently under construction (i.e. that XFS_INEW is set).
712 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
713 * memory barrier that ensures this detection works correctly at lookup
714 * time.
715 */
716 if (flags & XFS_IGET_DONTCACHE)
717 d_mark_dontcache(VFS_I(ip));
718 ip->i_udquot = NULL;
719 ip->i_gdquot = NULL;
720 ip->i_pdquot = NULL;
721 xfs_iflags_set(ip, XFS_INEW);
722
723 /* insert the new inode */
724 spin_lock(&pag->pag_ici_lock);
725 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
726 if (unlikely(error)) {
727 WARN_ON(error != -EEXIST);
728 XFS_STATS_INC(mp, xs_ig_dup);
729 error = -EAGAIN;
730 goto out_preload_end;
731 }
732 spin_unlock(&pag->pag_ici_lock);
733 radix_tree_preload_end();
734
735 *ipp = ip;
736 return 0;
737
738 out_preload_end:
739 spin_unlock(&pag->pag_ici_lock);
740 radix_tree_preload_end();
741 if (lock_flags)
742 xfs_iunlock(ip, lock_flags);
743 out_destroy:
744 __destroy_inode(VFS_I(ip));
745 xfs_inode_free(ip);
746 return error;
747 }
748
749 /*
750 * Look up an inode by number in the given file system. The inode is looked up
751 * in the cache held in each AG. If the inode is found in the cache, initialise
752 * the vfs inode if necessary.
753 *
754 * If it is not in core, read it in from the file system's device, add it to the
755 * cache and initialise the vfs inode.
756 *
757 * The inode is locked according to the value of the lock_flags parameter.
758 * Inode lookup is only done during metadata operations and not as part of the
759 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
760 */
761 int
xfs_iget(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,uint flags,uint lock_flags,struct xfs_inode ** ipp)762 xfs_iget(
763 struct xfs_mount *mp,
764 struct xfs_trans *tp,
765 xfs_ino_t ino,
766 uint flags,
767 uint lock_flags,
768 struct xfs_inode **ipp)
769 {
770 struct xfs_inode *ip;
771 struct xfs_perag *pag;
772 xfs_agino_t agino;
773 int error;
774
775 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
776
777 /* reject inode numbers outside existing AGs */
778 if (!xfs_verify_ino(mp, ino))
779 return -EINVAL;
780
781 XFS_STATS_INC(mp, xs_ig_attempts);
782
783 /* get the perag structure and ensure that it's inode capable */
784 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
785 agino = XFS_INO_TO_AGINO(mp, ino);
786
787 again:
788 error = 0;
789 rcu_read_lock();
790 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
791
792 if (ip) {
793 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
794 if (error)
795 goto out_error_or_again;
796 } else {
797 rcu_read_unlock();
798 if (flags & XFS_IGET_INCORE) {
799 error = -ENODATA;
800 goto out_error_or_again;
801 }
802 XFS_STATS_INC(mp, xs_ig_missed);
803
804 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
805 flags, lock_flags);
806 if (error)
807 goto out_error_or_again;
808 }
809 xfs_perag_put(pag);
810
811 *ipp = ip;
812
813 /*
814 * If we have a real type for an on-disk inode, we can setup the inode
815 * now. If it's a new inode being created, xfs_init_new_inode will
816 * handle it.
817 */
818 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
819 xfs_setup_existing_inode(ip);
820 return 0;
821
822 out_error_or_again:
823 if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
824 error == -EAGAIN) {
825 delay(1);
826 goto again;
827 }
828 xfs_perag_put(pag);
829 return error;
830 }
831
832 /*
833 * Get a metadata inode.
834 *
835 * The metafile type must match the file mode exactly, and for files in the
836 * metadata directory tree, it must match the inode's metatype exactly.
837 */
838 int
xfs_trans_metafile_iget(struct xfs_trans * tp,xfs_ino_t ino,enum xfs_metafile_type metafile_type,struct xfs_inode ** ipp)839 xfs_trans_metafile_iget(
840 struct xfs_trans *tp,
841 xfs_ino_t ino,
842 enum xfs_metafile_type metafile_type,
843 struct xfs_inode **ipp)
844 {
845 struct xfs_mount *mp = tp->t_mountp;
846 struct xfs_inode *ip;
847 umode_t mode;
848 int error;
849
850 error = xfs_iget(mp, tp, ino, 0, 0, &ip);
851 if (error == -EFSCORRUPTED || error == -EINVAL)
852 goto whine;
853 if (error)
854 return error;
855
856 if (VFS_I(ip)->i_nlink == 0)
857 goto bad_rele;
858
859 if (metafile_type == XFS_METAFILE_DIR)
860 mode = S_IFDIR;
861 else
862 mode = S_IFREG;
863 if (inode_wrong_type(VFS_I(ip), mode))
864 goto bad_rele;
865 if (xfs_has_metadir(mp)) {
866 if (!xfs_is_metadir_inode(ip))
867 goto bad_rele;
868 if (metafile_type != ip->i_metatype)
869 goto bad_rele;
870 }
871
872 *ipp = ip;
873 return 0;
874 bad_rele:
875 xfs_irele(ip);
876 whine:
877 xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino,
878 metafile_type);
879 xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
880 return -EFSCORRUPTED;
881 }
882
883 /* Grab a metadata file if the caller doesn't already have a transaction. */
884 int
xfs_metafile_iget(struct xfs_mount * mp,xfs_ino_t ino,enum xfs_metafile_type metafile_type,struct xfs_inode ** ipp)885 xfs_metafile_iget(
886 struct xfs_mount *mp,
887 xfs_ino_t ino,
888 enum xfs_metafile_type metafile_type,
889 struct xfs_inode **ipp)
890 {
891 struct xfs_trans *tp;
892 int error;
893
894 tp = xfs_trans_alloc_empty(mp);
895 error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
896 xfs_trans_cancel(tp);
897 return error;
898 }
899
900 /*
901 * Grab the inode for reclaim exclusively.
902 *
903 * We have found this inode via a lookup under RCU, so the inode may have
904 * already been freed, or it may be in the process of being recycled by
905 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
906 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
907 * will not be set. Hence we need to check for both these flag conditions to
908 * avoid inodes that are no longer reclaim candidates.
909 *
910 * Note: checking for other state flags here, under the i_flags_lock or not, is
911 * racy and should be avoided. Those races should be resolved only after we have
912 * ensured that we are able to reclaim this inode and the world can see that we
913 * are going to reclaim it.
914 *
915 * Return true if we grabbed it, false otherwise.
916 */
917 static bool
xfs_reclaim_igrab(struct xfs_inode * ip,struct xfs_icwalk * icw)918 xfs_reclaim_igrab(
919 struct xfs_inode *ip,
920 struct xfs_icwalk *icw)
921 {
922 ASSERT(rcu_read_lock_held());
923
924 spin_lock(&ip->i_flags_lock);
925 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
926 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
927 /* not a reclaim candidate. */
928 spin_unlock(&ip->i_flags_lock);
929 return false;
930 }
931
932 /* Don't reclaim a sick inode unless the caller asked for it. */
933 if (ip->i_sick &&
934 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
935 spin_unlock(&ip->i_flags_lock);
936 return false;
937 }
938
939 __xfs_iflags_set(ip, XFS_IRECLAIM);
940 spin_unlock(&ip->i_flags_lock);
941 return true;
942 }
943
944 /*
945 * Inode reclaim is non-blocking, so the default action if progress cannot be
946 * made is to "requeue" the inode for reclaim by unlocking it and clearing the
947 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
948 * blocking anymore and hence we can wait for the inode to be able to reclaim
949 * it.
950 *
951 * We do no IO here - if callers require inodes to be cleaned they must push the
952 * AIL first to trigger writeback of dirty inodes. This enables writeback to be
953 * done in the background in a non-blocking manner, and enables memory reclaim
954 * to make progress without blocking.
955 */
956 static void
xfs_reclaim_inode(struct xfs_inode * ip,struct xfs_perag * pag)957 xfs_reclaim_inode(
958 struct xfs_inode *ip,
959 struct xfs_perag *pag)
960 {
961 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
962
963 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
964 goto out;
965 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
966 goto out_iunlock;
967
968 /*
969 * Check for log shutdown because aborting the inode can move the log
970 * tail and corrupt in memory state. This is fine if the log is shut
971 * down, but if the log is still active and only the mount is shut down
972 * then the in-memory log tail movement caused by the abort can be
973 * incorrectly propagated to disk.
974 */
975 if (xlog_is_shutdown(ip->i_mount->m_log)) {
976 xfs_iunpin_wait(ip);
977 /*
978 * Avoid a ABBA deadlock on the inode cluster buffer vs
979 * concurrent xfs_ifree_cluster() trying to mark the inode
980 * stale. We don't need the inode locked to run the flush abort
981 * code, but the flush abort needs to lock the cluster buffer.
982 */
983 xfs_iunlock(ip, XFS_ILOCK_EXCL);
984 xfs_iflush_shutdown_abort(ip);
985 xfs_ilock(ip, XFS_ILOCK_EXCL);
986 goto reclaim;
987 }
988 if (xfs_ipincount(ip))
989 goto out_clear_flush;
990 if (!xfs_inode_clean(ip))
991 goto out_clear_flush;
992
993 xfs_iflags_clear(ip, XFS_IFLUSHING);
994 reclaim:
995 trace_xfs_inode_reclaiming(ip);
996
997 /*
998 * Because we use RCU freeing we need to ensure the inode always appears
999 * to be reclaimed with an invalid inode number when in the free state.
1000 * We do this as early as possible under the ILOCK so that
1001 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
1002 * detect races with us here. By doing this, we guarantee that once
1003 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
1004 * it will see either a valid inode that will serialise correctly, or it
1005 * will see an invalid inode that it can skip.
1006 */
1007 spin_lock(&ip->i_flags_lock);
1008 ip->i_flags = XFS_IRECLAIM;
1009 ip->i_ino = 0;
1010 ip->i_sick = 0;
1011 ip->i_checked = 0;
1012 spin_unlock(&ip->i_flags_lock);
1013
1014 ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
1015 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1016
1017 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
1018 /*
1019 * Remove the inode from the per-AG radix tree.
1020 *
1021 * Because radix_tree_delete won't complain even if the item was never
1022 * added to the tree assert that it's been there before to catch
1023 * problems with the inode life time early on.
1024 */
1025 spin_lock(&pag->pag_ici_lock);
1026 if (!radix_tree_delete(&pag->pag_ici_root,
1027 XFS_INO_TO_AGINO(ip->i_mount, ino)))
1028 ASSERT(0);
1029 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
1030 spin_unlock(&pag->pag_ici_lock);
1031
1032 /*
1033 * Here we do an (almost) spurious inode lock in order to coordinate
1034 * with inode cache radix tree lookups. This is because the lookup
1035 * can reference the inodes in the cache without taking references.
1036 *
1037 * We make that OK here by ensuring that we wait until the inode is
1038 * unlocked after the lookup before we go ahead and free it.
1039 */
1040 xfs_ilock(ip, XFS_ILOCK_EXCL);
1041 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
1042 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1043 ASSERT(xfs_inode_clean(ip));
1044
1045 __xfs_inode_free(ip);
1046 return;
1047
1048 out_clear_flush:
1049 xfs_iflags_clear(ip, XFS_IFLUSHING);
1050 out_iunlock:
1051 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1052 out:
1053 xfs_iflags_clear(ip, XFS_IRECLAIM);
1054 }
1055
1056 /* Reclaim sick inodes if we're unmounting or the fs went down. */
1057 static inline bool
xfs_want_reclaim_sick(struct xfs_mount * mp)1058 xfs_want_reclaim_sick(
1059 struct xfs_mount *mp)
1060 {
1061 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
1062 xfs_is_shutdown(mp);
1063 }
1064
1065 void
xfs_reclaim_inodes(struct xfs_mount * mp)1066 xfs_reclaim_inodes(
1067 struct xfs_mount *mp)
1068 {
1069 struct xfs_icwalk icw = {
1070 .icw_flags = 0,
1071 };
1072
1073 if (xfs_want_reclaim_sick(mp))
1074 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1075
1076 while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
1077 xfs_ail_push_all_sync(mp->m_ail);
1078 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1079 }
1080 }
1081
1082 /*
1083 * The shrinker infrastructure determines how many inodes we should scan for
1084 * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1085 * push the AIL here. We also want to proactively free up memory if we can to
1086 * minimise the amount of work memory reclaim has to do so we kick the
1087 * background reclaim if it isn't already scheduled.
1088 */
1089 long
xfs_reclaim_inodes_nr(struct xfs_mount * mp,unsigned long nr_to_scan)1090 xfs_reclaim_inodes_nr(
1091 struct xfs_mount *mp,
1092 unsigned long nr_to_scan)
1093 {
1094 struct xfs_icwalk icw = {
1095 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
1096 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
1097 };
1098
1099 if (xfs_want_reclaim_sick(mp))
1100 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1101
1102 /* kick background reclaimer and push the AIL */
1103 xfs_reclaim_work_queue(mp);
1104 xfs_ail_push_all(mp->m_ail);
1105
1106 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1107 return 0;
1108 }
1109
1110 /*
1111 * Return the number of reclaimable inodes in the filesystem for
1112 * the shrinker to determine how much to reclaim.
1113 */
1114 long
xfs_reclaim_inodes_count(struct xfs_mount * mp)1115 xfs_reclaim_inodes_count(
1116 struct xfs_mount *mp)
1117 {
1118 XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0);
1119 long reclaimable = 0;
1120 struct xfs_perag *pag;
1121
1122 rcu_read_lock();
1123 xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
1124 trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
1125 reclaimable += pag->pag_ici_reclaimable;
1126 }
1127 rcu_read_unlock();
1128
1129 return reclaimable;
1130 }
1131
1132 STATIC bool
xfs_icwalk_match_id(struct xfs_inode * ip,struct xfs_icwalk * icw)1133 xfs_icwalk_match_id(
1134 struct xfs_inode *ip,
1135 struct xfs_icwalk *icw)
1136 {
1137 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1138 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1139 return false;
1140
1141 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1142 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1143 return false;
1144
1145 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1146 ip->i_projid != icw->icw_prid)
1147 return false;
1148
1149 return true;
1150 }
1151
1152 /*
1153 * A union-based inode filtering algorithm. Process the inode if any of the
1154 * criteria match. This is for global/internal scans only.
1155 */
1156 STATIC bool
xfs_icwalk_match_id_union(struct xfs_inode * ip,struct xfs_icwalk * icw)1157 xfs_icwalk_match_id_union(
1158 struct xfs_inode *ip,
1159 struct xfs_icwalk *icw)
1160 {
1161 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1162 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1163 return true;
1164
1165 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1166 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1167 return true;
1168
1169 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1170 ip->i_projid == icw->icw_prid)
1171 return true;
1172
1173 return false;
1174 }
1175
1176 /*
1177 * Is this inode @ip eligible for eof/cow block reclamation, given some
1178 * filtering parameters @icw? The inode is eligible if @icw is null or
1179 * if the predicate functions match.
1180 */
1181 static bool
xfs_icwalk_match(struct xfs_inode * ip,struct xfs_icwalk * icw)1182 xfs_icwalk_match(
1183 struct xfs_inode *ip,
1184 struct xfs_icwalk *icw)
1185 {
1186 bool match;
1187
1188 if (!icw)
1189 return true;
1190
1191 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1192 match = xfs_icwalk_match_id_union(ip, icw);
1193 else
1194 match = xfs_icwalk_match_id(ip, icw);
1195 if (!match)
1196 return false;
1197
1198 /* skip the inode if the file size is too small */
1199 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1200 XFS_ISIZE(ip) < icw->icw_min_file_size)
1201 return false;
1202
1203 return true;
1204 }
1205
1206 /*
1207 * This is a fast pass over the inode cache to try to get reclaim moving on as
1208 * many inodes as possible in a short period of time. It kicks itself every few
1209 * seconds, as well as being kicked by the inode cache shrinker when memory
1210 * goes low.
1211 */
1212 void
xfs_reclaim_worker(struct work_struct * work)1213 xfs_reclaim_worker(
1214 struct work_struct *work)
1215 {
1216 struct xfs_mount *mp = container_of(to_delayed_work(work),
1217 struct xfs_mount, m_reclaim_work);
1218
1219 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1220 xfs_reclaim_work_queue(mp);
1221 }
1222
1223 STATIC int
xfs_inode_free_eofblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1224 xfs_inode_free_eofblocks(
1225 struct xfs_inode *ip,
1226 struct xfs_icwalk *icw,
1227 unsigned int *lockflags)
1228 {
1229 bool wait;
1230
1231 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1232
1233 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1234 return 0;
1235
1236 /*
1237 * If the mapping is dirty the operation can block and wait for some
1238 * time. Unless we are waiting, skip it.
1239 */
1240 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1241 return 0;
1242
1243 if (!xfs_icwalk_match(ip, icw))
1244 return 0;
1245
1246 /*
1247 * If the caller is waiting, return -EAGAIN to keep the background
1248 * scanner moving and revisit the inode in a subsequent pass.
1249 */
1250 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1251 if (wait)
1252 return -EAGAIN;
1253 return 0;
1254 }
1255 *lockflags |= XFS_IOLOCK_EXCL;
1256
1257 if (xfs_can_free_eofblocks(ip))
1258 return xfs_free_eofblocks(ip);
1259
1260 /* inode could be preallocated */
1261 trace_xfs_inode_free_eofblocks_invalid(ip);
1262 xfs_inode_clear_eofblocks_tag(ip);
1263 return 0;
1264 }
1265
1266 static void
xfs_blockgc_set_iflag(struct xfs_inode * ip,unsigned long iflag)1267 xfs_blockgc_set_iflag(
1268 struct xfs_inode *ip,
1269 unsigned long iflag)
1270 {
1271 struct xfs_mount *mp = ip->i_mount;
1272 struct xfs_perag *pag;
1273
1274 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1275
1276 /*
1277 * Don't bother locking the AG and looking up in the radix trees
1278 * if we already know that we have the tag set.
1279 */
1280 if (ip->i_flags & iflag)
1281 return;
1282 spin_lock(&ip->i_flags_lock);
1283 ip->i_flags |= iflag;
1284 spin_unlock(&ip->i_flags_lock);
1285
1286 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1287 spin_lock(&pag->pag_ici_lock);
1288
1289 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1290 XFS_ICI_BLOCKGC_TAG);
1291
1292 spin_unlock(&pag->pag_ici_lock);
1293 xfs_perag_put(pag);
1294 }
1295
1296 void
xfs_inode_set_eofblocks_tag(xfs_inode_t * ip)1297 xfs_inode_set_eofblocks_tag(
1298 xfs_inode_t *ip)
1299 {
1300 trace_xfs_inode_set_eofblocks_tag(ip);
1301 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1302 }
1303
1304 static void
xfs_blockgc_clear_iflag(struct xfs_inode * ip,unsigned long iflag)1305 xfs_blockgc_clear_iflag(
1306 struct xfs_inode *ip,
1307 unsigned long iflag)
1308 {
1309 struct xfs_mount *mp = ip->i_mount;
1310 struct xfs_perag *pag;
1311 bool clear_tag;
1312
1313 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1314
1315 spin_lock(&ip->i_flags_lock);
1316 ip->i_flags &= ~iflag;
1317 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1318 spin_unlock(&ip->i_flags_lock);
1319
1320 if (!clear_tag)
1321 return;
1322
1323 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1324 spin_lock(&pag->pag_ici_lock);
1325
1326 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1327 XFS_ICI_BLOCKGC_TAG);
1328
1329 spin_unlock(&pag->pag_ici_lock);
1330 xfs_perag_put(pag);
1331 }
1332
1333 void
xfs_inode_clear_eofblocks_tag(xfs_inode_t * ip)1334 xfs_inode_clear_eofblocks_tag(
1335 xfs_inode_t *ip)
1336 {
1337 trace_xfs_inode_clear_eofblocks_tag(ip);
1338 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1339 }
1340
1341 /*
1342 * Prepare to free COW fork blocks from an inode.
1343 */
1344 static bool
xfs_prep_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw)1345 xfs_prep_free_cowblocks(
1346 struct xfs_inode *ip,
1347 struct xfs_icwalk *icw)
1348 {
1349 bool sync;
1350
1351 sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1352
1353 /*
1354 * Just clear the tag if we have an empty cow fork or none at all. It's
1355 * possible the inode was fully unshared since it was originally tagged.
1356 */
1357 if (!xfs_inode_has_cow_data(ip)) {
1358 trace_xfs_inode_free_cowblocks_invalid(ip);
1359 xfs_inode_clear_cowblocks_tag(ip);
1360 return false;
1361 }
1362
1363 /*
1364 * A cowblocks trim of an inode can have a significant effect on
1365 * fragmentation even when a reasonable COW extent size hint is set.
1366 * Therefore, we prefer to not process cowblocks unless they are clean
1367 * and idle. We can never process a cowblocks inode that is dirty or has
1368 * in-flight I/O under any circumstances, because outstanding writeback
1369 * or dio expects targeted COW fork blocks exist through write
1370 * completion where they can be remapped into the data fork.
1371 *
1372 * Therefore, the heuristic used here is to never process inodes
1373 * currently opened for write from background (i.e. non-sync) scans. For
1374 * sync scans, use the pagecache/dio state of the inode to ensure we
1375 * never free COW fork blocks out from under pending I/O.
1376 */
1377 if (!sync && inode_is_open_for_write(VFS_I(ip)))
1378 return false;
1379 return xfs_can_free_cowblocks(ip);
1380 }
1381
1382 /*
1383 * Automatic CoW Reservation Freeing
1384 *
1385 * These functions automatically garbage collect leftover CoW reservations
1386 * that were made on behalf of a cowextsize hint when we start to run out
1387 * of quota or when the reservations sit around for too long. If the file
1388 * has dirty pages or is undergoing writeback, its CoW reservations will
1389 * be retained.
1390 *
1391 * The actual garbage collection piggybacks off the same code that runs
1392 * the speculative EOF preallocation garbage collector.
1393 */
1394 STATIC int
xfs_inode_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1395 xfs_inode_free_cowblocks(
1396 struct xfs_inode *ip,
1397 struct xfs_icwalk *icw,
1398 unsigned int *lockflags)
1399 {
1400 bool wait;
1401 int ret = 0;
1402
1403 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1404
1405 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1406 return 0;
1407
1408 if (!xfs_prep_free_cowblocks(ip, icw))
1409 return 0;
1410
1411 if (!xfs_icwalk_match(ip, icw))
1412 return 0;
1413
1414 /*
1415 * If the caller is waiting, return -EAGAIN to keep the background
1416 * scanner moving and revisit the inode in a subsequent pass.
1417 */
1418 if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1419 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1420 if (wait)
1421 return -EAGAIN;
1422 return 0;
1423 }
1424 *lockflags |= XFS_IOLOCK_EXCL;
1425
1426 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1427 if (wait)
1428 return -EAGAIN;
1429 return 0;
1430 }
1431 *lockflags |= XFS_MMAPLOCK_EXCL;
1432
1433 /*
1434 * Check again, nobody else should be able to dirty blocks or change
1435 * the reflink iflag now that we have the first two locks held.
1436 */
1437 if (xfs_prep_free_cowblocks(ip, icw))
1438 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1439 return ret;
1440 }
1441
1442 void
xfs_inode_set_cowblocks_tag(xfs_inode_t * ip)1443 xfs_inode_set_cowblocks_tag(
1444 xfs_inode_t *ip)
1445 {
1446 trace_xfs_inode_set_cowblocks_tag(ip);
1447 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1448 }
1449
1450 void
xfs_inode_clear_cowblocks_tag(xfs_inode_t * ip)1451 xfs_inode_clear_cowblocks_tag(
1452 xfs_inode_t *ip)
1453 {
1454 trace_xfs_inode_clear_cowblocks_tag(ip);
1455 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1456 }
1457
1458 /* Disable post-EOF and CoW block auto-reclamation. */
1459 void
xfs_blockgc_stop(struct xfs_mount * mp)1460 xfs_blockgc_stop(
1461 struct xfs_mount *mp)
1462 {
1463 struct xfs_perag *pag = NULL;
1464
1465 if (!xfs_clear_blockgc_enabled(mp))
1466 return;
1467
1468 while ((pag = xfs_perag_next(mp, pag)))
1469 cancel_delayed_work_sync(&pag->pag_blockgc_work);
1470 trace_xfs_blockgc_stop(mp, __return_address);
1471 }
1472
1473 /* Enable post-EOF and CoW block auto-reclamation. */
1474 void
xfs_blockgc_start(struct xfs_mount * mp)1475 xfs_blockgc_start(
1476 struct xfs_mount *mp)
1477 {
1478 struct xfs_perag *pag = NULL;
1479
1480 if (xfs_set_blockgc_enabled(mp))
1481 return;
1482
1483 trace_xfs_blockgc_start(mp, __return_address);
1484 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1485 xfs_blockgc_queue(pag);
1486 }
1487
1488 /* Don't try to run block gc on an inode that's in any of these states. */
1489 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
1490 XFS_NEED_INACTIVE | \
1491 XFS_INACTIVATING | \
1492 XFS_IRECLAIMABLE | \
1493 XFS_IRECLAIM)
1494 /*
1495 * Decide if the given @ip is eligible for garbage collection of speculative
1496 * preallocations, and grab it if so. Returns true if it's ready to go or
1497 * false if we should just ignore it.
1498 */
1499 static bool
xfs_blockgc_igrab(struct xfs_inode * ip)1500 xfs_blockgc_igrab(
1501 struct xfs_inode *ip)
1502 {
1503 struct inode *inode = VFS_I(ip);
1504
1505 ASSERT(rcu_read_lock_held());
1506
1507 /* Check for stale RCU freed inode */
1508 spin_lock(&ip->i_flags_lock);
1509 if (!ip->i_ino)
1510 goto out_unlock_noent;
1511
1512 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1513 goto out_unlock_noent;
1514 spin_unlock(&ip->i_flags_lock);
1515
1516 /* nothing to sync during shutdown */
1517 if (xfs_is_shutdown(ip->i_mount))
1518 return false;
1519
1520 /* If we can't grab the inode, it must on it's way to reclaim. */
1521 if (!igrab(inode))
1522 return false;
1523
1524 /* inode is valid */
1525 return true;
1526
1527 out_unlock_noent:
1528 spin_unlock(&ip->i_flags_lock);
1529 return false;
1530 }
1531
1532 /* Scan one incore inode for block preallocations that we can remove. */
1533 static int
xfs_blockgc_scan_inode(struct xfs_inode * ip,struct xfs_icwalk * icw)1534 xfs_blockgc_scan_inode(
1535 struct xfs_inode *ip,
1536 struct xfs_icwalk *icw)
1537 {
1538 unsigned int lockflags = 0;
1539 int error;
1540
1541 error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1542 if (error)
1543 goto unlock;
1544
1545 error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1546 unlock:
1547 if (lockflags)
1548 xfs_iunlock(ip, lockflags);
1549 xfs_irele(ip);
1550 return error;
1551 }
1552
1553 /* Background worker that trims preallocated space. */
1554 void
xfs_blockgc_worker(struct work_struct * work)1555 xfs_blockgc_worker(
1556 struct work_struct *work)
1557 {
1558 struct xfs_perag *pag = container_of(to_delayed_work(work),
1559 struct xfs_perag, pag_blockgc_work);
1560 struct xfs_mount *mp = pag_mount(pag);
1561 int error;
1562
1563 trace_xfs_blockgc_worker(mp, __return_address);
1564
1565 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1566 if (error)
1567 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1568 pag_agno(pag), error);
1569 xfs_blockgc_queue(pag);
1570 }
1571
1572 /*
1573 * Try to free space in the filesystem by purging inactive inodes, eofblocks
1574 * and cowblocks.
1575 */
1576 int
xfs_blockgc_free_space(struct xfs_mount * mp,struct xfs_icwalk * icw)1577 xfs_blockgc_free_space(
1578 struct xfs_mount *mp,
1579 struct xfs_icwalk *icw)
1580 {
1581 int error;
1582
1583 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1584
1585 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
1586 if (error)
1587 return error;
1588
1589 return xfs_inodegc_flush(mp);
1590 }
1591
1592 /*
1593 * Reclaim all the free space that we can by scheduling the background blockgc
1594 * and inodegc workers immediately and waiting for them all to clear.
1595 */
1596 int
xfs_blockgc_flush_all(struct xfs_mount * mp)1597 xfs_blockgc_flush_all(
1598 struct xfs_mount *mp)
1599 {
1600 struct xfs_perag *pag = NULL;
1601
1602 trace_xfs_blockgc_flush_all(mp, __return_address);
1603
1604 /*
1605 * For each blockgc worker, move its queue time up to now. If it wasn't
1606 * queued, it will not be requeued. Then flush whatever is left.
1607 */
1608 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1609 mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0);
1610
1611 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
1612 flush_delayed_work(&pag->pag_blockgc_work);
1613
1614 return xfs_inodegc_flush(mp);
1615 }
1616
1617 /*
1618 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
1619 * quota caused an allocation failure, so we make a best effort by including
1620 * each quota under low free space conditions (less than 1% free space) in the
1621 * scan.
1622 *
1623 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan
1624 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1625 * MMAPLOCK.
1626 */
1627 int
xfs_blockgc_free_dquots(struct xfs_mount * mp,struct xfs_dquot * udqp,struct xfs_dquot * gdqp,struct xfs_dquot * pdqp,unsigned int iwalk_flags)1628 xfs_blockgc_free_dquots(
1629 struct xfs_mount *mp,
1630 struct xfs_dquot *udqp,
1631 struct xfs_dquot *gdqp,
1632 struct xfs_dquot *pdqp,
1633 unsigned int iwalk_flags)
1634 {
1635 struct xfs_icwalk icw = {0};
1636 bool do_work = false;
1637
1638 if (!udqp && !gdqp && !pdqp)
1639 return 0;
1640
1641 /*
1642 * Run a scan to free blocks using the union filter to cover all
1643 * applicable quotas in a single scan.
1644 */
1645 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1646
1647 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1648 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1649 icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1650 do_work = true;
1651 }
1652
1653 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1654 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1655 icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1656 do_work = true;
1657 }
1658
1659 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1660 icw.icw_prid = pdqp->q_id;
1661 icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1662 do_work = true;
1663 }
1664
1665 if (!do_work)
1666 return 0;
1667
1668 return xfs_blockgc_free_space(mp, &icw);
1669 }
1670
1671 /* Run cow/eofblocks scans on the quotas attached to the inode. */
1672 int
xfs_blockgc_free_quota(struct xfs_inode * ip,unsigned int iwalk_flags)1673 xfs_blockgc_free_quota(
1674 struct xfs_inode *ip,
1675 unsigned int iwalk_flags)
1676 {
1677 return xfs_blockgc_free_dquots(ip->i_mount,
1678 xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1679 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1680 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1681 }
1682
1683 /* XFS Inode Cache Walking Code */
1684
1685 /*
1686 * The inode lookup is done in batches to keep the amount of lock traffic and
1687 * radix tree lookups to a minimum. The batch size is a trade off between
1688 * lookup reduction and stack usage. This is in the reclaim path, so we can't
1689 * be too greedy.
1690 */
1691 #define XFS_LOOKUP_BATCH 32
1692
1693
1694 /*
1695 * Decide if we want to grab this inode in anticipation of doing work towards
1696 * the goal.
1697 */
1698 static inline bool
xfs_icwalk_igrab(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_icwalk * icw)1699 xfs_icwalk_igrab(
1700 enum xfs_icwalk_goal goal,
1701 struct xfs_inode *ip,
1702 struct xfs_icwalk *icw)
1703 {
1704 switch (goal) {
1705 case XFS_ICWALK_BLOCKGC:
1706 return xfs_blockgc_igrab(ip);
1707 case XFS_ICWALK_RECLAIM:
1708 return xfs_reclaim_igrab(ip, icw);
1709 default:
1710 return false;
1711 }
1712 }
1713
1714 /*
1715 * Process an inode. Each processing function must handle any state changes
1716 * made by the icwalk igrab function. Return -EAGAIN to skip an inode.
1717 */
1718 static inline int
xfs_icwalk_process_inode(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_perag * pag,struct xfs_icwalk * icw)1719 xfs_icwalk_process_inode(
1720 enum xfs_icwalk_goal goal,
1721 struct xfs_inode *ip,
1722 struct xfs_perag *pag,
1723 struct xfs_icwalk *icw)
1724 {
1725 int error = 0;
1726
1727 switch (goal) {
1728 case XFS_ICWALK_BLOCKGC:
1729 error = xfs_blockgc_scan_inode(ip, icw);
1730 break;
1731 case XFS_ICWALK_RECLAIM:
1732 xfs_reclaim_inode(ip, pag);
1733 break;
1734 }
1735 return error;
1736 }
1737
1738 /*
1739 * For a given per-AG structure @pag and a goal, grab qualifying inodes and
1740 * process them in some manner.
1741 */
1742 static int
xfs_icwalk_ag(struct xfs_perag * pag,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1743 xfs_icwalk_ag(
1744 struct xfs_perag *pag,
1745 enum xfs_icwalk_goal goal,
1746 struct xfs_icwalk *icw)
1747 {
1748 struct xfs_mount *mp = pag_mount(pag);
1749 uint32_t first_index;
1750 int last_error = 0;
1751 int skipped;
1752 bool done;
1753 int nr_found;
1754
1755 restart:
1756 done = false;
1757 skipped = 0;
1758 if (goal == XFS_ICWALK_RECLAIM)
1759 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1760 else
1761 first_index = 0;
1762 nr_found = 0;
1763 do {
1764 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1765 int error = 0;
1766 int i;
1767
1768 rcu_read_lock();
1769
1770 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1771 (void **) batch, first_index,
1772 XFS_LOOKUP_BATCH, goal);
1773 if (!nr_found) {
1774 done = true;
1775 rcu_read_unlock();
1776 break;
1777 }
1778
1779 /*
1780 * Grab the inodes before we drop the lock. if we found
1781 * nothing, nr == 0 and the loop will be skipped.
1782 */
1783 for (i = 0; i < nr_found; i++) {
1784 struct xfs_inode *ip = batch[i];
1785
1786 if (done || !xfs_icwalk_igrab(goal, ip, icw))
1787 batch[i] = NULL;
1788
1789 /*
1790 * Update the index for the next lookup. Catch
1791 * overflows into the next AG range which can occur if
1792 * we have inodes in the last block of the AG and we
1793 * are currently pointing to the last inode.
1794 *
1795 * Because we may see inodes that are from the wrong AG
1796 * due to RCU freeing and reallocation, only update the
1797 * index if it lies in this AG. It was a race that lead
1798 * us to see this inode, so another lookup from the
1799 * same index will not find it again.
1800 */
1801 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
1802 continue;
1803 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1804 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1805 done = true;
1806 }
1807
1808 /* unlock now we've grabbed the inodes. */
1809 rcu_read_unlock();
1810
1811 for (i = 0; i < nr_found; i++) {
1812 if (!batch[i])
1813 continue;
1814 error = xfs_icwalk_process_inode(goal, batch[i], pag,
1815 icw);
1816 if (error == -EAGAIN) {
1817 skipped++;
1818 continue;
1819 }
1820 if (error && last_error != -EFSCORRUPTED)
1821 last_error = error;
1822 }
1823
1824 /* bail out if the filesystem is corrupted. */
1825 if (error == -EFSCORRUPTED)
1826 break;
1827
1828 cond_resched();
1829
1830 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1831 icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1832 if (icw->icw_scan_limit <= 0)
1833 break;
1834 }
1835 } while (nr_found && !done);
1836
1837 if (goal == XFS_ICWALK_RECLAIM) {
1838 if (done)
1839 first_index = 0;
1840 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1841 }
1842
1843 if (skipped) {
1844 delay(1);
1845 goto restart;
1846 }
1847 return last_error;
1848 }
1849
1850 /* Walk all incore inodes to achieve a given goal. */
1851 static int
xfs_icwalk(struct xfs_mount * mp,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1852 xfs_icwalk(
1853 struct xfs_mount *mp,
1854 enum xfs_icwalk_goal goal,
1855 struct xfs_icwalk *icw)
1856 {
1857 struct xfs_perag *pag = NULL;
1858 int error = 0;
1859 int last_error = 0;
1860
1861 while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
1862 error = xfs_icwalk_ag(pag, goal, icw);
1863 if (error) {
1864 last_error = error;
1865 if (error == -EFSCORRUPTED) {
1866 xfs_perag_rele(pag);
1867 break;
1868 }
1869 }
1870 }
1871 return last_error;
1872 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1873 }
1874
1875 #ifdef DEBUG
1876 static void
xfs_check_delalloc(struct xfs_inode * ip,int whichfork)1877 xfs_check_delalloc(
1878 struct xfs_inode *ip,
1879 int whichfork)
1880 {
1881 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
1882 struct xfs_bmbt_irec got;
1883 struct xfs_iext_cursor icur;
1884
1885 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
1886 return;
1887 do {
1888 if (isnullstartblock(got.br_startblock)) {
1889 xfs_warn(ip->i_mount,
1890 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1891 ip->i_ino,
1892 whichfork == XFS_DATA_FORK ? "data" : "cow",
1893 got.br_startoff, got.br_blockcount);
1894 }
1895 } while (xfs_iext_next_extent(ifp, &icur, &got));
1896 }
1897 #else
1898 #define xfs_check_delalloc(ip, whichfork) do { } while (0)
1899 #endif
1900
1901 /* Schedule the inode for reclaim. */
1902 static void
xfs_inodegc_set_reclaimable(struct xfs_inode * ip)1903 xfs_inodegc_set_reclaimable(
1904 struct xfs_inode *ip)
1905 {
1906 struct xfs_mount *mp = ip->i_mount;
1907 struct xfs_perag *pag;
1908
1909 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1910 xfs_check_delalloc(ip, XFS_DATA_FORK);
1911 xfs_check_delalloc(ip, XFS_COW_FORK);
1912 ASSERT(0);
1913 }
1914
1915 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1916 spin_lock(&pag->pag_ici_lock);
1917 spin_lock(&ip->i_flags_lock);
1918
1919 trace_xfs_inode_set_reclaimable(ip);
1920 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
1921 ip->i_flags |= XFS_IRECLAIMABLE;
1922 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1923 XFS_ICI_RECLAIM_TAG);
1924
1925 spin_unlock(&ip->i_flags_lock);
1926 spin_unlock(&pag->pag_ici_lock);
1927 xfs_perag_put(pag);
1928 }
1929
1930 /*
1931 * Free all speculative preallocations and possibly even the inode itself.
1932 * This is the last chance to make changes to an otherwise unreferenced file
1933 * before incore reclamation happens.
1934 */
1935 static int
xfs_inodegc_inactivate(struct xfs_inode * ip)1936 xfs_inodegc_inactivate(
1937 struct xfs_inode *ip)
1938 {
1939 int error;
1940
1941 trace_xfs_inode_inactivating(ip);
1942 error = xfs_inactive(ip);
1943 xfs_inodegc_set_reclaimable(ip);
1944 return error;
1945
1946 }
1947
1948 void
xfs_inodegc_worker(struct work_struct * work)1949 xfs_inodegc_worker(
1950 struct work_struct *work)
1951 {
1952 struct xfs_inodegc *gc = container_of(to_delayed_work(work),
1953 struct xfs_inodegc, work);
1954 struct llist_node *node = llist_del_all(&gc->list);
1955 struct xfs_inode *ip, *n;
1956 struct xfs_mount *mp = gc->mp;
1957 unsigned int nofs_flag;
1958
1959 /*
1960 * Clear the cpu mask bit and ensure that we have seen the latest
1961 * update of the gc structure associated with this CPU. This matches
1962 * with the release semantics used when setting the cpumask bit in
1963 * xfs_inodegc_queue.
1964 */
1965 cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
1966 smp_mb__after_atomic();
1967
1968 WRITE_ONCE(gc->items, 0);
1969
1970 if (!node)
1971 return;
1972
1973 /*
1974 * We can allocate memory here while doing writeback on behalf of
1975 * memory reclaim. To avoid memory allocation deadlocks set the
1976 * task-wide nofs context for the following operations.
1977 */
1978 nofs_flag = memalloc_nofs_save();
1979
1980 ip = llist_entry(node, struct xfs_inode, i_gclist);
1981 trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
1982
1983 WRITE_ONCE(gc->shrinker_hits, 0);
1984 llist_for_each_entry_safe(ip, n, node, i_gclist) {
1985 int error;
1986
1987 xfs_iflags_set(ip, XFS_INACTIVATING);
1988 error = xfs_inodegc_inactivate(ip);
1989 if (error && !gc->error)
1990 gc->error = error;
1991 }
1992
1993 memalloc_nofs_restore(nofs_flag);
1994 }
1995
1996 /*
1997 * Expedite all pending inodegc work to run immediately. This does not wait for
1998 * completion of the work.
1999 */
2000 void
xfs_inodegc_push(struct xfs_mount * mp)2001 xfs_inodegc_push(
2002 struct xfs_mount *mp)
2003 {
2004 if (!xfs_is_inodegc_enabled(mp))
2005 return;
2006 trace_xfs_inodegc_push(mp, __return_address);
2007 xfs_inodegc_queue_all(mp);
2008 }
2009
2010 /*
2011 * Force all currently queued inode inactivation work to run immediately and
2012 * wait for the work to finish.
2013 */
2014 int
xfs_inodegc_flush(struct xfs_mount * mp)2015 xfs_inodegc_flush(
2016 struct xfs_mount *mp)
2017 {
2018 xfs_inodegc_push(mp);
2019 trace_xfs_inodegc_flush(mp, __return_address);
2020 return xfs_inodegc_wait_all(mp);
2021 }
2022
2023 /*
2024 * Flush all the pending work and then disable the inode inactivation background
2025 * workers and wait for them to stop. Caller must hold sb->s_umount to
2026 * coordinate changes in the inodegc_enabled state.
2027 */
2028 void
xfs_inodegc_stop(struct xfs_mount * mp)2029 xfs_inodegc_stop(
2030 struct xfs_mount *mp)
2031 {
2032 bool rerun;
2033
2034 if (!xfs_clear_inodegc_enabled(mp))
2035 return;
2036
2037 /*
2038 * Drain all pending inodegc work, including inodes that could be
2039 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
2040 * threads that sample the inodegc state just prior to us clearing it.
2041 * The inodegc flag state prevents new threads from queuing more
2042 * inodes, so we queue pending work items and flush the workqueue until
2043 * all inodegc lists are empty. IOWs, we cannot use drain_workqueue
2044 * here because it does not allow other unserialized mechanisms to
2045 * reschedule inodegc work while this draining is in progress.
2046 */
2047 xfs_inodegc_queue_all(mp);
2048 do {
2049 flush_workqueue(mp->m_inodegc_wq);
2050 rerun = xfs_inodegc_queue_all(mp);
2051 } while (rerun);
2052
2053 trace_xfs_inodegc_stop(mp, __return_address);
2054 }
2055
2056 /*
2057 * Enable the inode inactivation background workers and schedule deferred inode
2058 * inactivation work if there is any. Caller must hold sb->s_umount to
2059 * coordinate changes in the inodegc_enabled state.
2060 */
2061 void
xfs_inodegc_start(struct xfs_mount * mp)2062 xfs_inodegc_start(
2063 struct xfs_mount *mp)
2064 {
2065 if (xfs_set_inodegc_enabled(mp))
2066 return;
2067
2068 trace_xfs_inodegc_start(mp, __return_address);
2069 xfs_inodegc_queue_all(mp);
2070 }
2071
2072 #ifdef CONFIG_XFS_RT
2073 static inline bool
xfs_inodegc_want_queue_rt_file(struct xfs_inode * ip)2074 xfs_inodegc_want_queue_rt_file(
2075 struct xfs_inode *ip)
2076 {
2077 struct xfs_mount *mp = ip->i_mount;
2078
2079 if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
2080 return false;
2081
2082 if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
2083 mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
2084 XFS_FDBLOCKS_BATCH) < 0)
2085 return true;
2086
2087 return false;
2088 }
2089 #else
2090 # define xfs_inodegc_want_queue_rt_file(ip) (false)
2091 #endif /* CONFIG_XFS_RT */
2092
2093 /*
2094 * Schedule the inactivation worker when:
2095 *
2096 * - We've accumulated more than one inode cluster buffer's worth of inodes.
2097 * - There is less than 5% free space left.
2098 * - Any of the quotas for this inode are near an enforcement limit.
2099 */
2100 static inline bool
xfs_inodegc_want_queue_work(struct xfs_inode * ip,unsigned int items)2101 xfs_inodegc_want_queue_work(
2102 struct xfs_inode *ip,
2103 unsigned int items)
2104 {
2105 struct xfs_mount *mp = ip->i_mount;
2106
2107 if (items > mp->m_ino_geo.inodes_per_cluster)
2108 return true;
2109
2110 if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
2111 mp->m_low_space[XFS_LOWSP_5_PCNT],
2112 XFS_FDBLOCKS_BATCH) < 0)
2113 return true;
2114
2115 if (xfs_inodegc_want_queue_rt_file(ip))
2116 return true;
2117
2118 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
2119 return true;
2120
2121 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
2122 return true;
2123
2124 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
2125 return true;
2126
2127 return false;
2128 }
2129
2130 /*
2131 * Upper bound on the number of inodes in each AG that can be queued for
2132 * inactivation at any given time, to avoid monopolizing the workqueue.
2133 */
2134 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
2135
2136 /*
2137 * Make the frontend wait for inactivations when:
2138 *
2139 * - Memory shrinkers queued the inactivation worker and it hasn't finished.
2140 * - The queue depth exceeds the maximum allowable percpu backlog.
2141 *
2142 * Note: If we are in a NOFS context here (e.g. current thread is running a
2143 * transaction) the we don't want to block here as inodegc progress may require
2144 * filesystem resources we hold to make progress and that could result in a
2145 * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
2146 */
2147 static inline bool
xfs_inodegc_want_flush_work(struct xfs_inode * ip,unsigned int items,unsigned int shrinker_hits)2148 xfs_inodegc_want_flush_work(
2149 struct xfs_inode *ip,
2150 unsigned int items,
2151 unsigned int shrinker_hits)
2152 {
2153 if (current->flags & PF_MEMALLOC_NOFS)
2154 return false;
2155
2156 if (shrinker_hits > 0)
2157 return true;
2158
2159 if (items > XFS_INODEGC_MAX_BACKLOG)
2160 return true;
2161
2162 return false;
2163 }
2164
2165 /*
2166 * Queue a background inactivation worker if there are inodes that need to be
2167 * inactivated and higher level xfs code hasn't disabled the background
2168 * workers.
2169 */
2170 static void
xfs_inodegc_queue(struct xfs_inode * ip)2171 xfs_inodegc_queue(
2172 struct xfs_inode *ip)
2173 {
2174 struct xfs_mount *mp = ip->i_mount;
2175 struct xfs_inodegc *gc;
2176 int items;
2177 unsigned int shrinker_hits;
2178 unsigned int cpu_nr;
2179 unsigned long queue_delay = 1;
2180
2181 trace_xfs_inode_set_need_inactive(ip);
2182 spin_lock(&ip->i_flags_lock);
2183 ip->i_flags |= XFS_NEED_INACTIVE;
2184 spin_unlock(&ip->i_flags_lock);
2185
2186 cpu_nr = get_cpu();
2187 gc = this_cpu_ptr(mp->m_inodegc);
2188 llist_add(&ip->i_gclist, &gc->list);
2189 items = READ_ONCE(gc->items);
2190 WRITE_ONCE(gc->items, items + 1);
2191 shrinker_hits = READ_ONCE(gc->shrinker_hits);
2192
2193 /*
2194 * Ensure the list add is always seen by anyone who finds the cpumask
2195 * bit set. This effectively gives the cpumask bit set operation
2196 * release ordering semantics.
2197 */
2198 smp_mb__before_atomic();
2199 if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
2200 cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
2201
2202 /*
2203 * We queue the work while holding the current CPU so that the work
2204 * is scheduled to run on this CPU.
2205 */
2206 if (!xfs_is_inodegc_enabled(mp)) {
2207 put_cpu();
2208 return;
2209 }
2210
2211 if (xfs_inodegc_want_queue_work(ip, items))
2212 queue_delay = 0;
2213
2214 trace_xfs_inodegc_queue(mp, __return_address);
2215 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2216 queue_delay);
2217 put_cpu();
2218
2219 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2220 trace_xfs_inodegc_throttle(mp, __return_address);
2221 flush_delayed_work(&gc->work);
2222 }
2223 }
2224
2225 /*
2226 * We set the inode flag atomically with the radix tree tag. Once we get tag
2227 * lookups on the radix tree, this inode flag can go away.
2228 *
2229 * We always use background reclaim here because even if the inode is clean, it
2230 * still may be under IO and hence we have wait for IO completion to occur
2231 * before we can reclaim the inode. The background reclaim path handles this
2232 * more efficiently than we can here, so simply let background reclaim tear down
2233 * all inodes.
2234 */
2235 void
xfs_inode_mark_reclaimable(struct xfs_inode * ip)2236 xfs_inode_mark_reclaimable(
2237 struct xfs_inode *ip)
2238 {
2239 struct xfs_mount *mp = ip->i_mount;
2240 bool need_inactive;
2241
2242 XFS_STATS_INC(mp, vn_reclaim);
2243
2244 /*
2245 * We should never get here with any of the reclaim flags already set.
2246 */
2247 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2248
2249 need_inactive = xfs_inode_needs_inactive(ip);
2250 if (need_inactive) {
2251 xfs_inodegc_queue(ip);
2252 return;
2253 }
2254
2255 /* Going straight to reclaim, so drop the dquots. */
2256 xfs_qm_dqdetach(ip);
2257 xfs_inodegc_set_reclaimable(ip);
2258 }
2259
2260 /*
2261 * Register a phony shrinker so that we can run background inodegc sooner when
2262 * there's memory pressure. Inactivation does not itself free any memory but
2263 * it does make inodes reclaimable, which eventually frees memory.
2264 *
2265 * The count function, seek value, and batch value are crafted to trigger the
2266 * scan function during the second round of scanning. Hopefully this means
2267 * that we reclaimed enough memory that initiating metadata transactions won't
2268 * make things worse.
2269 */
2270 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
2271 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2272
2273 static unsigned long
xfs_inodegc_shrinker_count(struct shrinker * shrink,struct shrink_control * sc)2274 xfs_inodegc_shrinker_count(
2275 struct shrinker *shrink,
2276 struct shrink_control *sc)
2277 {
2278 struct xfs_mount *mp = shrink->private_data;
2279 struct xfs_inodegc *gc;
2280 int cpu;
2281
2282 if (!xfs_is_inodegc_enabled(mp))
2283 return 0;
2284
2285 for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2286 gc = per_cpu_ptr(mp->m_inodegc, cpu);
2287 if (!llist_empty(&gc->list))
2288 return XFS_INODEGC_SHRINKER_COUNT;
2289 }
2290
2291 return 0;
2292 }
2293
2294 static unsigned long
xfs_inodegc_shrinker_scan(struct shrinker * shrink,struct shrink_control * sc)2295 xfs_inodegc_shrinker_scan(
2296 struct shrinker *shrink,
2297 struct shrink_control *sc)
2298 {
2299 struct xfs_mount *mp = shrink->private_data;
2300 struct xfs_inodegc *gc;
2301 int cpu;
2302 bool no_items = true;
2303
2304 if (!xfs_is_inodegc_enabled(mp))
2305 return SHRINK_STOP;
2306
2307 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2308
2309 for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2310 gc = per_cpu_ptr(mp->m_inodegc, cpu);
2311 if (!llist_empty(&gc->list)) {
2312 unsigned int h = READ_ONCE(gc->shrinker_hits);
2313
2314 WRITE_ONCE(gc->shrinker_hits, h + 1);
2315 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
2316 no_items = false;
2317 }
2318 }
2319
2320 /*
2321 * If there are no inodes to inactivate, we don't want the shrinker
2322 * to think there's deferred work to call us back about.
2323 */
2324 if (no_items)
2325 return LONG_MAX;
2326
2327 return SHRINK_STOP;
2328 }
2329
2330 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
2331 int
xfs_inodegc_register_shrinker(struct xfs_mount * mp)2332 xfs_inodegc_register_shrinker(
2333 struct xfs_mount *mp)
2334 {
2335 mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
2336 "xfs-inodegc:%s",
2337 mp->m_super->s_id);
2338 if (!mp->m_inodegc_shrinker)
2339 return -ENOMEM;
2340
2341 mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
2342 mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
2343 mp->m_inodegc_shrinker->seeks = 0;
2344 mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
2345 mp->m_inodegc_shrinker->private_data = mp;
2346
2347 shrinker_register(mp->m_inodegc_shrinker);
2348
2349 return 0;
2350 }
2351