xref: /linux/fs/xfs/libxfs/xfs_inode_util.c (revision 9208c05f9fdfd927ea160b97dfef3c379049fff2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include <linux/iversion.h>
7 #include "xfs.h"
8 #include "xfs_fs.h"
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_inode.h"
16 #include "xfs_inode_util.h"
17 #include "xfs_trans.h"
18 #include "xfs_ialloc.h"
19 #include "xfs_health.h"
20 #include "xfs_bmap.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_ag.h"
24 #include "xfs_iunlink_item.h"
25 #include "xfs_inode_item.h"
26 
27 uint16_t
28 xfs_flags2diflags(
29 	struct xfs_inode	*ip,
30 	unsigned int		xflags)
31 {
32 	/* can't set PREALLOC this way, just preserve it */
33 	uint16_t		di_flags =
34 		(ip->i_diflags & XFS_DIFLAG_PREALLOC);
35 
36 	if (xflags & FS_XFLAG_IMMUTABLE)
37 		di_flags |= XFS_DIFLAG_IMMUTABLE;
38 	if (xflags & FS_XFLAG_APPEND)
39 		di_flags |= XFS_DIFLAG_APPEND;
40 	if (xflags & FS_XFLAG_SYNC)
41 		di_flags |= XFS_DIFLAG_SYNC;
42 	if (xflags & FS_XFLAG_NOATIME)
43 		di_flags |= XFS_DIFLAG_NOATIME;
44 	if (xflags & FS_XFLAG_NODUMP)
45 		di_flags |= XFS_DIFLAG_NODUMP;
46 	if (xflags & FS_XFLAG_NODEFRAG)
47 		di_flags |= XFS_DIFLAG_NODEFRAG;
48 	if (xflags & FS_XFLAG_FILESTREAM)
49 		di_flags |= XFS_DIFLAG_FILESTREAM;
50 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
51 		if (xflags & FS_XFLAG_RTINHERIT)
52 			di_flags |= XFS_DIFLAG_RTINHERIT;
53 		if (xflags & FS_XFLAG_NOSYMLINKS)
54 			di_flags |= XFS_DIFLAG_NOSYMLINKS;
55 		if (xflags & FS_XFLAG_EXTSZINHERIT)
56 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
57 		if (xflags & FS_XFLAG_PROJINHERIT)
58 			di_flags |= XFS_DIFLAG_PROJINHERIT;
59 	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
60 		if (xflags & FS_XFLAG_REALTIME)
61 			di_flags |= XFS_DIFLAG_REALTIME;
62 		if (xflags & FS_XFLAG_EXTSIZE)
63 			di_flags |= XFS_DIFLAG_EXTSIZE;
64 	}
65 
66 	return di_flags;
67 }
68 
69 uint64_t
70 xfs_flags2diflags2(
71 	struct xfs_inode	*ip,
72 	unsigned int		xflags)
73 {
74 	uint64_t		di_flags2 =
75 		(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
76 				   XFS_DIFLAG2_BIGTIME |
77 				   XFS_DIFLAG2_NREXT64));
78 
79 	if (xflags & FS_XFLAG_DAX)
80 		di_flags2 |= XFS_DIFLAG2_DAX;
81 	if (xflags & FS_XFLAG_COWEXTSIZE)
82 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
83 
84 	return di_flags2;
85 }
86 
87 uint32_t
88 xfs_ip2xflags(
89 	struct xfs_inode	*ip)
90 {
91 	uint32_t		flags = 0;
92 
93 	if (ip->i_diflags & XFS_DIFLAG_ANY) {
94 		if (ip->i_diflags & XFS_DIFLAG_REALTIME)
95 			flags |= FS_XFLAG_REALTIME;
96 		if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
97 			flags |= FS_XFLAG_PREALLOC;
98 		if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
99 			flags |= FS_XFLAG_IMMUTABLE;
100 		if (ip->i_diflags & XFS_DIFLAG_APPEND)
101 			flags |= FS_XFLAG_APPEND;
102 		if (ip->i_diflags & XFS_DIFLAG_SYNC)
103 			flags |= FS_XFLAG_SYNC;
104 		if (ip->i_diflags & XFS_DIFLAG_NOATIME)
105 			flags |= FS_XFLAG_NOATIME;
106 		if (ip->i_diflags & XFS_DIFLAG_NODUMP)
107 			flags |= FS_XFLAG_NODUMP;
108 		if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
109 			flags |= FS_XFLAG_RTINHERIT;
110 		if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
111 			flags |= FS_XFLAG_PROJINHERIT;
112 		if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
113 			flags |= FS_XFLAG_NOSYMLINKS;
114 		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
115 			flags |= FS_XFLAG_EXTSIZE;
116 		if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
117 			flags |= FS_XFLAG_EXTSZINHERIT;
118 		if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
119 			flags |= FS_XFLAG_NODEFRAG;
120 		if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
121 			flags |= FS_XFLAG_FILESTREAM;
122 	}
123 
124 	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
125 		if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
126 			flags |= FS_XFLAG_DAX;
127 		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
128 			flags |= FS_XFLAG_COWEXTSIZE;
129 	}
130 
131 	if (xfs_inode_has_attr_fork(ip))
132 		flags |= FS_XFLAG_HASATTR;
133 	return flags;
134 }
135 
136 prid_t
137 xfs_get_initial_prid(struct xfs_inode *dp)
138 {
139 	if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
140 		return dp->i_projid;
141 
142 	/* Assign to the root project by default. */
143 	return 0;
144 }
145 
146 /* Propagate di_flags from a parent inode to a child inode. */
147 static inline void
148 xfs_inode_inherit_flags(
149 	struct xfs_inode	*ip,
150 	const struct xfs_inode	*pip)
151 {
152 	unsigned int		di_flags = 0;
153 	xfs_failaddr_t		failaddr;
154 	umode_t			mode = VFS_I(ip)->i_mode;
155 
156 	if (S_ISDIR(mode)) {
157 		if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
158 			di_flags |= XFS_DIFLAG_RTINHERIT;
159 		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
160 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
161 			ip->i_extsize = pip->i_extsize;
162 		}
163 		if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
164 			di_flags |= XFS_DIFLAG_PROJINHERIT;
165 	} else if (S_ISREG(mode)) {
166 		if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
167 		    xfs_has_realtime(ip->i_mount))
168 			di_flags |= XFS_DIFLAG_REALTIME;
169 		if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
170 			di_flags |= XFS_DIFLAG_EXTSIZE;
171 			ip->i_extsize = pip->i_extsize;
172 		}
173 	}
174 	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
175 	    xfs_inherit_noatime)
176 		di_flags |= XFS_DIFLAG_NOATIME;
177 	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
178 	    xfs_inherit_nodump)
179 		di_flags |= XFS_DIFLAG_NODUMP;
180 	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
181 	    xfs_inherit_sync)
182 		di_flags |= XFS_DIFLAG_SYNC;
183 	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
184 	    xfs_inherit_nosymlinks)
185 		di_flags |= XFS_DIFLAG_NOSYMLINKS;
186 	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
187 	    xfs_inherit_nodefrag)
188 		di_flags |= XFS_DIFLAG_NODEFRAG;
189 	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
190 		di_flags |= XFS_DIFLAG_FILESTREAM;
191 
192 	ip->i_diflags |= di_flags;
193 
194 	/*
195 	 * Inode verifiers on older kernels only check that the extent size
196 	 * hint is an integer multiple of the rt extent size on realtime files.
197 	 * They did not check the hint alignment on a directory with both
198 	 * rtinherit and extszinherit flags set.  If the misaligned hint is
199 	 * propagated from a directory into a new realtime file, new file
200 	 * allocations will fail due to math errors in the rt allocator and/or
201 	 * trip the verifiers.  Validate the hint settings in the new file so
202 	 * that we don't let broken hints propagate.
203 	 */
204 	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
205 			VFS_I(ip)->i_mode, ip->i_diflags);
206 	if (failaddr) {
207 		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
208 				   XFS_DIFLAG_EXTSZINHERIT);
209 		ip->i_extsize = 0;
210 	}
211 }
212 
213 /* Propagate di_flags2 from a parent inode to a child inode. */
214 static inline void
215 xfs_inode_inherit_flags2(
216 	struct xfs_inode	*ip,
217 	const struct xfs_inode	*pip)
218 {
219 	xfs_failaddr_t		failaddr;
220 
221 	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
222 		ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
223 		ip->i_cowextsize = pip->i_cowextsize;
224 	}
225 	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
226 		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
227 	if (xfs_is_metadir_inode(pip))
228 		ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
229 
230 	/* Don't let invalid cowextsize hints propagate. */
231 	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
232 			VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
233 	if (failaddr) {
234 		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
235 		ip->i_cowextsize = 0;
236 	}
237 }
238 
239 /*
240  * If we need to create attributes immediately after allocating the inode,
241  * initialise an empty attribute fork right now. We use the default fork offset
242  * for attributes here as we don't know exactly what size or how many
243  * attributes we might be adding. We can do this safely here because we know
244  * the data fork is completely empty and this saves us from needing to run a
245  * separate transaction to set the fork offset in the immediate future.
246  *
247  * If we have parent pointers and the caller hasn't told us that the file will
248  * never be linked into a directory tree, we /must/ create the attr fork.
249  */
250 static inline bool
251 xfs_icreate_want_attrfork(
252 	struct xfs_mount		*mp,
253 	const struct xfs_icreate_args	*args)
254 {
255 	if (args->flags & XFS_ICREATE_INIT_XATTRS)
256 		return true;
257 
258 	if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp))
259 		return true;
260 
261 	return false;
262 }
263 
264 /* Initialise an inode's attributes. */
265 void
266 xfs_inode_init(
267 	struct xfs_trans	*tp,
268 	const struct xfs_icreate_args *args,
269 	struct xfs_inode	*ip)
270 {
271 	struct xfs_inode	*pip = args->pip;
272 	struct inode		*dir = pip ? VFS_I(pip) : NULL;
273 	struct xfs_mount	*mp = tp->t_mountp;
274 	struct inode		*inode = VFS_I(ip);
275 	unsigned int		flags;
276 	int			times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
277 					XFS_ICHGTIME_ACCESS;
278 
279 	if (args->flags & XFS_ICREATE_TMPFILE)
280 		set_nlink(inode, 0);
281 	else if (S_ISDIR(args->mode))
282 		set_nlink(inode, 2);
283 	else
284 		set_nlink(inode, 1);
285 	inode->i_rdev = args->rdev;
286 
287 	if (!args->idmap || pip == NULL) {
288 		/* creating a tree root, sb rooted, or detached file */
289 		inode->i_uid = GLOBAL_ROOT_UID;
290 		inode->i_gid = GLOBAL_ROOT_GID;
291 		ip->i_projid = 0;
292 		inode->i_mode = args->mode;
293 	} else {
294 		/* creating a child in the directory tree */
295 		if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
296 			inode_fsuid_set(inode, args->idmap);
297 			inode->i_gid = dir->i_gid;
298 			inode->i_mode = args->mode;
299 		} else {
300 			inode_init_owner(args->idmap, inode, dir, args->mode);
301 		}
302 
303 		/*
304 		 * If the group ID of the new file does not match the effective
305 		 * group ID or one of the supplementary group IDs, the S_ISGID
306 		 * bit is cleared (and only if the irix_sgid_inherit
307 		 * compatibility variable is set).
308 		 */
309 		if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
310 		    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
311 			inode->i_mode &= ~S_ISGID;
312 
313 		ip->i_projid = xfs_get_initial_prid(pip);
314 	}
315 
316 	ip->i_disk_size = 0;
317 	ip->i_df.if_nextents = 0;
318 	ASSERT(ip->i_nblocks == 0);
319 
320 	ip->i_extsize = 0;
321 	ip->i_diflags = 0;
322 
323 	if (xfs_has_v3inodes(mp)) {
324 		inode_set_iversion(inode, 1);
325 		ip->i_cowextsize = 0;
326 		times |= XFS_ICHGTIME_CREATE;
327 	}
328 
329 	xfs_trans_ichgtime(tp, ip, times);
330 
331 	flags = XFS_ILOG_CORE;
332 	switch (args->mode & S_IFMT) {
333 	case S_IFIFO:
334 	case S_IFCHR:
335 	case S_IFBLK:
336 	case S_IFSOCK:
337 		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
338 		flags |= XFS_ILOG_DEV;
339 		break;
340 	case S_IFREG:
341 	case S_IFDIR:
342 		if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
343 			xfs_inode_inherit_flags(ip, pip);
344 		if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
345 			xfs_inode_inherit_flags2(ip, pip);
346 		fallthrough;
347 	case S_IFLNK:
348 		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
349 		ip->i_df.if_bytes = 0;
350 		ip->i_df.if_data = NULL;
351 		break;
352 	default:
353 		ASSERT(0);
354 	}
355 
356 	if (xfs_icreate_want_attrfork(mp, args)) {
357 		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
358 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
359 
360 		if (!xfs_has_attr(mp)) {
361 			spin_lock(&mp->m_sb_lock);
362 			xfs_add_attr(mp);
363 			spin_unlock(&mp->m_sb_lock);
364 			xfs_log_sb(tp);
365 		}
366 	}
367 
368 	xfs_trans_log_inode(tp, ip, flags);
369 }
370 
371 /*
372  * In-Core Unlinked List Lookups
373  * =============================
374  *
375  * Every inode is supposed to be reachable from some other piece of metadata
376  * with the exception of the root directory.  Inodes with a connection to a
377  * file descriptor but not linked from anywhere in the on-disk directory tree
378  * are collectively known as unlinked inodes, though the filesystem itself
379  * maintains links to these inodes so that on-disk metadata are consistent.
380  *
381  * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
382  * header contains a number of buckets that point to an inode, and each inode
383  * record has a pointer to the next inode in the hash chain.  This
384  * singly-linked list causes scaling problems in the iunlink remove function
385  * because we must walk that list to find the inode that points to the inode
386  * being removed from the unlinked hash bucket list.
387  *
388  * Hence we keep an in-memory double linked list to link each inode on an
389  * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
390  * based lists would require having 64 list heads in the perag, one for each
391  * list. This is expensive in terms of memory (think millions of AGs) and cache
392  * misses on lookups. Instead, use the fact that inodes on the unlinked list
393  * must be referenced at the VFS level to keep them on the list and hence we
394  * have an existence guarantee for inodes on the unlinked list.
395  *
396  * Given we have an existence guarantee, we can use lockless inode cache lookups
397  * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
398  * for the double linked unlinked list, and we don't need any extra locking to
399  * keep the list safe as all manipulations are done under the AGI buffer lock.
400  * Keeping the list up to date does not require memory allocation, just finding
401  * the XFS inode and updating the next/prev unlinked list aginos.
402  */
403 
404 /*
405  * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
406  * is not in cache.
407  */
408 static int
409 xfs_iunlink_update_backref(
410 	struct xfs_perag	*pag,
411 	xfs_agino_t		prev_agino,
412 	xfs_agino_t		next_agino)
413 {
414 	struct xfs_inode	*ip;
415 
416 	/* No update necessary if we are at the end of the list. */
417 	if (next_agino == NULLAGINO)
418 		return 0;
419 
420 	ip = xfs_iunlink_lookup(pag, next_agino);
421 	if (!ip)
422 		return -ENOLINK;
423 
424 	ip->i_prev_unlinked = prev_agino;
425 	return 0;
426 }
427 
428 /*
429  * Point the AGI unlinked bucket at an inode and log the results.  The caller
430  * is responsible for validating the old value.
431  */
432 STATIC int
433 xfs_iunlink_update_bucket(
434 	struct xfs_trans	*tp,
435 	struct xfs_perag	*pag,
436 	struct xfs_buf		*agibp,
437 	unsigned int		bucket_index,
438 	xfs_agino_t		new_agino)
439 {
440 	struct xfs_agi		*agi = agibp->b_addr;
441 	xfs_agino_t		old_value;
442 	int			offset;
443 
444 	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
445 
446 	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
447 	trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value,
448 			new_agino);
449 
450 	/*
451 	 * We should never find the head of the list already set to the value
452 	 * passed in because either we're adding or removing ourselves from the
453 	 * head of the list.
454 	 */
455 	if (old_value == new_agino) {
456 		xfs_buf_mark_corrupt(agibp);
457 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
458 		return -EFSCORRUPTED;
459 	}
460 
461 	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
462 	offset = offsetof(struct xfs_agi, agi_unlinked) +
463 			(sizeof(xfs_agino_t) * bucket_index);
464 	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
465 	return 0;
466 }
467 
468 static int
469 xfs_iunlink_insert_inode(
470 	struct xfs_trans	*tp,
471 	struct xfs_perag	*pag,
472 	struct xfs_buf		*agibp,
473 	struct xfs_inode	*ip)
474 {
475 	struct xfs_mount	*mp = tp->t_mountp;
476 	struct xfs_agi		*agi = agibp->b_addr;
477 	xfs_agino_t		next_agino;
478 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
479 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
480 	int			error;
481 
482 	/*
483 	 * Get the index into the agi hash table for the list this inode will
484 	 * go on.  Make sure the pointer isn't garbage and that this inode
485 	 * isn't already on the list.
486 	 */
487 	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
488 	if (next_agino == agino ||
489 	    !xfs_verify_agino_or_null(pag, next_agino)) {
490 		xfs_buf_mark_corrupt(agibp);
491 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
492 		return -EFSCORRUPTED;
493 	}
494 
495 	/*
496 	 * Update the prev pointer in the next inode to point back to this
497 	 * inode.
498 	 */
499 	error = xfs_iunlink_update_backref(pag, agino, next_agino);
500 	if (error == -ENOLINK)
501 		error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
502 	if (error)
503 		return error;
504 
505 	if (next_agino != NULLAGINO) {
506 		/*
507 		 * There is already another inode in the bucket, so point this
508 		 * inode to the current head of the list.
509 		 */
510 		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
511 		if (error)
512 			return error;
513 		ip->i_next_unlinked = next_agino;
514 	}
515 
516 	/* Point the head of the list to point to this inode. */
517 	ip->i_prev_unlinked = NULLAGINO;
518 	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
519 }
520 
521 /*
522  * This is called when the inode's link count has gone to 0 or we are creating
523  * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
524  *
525  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
526  * list when the inode is freed.
527  */
528 int
529 xfs_iunlink(
530 	struct xfs_trans	*tp,
531 	struct xfs_inode	*ip)
532 {
533 	struct xfs_mount	*mp = tp->t_mountp;
534 	struct xfs_perag	*pag;
535 	struct xfs_buf		*agibp;
536 	int			error;
537 
538 	ASSERT(VFS_I(ip)->i_nlink == 0);
539 	ASSERT(VFS_I(ip)->i_mode != 0);
540 	trace_xfs_iunlink(ip);
541 
542 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
543 
544 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
545 	error = xfs_read_agi(pag, tp, 0, &agibp);
546 	if (error)
547 		goto out;
548 
549 	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
550 out:
551 	xfs_perag_put(pag);
552 	return error;
553 }
554 
555 static int
556 xfs_iunlink_remove_inode(
557 	struct xfs_trans	*tp,
558 	struct xfs_perag	*pag,
559 	struct xfs_buf		*agibp,
560 	struct xfs_inode	*ip)
561 {
562 	struct xfs_mount	*mp = tp->t_mountp;
563 	struct xfs_agi		*agi = agibp->b_addr;
564 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
565 	xfs_agino_t		head_agino;
566 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
567 	int			error;
568 
569 	trace_xfs_iunlink_remove(ip);
570 
571 	/*
572 	 * Get the index into the agi hash table for the list this inode will
573 	 * go on.  Make sure the head pointer isn't garbage.
574 	 */
575 	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
576 	if (!xfs_verify_agino(pag, head_agino)) {
577 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
578 				agi, sizeof(*agi));
579 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
580 		return -EFSCORRUPTED;
581 	}
582 
583 	/*
584 	 * Set our inode's next_unlinked pointer to NULL and then return
585 	 * the old pointer value so that we can update whatever was previous
586 	 * to us in the list to point to whatever was next in the list.
587 	 */
588 	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
589 	if (error)
590 		return error;
591 
592 	/*
593 	 * Update the prev pointer in the next inode to point back to previous
594 	 * inode in the chain.
595 	 */
596 	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
597 			ip->i_next_unlinked);
598 	if (error == -ENOLINK)
599 		error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
600 				ip->i_next_unlinked);
601 	if (error)
602 		return error;
603 
604 	if (head_agino != agino) {
605 		struct xfs_inode	*prev_ip;
606 
607 		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
608 		if (!prev_ip) {
609 			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
610 			return -EFSCORRUPTED;
611 		}
612 
613 		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
614 				ip->i_next_unlinked);
615 		prev_ip->i_next_unlinked = ip->i_next_unlinked;
616 	} else {
617 		/* Point the head of the list to the next unlinked inode. */
618 		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
619 				ip->i_next_unlinked);
620 	}
621 
622 	ip->i_next_unlinked = NULLAGINO;
623 	ip->i_prev_unlinked = 0;
624 	return error;
625 }
626 
627 /*
628  * Pull the on-disk inode from the AGI unlinked list.
629  */
630 int
631 xfs_iunlink_remove(
632 	struct xfs_trans	*tp,
633 	struct xfs_perag	*pag,
634 	struct xfs_inode	*ip)
635 {
636 	struct xfs_buf		*agibp;
637 	int			error;
638 
639 	trace_xfs_iunlink_remove(ip);
640 
641 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
642 	error = xfs_read_agi(pag, tp, 0, &agibp);
643 	if (error)
644 		return error;
645 
646 	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
647 }
648 
649 /*
650  * Decrement the link count on an inode & log the change.  If this causes the
651  * link count to go to zero, move the inode to AGI unlinked list so that it can
652  * be freed when the last active reference goes away via xfs_inactive().
653  */
654 int
655 xfs_droplink(
656 	struct xfs_trans	*tp,
657 	struct xfs_inode	*ip)
658 {
659 	struct inode		*inode = VFS_I(ip);
660 
661 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
662 
663 	if (inode->i_nlink == 0) {
664 		xfs_info_ratelimited(tp->t_mountp,
665  "Inode 0x%llx link count dropped below zero.  Pinning link count.",
666 				ip->i_ino);
667 		set_nlink(inode, XFS_NLINK_PINNED);
668 	}
669 	if (inode->i_nlink != XFS_NLINK_PINNED)
670 		drop_nlink(inode);
671 
672 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
673 
674 	if (inode->i_nlink)
675 		return 0;
676 
677 	return xfs_iunlink(tp, ip);
678 }
679 
680 /*
681  * Increment the link count on an inode & log the change.
682  */
683 void
684 xfs_bumplink(
685 	struct xfs_trans	*tp,
686 	struct xfs_inode	*ip)
687 {
688 	struct inode		*inode = VFS_I(ip);
689 
690 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
691 
692 	if (inode->i_nlink == XFS_NLINK_PINNED - 1)
693 		xfs_info_ratelimited(tp->t_mountp,
694  "Inode 0x%llx link count exceeded maximum.  Pinning link count.",
695 				ip->i_ino);
696 	if (inode->i_nlink != XFS_NLINK_PINNED)
697 		inc_nlink(inode);
698 
699 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
700 }
701 
702 /* Free an inode in the ondisk index and zero it out. */
703 int
704 xfs_inode_uninit(
705 	struct xfs_trans	*tp,
706 	struct xfs_perag	*pag,
707 	struct xfs_inode	*ip,
708 	struct xfs_icluster	*xic)
709 {
710 	struct xfs_mount	*mp = ip->i_mount;
711 	int			error;
712 
713 	/*
714 	 * Free the inode first so that we guarantee that the AGI lock is going
715 	 * to be taken before we remove the inode from the unlinked list. This
716 	 * makes the AGI lock -> unlinked list modification order the same as
717 	 * used in O_TMPFILE creation.
718 	 */
719 	error = xfs_difree(tp, pag, ip->i_ino, xic);
720 	if (error)
721 		return error;
722 
723 	error = xfs_iunlink_remove(tp, pag, ip);
724 	if (error)
725 		return error;
726 
727 	/*
728 	 * Free any local-format data sitting around before we reset the
729 	 * data fork to extents format.  Note that the attr fork data has
730 	 * already been freed by xfs_attr_inactive.
731 	 */
732 	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
733 		kfree(ip->i_df.if_data);
734 		ip->i_df.if_data = NULL;
735 		ip->i_df.if_bytes = 0;
736 	}
737 
738 	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
739 	ip->i_diflags = 0;
740 	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
741 	ip->i_forkoff = 0;		/* mark the attr fork not in use */
742 	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
743 
744 	/*
745 	 * Bump the generation count so no one will be confused
746 	 * by reincarnations of this inode.
747 	 */
748 	VFS_I(ip)->i_generation++;
749 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
750 	return 0;
751 }
752