xref: /linux/fs/xfs/xfs_pnfs.c (revision 25489a4f556414445d342951615178368ee45cde)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2014 Christoph Hellwig.
4  */
5 #include "xfs.h"
6 #include "xfs_shared.h"
7 #include "xfs_format.h"
8 #include "xfs_log_format.h"
9 #include "xfs_trans_resv.h"
10 #include "xfs_mount.h"
11 #include "xfs_inode.h"
12 #include "xfs_trans.h"
13 #include "xfs_bmap.h"
14 #include "xfs_iomap.h"
15 #include "xfs_pnfs.h"
16 
17 /*
18  * Ensure that we do not have any outstanding pNFS layouts that can be used by
19  * clients to directly read from or write to this inode.  This must be called
20  * before every operation that can remove blocks from the extent map.
21  * Additionally we call it during the write operation, where aren't concerned
22  * about exposing unallocated blocks but just want to provide basic
23  * synchronization between a local writer and pNFS clients.  mmap writes would
24  * also benefit from this sort of synchronization, but due to the tricky locking
25  * rules in the page fault path we don't bother.
26  */
27 int
28 xfs_break_leased_layouts(
29 	struct inode		*inode,
30 	uint			*iolock,
31 	bool			*did_unlock)
32 {
33 	struct xfs_inode	*ip = XFS_I(inode);
34 	int			error;
35 
36 	while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
37 		xfs_iunlock(ip, *iolock);
38 		*did_unlock = true;
39 		error = break_layout(inode, true);
40 		*iolock &= ~XFS_IOLOCK_SHARED;
41 		*iolock |= XFS_IOLOCK_EXCL;
42 		xfs_ilock(ip, *iolock);
43 	}
44 
45 	return error;
46 }
47 
48 /*
49  * Get a unique ID including its location so that the client can identify
50  * the exported device.
51  */
52 int
53 xfs_fs_get_uuid(
54 	struct super_block	*sb,
55 	u8			*buf,
56 	u32			*len,
57 	u64			*offset)
58 {
59 	struct xfs_mount	*mp = XFS_M(sb);
60 
61 	if (*len < sizeof(uuid_t))
62 		return -EINVAL;
63 
64 	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
65 	*len = sizeof(uuid_t);
66 	*offset = offsetof(struct xfs_dsb, sb_uuid);
67 	return 0;
68 }
69 
70 /*
71  * We cannot use file based VFS helpers such as file_modified() to update
72  * inode state as we modify the data/metadata in the inode here. Hence we have
73  * to open code the timestamp updates and SUID/SGID stripping. We also need
74  * to set the inode prealloc flag to ensure that the extents we allocate are not
75  * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
76  * is from the client to indicate that data has been written and the file size
77  * can be extended.
78  */
79 static int
80 xfs_fs_map_update_inode(
81 	struct xfs_inode	*ip)
82 {
83 	struct xfs_trans	*tp;
84 	int			error;
85 
86 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
87 			0, 0, 0, &tp);
88 	if (error)
89 		return error;
90 
91 	xfs_ilock(ip, XFS_ILOCK_EXCL);
92 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
93 
94 	VFS_I(ip)->i_mode &= ~S_ISUID;
95 	if (VFS_I(ip)->i_mode & S_IXGRP)
96 		VFS_I(ip)->i_mode &= ~S_ISGID;
97 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
98 	ip->i_diflags |= XFS_DIFLAG_PREALLOC;
99 
100 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
101 	return xfs_trans_commit(tp);
102 }
103 
104 /*
105  * Get a layout for the pNFS client.
106  */
107 int
108 xfs_fs_map_blocks(
109 	struct inode		*inode,
110 	loff_t			offset,
111 	u64			length,
112 	struct iomap		*iomap,
113 	bool			write,
114 	u32			*device_generation)
115 {
116 	struct xfs_inode	*ip = XFS_I(inode);
117 	struct xfs_mount	*mp = ip->i_mount;
118 	struct xfs_bmbt_irec	imap;
119 	xfs_fileoff_t		offset_fsb, end_fsb;
120 	loff_t			limit;
121 	int			bmapi_flags = XFS_BMAPI_ENTIRE;
122 	int			nimaps = 1;
123 	uint			lock_flags;
124 	int			error = 0;
125 	u64			seq;
126 
127 	if (xfs_is_shutdown(mp))
128 		return -EIO;
129 
130 	/*
131 	 * We can't export inodes residing on the realtime device.  The realtime
132 	 * device doesn't have a UUID to identify it, so the client has no way
133 	 * to find it.
134 	 */
135 	if (XFS_IS_REALTIME_INODE(ip))
136 		return -ENXIO;
137 
138 	/*
139 	 * The pNFS block layout spec actually supports reflink like
140 	 * functionality, but the Linux pNFS server doesn't implement it yet.
141 	 */
142 	if (xfs_is_reflink_inode(ip))
143 		return -ENXIO;
144 
145 	/*
146 	 * Lock out any other I/O before we flush and invalidate the pagecache,
147 	 * and then hand out a layout to the remote system.  This is very
148 	 * similar to direct I/O, except that the synchronization is much more
149 	 * complicated.  See the comment near xfs_break_leased_layouts
150 	 * for a detailed explanation.
151 	 */
152 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
153 
154 	error = -EINVAL;
155 	limit = mp->m_super->s_maxbytes;
156 	if (!write)
157 		limit = max(limit, round_up(i_size_read(inode),
158 				     inode->i_sb->s_blocksize));
159 	if (offset > limit)
160 		goto out_unlock;
161 	if (offset > limit - length)
162 		length = limit - offset;
163 
164 	error = filemap_write_and_wait(inode->i_mapping);
165 	if (error)
166 		goto out_unlock;
167 	error = invalidate_inode_pages2(inode->i_mapping);
168 	if (WARN_ON_ONCE(error))
169 		goto out_unlock;
170 
171 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
172 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
173 
174 	lock_flags = xfs_ilock_data_map_shared(ip);
175 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
176 				&imap, &nimaps, bmapi_flags);
177 	seq = xfs_iomap_inode_sequence(ip, 0);
178 
179 	ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
180 
181 	if (!error && write &&
182 	    (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
183 		if (offset + length > XFS_ISIZE(ip))
184 			end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
185 		else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
186 			end_fsb = min(end_fsb, imap.br_startoff +
187 					       imap.br_blockcount);
188 		xfs_iunlock(ip, lock_flags);
189 
190 		error = xfs_iomap_write_direct(ip, offset_fsb,
191 				end_fsb - offset_fsb, 0, &imap, &seq);
192 		if (error)
193 			goto out_unlock;
194 
195 		/*
196 		 * Ensure the next transaction is committed synchronously so
197 		 * that the blocks allocated and handed out to the client are
198 		 * guaranteed to be present even after a server crash.
199 		 */
200 		error = xfs_fs_map_update_inode(ip);
201 		if (!error)
202 			error = xfs_log_force_inode(ip);
203 		if (error)
204 			goto out_unlock;
205 
206 	} else {
207 		xfs_iunlock(ip, lock_flags);
208 	}
209 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
210 
211 	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
212 	*device_generation = mp->m_generation;
213 	return error;
214 out_unlock:
215 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
216 	return error;
217 }
218 
219 /*
220  * Ensure the size update falls into a valid allocated block.
221  */
222 static int
223 xfs_pnfs_validate_isize(
224 	struct xfs_inode	*ip,
225 	xfs_off_t		isize)
226 {
227 	struct xfs_bmbt_irec	imap;
228 	int			nimaps = 1;
229 	int			error = 0;
230 
231 	xfs_ilock(ip, XFS_ILOCK_SHARED);
232 	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
233 				&imap, &nimaps, 0);
234 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
235 	if (error)
236 		return error;
237 
238 	if (imap.br_startblock == HOLESTARTBLOCK ||
239 	    imap.br_startblock == DELAYSTARTBLOCK ||
240 	    imap.br_state == XFS_EXT_UNWRITTEN)
241 		return -EIO;
242 	return 0;
243 }
244 
245 /*
246  * Make sure the blocks described by maps are stable on disk.  This includes
247  * converting any unwritten extents, flushing the disk cache and updating the
248  * time stamps.
249  *
250  * Note that we rely on the caller to always send us a timestamp update so that
251  * we always commit a transaction here.  If that stops being true we will have
252  * to manually flush the cache here similar to what the fsync code path does
253  * for datasyncs on files that have no dirty metadata.
254  */
255 int
256 xfs_fs_commit_blocks(
257 	struct inode		*inode,
258 	struct iomap		*maps,
259 	int			nr_maps,
260 	struct iattr		*iattr)
261 {
262 	struct xfs_inode	*ip = XFS_I(inode);
263 	struct xfs_mount	*mp = ip->i_mount;
264 	struct xfs_trans	*tp;
265 	bool			update_isize = false;
266 	int			error, i;
267 	loff_t			size;
268 
269 	ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
270 
271 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
272 
273 	size = i_size_read(inode);
274 	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
275 		update_isize = true;
276 		size = iattr->ia_size;
277 	}
278 
279 	for (i = 0; i < nr_maps; i++) {
280 		u64 start, length, end;
281 
282 		start = maps[i].offset;
283 		if (start > size)
284 			continue;
285 
286 		end = start + maps[i].length;
287 		if (end > size)
288 			end = size;
289 
290 		length = end - start;
291 		if (!length)
292 			continue;
293 
294 		/*
295 		 * Make sure reads through the pagecache see the new data.
296 		 */
297 		error = invalidate_inode_pages2_range(inode->i_mapping,
298 					start >> PAGE_SHIFT,
299 					(end - 1) >> PAGE_SHIFT);
300 		WARN_ON_ONCE(error);
301 
302 		error = xfs_iomap_write_unwritten(ip, start, length, false);
303 		if (error)
304 			goto out_drop_iolock;
305 	}
306 
307 	if (update_isize) {
308 		error = xfs_pnfs_validate_isize(ip, size);
309 		if (error)
310 			goto out_drop_iolock;
311 	}
312 
313 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
314 	if (error)
315 		goto out_drop_iolock;
316 
317 	xfs_ilock(ip, XFS_ILOCK_EXCL);
318 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
319 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
320 
321 	ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
322 	setattr_copy(&nop_mnt_idmap, inode, iattr);
323 	if (update_isize) {
324 		i_size_write(inode, iattr->ia_size);
325 		ip->i_disk_size = iattr->ia_size;
326 	}
327 
328 	xfs_trans_set_sync(tp);
329 	error = xfs_trans_commit(tp);
330 
331 out_drop_iolock:
332 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
333 	return error;
334 }
335