xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c (revision e7be843b4a162e68651d3911f0357ed464915629)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
25  */
26 
27 
28 #ifdef CONFIG_COMPAT
29 #include <linux/compat.h>
30 #endif
31 #include <linux/fs.h>
32 #include <linux/migrate.h>
33 #include <sys/file.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/zfs_znode.h>
36 #include <sys/zfs_vfsops.h>
37 #include <sys/zfs_vnops.h>
38 #include <sys/zfs_project.h>
39 #include <linux/pagemap_compat.h>
40 #include <linux/fadvise.h>
41 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
42 #include <linux/writeback.h>
43 #endif
44 
45 /*
46  * When using fallocate(2) to preallocate space, inflate the requested
47  * capacity check by 10% to account for the required metadata blocks.
48  */
49 static unsigned int zfs_fallocate_reserve_percent = 110;
50 
51 static int
52 zpl_open(struct inode *ip, struct file *filp)
53 {
54 	cred_t *cr = CRED();
55 	int error;
56 	fstrans_cookie_t cookie;
57 
58 	error = generic_file_open(ip, filp);
59 	if (error)
60 		return (error);
61 
62 	crhold(cr);
63 	cookie = spl_fstrans_mark();
64 	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
65 	spl_fstrans_unmark(cookie);
66 	crfree(cr);
67 	ASSERT3S(error, <=, 0);
68 
69 	return (error);
70 }
71 
72 static int
73 zpl_release(struct inode *ip, struct file *filp)
74 {
75 	cred_t *cr = CRED();
76 	int error;
77 	fstrans_cookie_t cookie;
78 
79 	cookie = spl_fstrans_mark();
80 	if (ITOZ(ip)->z_atime_dirty)
81 		zfs_mark_inode_dirty(ip);
82 
83 	crhold(cr);
84 	error = -zfs_close(ip, filp->f_flags, cr);
85 	spl_fstrans_unmark(cookie);
86 	crfree(cr);
87 	ASSERT3S(error, <=, 0);
88 
89 	return (error);
90 }
91 
92 static int
93 zpl_iterate(struct file *filp, struct dir_context *ctx)
94 {
95 	cred_t *cr = CRED();
96 	int error;
97 	fstrans_cookie_t cookie;
98 
99 	crhold(cr);
100 	cookie = spl_fstrans_mark();
101 	error = -zfs_readdir(file_inode(filp), ctx, cr);
102 	spl_fstrans_unmark(cookie);
103 	crfree(cr);
104 	ASSERT3S(error, <=, 0);
105 
106 	return (error);
107 }
108 
109 static int
110 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
111 {
112 	struct inode *inode = filp->f_mapping->host;
113 	znode_t *zp = ITOZ(inode);
114 	cred_t *cr = CRED();
115 	int error;
116 	fstrans_cookie_t cookie;
117 
118 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
119 	if (error)
120 		return (error);
121 
122 	crhold(cr);
123 	cookie = spl_fstrans_mark();
124 	error = -zfs_fsync(zp, datasync, cr);
125 	spl_fstrans_unmark(cookie);
126 	crfree(cr);
127 	ASSERT3S(error, <=, 0);
128 
129 	return (error);
130 }
131 
132 static inline int
133 zfs_io_flags(struct kiocb *kiocb)
134 {
135 	int flags = 0;
136 
137 #if defined(IOCB_DSYNC)
138 	if (kiocb->ki_flags & IOCB_DSYNC)
139 		flags |= O_DSYNC;
140 #endif
141 #if defined(IOCB_SYNC)
142 	if (kiocb->ki_flags & IOCB_SYNC)
143 		flags |= O_SYNC;
144 #endif
145 #if defined(IOCB_APPEND)
146 	if (kiocb->ki_flags & IOCB_APPEND)
147 		flags |= O_APPEND;
148 #endif
149 #if defined(IOCB_DIRECT)
150 	if (kiocb->ki_flags & IOCB_DIRECT)
151 		flags |= O_DIRECT;
152 #endif
153 	return (flags);
154 }
155 
156 /*
157  * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
158  * is true.  This is needed since datasets with inherited "relatime" property
159  * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
160  * `zfs set relatime=...`), which is what relatime test in VFS by
161  * relatime_need_update() is based on.
162  */
163 static inline void
164 zpl_file_accessed(struct file *filp)
165 {
166 	struct inode *ip = filp->f_mapping->host;
167 
168 	if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
169 		if (zfs_relatime_need_update(ip))
170 			file_accessed(filp);
171 	} else {
172 		file_accessed(filp);
173 	}
174 }
175 
176 static ssize_t
177 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
178 {
179 	cred_t *cr = CRED();
180 	fstrans_cookie_t cookie;
181 	struct file *filp = kiocb->ki_filp;
182 	ssize_t count = iov_iter_count(to);
183 	zfs_uio_t uio;
184 
185 	zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);
186 
187 	crhold(cr);
188 	cookie = spl_fstrans_mark();
189 
190 	ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
191 	    filp->f_flags | zfs_io_flags(kiocb), cr);
192 
193 	spl_fstrans_unmark(cookie);
194 	crfree(cr);
195 
196 	if (ret < 0)
197 		return (ret);
198 
199 	ssize_t read = count - uio.uio_resid;
200 	kiocb->ki_pos += read;
201 
202 	zpl_file_accessed(filp);
203 
204 	return (read);
205 }
206 
207 static inline ssize_t
208 zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
209     size_t *countp)
210 {
211 	ssize_t ret = generic_write_checks(kiocb, from);
212 	if (ret <= 0)
213 		return (ret);
214 
215 	*countp = ret;
216 
217 	return (0);
218 }
219 
220 static ssize_t
221 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
222 {
223 	cred_t *cr = CRED();
224 	fstrans_cookie_t cookie;
225 	struct file *filp = kiocb->ki_filp;
226 	struct inode *ip = filp->f_mapping->host;
227 	zfs_uio_t uio;
228 	size_t count = 0;
229 	ssize_t ret;
230 
231 	ret = zpl_generic_write_checks(kiocb, from, &count);
232 	if (ret)
233 		return (ret);
234 
235 	zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);
236 
237 	crhold(cr);
238 	cookie = spl_fstrans_mark();
239 
240 	ret = -zfs_write(ITOZ(ip), &uio,
241 	    filp->f_flags | zfs_io_flags(kiocb), cr);
242 
243 	spl_fstrans_unmark(cookie);
244 	crfree(cr);
245 
246 	if (ret < 0)
247 		return (ret);
248 
249 	ssize_t wrote = count - uio.uio_resid;
250 	kiocb->ki_pos += wrote;
251 
252 	return (wrote);
253 }
254 
255 static ssize_t
256 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
257 {
258 	/*
259 	 * All O_DIRECT requests should be handled by
260 	 * zpl_iter_write/read}(). There is no way kernel generic code should
261 	 * call the direct_IO address_space_operations function. We set this
262 	 * code path to be fatal if it is executed.
263 	 */
264 	PANIC(0);
265 	return (0);
266 }
267 
268 static loff_t
269 zpl_llseek(struct file *filp, loff_t offset, int whence)
270 {
271 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
272 	fstrans_cookie_t cookie;
273 
274 	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
275 		struct inode *ip = filp->f_mapping->host;
276 		loff_t maxbytes = ip->i_sb->s_maxbytes;
277 		loff_t error;
278 
279 		spl_inode_lock_shared(ip);
280 		cookie = spl_fstrans_mark();
281 		error = -zfs_holey(ITOZ(ip), whence, &offset);
282 		spl_fstrans_unmark(cookie);
283 		if (error == 0)
284 			error = lseek_execute(filp, ip, offset, maxbytes);
285 		spl_inode_unlock_shared(ip);
286 
287 		return (error);
288 	}
289 #endif /* SEEK_HOLE && SEEK_DATA */
290 
291 	return (generic_file_llseek(filp, offset, whence));
292 }
293 
294 /*
295  * It's worth taking a moment to describe how mmap is implemented
296  * for zfs because it differs considerably from other Linux filesystems.
297  * However, this issue is handled the same way under OpenSolaris.
298  *
299  * The issue is that by design zfs bypasses the Linux page cache and
300  * leaves all caching up to the ARC.  This has been shown to work
301  * well for the common read(2)/write(2) case.  However, mmap(2)
302  * is problem because it relies on being tightly integrated with the
303  * page cache.  To handle this we cache mmap'ed files twice, once in
304  * the ARC and a second time in the page cache.  The code is careful
305  * to keep both copies synchronized.
306  *
307  * When a file with an mmap'ed region is written to using write(2)
308  * both the data in the ARC and existing pages in the page cache
309  * are updated.  For a read(2) data will be read first from the page
310  * cache then the ARC if needed.  Neither a write(2) or read(2) will
311  * will ever result in new pages being added to the page cache.
312  *
313  * New pages are added to the page cache only via .readpage() which
314  * is called when the vfs needs to read a page off disk to back the
315  * virtual memory region.  These pages may be modified without
316  * notifying the ARC and will be written out periodically via
317  * .writepage().  This will occur due to either a sync or the usual
318  * page aging behavior.  Note because a read(2) of a mmap'ed file
319  * will always check the page cache first even when the ARC is out
320  * of date correct data will still be returned.
321  *
322  * While this implementation ensures correct behavior it does have
323  * have some drawbacks.  The most obvious of which is that it
324  * increases the required memory footprint when access mmap'ed
325  * files.  It also adds additional complexity to the code keeping
326  * both caches synchronized.
327  *
328  * Longer term it may be possible to cleanly resolve this wart by
329  * mapping page cache pages directly on to the ARC buffers.  The
330  * Linux address space operations are flexible enough to allow
331  * selection of which pages back a particular index.  The trick
332  * would be working out the details of which subsystem is in
333  * charge, the ARC, the page cache, or both.  It may also prove
334  * helpful to move the ARC buffers to a scatter-gather lists
335  * rather than a vmalloc'ed region.
336  */
337 static int
338 zpl_mmap(struct file *filp, struct vm_area_struct *vma)
339 {
340 	struct inode *ip = filp->f_mapping->host;
341 	int error;
342 	fstrans_cookie_t cookie;
343 
344 	cookie = spl_fstrans_mark();
345 	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
346 	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
347 	spl_fstrans_unmark(cookie);
348 
349 	if (error)
350 		return (error);
351 
352 	error = generic_file_mmap(filp, vma);
353 	if (error)
354 		return (error);
355 
356 	return (error);
357 }
358 
359 /*
360  * Populate a page with data for the Linux page cache.  This function is
361  * only used to support mmap(2).  There will be an identical copy of the
362  * data in the ARC which is kept up to date via .write() and .writepage().
363  */
364 static inline int
365 zpl_readpage_common(struct page *pp)
366 {
367 	fstrans_cookie_t cookie;
368 
369 	ASSERT(PageLocked(pp));
370 
371 	cookie = spl_fstrans_mark();
372 	int error = -zfs_getpage(pp->mapping->host, pp);
373 	spl_fstrans_unmark(cookie);
374 
375 	unlock_page(pp);
376 
377 	return (error);
378 }
379 
380 #ifdef HAVE_VFS_READ_FOLIO
381 static int
382 zpl_read_folio(struct file *filp, struct folio *folio)
383 {
384 	return (zpl_readpage_common(&folio->page));
385 }
386 #else
387 static int
388 zpl_readpage(struct file *filp, struct page *pp)
389 {
390 	return (zpl_readpage_common(pp));
391 }
392 #endif
393 
394 static int
395 zpl_readpage_filler(void *data, struct page *pp)
396 {
397 	return (zpl_readpage_common(pp));
398 }
399 
400 /*
401  * Populate a set of pages with data for the Linux page cache.  This
402  * function will only be called for read ahead and never for demand
403  * paging.  For simplicity, the code relies on read_cache_pages() to
404  * correctly lock each page for IO and call zpl_readpage().
405  */
406 #ifdef HAVE_VFS_READPAGES
407 static int
408 zpl_readpages(struct file *filp, struct address_space *mapping,
409     struct list_head *pages, unsigned nr_pages)
410 {
411 	return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
412 }
413 #else
414 static void
415 zpl_readahead(struct readahead_control *ractl)
416 {
417 	struct page *page;
418 
419 	while ((page = readahead_page(ractl)) != NULL) {
420 		int ret;
421 
422 		ret = zpl_readpage_filler(NULL, page);
423 		put_page(page);
424 		if (ret)
425 			break;
426 	}
427 }
428 #endif
429 
430 static int
431 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
432 {
433 	boolean_t *for_sync = data;
434 	fstrans_cookie_t cookie;
435 	int ret;
436 
437 	ASSERT(PageLocked(pp));
438 	ASSERT(!PageWriteback(pp));
439 
440 	cookie = spl_fstrans_mark();
441 	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
442 	spl_fstrans_unmark(cookie);
443 
444 	return (ret);
445 }
446 
447 #ifdef HAVE_WRITEPAGE_T_FOLIO
448 static int
449 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
450 {
451 	return (zpl_putpage(&pp->page, wbc, data));
452 }
453 #endif
454 
455 static inline int
456 zpl_write_cache_pages(struct address_space *mapping,
457     struct writeback_control *wbc, void *data)
458 {
459 	int result;
460 
461 #ifdef HAVE_WRITEPAGE_T_FOLIO
462 	result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
463 #else
464 	result = write_cache_pages(mapping, wbc, zpl_putpage, data);
465 #endif
466 	return (result);
467 }
468 
469 static int
470 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
471 {
472 	znode_t		*zp = ITOZ(mapping->host);
473 	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
474 	enum writeback_sync_modes sync_mode;
475 	int result;
476 
477 	if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
478 		return (result);
479 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
480 		wbc->sync_mode = WB_SYNC_ALL;
481 	zpl_exit(zfsvfs, FTAG);
482 	sync_mode = wbc->sync_mode;
483 
484 	/*
485 	 * We don't want to run write_cache_pages() in SYNC mode here, because
486 	 * that would make putpage() wait for a single page to be committed to
487 	 * disk every single time, resulting in atrocious performance. Instead
488 	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
489 	 * and then we commit it all in one go.
490 	 */
491 	boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
492 	wbc->sync_mode = WB_SYNC_NONE;
493 	result = zpl_write_cache_pages(mapping, wbc, &for_sync);
494 	if (sync_mode != wbc->sync_mode) {
495 		if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
496 			return (result);
497 		if (zfsvfs->z_log != NULL)
498 			zil_commit(zfsvfs->z_log, zp->z_id);
499 		zpl_exit(zfsvfs, FTAG);
500 
501 		/*
502 		 * We need to call write_cache_pages() again (we can't just
503 		 * return after the commit) because the previous call in
504 		 * non-SYNC mode does not guarantee that we got all the dirty
505 		 * pages (see the implementation of write_cache_pages() for
506 		 * details). That being said, this is a no-op in most cases.
507 		 */
508 		wbc->sync_mode = sync_mode;
509 		result = zpl_write_cache_pages(mapping, wbc, &for_sync);
510 	}
511 	return (result);
512 }
513 
514 #ifdef HAVE_VFS_WRITEPAGE
515 /*
516  * Write out dirty pages to the ARC, this function is only required to
517  * support mmap(2).  Mapped pages may be dirtied by memory operations
518  * which never call .write().  These dirty pages are kept in sync with
519  * the ARC buffers via this hook.
520  */
521 static int
522 zpl_writepage(struct page *pp, struct writeback_control *wbc)
523 {
524 	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
525 		wbc->sync_mode = WB_SYNC_ALL;
526 
527 	boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
528 
529 	return (zpl_putpage(pp, wbc, &for_sync));
530 }
531 #endif
532 
533 /*
534  * The flag combination which matches the behavior of zfs_space() is
535  * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
536  * flag was introduced in the 2.6.38 kernel.
537  *
538  * The original mode=0 (allocate space) behavior can be reasonably emulated
539  * by checking if enough space exists and creating a sparse file, as real
540  * persistent space reservation is not possible due to COW, snapshots, etc.
541  */
542 static long
543 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
544 {
545 	cred_t *cr = CRED();
546 	loff_t olen;
547 	fstrans_cookie_t cookie;
548 	int error = 0;
549 
550 	int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;
551 
552 	if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)
553 		return (-EOPNOTSUPP);
554 
555 	if (offset < 0 || len <= 0)
556 		return (-EINVAL);
557 
558 	spl_inode_lock(ip);
559 	olen = i_size_read(ip);
560 
561 	crhold(cr);
562 	cookie = spl_fstrans_mark();
563 	if (mode & (test_mode)) {
564 		flock64_t bf;
565 
566 		if (mode & FALLOC_FL_KEEP_SIZE) {
567 			if (offset > olen)
568 				goto out_unmark;
569 
570 			if (offset + len > olen)
571 				len = olen - offset;
572 		}
573 		bf.l_type = F_WRLCK;
574 		bf.l_whence = SEEK_SET;
575 		bf.l_start = offset;
576 		bf.l_len = len;
577 		bf.l_pid = 0;
578 
579 		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
580 	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
581 		unsigned int percent = zfs_fallocate_reserve_percent;
582 		struct kstatfs statfs;
583 
584 		/* Legacy mode, disable fallocate compatibility. */
585 		if (percent == 0) {
586 			error = -EOPNOTSUPP;
587 			goto out_unmark;
588 		}
589 
590 		/*
591 		 * Use zfs_statvfs() instead of dmu_objset_space() since it
592 		 * also checks project quota limits, which are relevant here.
593 		 */
594 		error = zfs_statvfs(ip, &statfs);
595 		if (error)
596 			goto out_unmark;
597 
598 		/*
599 		 * Shrink available space a bit to account for overhead/races.
600 		 * We know the product previously fit into availbytes from
601 		 * dmu_objset_space(), so the smaller product will also fit.
602 		 */
603 		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
604 			error = -ENOSPC;
605 			goto out_unmark;
606 		}
607 		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
608 			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
609 	}
610 out_unmark:
611 	spl_fstrans_unmark(cookie);
612 	spl_inode_unlock(ip);
613 
614 	crfree(cr);
615 
616 	return (error);
617 }
618 
619 static long
620 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
621 {
622 	return zpl_fallocate_common(file_inode(filp),
623 	    mode, offset, len);
624 }
625 
626 static int
627 zpl_ioctl_getversion(struct file *filp, void __user *arg)
628 {
629 	uint32_t generation = file_inode(filp)->i_generation;
630 
631 	return (copy_to_user(arg, &generation, sizeof (generation)));
632 }
633 
634 static int
635 zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
636 {
637 	struct inode *ip = file_inode(filp);
638 	znode_t *zp = ITOZ(ip);
639 	zfsvfs_t *zfsvfs = ITOZSB(ip);
640 	objset_t *os = zfsvfs->z_os;
641 	int error = 0;
642 
643 	if (S_ISFIFO(ip->i_mode))
644 		return (-ESPIPE);
645 
646 	if (offset < 0 || len < 0)
647 		return (-EINVAL);
648 
649 	if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
650 		return (error);
651 
652 	switch (advice) {
653 	case POSIX_FADV_SEQUENTIAL:
654 	case POSIX_FADV_WILLNEED:
655 #ifdef HAVE_GENERIC_FADVISE
656 		if (zn_has_cached_data(zp, offset, offset + len - 1))
657 			error = generic_fadvise(filp, offset, len, advice);
658 #endif
659 		/*
660 		 * Pass on the caller's size directly, but note that
661 		 * dmu_prefetch_max will effectively cap it.  If there
662 		 * really is a larger sequential access pattern, perhaps
663 		 * dmu_zfetch will detect it.
664 		 */
665 		if (len == 0)
666 			len = i_size_read(ip) - offset;
667 
668 		dmu_prefetch(os, zp->z_id, 0, offset, len,
669 		    ZIO_PRIORITY_ASYNC_READ);
670 		break;
671 	case POSIX_FADV_NORMAL:
672 	case POSIX_FADV_RANDOM:
673 	case POSIX_FADV_DONTNEED:
674 	case POSIX_FADV_NOREUSE:
675 		/* ignored for now */
676 		break;
677 	default:
678 		error = -EINVAL;
679 		break;
680 	}
681 
682 	zfs_exit(zfsvfs, FTAG);
683 
684 	return (error);
685 }
686 
687 #define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
688 #define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
689 
690 static uint32_t
691 __zpl_ioctl_getflags(struct inode *ip)
692 {
693 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
694 	uint32_t ioctl_flags = 0;
695 
696 	if (zfs_flags & ZFS_IMMUTABLE)
697 		ioctl_flags |= FS_IMMUTABLE_FL;
698 
699 	if (zfs_flags & ZFS_APPENDONLY)
700 		ioctl_flags |= FS_APPEND_FL;
701 
702 	if (zfs_flags & ZFS_NODUMP)
703 		ioctl_flags |= FS_NODUMP_FL;
704 
705 	if (zfs_flags & ZFS_PROJINHERIT)
706 		ioctl_flags |= ZFS_PROJINHERIT_FL;
707 
708 	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
709 }
710 
711 /*
712  * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
713  * attributes common to both Linux and Solaris are mapped.
714  */
715 static int
716 zpl_ioctl_getflags(struct file *filp, void __user *arg)
717 {
718 	uint32_t flags;
719 	int err;
720 
721 	flags = __zpl_ioctl_getflags(file_inode(filp));
722 	err = copy_to_user(arg, &flags, sizeof (flags));
723 
724 	return (err);
725 }
726 
727 /*
728  * fchange() is a helper macro to detect if we have been asked to change a
729  * flag. This is ugly, but the requirement that we do this is a consequence of
730  * how the Linux file attribute interface was designed. Another consequence is
731  * that concurrent modification of files suffers from a TOCTOU race. Neither
732  * are things we can fix without modifying the kernel-userland interface, which
733  * is outside of our jurisdiction.
734  */
735 
736 #define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
737 
738 static int
739 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
740 {
741 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
742 	xoptattr_t *xoap;
743 
744 	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
745 	    ZFS_PROJINHERIT_FL))
746 		return (-EOPNOTSUPP);
747 
748 	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
749 		return (-EACCES);
750 
751 	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
752 	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
753 	    !capable(CAP_LINUX_IMMUTABLE))
754 		return (-EPERM);
755 
756 	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
757 		return (-EACCES);
758 
759 	xva_init(xva);
760 	xoap = xva_getxoptattr(xva);
761 
762 #define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {	\
763 	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
764 	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
765 		XVA_SET_REQ(xva, (xflag));	\
766 		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
767 	}	\
768 } while (0)
769 
770 	FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,
771 	    xoap->xoa_immutable);
772 	FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,
773 	    xoap->xoa_appendonly);
774 	FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
775 	    xoap->xoa_nodump);
776 	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
777 	    xoap->xoa_projinherit);
778 
779 #undef	FLAG_CHANGE
780 
781 	return (0);
782 }
783 
784 static int
785 zpl_ioctl_setflags(struct file *filp, void __user *arg)
786 {
787 	struct inode *ip = file_inode(filp);
788 	uint32_t flags;
789 	cred_t *cr = CRED();
790 	xvattr_t xva;
791 	int err;
792 	fstrans_cookie_t cookie;
793 
794 	if (copy_from_user(&flags, arg, sizeof (flags)))
795 		return (-EFAULT);
796 
797 	err = __zpl_ioctl_setflags(ip, flags, &xva);
798 	if (err)
799 		return (err);
800 
801 	crhold(cr);
802 	cookie = spl_fstrans_mark();
803 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
804 	spl_fstrans_unmark(cookie);
805 	crfree(cr);
806 
807 	return (err);
808 }
809 
810 static int
811 zpl_ioctl_getxattr(struct file *filp, void __user *arg)
812 {
813 	zfsxattr_t fsx = { 0 };
814 	struct inode *ip = file_inode(filp);
815 	int err;
816 
817 	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
818 	fsx.fsx_projid = ITOZ(ip)->z_projid;
819 	err = copy_to_user(arg, &fsx, sizeof (fsx));
820 
821 	return (err);
822 }
823 
824 static int
825 zpl_ioctl_setxattr(struct file *filp, void __user *arg)
826 {
827 	struct inode *ip = file_inode(filp);
828 	zfsxattr_t fsx;
829 	cred_t *cr = CRED();
830 	xvattr_t xva;
831 	xoptattr_t *xoap;
832 	int err;
833 	fstrans_cookie_t cookie;
834 
835 	if (copy_from_user(&fsx, arg, sizeof (fsx)))
836 		return (-EFAULT);
837 
838 	if (!zpl_is_valid_projid(fsx.fsx_projid))
839 		return (-EINVAL);
840 
841 	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
842 	if (err)
843 		return (err);
844 
845 	xoap = xva_getxoptattr(&xva);
846 	XVA_SET_REQ(&xva, XAT_PROJID);
847 	xoap->xoa_projid = fsx.fsx_projid;
848 
849 	crhold(cr);
850 	cookie = spl_fstrans_mark();
851 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
852 	spl_fstrans_unmark(cookie);
853 	crfree(cr);
854 
855 	return (err);
856 }
857 
858 /*
859  * Expose Additional File Level Attributes of ZFS.
860  */
861 static int
862 zpl_ioctl_getdosflags(struct file *filp, void __user *arg)
863 {
864 	struct inode *ip = file_inode(filp);
865 	uint64_t dosflags = ITOZ(ip)->z_pflags;
866 	dosflags &= ZFS_DOS_FL_USER_VISIBLE;
867 	int err = copy_to_user(arg, &dosflags, sizeof (dosflags));
868 
869 	return (err);
870 }
871 
872 static int
873 __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
874 {
875 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
876 	xoptattr_t *xoap;
877 
878 	if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))
879 		return (-EOPNOTSUPP);
880 
881 	if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||
882 	    fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&
883 	    !capable(CAP_LINUX_IMMUTABLE))
884 		return (-EPERM);
885 
886 	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
887 		return (-EACCES);
888 
889 	xva_init(xva);
890 	xoap = xva_getxoptattr(xva);
891 
892 #define	FLAG_CHANGE(iflag, xflag, xfield)	do {	\
893 	if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) ||	\
894 	    ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) {	\
895 		XVA_SET_REQ(xva, (xflag));	\
896 		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
897 	}	\
898 } while (0)
899 
900 	FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);
901 	FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);
902 	FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);
903 	FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);
904 	FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);
905 	FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);
906 	FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);
907 	FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);
908 	FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);
909 	FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);
910 	FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);
911 
912 #undef	FLAG_CHANGE
913 
914 	return (0);
915 }
916 
917 /*
918  * Set Additional File Level Attributes of ZFS.
919  */
920 static int
921 zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
922 {
923 	struct inode *ip = file_inode(filp);
924 	uint64_t dosflags;
925 	cred_t *cr = CRED();
926 	xvattr_t xva;
927 	int err;
928 	fstrans_cookie_t cookie;
929 
930 	if (copy_from_user(&dosflags, arg, sizeof (dosflags)))
931 		return (-EFAULT);
932 
933 	err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);
934 	if (err)
935 		return (err);
936 
937 	crhold(cr);
938 	cookie = spl_fstrans_mark();
939 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
940 	spl_fstrans_unmark(cookie);
941 	crfree(cr);
942 
943 	return (err);
944 }
945 
946 static int
947 zpl_ioctl_rewrite(struct file *filp, void __user *arg)
948 {
949 	struct inode *ip = file_inode(filp);
950 	zfs_rewrite_args_t args;
951 	fstrans_cookie_t cookie;
952 	int err;
953 
954 	if (copy_from_user(&args, arg, sizeof (args)))
955 		return (-EFAULT);
956 
957 	if (unlikely(!(filp->f_mode & FMODE_WRITE)))
958 		return (-EBADF);
959 
960 	cookie = spl_fstrans_mark();
961 	err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
962 	spl_fstrans_unmark(cookie);
963 
964 	return (err);
965 }
966 
967 static long
968 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
969 {
970 	switch (cmd) {
971 	case FS_IOC_GETVERSION:
972 		return (zpl_ioctl_getversion(filp, (void *)arg));
973 	case FS_IOC_GETFLAGS:
974 		return (zpl_ioctl_getflags(filp, (void *)arg));
975 	case FS_IOC_SETFLAGS:
976 		return (zpl_ioctl_setflags(filp, (void *)arg));
977 	case ZFS_IOC_FSGETXATTR:
978 		return (zpl_ioctl_getxattr(filp, (void *)arg));
979 	case ZFS_IOC_FSSETXATTR:
980 		return (zpl_ioctl_setxattr(filp, (void *)arg));
981 	case ZFS_IOC_GETDOSFLAGS:
982 		return (zpl_ioctl_getdosflags(filp, (void *)arg));
983 	case ZFS_IOC_SETDOSFLAGS:
984 		return (zpl_ioctl_setdosflags(filp, (void *)arg));
985 	case ZFS_IOC_REWRITE:
986 		return (zpl_ioctl_rewrite(filp, (void *)arg));
987 	default:
988 		return (-ENOTTY);
989 	}
990 }
991 
992 #ifdef CONFIG_COMPAT
993 static long
994 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
995 {
996 	switch (cmd) {
997 	case FS_IOC32_GETVERSION:
998 		cmd = FS_IOC_GETVERSION;
999 		break;
1000 	case FS_IOC32_GETFLAGS:
1001 		cmd = FS_IOC_GETFLAGS;
1002 		break;
1003 	case FS_IOC32_SETFLAGS:
1004 		cmd = FS_IOC_SETFLAGS;
1005 		break;
1006 	default:
1007 		return (-ENOTTY);
1008 	}
1009 	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1010 }
1011 #endif /* CONFIG_COMPAT */
1012 
1013 const struct address_space_operations zpl_address_space_operations = {
1014 #ifdef HAVE_VFS_READPAGES
1015 	.readpages	= zpl_readpages,
1016 #else
1017 	.readahead	= zpl_readahead,
1018 #endif
1019 #ifdef HAVE_VFS_READ_FOLIO
1020 	.read_folio	= zpl_read_folio,
1021 #else
1022 	.readpage	= zpl_readpage,
1023 #endif
1024 #ifdef HAVE_VFS_WRITEPAGE
1025 	.writepage	= zpl_writepage,
1026 #endif
1027 	.writepages	= zpl_writepages,
1028 	.direct_IO	= zpl_direct_IO,
1029 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
1030 	.set_page_dirty = __set_page_dirty_nobuffers,
1031 #endif
1032 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
1033 	.dirty_folio	= filemap_dirty_folio,
1034 #endif
1035 #ifdef HAVE_VFS_MIGRATE_FOLIO
1036 	.migrate_folio	= migrate_folio,
1037 #elif defined(HAVE_VFS_MIGRATEPAGE)
1038 	.migratepage	= migrate_page,
1039 #endif
1040 };
1041 
1042 const struct file_operations zpl_file_operations = {
1043 	.open		= zpl_open,
1044 	.release	= zpl_release,
1045 	.llseek		= zpl_llseek,
1046 	.read_iter	= zpl_iter_read,
1047 	.write_iter	= zpl_iter_write,
1048 #ifdef HAVE_COPY_SPLICE_READ
1049 	.splice_read	= copy_splice_read,
1050 #else
1051 	.splice_read	= generic_file_splice_read,
1052 #endif
1053 	.splice_write	= iter_file_splice_write,
1054 	.mmap		= zpl_mmap,
1055 	.fsync		= zpl_fsync,
1056 	.fallocate	= zpl_fallocate,
1057 	.copy_file_range	= zpl_copy_file_range,
1058 #ifdef HAVE_VFS_CLONE_FILE_RANGE
1059 	.clone_file_range	= zpl_clone_file_range,
1060 #endif
1061 #ifdef HAVE_VFS_REMAP_FILE_RANGE
1062 	.remap_file_range	= zpl_remap_file_range,
1063 #endif
1064 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE
1065 	.dedupe_file_range	= zpl_dedupe_file_range,
1066 #endif
1067 	.fadvise	= zpl_fadvise,
1068 	.unlocked_ioctl	= zpl_ioctl,
1069 #ifdef CONFIG_COMPAT
1070 	.compat_ioctl	= zpl_compat_ioctl,
1071 #endif
1072 };
1073 
1074 const struct file_operations zpl_dir_file_operations = {
1075 	.llseek		= generic_file_llseek,
1076 	.read		= generic_read_dir,
1077 	.iterate_shared	= zpl_iterate,
1078 	.fsync		= zpl_fsync,
1079 	.unlocked_ioctl = zpl_ioctl,
1080 #ifdef CONFIG_COMPAT
1081 	.compat_ioctl   = zpl_compat_ioctl,
1082 #endif
1083 };
1084 
1085 module_param(zfs_fallocate_reserve_percent, uint, 0644);
1086 MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
1087 	"Percentage of length to use for the available capacity check");
1088