xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c (revision e2eeea75eb8b6dd50c1298067a0655880d186734)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
23  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
24  */
25 
26 
27 #ifdef CONFIG_COMPAT
28 #include <linux/compat.h>
29 #endif
30 #include <sys/file.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/zfs_znode.h>
33 #include <sys/zfs_vfsops.h>
34 #include <sys/zfs_vnops.h>
35 #include <sys/zfs_project.h>
36 
37 /*
38  * When using fallocate(2) to preallocate space, inflate the requested
39  * capacity check by 10% to account for the required metadata blocks.
40  */
41 unsigned int zfs_fallocate_reserve_percent = 110;
42 
43 static int
44 zpl_open(struct inode *ip, struct file *filp)
45 {
46 	cred_t *cr = CRED();
47 	int error;
48 	fstrans_cookie_t cookie;
49 
50 	error = generic_file_open(ip, filp);
51 	if (error)
52 		return (error);
53 
54 	crhold(cr);
55 	cookie = spl_fstrans_mark();
56 	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
57 	spl_fstrans_unmark(cookie);
58 	crfree(cr);
59 	ASSERT3S(error, <=, 0);
60 
61 	return (error);
62 }
63 
64 static int
65 zpl_release(struct inode *ip, struct file *filp)
66 {
67 	cred_t *cr = CRED();
68 	int error;
69 	fstrans_cookie_t cookie;
70 
71 	cookie = spl_fstrans_mark();
72 	if (ITOZ(ip)->z_atime_dirty)
73 		zfs_mark_inode_dirty(ip);
74 
75 	crhold(cr);
76 	error = -zfs_close(ip, filp->f_flags, cr);
77 	spl_fstrans_unmark(cookie);
78 	crfree(cr);
79 	ASSERT3S(error, <=, 0);
80 
81 	return (error);
82 }
83 
84 static int
85 zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
86 {
87 	cred_t *cr = CRED();
88 	int error;
89 	fstrans_cookie_t cookie;
90 
91 	crhold(cr);
92 	cookie = spl_fstrans_mark();
93 	error = -zfs_readdir(file_inode(filp), ctx, cr);
94 	spl_fstrans_unmark(cookie);
95 	crfree(cr);
96 	ASSERT3S(error, <=, 0);
97 
98 	return (error);
99 }
100 
101 #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
102 static int
103 zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
104 {
105 	zpl_dir_context_t ctx =
106 	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
107 	int error;
108 
109 	error = zpl_iterate(filp, &ctx);
110 	filp->f_pos = ctx.pos;
111 
112 	return (error);
113 }
114 #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
115 
116 #if defined(HAVE_FSYNC_WITHOUT_DENTRY)
117 /*
118  * Linux 2.6.35 - 3.0 API,
119  * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
120  * redundant.  The dentry is still accessible via filp->f_path.dentry,
121  * and we are guaranteed that filp will never be NULL.
122  */
123 static int
124 zpl_fsync(struct file *filp, int datasync)
125 {
126 	struct inode *inode = filp->f_mapping->host;
127 	cred_t *cr = CRED();
128 	int error;
129 	fstrans_cookie_t cookie;
130 
131 	crhold(cr);
132 	cookie = spl_fstrans_mark();
133 	error = -zfs_fsync(ITOZ(inode), datasync, cr);
134 	spl_fstrans_unmark(cookie);
135 	crfree(cr);
136 	ASSERT3S(error, <=, 0);
137 
138 	return (error);
139 }
140 
141 #ifdef HAVE_FILE_AIO_FSYNC
142 static int
143 zpl_aio_fsync(struct kiocb *kiocb, int datasync)
144 {
145 	return (zpl_fsync(kiocb->ki_filp, datasync));
146 }
147 #endif
148 
149 #elif defined(HAVE_FSYNC_RANGE)
150 /*
151  * Linux 3.1 - 3.x API,
152  * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
153  * been pushed down in to the .fsync() vfs hook.  Additionally, the i_mutex
154  * lock is no longer held by the caller, for zfs we don't require the lock
155  * to be held so we don't acquire it.
156  */
157 static int
158 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
159 {
160 	struct inode *inode = filp->f_mapping->host;
161 	cred_t *cr = CRED();
162 	int error;
163 	fstrans_cookie_t cookie;
164 
165 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
166 	if (error)
167 		return (error);
168 
169 	crhold(cr);
170 	cookie = spl_fstrans_mark();
171 	error = -zfs_fsync(ITOZ(inode), datasync, cr);
172 	spl_fstrans_unmark(cookie);
173 	crfree(cr);
174 	ASSERT3S(error, <=, 0);
175 
176 	return (error);
177 }
178 
179 #ifdef HAVE_FILE_AIO_FSYNC
180 static int
181 zpl_aio_fsync(struct kiocb *kiocb, int datasync)
182 {
183 	return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
184 }
185 #endif
186 
187 #else
188 #error "Unsupported fops->fsync() implementation"
189 #endif
190 
191 static inline int
192 zfs_io_flags(struct kiocb *kiocb)
193 {
194 	int flags = 0;
195 
196 #if defined(IOCB_DSYNC)
197 	if (kiocb->ki_flags & IOCB_DSYNC)
198 		flags |= O_DSYNC;
199 #endif
200 #if defined(IOCB_SYNC)
201 	if (kiocb->ki_flags & IOCB_SYNC)
202 		flags |= O_SYNC;
203 #endif
204 #if defined(IOCB_APPEND)
205 	if (kiocb->ki_flags & IOCB_APPEND)
206 		flags |= O_APPEND;
207 #endif
208 #if defined(IOCB_DIRECT)
209 	if (kiocb->ki_flags & IOCB_DIRECT)
210 		flags |= O_DIRECT;
211 #endif
212 	return (flags);
213 }
214 
215 static ssize_t
216 zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
217     unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
218     cred_t *cr, size_t skip)
219 {
220 	ssize_t read;
221 	uio_t uio = { { 0 }, 0 };
222 	int error;
223 	fstrans_cookie_t cookie;
224 
225 	uio.uio_iov = iovp;
226 	uio.uio_iovcnt = nr_segs;
227 	uio.uio_loffset = *ppos;
228 	uio.uio_segflg = segment;
229 	uio.uio_limit = MAXOFFSET_T;
230 	uio.uio_resid = count;
231 	uio.uio_skip = skip;
232 
233 	cookie = spl_fstrans_mark();
234 	error = -zfs_read(ip, &uio, flags, cr);
235 	spl_fstrans_unmark(cookie);
236 	if (error < 0)
237 		return (error);
238 
239 	read = count - uio.uio_resid;
240 	*ppos += read;
241 
242 	return (read);
243 }
244 
245 inline ssize_t
246 zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
247     uio_seg_t segment, int flags, cred_t *cr)
248 {
249 	struct iovec iov;
250 
251 	iov.iov_base = (void *)buf;
252 	iov.iov_len = len;
253 
254 	return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
255 	    flags, cr, 0));
256 }
257 
258 static ssize_t
259 zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
260     unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
261 {
262 	cred_t *cr = CRED();
263 	struct file *filp = kiocb->ki_filp;
264 	struct inode *ip = filp->f_mapping->host;
265 	zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
266 	ssize_t read;
267 	unsigned int f_flags = filp->f_flags;
268 
269 	f_flags |= zfs_io_flags(kiocb);
270 	crhold(cr);
271 	read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
272 	    nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
273 	crfree(cr);
274 
275 	/*
276 	 * If relatime is enabled, call file_accessed() only if
277 	 * zfs_relatime_need_update() is true.  This is needed since datasets
278 	 * with inherited "relatime" property aren't necessarily mounted with
279 	 * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
280 	 * relatime test in VFS by relatime_need_update() is based on.
281 	 */
282 	if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
283 		if (zfs_relatime_need_update(ip))
284 			file_accessed(filp);
285 	} else {
286 		file_accessed(filp);
287 	}
288 
289 	return (read);
290 }
291 
292 #if defined(HAVE_VFS_RW_ITERATE)
293 static ssize_t
294 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
295 {
296 	ssize_t ret;
297 	uio_seg_t seg = UIO_USERSPACE;
298 	if (to->type & ITER_KVEC)
299 		seg = UIO_SYSSPACE;
300 	if (to->type & ITER_BVEC)
301 		seg = UIO_BVEC;
302 	ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
303 	    iov_iter_count(to), seg, to->iov_offset);
304 	if (ret > 0)
305 		iov_iter_advance(to, ret);
306 	return (ret);
307 }
308 #else
309 static ssize_t
310 zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
311     unsigned long nr_segs, loff_t pos)
312 {
313 	ssize_t ret;
314 	size_t count;
315 
316 	ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
317 	if (ret)
318 		return (ret);
319 
320 	return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
321 	    UIO_USERSPACE, 0));
322 }
323 #endif /* HAVE_VFS_RW_ITERATE */
324 
325 static ssize_t
326 zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
327     unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
328     cred_t *cr, size_t skip)
329 {
330 	ssize_t wrote;
331 	uio_t uio = { { 0 }, 0 };
332 	int error;
333 	fstrans_cookie_t cookie;
334 
335 	if (flags & O_APPEND)
336 		*ppos = i_size_read(ip);
337 
338 	uio.uio_iov = iovp;
339 	uio.uio_iovcnt = nr_segs;
340 	uio.uio_loffset = *ppos;
341 	uio.uio_segflg = segment;
342 	uio.uio_limit = MAXOFFSET_T;
343 	uio.uio_resid = count;
344 	uio.uio_skip = skip;
345 
346 	cookie = spl_fstrans_mark();
347 	error = -zfs_write(ip, &uio, flags, cr);
348 	spl_fstrans_unmark(cookie);
349 	if (error < 0)
350 		return (error);
351 
352 	wrote = count - uio.uio_resid;
353 	*ppos += wrote;
354 
355 	return (wrote);
356 }
357 
358 inline ssize_t
359 zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
360     uio_seg_t segment, int flags, cred_t *cr)
361 {
362 	struct iovec iov;
363 
364 	iov.iov_base = (void *)buf;
365 	iov.iov_len = len;
366 
367 	return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
368 	    flags, cr, 0));
369 }
370 
371 static ssize_t
372 zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
373     unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
374 {
375 	cred_t *cr = CRED();
376 	struct file *filp = kiocb->ki_filp;
377 	ssize_t wrote;
378 	unsigned int f_flags = filp->f_flags;
379 
380 	f_flags |= zfs_io_flags(kiocb);
381 	crhold(cr);
382 	wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
383 	    nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
384 	crfree(cr);
385 
386 	return (wrote);
387 }
388 
389 #if defined(HAVE_VFS_RW_ITERATE)
390 static ssize_t
391 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
392 {
393 	size_t count;
394 	ssize_t ret;
395 	uio_seg_t seg = UIO_USERSPACE;
396 
397 #ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
398 	struct file *file = kiocb->ki_filp;
399 	struct address_space *mapping = file->f_mapping;
400 	struct inode *ip = mapping->host;
401 	int isblk = S_ISBLK(ip->i_mode);
402 
403 	count = iov_iter_count(from);
404 	ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
405 	if (ret)
406 		return (ret);
407 #else
408 	/*
409 	 * XXX - ideally this check should be in the same lock region with
410 	 * write operations, so that there's no TOCTTOU race when doing
411 	 * append and someone else grow the file.
412 	 */
413 	ret = generic_write_checks(kiocb, from);
414 	if (ret <= 0)
415 		return (ret);
416 	count = ret;
417 #endif
418 
419 	if (from->type & ITER_KVEC)
420 		seg = UIO_SYSSPACE;
421 	if (from->type & ITER_BVEC)
422 		seg = UIO_BVEC;
423 
424 	ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
425 	    count, seg, from->iov_offset);
426 	if (ret > 0)
427 		iov_iter_advance(from, ret);
428 
429 	return (ret);
430 }
431 #else
432 static ssize_t
433 zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
434     unsigned long nr_segs, loff_t pos)
435 {
436 	struct file *file = kiocb->ki_filp;
437 	struct address_space *mapping = file->f_mapping;
438 	struct inode *ip = mapping->host;
439 	int isblk = S_ISBLK(ip->i_mode);
440 	size_t count;
441 	ssize_t ret;
442 
443 	ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
444 	if (ret)
445 		return (ret);
446 
447 	ret = generic_write_checks(file, &pos, &count, isblk);
448 	if (ret)
449 		return (ret);
450 
451 	return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
452 	    UIO_USERSPACE, 0));
453 }
454 #endif /* HAVE_VFS_RW_ITERATE */
455 
456 #if defined(HAVE_VFS_RW_ITERATE)
457 static ssize_t
458 zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
459 {
460 	if (rw == WRITE)
461 		return (zpl_iter_write(kiocb, iter));
462 	else
463 		return (zpl_iter_read(kiocb, iter));
464 }
465 #if defined(HAVE_VFS_DIRECT_IO_ITER)
466 static ssize_t
467 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
468 {
469 	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
470 }
471 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
472 static ssize_t
473 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
474 {
475 	ASSERT3S(pos, ==, kiocb->ki_pos);
476 	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
477 }
478 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
479 static ssize_t
480 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
481 {
482 	ASSERT3S(pos, ==, kiocb->ki_pos);
483 	return (zpl_direct_IO_impl(rw, kiocb, iter));
484 }
485 #else
486 #error "Unknown direct IO interface"
487 #endif
488 
489 #else
490 
491 #if defined(HAVE_VFS_DIRECT_IO_IOVEC)
492 static ssize_t
493 zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
494     loff_t pos, unsigned long nr_segs)
495 {
496 	if (rw == WRITE)
497 		return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
498 	else
499 		return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
500 }
501 #else
502 #error "Unknown direct IO interface"
503 #endif
504 
505 #endif /* HAVE_VFS_RW_ITERATE */
506 
507 static loff_t
508 zpl_llseek(struct file *filp, loff_t offset, int whence)
509 {
510 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
511 	fstrans_cookie_t cookie;
512 
513 	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
514 		struct inode *ip = filp->f_mapping->host;
515 		loff_t maxbytes = ip->i_sb->s_maxbytes;
516 		loff_t error;
517 
518 		spl_inode_lock_shared(ip);
519 		cookie = spl_fstrans_mark();
520 		error = -zfs_holey(ip, whence, &offset);
521 		spl_fstrans_unmark(cookie);
522 		if (error == 0)
523 			error = lseek_execute(filp, ip, offset, maxbytes);
524 		spl_inode_unlock_shared(ip);
525 
526 		return (error);
527 	}
528 #endif /* SEEK_HOLE && SEEK_DATA */
529 
530 	return (generic_file_llseek(filp, offset, whence));
531 }
532 
533 /*
534  * It's worth taking a moment to describe how mmap is implemented
535  * for zfs because it differs considerably from other Linux filesystems.
536  * However, this issue is handled the same way under OpenSolaris.
537  *
538  * The issue is that by design zfs bypasses the Linux page cache and
539  * leaves all caching up to the ARC.  This has been shown to work
540  * well for the common read(2)/write(2) case.  However, mmap(2)
541  * is problem because it relies on being tightly integrated with the
542  * page cache.  To handle this we cache mmap'ed files twice, once in
543  * the ARC and a second time in the page cache.  The code is careful
544  * to keep both copies synchronized.
545  *
546  * When a file with an mmap'ed region is written to using write(2)
547  * both the data in the ARC and existing pages in the page cache
548  * are updated.  For a read(2) data will be read first from the page
549  * cache then the ARC if needed.  Neither a write(2) or read(2) will
550  * will ever result in new pages being added to the page cache.
551  *
552  * New pages are added to the page cache only via .readpage() which
553  * is called when the vfs needs to read a page off disk to back the
554  * virtual memory region.  These pages may be modified without
555  * notifying the ARC and will be written out periodically via
556  * .writepage().  This will occur due to either a sync or the usual
557  * page aging behavior.  Note because a read(2) of a mmap'ed file
558  * will always check the page cache first even when the ARC is out
559  * of date correct data will still be returned.
560  *
561  * While this implementation ensures correct behavior it does have
562  * have some drawbacks.  The most obvious of which is that it
563  * increases the required memory footprint when access mmap'ed
564  * files.  It also adds additional complexity to the code keeping
565  * both caches synchronized.
566  *
567  * Longer term it may be possible to cleanly resolve this wart by
568  * mapping page cache pages directly on to the ARC buffers.  The
569  * Linux address space operations are flexible enough to allow
570  * selection of which pages back a particular index.  The trick
571  * would be working out the details of which subsystem is in
572  * charge, the ARC, the page cache, or both.  It may also prove
573  * helpful to move the ARC buffers to a scatter-gather lists
574  * rather than a vmalloc'ed region.
575  */
576 static int
577 zpl_mmap(struct file *filp, struct vm_area_struct *vma)
578 {
579 	struct inode *ip = filp->f_mapping->host;
580 	znode_t *zp = ITOZ(ip);
581 	int error;
582 	fstrans_cookie_t cookie;
583 
584 	cookie = spl_fstrans_mark();
585 	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
586 	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
587 	spl_fstrans_unmark(cookie);
588 	if (error)
589 		return (error);
590 
591 	error = generic_file_mmap(filp, vma);
592 	if (error)
593 		return (error);
594 
595 	mutex_enter(&zp->z_lock);
596 	zp->z_is_mapped = B_TRUE;
597 	mutex_exit(&zp->z_lock);
598 
599 	return (error);
600 }
601 
602 /*
603  * Populate a page with data for the Linux page cache.  This function is
604  * only used to support mmap(2).  There will be an identical copy of the
605  * data in the ARC which is kept up to date via .write() and .writepage().
606  *
607  * Current this function relies on zpl_read_common() and the O_DIRECT
608  * flag to read in a page.  This works but the more correct way is to
609  * update zfs_fillpage() to be Linux friendly and use that interface.
610  */
611 static int
612 zpl_readpage(struct file *filp, struct page *pp)
613 {
614 	struct inode *ip;
615 	struct page *pl[1];
616 	int error = 0;
617 	fstrans_cookie_t cookie;
618 
619 	ASSERT(PageLocked(pp));
620 	ip = pp->mapping->host;
621 	pl[0] = pp;
622 
623 	cookie = spl_fstrans_mark();
624 	error = -zfs_getpage(ip, pl, 1);
625 	spl_fstrans_unmark(cookie);
626 
627 	if (error) {
628 		SetPageError(pp);
629 		ClearPageUptodate(pp);
630 	} else {
631 		ClearPageError(pp);
632 		SetPageUptodate(pp);
633 		flush_dcache_page(pp);
634 	}
635 
636 	unlock_page(pp);
637 	return (error);
638 }
639 
640 /*
641  * Populate a set of pages with data for the Linux page cache.  This
642  * function will only be called for read ahead and never for demand
643  * paging.  For simplicity, the code relies on read_cache_pages() to
644  * correctly lock each page for IO and call zpl_readpage().
645  */
646 static int
647 zpl_readpages(struct file *filp, struct address_space *mapping,
648     struct list_head *pages, unsigned nr_pages)
649 {
650 	return (read_cache_pages(mapping, pages,
651 	    (filler_t *)zpl_readpage, filp));
652 }
653 
654 static int
655 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
656 {
657 	struct address_space *mapping = data;
658 	fstrans_cookie_t cookie;
659 
660 	ASSERT(PageLocked(pp));
661 	ASSERT(!PageWriteback(pp));
662 
663 	cookie = spl_fstrans_mark();
664 	(void) zfs_putpage(mapping->host, pp, wbc);
665 	spl_fstrans_unmark(cookie);
666 
667 	return (0);
668 }
669 
670 static int
671 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
672 {
673 	znode_t		*zp = ITOZ(mapping->host);
674 	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
675 	enum writeback_sync_modes sync_mode;
676 	int result;
677 
678 	ZFS_ENTER(zfsvfs);
679 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
680 		wbc->sync_mode = WB_SYNC_ALL;
681 	ZFS_EXIT(zfsvfs);
682 	sync_mode = wbc->sync_mode;
683 
684 	/*
685 	 * We don't want to run write_cache_pages() in SYNC mode here, because
686 	 * that would make putpage() wait for a single page to be committed to
687 	 * disk every single time, resulting in atrocious performance. Instead
688 	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
689 	 * and then we commit it all in one go.
690 	 */
691 	wbc->sync_mode = WB_SYNC_NONE;
692 	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
693 	if (sync_mode != wbc->sync_mode) {
694 		ZFS_ENTER(zfsvfs);
695 		ZFS_VERIFY_ZP(zp);
696 		if (zfsvfs->z_log != NULL)
697 			zil_commit(zfsvfs->z_log, zp->z_id);
698 		ZFS_EXIT(zfsvfs);
699 
700 		/*
701 		 * We need to call write_cache_pages() again (we can't just
702 		 * return after the commit) because the previous call in
703 		 * non-SYNC mode does not guarantee that we got all the dirty
704 		 * pages (see the implementation of write_cache_pages() for
705 		 * details). That being said, this is a no-op in most cases.
706 		 */
707 		wbc->sync_mode = sync_mode;
708 		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
709 	}
710 	return (result);
711 }
712 
713 /*
714  * Write out dirty pages to the ARC, this function is only required to
715  * support mmap(2).  Mapped pages may be dirtied by memory operations
716  * which never call .write().  These dirty pages are kept in sync with
717  * the ARC buffers via this hook.
718  */
719 static int
720 zpl_writepage(struct page *pp, struct writeback_control *wbc)
721 {
722 	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
723 		wbc->sync_mode = WB_SYNC_ALL;
724 
725 	return (zpl_putpage(pp, wbc, pp->mapping));
726 }
727 
728 /*
729  * The flag combination which matches the behavior of zfs_space() is
730  * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
731  * flag was introduced in the 2.6.38 kernel.
732  *
733  * The original mode=0 (allocate space) behavior can be reasonably emulated
734  * by checking if enough space exists and creating a sparse file, as real
735  * persistent space reservation is not possible due to COW, snapshots, etc.
736  */
737 static long
738 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
739 {
740 	cred_t *cr = CRED();
741 	loff_t olen;
742 	fstrans_cookie_t cookie;
743 	int error = 0;
744 
745 	if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
746 		return (-EOPNOTSUPP);
747 
748 	if (offset < 0 || len <= 0)
749 		return (-EINVAL);
750 
751 	spl_inode_lock(ip);
752 	olen = i_size_read(ip);
753 
754 	crhold(cr);
755 	cookie = spl_fstrans_mark();
756 	if (mode & FALLOC_FL_PUNCH_HOLE) {
757 		flock64_t bf;
758 
759 		if (offset > olen)
760 			goto out_unmark;
761 
762 		if (offset + len > olen)
763 			len = olen - offset;
764 		bf.l_type = F_WRLCK;
765 		bf.l_whence = SEEK_SET;
766 		bf.l_start = offset;
767 		bf.l_len = len;
768 		bf.l_pid = 0;
769 
770 		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
771 	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
772 		unsigned int percent = zfs_fallocate_reserve_percent;
773 		struct kstatfs statfs;
774 
775 		/* Legacy mode, disable fallocate compatibility. */
776 		if (percent == 0) {
777 			error = -EOPNOTSUPP;
778 			goto out_unmark;
779 		}
780 
781 		/*
782 		 * Use zfs_statvfs() instead of dmu_objset_space() since it
783 		 * also checks project quota limits, which are relevant here.
784 		 */
785 		error = zfs_statvfs(ip, &statfs);
786 		if (error)
787 			goto out_unmark;
788 
789 		/*
790 		 * Shrink available space a bit to account for overhead/races.
791 		 * We know the product previously fit into availbytes from
792 		 * dmu_objset_space(), so the smaller product will also fit.
793 		 */
794 		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
795 			error = -ENOSPC;
796 			goto out_unmark;
797 		}
798 		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
799 			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
800 	}
801 out_unmark:
802 	spl_fstrans_unmark(cookie);
803 	spl_inode_unlock(ip);
804 
805 	crfree(cr);
806 
807 	return (error);
808 }
809 
810 static long
811 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
812 {
813 	return zpl_fallocate_common(file_inode(filp),
814 	    mode, offset, len);
815 }
816 
817 #define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
818 #define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
819 
820 static uint32_t
821 __zpl_ioctl_getflags(struct inode *ip)
822 {
823 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
824 	uint32_t ioctl_flags = 0;
825 
826 	if (zfs_flags & ZFS_IMMUTABLE)
827 		ioctl_flags |= FS_IMMUTABLE_FL;
828 
829 	if (zfs_flags & ZFS_APPENDONLY)
830 		ioctl_flags |= FS_APPEND_FL;
831 
832 	if (zfs_flags & ZFS_NODUMP)
833 		ioctl_flags |= FS_NODUMP_FL;
834 
835 	if (zfs_flags & ZFS_PROJINHERIT)
836 		ioctl_flags |= ZFS_PROJINHERIT_FL;
837 
838 	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
839 }
840 
841 /*
842  * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
843  * attributes common to both Linux and Solaris are mapped.
844  */
845 static int
846 zpl_ioctl_getflags(struct file *filp, void __user *arg)
847 {
848 	uint32_t flags;
849 	int err;
850 
851 	flags = __zpl_ioctl_getflags(file_inode(filp));
852 	err = copy_to_user(arg, &flags, sizeof (flags));
853 
854 	return (err);
855 }
856 
857 /*
858  * fchange() is a helper macro to detect if we have been asked to change a
859  * flag. This is ugly, but the requirement that we do this is a consequence of
860  * how the Linux file attribute interface was designed. Another consequence is
861  * that concurrent modification of files suffers from a TOCTOU race. Neither
862  * are things we can fix without modifying the kernel-userland interface, which
863  * is outside of our jurisdiction.
864  */
865 
866 #define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
867 
868 static int
869 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
870 {
871 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
872 	xoptattr_t *xoap;
873 
874 	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
875 	    ZFS_PROJINHERIT_FL))
876 		return (-EOPNOTSUPP);
877 
878 	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
879 		return (-EACCES);
880 
881 	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
882 	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
883 	    !capable(CAP_LINUX_IMMUTABLE))
884 		return (-EACCES);
885 
886 	if (!inode_owner_or_capable(ip))
887 		return (-EACCES);
888 
889 	xva_init(xva);
890 	xoap = xva_getxoptattr(xva);
891 
892 	XVA_SET_REQ(xva, XAT_IMMUTABLE);
893 	if (ioctl_flags & FS_IMMUTABLE_FL)
894 		xoap->xoa_immutable = B_TRUE;
895 
896 	XVA_SET_REQ(xva, XAT_APPENDONLY);
897 	if (ioctl_flags & FS_APPEND_FL)
898 		xoap->xoa_appendonly = B_TRUE;
899 
900 	XVA_SET_REQ(xva, XAT_NODUMP);
901 	if (ioctl_flags & FS_NODUMP_FL)
902 		xoap->xoa_nodump = B_TRUE;
903 
904 	XVA_SET_REQ(xva, XAT_PROJINHERIT);
905 	if (ioctl_flags & ZFS_PROJINHERIT_FL)
906 		xoap->xoa_projinherit = B_TRUE;
907 
908 	return (0);
909 }
910 
911 static int
912 zpl_ioctl_setflags(struct file *filp, void __user *arg)
913 {
914 	struct inode *ip = file_inode(filp);
915 	uint32_t flags;
916 	cred_t *cr = CRED();
917 	xvattr_t xva;
918 	int err;
919 	fstrans_cookie_t cookie;
920 
921 	if (copy_from_user(&flags, arg, sizeof (flags)))
922 		return (-EFAULT);
923 
924 	err = __zpl_ioctl_setflags(ip, flags, &xva);
925 	if (err)
926 		return (err);
927 
928 	crhold(cr);
929 	cookie = spl_fstrans_mark();
930 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
931 	spl_fstrans_unmark(cookie);
932 	crfree(cr);
933 
934 	return (err);
935 }
936 
937 static int
938 zpl_ioctl_getxattr(struct file *filp, void __user *arg)
939 {
940 	zfsxattr_t fsx = { 0 };
941 	struct inode *ip = file_inode(filp);
942 	int err;
943 
944 	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
945 	fsx.fsx_projid = ITOZ(ip)->z_projid;
946 	err = copy_to_user(arg, &fsx, sizeof (fsx));
947 
948 	return (err);
949 }
950 
951 static int
952 zpl_ioctl_setxattr(struct file *filp, void __user *arg)
953 {
954 	struct inode *ip = file_inode(filp);
955 	zfsxattr_t fsx;
956 	cred_t *cr = CRED();
957 	xvattr_t xva;
958 	xoptattr_t *xoap;
959 	int err;
960 	fstrans_cookie_t cookie;
961 
962 	if (copy_from_user(&fsx, arg, sizeof (fsx)))
963 		return (-EFAULT);
964 
965 	if (!zpl_is_valid_projid(fsx.fsx_projid))
966 		return (-EINVAL);
967 
968 	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
969 	if (err)
970 		return (err);
971 
972 	xoap = xva_getxoptattr(&xva);
973 	XVA_SET_REQ(&xva, XAT_PROJID);
974 	xoap->xoa_projid = fsx.fsx_projid;
975 
976 	crhold(cr);
977 	cookie = spl_fstrans_mark();
978 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
979 	spl_fstrans_unmark(cookie);
980 	crfree(cr);
981 
982 	return (err);
983 }
984 
985 static long
986 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
987 {
988 	switch (cmd) {
989 	case FS_IOC_GETFLAGS:
990 		return (zpl_ioctl_getflags(filp, (void *)arg));
991 	case FS_IOC_SETFLAGS:
992 		return (zpl_ioctl_setflags(filp, (void *)arg));
993 	case ZFS_IOC_FSGETXATTR:
994 		return (zpl_ioctl_getxattr(filp, (void *)arg));
995 	case ZFS_IOC_FSSETXATTR:
996 		return (zpl_ioctl_setxattr(filp, (void *)arg));
997 	default:
998 		return (-ENOTTY);
999 	}
1000 }
1001 
1002 #ifdef CONFIG_COMPAT
1003 static long
1004 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1005 {
1006 	switch (cmd) {
1007 	case FS_IOC32_GETFLAGS:
1008 		cmd = FS_IOC_GETFLAGS;
1009 		break;
1010 	case FS_IOC32_SETFLAGS:
1011 		cmd = FS_IOC_SETFLAGS;
1012 		break;
1013 	default:
1014 		return (-ENOTTY);
1015 	}
1016 	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1017 }
1018 #endif /* CONFIG_COMPAT */
1019 
1020 
1021 const struct address_space_operations zpl_address_space_operations = {
1022 	.readpages	= zpl_readpages,
1023 	.readpage	= zpl_readpage,
1024 	.writepage	= zpl_writepage,
1025 	.writepages	= zpl_writepages,
1026 	.direct_IO	= zpl_direct_IO,
1027 };
1028 
1029 const struct file_operations zpl_file_operations = {
1030 	.open		= zpl_open,
1031 	.release	= zpl_release,
1032 	.llseek		= zpl_llseek,
1033 #ifdef HAVE_VFS_RW_ITERATE
1034 #ifdef HAVE_NEW_SYNC_READ
1035 	.read		= new_sync_read,
1036 	.write		= new_sync_write,
1037 #endif
1038 	.read_iter	= zpl_iter_read,
1039 	.write_iter	= zpl_iter_write,
1040 #else
1041 	.read		= do_sync_read,
1042 	.write		= do_sync_write,
1043 	.aio_read	= zpl_aio_read,
1044 	.aio_write	= zpl_aio_write,
1045 #endif
1046 	.mmap		= zpl_mmap,
1047 	.fsync		= zpl_fsync,
1048 #ifdef HAVE_FILE_AIO_FSYNC
1049 	.aio_fsync	= zpl_aio_fsync,
1050 #endif
1051 	.fallocate	= zpl_fallocate,
1052 	.unlocked_ioctl	= zpl_ioctl,
1053 #ifdef CONFIG_COMPAT
1054 	.compat_ioctl	= zpl_compat_ioctl,
1055 #endif
1056 };
1057 
1058 const struct file_operations zpl_dir_file_operations = {
1059 	.llseek		= generic_file_llseek,
1060 	.read		= generic_read_dir,
1061 #if defined(HAVE_VFS_ITERATE_SHARED)
1062 	.iterate_shared	= zpl_iterate,
1063 #elif defined(HAVE_VFS_ITERATE)
1064 	.iterate	= zpl_iterate,
1065 #else
1066 	.readdir	= zpl_readdir,
1067 #endif
1068 	.fsync		= zpl_fsync,
1069 	.unlocked_ioctl = zpl_ioctl,
1070 #ifdef CONFIG_COMPAT
1071 	.compat_ioctl   = zpl_compat_ioctl,
1072 #endif
1073 };
1074 
1075 /* BEGIN CSTYLED */
1076 module_param(zfs_fallocate_reserve_percent, uint, 0644);
1077 MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
1078     "Percentage of length to use for the available capacity check");
1079 /* END CSTYLED */
1080