1 /*-
2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
3 *
4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5 * All rights reserved.
6 *
7 * This software was developed for the FreeBSD Project by Marshall
8 * Kirk McKusick and Network Associates Laboratories, the Security
9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11 * research program
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Copyright (c) 1982, 1986, 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
61 */
62
63 #include <sys/cdefs.h>
64 #include "opt_directio.h"
65 #include "opt_ffs.h"
66 #include "opt_ufs.h"
67
68 #include <sys/param.h>
69 #include <sys/bio.h>
70 #include <sys/systm.h>
71 #include <sys/buf.h>
72 #include <sys/conf.h>
73 #include <sys/extattr.h>
74 #include <sys/kernel.h>
75 #include <sys/limits.h>
76 #include <sys/malloc.h>
77 #include <sys/mount.h>
78 #include <sys/priv.h>
79 #include <sys/rwlock.h>
80 #include <sys/stat.h>
81 #include <sys/sysctl.h>
82 #include <sys/vmmeter.h>
83 #include <sys/vnode.h>
84
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/vm_extern.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pager.h>
91 #include <vm/vnode_pager.h>
92
93 #include <ufs/ufs/extattr.h>
94 #include <ufs/ufs/quota.h>
95 #include <ufs/ufs/inode.h>
96 #include <ufs/ufs/ufs_extern.h>
97 #include <ufs/ufs/ufsmount.h>
98 #include <ufs/ufs/dir.h>
99 #ifdef UFS_DIRHASH
100 #include <ufs/ufs/dirhash.h>
101 #endif
102
103 #include <ufs/ffs/fs.h>
104 #include <ufs/ffs/ffs_extern.h>
105
106 #define ALIGNED_TO(ptr, s) \
107 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
108
109 #ifdef DIRECTIO
110 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
111 #endif
112 static vop_fdatasync_t ffs_fdatasync;
113 static vop_fsync_t ffs_fsync;
114 static vop_getpages_t ffs_getpages;
115 static vop_getpages_async_t ffs_getpages_async;
116 static vop_lock1_t ffs_lock;
117 #ifdef INVARIANTS
118 static vop_unlock_t ffs_unlock_debug;
119 #endif
120 static vop_read_t ffs_read;
121 static vop_write_t ffs_write;
122 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
123 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
124 struct ucred *cred);
125 static vop_strategy_t ffsext_strategy;
126 static vop_closeextattr_t ffs_closeextattr;
127 static vop_deleteextattr_t ffs_deleteextattr;
128 static vop_getextattr_t ffs_getextattr;
129 static vop_listextattr_t ffs_listextattr;
130 static vop_openextattr_t ffs_openextattr;
131 static vop_setextattr_t ffs_setextattr;
132 static vop_vptofh_t ffs_vptofh;
133 static vop_vput_pair_t ffs_vput_pair;
134
135 vop_fplookup_vexec_t ufs_fplookup_vexec;
136
137 /* Global vfs data structures for ufs. */
138 struct vop_vector ffs_vnodeops1 = {
139 .vop_default = &ufs_vnodeops,
140 .vop_fsync = ffs_fsync,
141 .vop_fdatasync = ffs_fdatasync,
142 .vop_getpages = ffs_getpages,
143 .vop_getpages_async = ffs_getpages_async,
144 .vop_lock1 = ffs_lock,
145 #ifdef INVARIANTS
146 .vop_unlock = ffs_unlock_debug,
147 #endif
148 .vop_read = ffs_read,
149 .vop_reallocblks = ffs_reallocblks,
150 .vop_write = ffs_write,
151 .vop_vptofh = ffs_vptofh,
152 .vop_vput_pair = ffs_vput_pair,
153 .vop_fplookup_vexec = ufs_fplookup_vexec,
154 .vop_fplookup_symlink = VOP_EAGAIN,
155 };
156 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
157
158 struct vop_vector ffs_fifoops1 = {
159 .vop_default = &ufs_fifoops,
160 .vop_fsync = ffs_fsync,
161 .vop_fdatasync = ffs_fdatasync,
162 .vop_lock1 = ffs_lock,
163 #ifdef INVARIANTS
164 .vop_unlock = ffs_unlock_debug,
165 #endif
166 .vop_vptofh = ffs_vptofh,
167 .vop_fplookup_vexec = VOP_EAGAIN,
168 .vop_fplookup_symlink = VOP_EAGAIN,
169 };
170 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
171
172 /* Global vfs data structures for ufs. */
173 struct vop_vector ffs_vnodeops2 = {
174 .vop_default = &ufs_vnodeops,
175 .vop_fsync = ffs_fsync,
176 .vop_fdatasync = ffs_fdatasync,
177 .vop_getpages = ffs_getpages,
178 .vop_getpages_async = ffs_getpages_async,
179 .vop_lock1 = ffs_lock,
180 #ifdef INVARIANTS
181 .vop_unlock = ffs_unlock_debug,
182 #endif
183 .vop_read = ffs_read,
184 .vop_reallocblks = ffs_reallocblks,
185 .vop_write = ffs_write,
186 .vop_closeextattr = ffs_closeextattr,
187 .vop_deleteextattr = ffs_deleteextattr,
188 .vop_getextattr = ffs_getextattr,
189 .vop_listextattr = ffs_listextattr,
190 .vop_openextattr = ffs_openextattr,
191 .vop_setextattr = ffs_setextattr,
192 .vop_vptofh = ffs_vptofh,
193 .vop_vput_pair = ffs_vput_pair,
194 .vop_fplookup_vexec = ufs_fplookup_vexec,
195 .vop_fplookup_symlink = VOP_EAGAIN,
196 };
197 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
198
199 struct vop_vector ffs_fifoops2 = {
200 .vop_default = &ufs_fifoops,
201 .vop_fsync = ffs_fsync,
202 .vop_fdatasync = ffs_fdatasync,
203 .vop_lock1 = ffs_lock,
204 #ifdef INVARIANTS
205 .vop_unlock = ffs_unlock_debug,
206 #endif
207 .vop_reallocblks = ffs_reallocblks,
208 .vop_strategy = ffsext_strategy,
209 .vop_closeextattr = ffs_closeextattr,
210 .vop_deleteextattr = ffs_deleteextattr,
211 .vop_getextattr = ffs_getextattr,
212 .vop_listextattr = ffs_listextattr,
213 .vop_openextattr = ffs_openextattr,
214 .vop_setextattr = ffs_setextattr,
215 .vop_vptofh = ffs_vptofh,
216 .vop_fplookup_vexec = VOP_EAGAIN,
217 .vop_fplookup_symlink = VOP_EAGAIN,
218 };
219 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
220
221 /*
222 * Synch an open file.
223 */
224 /* ARGSUSED */
225 static int
ffs_fsync(struct vop_fsync_args * ap)226 ffs_fsync(struct vop_fsync_args *ap)
227 {
228 struct vnode *vp;
229 struct bufobj *bo;
230 int error;
231
232 vp = ap->a_vp;
233 bo = &vp->v_bufobj;
234 retry:
235 error = ffs_syncvnode(vp, ap->a_waitfor, 0);
236 if (error)
237 return (error);
238 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
239 error = softdep_fsync(vp);
240 if (error)
241 return (error);
242
243 /*
244 * The softdep_fsync() function may drop vp lock,
245 * allowing for dirty buffers to reappear on the
246 * bo_dirty list. Recheck and resync as needed.
247 */
248 BO_LOCK(bo);
249 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
250 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
251 BO_UNLOCK(bo);
252 goto retry;
253 }
254 BO_UNLOCK(bo);
255 }
256 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
257 return (ENXIO);
258 return (0);
259 }
260
261 int
ffs_syncvnode(struct vnode * vp,int waitfor,int flags)262 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
263 {
264 struct inode *ip;
265 struct bufobj *bo;
266 struct ufsmount *ump;
267 struct buf *bp, *nbp;
268 ufs_lbn_t lbn;
269 int error, passes, wflag;
270 bool still_dirty, unlocked, wait;
271
272 ip = VTOI(vp);
273 bo = &vp->v_bufobj;
274 ump = VFSTOUFS(vp->v_mount);
275 #ifdef WITNESS
276 wflag = IS_SNAPSHOT(ip) ? LK_NOWITNESS : 0;
277 #else
278 wflag = 0;
279 #endif
280
281 /*
282 * When doing MNT_WAIT we must first flush all dependencies
283 * on the inode.
284 */
285 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
286 (error = softdep_sync_metadata(vp)) != 0) {
287 if (ffs_fsfail_cleanup(ump, error))
288 error = 0;
289 return (error);
290 }
291
292 /*
293 * Flush all dirty buffers associated with a vnode.
294 */
295 error = 0;
296 passes = 0;
297 wait = false; /* Always do an async pass first. */
298 unlocked = false;
299 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
300 BO_LOCK(bo);
301 loop:
302 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
303 bp->b_vflags &= ~BV_SCANNED;
304 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
305 /*
306 * Reasons to skip this buffer: it has already been considered
307 * on this pass, the buffer has dependencies that will cause
308 * it to be redirtied and it has not already been deferred,
309 * or it is already being written.
310 */
311 if ((bp->b_vflags & BV_SCANNED) != 0)
312 continue;
313 bp->b_vflags |= BV_SCANNED;
314 /*
315 * Flush indirects in order, if requested.
316 *
317 * Note that if only datasync is requested, we can
318 * skip indirect blocks when softupdates are not
319 * active. Otherwise we must flush them with data,
320 * since dependencies prevent data block writes.
321 */
322 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
323 (lbn_level(bp->b_lblkno) >= passes ||
324 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
325 continue;
326 if (bp->b_lblkno > lbn)
327 panic("ffs_syncvnode: syncing truncated data.");
328 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
329 BO_UNLOCK(bo);
330 } else if (wait) {
331 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
332 LK_INTERLOCK | wflag, BO_LOCKPTR(bo)) != 0) {
333 BO_LOCK(bo);
334 bp->b_vflags &= ~BV_SCANNED;
335 goto next_locked;
336 }
337 } else
338 continue;
339 if ((bp->b_flags & B_DELWRI) == 0)
340 panic("ffs_fsync: not dirty");
341 /*
342 * Check for dependencies and potentially complete them.
343 */
344 if (!LIST_EMPTY(&bp->b_dep) &&
345 (error = softdep_sync_buf(vp, bp,
346 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
347 /*
348 * Lock order conflict, buffer was already unlocked,
349 * and vnode possibly unlocked.
350 */
351 if (error == ERELOOKUP) {
352 if (vp->v_data == NULL)
353 return (EBADF);
354 unlocked = true;
355 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
356 (error = softdep_sync_metadata(vp)) != 0) {
357 if (ffs_fsfail_cleanup(ump, error))
358 error = 0;
359 return (unlocked && error == 0 ?
360 ERELOOKUP : error);
361 }
362 /* Re-evaluate inode size */
363 lbn = lblkno(ITOFS(ip), (ip->i_size +
364 ITOFS(ip)->fs_bsize - 1));
365 goto next;
366 }
367 /* I/O error. */
368 if (error != EBUSY) {
369 BUF_UNLOCK(bp);
370 return (error);
371 }
372 /* If we deferred once, don't defer again. */
373 if ((bp->b_flags & B_DEFERRED) == 0) {
374 bp->b_flags |= B_DEFERRED;
375 BUF_UNLOCK(bp);
376 goto next;
377 }
378 }
379 if (wait) {
380 bremfree(bp);
381 error = bwrite(bp);
382 if (ffs_fsfail_cleanup(ump, error))
383 error = 0;
384 if (error != 0)
385 return (error);
386 } else if ((bp->b_flags & B_CLUSTEROK)) {
387 (void) vfs_bio_awrite(bp);
388 } else {
389 bremfree(bp);
390 (void) bawrite(bp);
391 }
392 next:
393 /*
394 * Since we may have slept during the I/O, we need
395 * to start from a known point.
396 */
397 BO_LOCK(bo);
398 next_locked:
399 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
400 }
401 if (waitfor != MNT_WAIT) {
402 BO_UNLOCK(bo);
403 if ((flags & NO_INO_UPDT) != 0)
404 return (unlocked ? ERELOOKUP : 0);
405 error = ffs_update(vp, 0);
406 if (error == 0 && unlocked)
407 error = ERELOOKUP;
408 return (error);
409 }
410 /* Drain IO to see if we're done. */
411 bufobj_wwait(bo, 0, 0);
412 /*
413 * Block devices associated with filesystems may have new I/O
414 * requests posted for them even if the vnode is locked, so no
415 * amount of trying will get them clean. We make several passes
416 * as a best effort.
417 *
418 * Regular files may need multiple passes to flush all dependency
419 * work as it is possible that we must write once per indirect
420 * level, once for the leaf, and once for the inode and each of
421 * these will be done with one sync and one async pass.
422 */
423 if (bo->bo_dirty.bv_cnt > 0) {
424 if ((flags & DATA_ONLY) == 0) {
425 still_dirty = true;
426 } else {
427 /*
428 * For data-only sync, dirty indirect buffers
429 * are ignored.
430 */
431 still_dirty = false;
432 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
433 if (bp->b_lblkno > -UFS_NDADDR) {
434 still_dirty = true;
435 break;
436 }
437 }
438 }
439
440 if (still_dirty) {
441 /* Write the inode after sync passes to flush deps. */
442 if (wait && DOINGSOFTDEP(vp) &&
443 (flags & NO_INO_UPDT) == 0) {
444 BO_UNLOCK(bo);
445 ffs_update(vp, 1);
446 BO_LOCK(bo);
447 }
448 /* switch between sync/async. */
449 wait = !wait;
450 if (wait || ++passes < UFS_NIADDR + 2)
451 goto loop;
452 }
453 }
454 BO_UNLOCK(bo);
455 error = 0;
456 if ((flags & DATA_ONLY) == 0) {
457 if ((flags & NO_INO_UPDT) == 0)
458 error = ffs_update(vp, 1);
459 if (DOINGSUJ(vp))
460 softdep_journal_fsync(VTOI(vp));
461 } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
462 error = ffs_update(vp, 1);
463 }
464 if (error == 0 && unlocked)
465 error = ERELOOKUP;
466 if (error == 0)
467 ip->i_flag &= ~IN_NEEDSYNC;
468 return (error);
469 }
470
471 static int
ffs_fdatasync(struct vop_fdatasync_args * ap)472 ffs_fdatasync(struct vop_fdatasync_args *ap)
473 {
474
475 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
476 }
477
478 static int
ffs_lock(struct vop_lock1_args * ap)479 ffs_lock(
480 struct vop_lock1_args /* {
481 struct vnode *a_vp;
482 int a_flags;
483 char *file;
484 int line;
485 } */ *ap)
486 {
487 #if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC)
488 struct vnode *vp = ap->a_vp;
489 #endif /* !NO_FFS_SNAPSHOT || DIAGNOSTIC */
490 #ifdef DIAGNOSTIC
491 struct inode *ip;
492 #endif /* DIAGNOSTIC */
493 int result;
494 #ifndef NO_FFS_SNAPSHOT
495 int flags;
496 struct lock *lkp;
497
498 /*
499 * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
500 * and only use it when LK_NODDLKTREAT is set. Currently this means it
501 * is only used during path lookup.
502 */
503 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
504 ap->a_flags |= LK_ADAPTIVE;
505 switch (ap->a_flags & LK_TYPE_MASK) {
506 case LK_SHARED:
507 case LK_UPGRADE:
508 case LK_EXCLUSIVE:
509 flags = ap->a_flags;
510 for (;;) {
511 VNPASS(vp->v_holdcnt != 0, vp);
512 lkp = vp->v_vnlock;
513 result = lockmgr_lock_flags(lkp, flags,
514 &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
515 if (lkp == vp->v_vnlock || result != 0)
516 break;
517 /*
518 * Apparent success, except that the vnode
519 * mutated between snapshot file vnode and
520 * regular file vnode while this process
521 * slept. The lock currently held is not the
522 * right lock. Release it, and try to get the
523 * new lock.
524 */
525 lockmgr_unlock(lkp);
526 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
527 (LK_INTERLOCK | LK_NOWAIT))
528 return (EBUSY);
529 if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
530 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
531 flags &= ~LK_INTERLOCK;
532 }
533 #ifdef DIAGNOSTIC
534 switch (ap->a_flags & LK_TYPE_MASK) {
535 case LK_UPGRADE:
536 case LK_EXCLUSIVE:
537 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
538 ip = VTOI(vp);
539 if (ip != NULL)
540 ip->i_lock_gen++;
541 }
542 }
543 #endif /* DIAGNOSTIC */
544 break;
545 default:
546 #ifdef DIAGNOSTIC
547 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
548 ip = VTOI(vp);
549 if (ip != NULL)
550 ufs_unlock_tracker(ip);
551 }
552 #endif /* DIAGNOSTIC */
553 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
554 break;
555 }
556 #else /* NO_FFS_SNAPSHOT */
557 /*
558 * See above for an explanation.
559 */
560 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
561 ap->a_flags |= LK_ADAPTIVE;
562 #ifdef DIAGNOSTIC
563 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
564 ip = VTOI(vp);
565 if (ip != NULL)
566 ufs_unlock_tracker(ip);
567 }
568 #endif /* DIAGNOSTIC */
569 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
570 #endif /* NO_FFS_SNAPSHOT */
571 #ifdef DIAGNOSTIC
572 switch (ap->a_flags & LK_TYPE_MASK) {
573 case LK_UPGRADE:
574 case LK_EXCLUSIVE:
575 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
576 ip = VTOI(vp);
577 if (ip != NULL)
578 ip->i_lock_gen++;
579 }
580 }
581 #endif /* DIAGNOSTIC */
582 return (result);
583 }
584
585 #ifdef INVARIANTS
586 static int
ffs_unlock_debug(struct vop_unlock_args * ap)587 ffs_unlock_debug(struct vop_unlock_args *ap)
588 {
589 struct vnode *vp;
590 struct inode *ip;
591
592 vp = ap->a_vp;
593 ip = VTOI(vp);
594 if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
595 if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
596 VI_LOCK(vp);
597 VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
598 ("%s: modified vnode (%x) not on lazy list",
599 __func__, ip->i_flag));
600 VI_UNLOCK(vp);
601 }
602 }
603 KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 ||
604 (ip->i_flag & IN_ENDOFF) == 0,
605 ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag));
606 #ifdef DIAGNOSTIC
607 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL &&
608 vp->v_vnlock->lk_recurse == 0)
609 ufs_unlock_tracker(ip);
610 #endif
611 return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
612 }
613 #endif
614
615 static int
ffs_read_hole(struct uio * uio,long xfersize,long * size)616 ffs_read_hole(struct uio *uio, long xfersize, long *size)
617 {
618 ssize_t saved_resid, tlen;
619 int error;
620
621 while (xfersize > 0) {
622 tlen = min(xfersize, ZERO_REGION_SIZE);
623 saved_resid = uio->uio_resid;
624 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
625 tlen, uio);
626 if (error != 0)
627 return (error);
628 tlen = saved_resid - uio->uio_resid;
629 xfersize -= tlen;
630 *size -= tlen;
631 }
632 return (0);
633 }
634
635 /*
636 * Vnode op for reading.
637 */
638 static int
ffs_read(struct vop_read_args * ap)639 ffs_read(
640 struct vop_read_args /* {
641 struct vnode *a_vp;
642 struct uio *a_uio;
643 int a_ioflag;
644 struct ucred *a_cred;
645 } */ *ap)
646 {
647 struct vnode *vp;
648 struct inode *ip;
649 struct uio *uio;
650 struct fs *fs;
651 struct buf *bp;
652 ufs_lbn_t lbn, nextlbn;
653 off_t bytesinfile;
654 long size, xfersize, blkoffset;
655 ssize_t orig_resid;
656 int bflag, error, ioflag, seqcount;
657
658 vp = ap->a_vp;
659 uio = ap->a_uio;
660 ioflag = ap->a_ioflag;
661 if (ap->a_ioflag & IO_EXT)
662 #ifdef notyet
663 return (ffs_extread(vp, uio, ioflag));
664 #else
665 panic("ffs_read+IO_EXT");
666 #endif
667 #ifdef DIRECTIO
668 if ((ioflag & IO_DIRECT) != 0) {
669 int workdone;
670
671 error = ffs_rawread(vp, uio, &workdone);
672 if (error != 0 || workdone != 0)
673 return error;
674 }
675 #endif
676
677 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
678 ip = VTOI(vp);
679
680 #ifdef INVARIANTS
681 if (uio->uio_rw != UIO_READ)
682 panic("ffs_read: mode");
683
684 if (vp->v_type == VLNK) {
685 if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen)
686 panic("ffs_read: short symlink");
687 } else if (vp->v_type != VREG && vp->v_type != VDIR)
688 panic("ffs_read: type %d", vp->v_type);
689 #endif
690 orig_resid = uio->uio_resid;
691 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
692 if (orig_resid == 0)
693 return (0);
694 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
695 fs = ITOFS(ip);
696 if (uio->uio_offset < ip->i_size &&
697 uio->uio_offset >= fs->fs_maxfilesize)
698 return (EOVERFLOW);
699
700 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
701 #ifdef WITNESS
702 bflag |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0;
703 #endif
704 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
705 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
706 break;
707 lbn = lblkno(fs, uio->uio_offset);
708 nextlbn = lbn + 1;
709
710 /*
711 * size of buffer. The buffer representing the
712 * end of the file is rounded up to the size of
713 * the block type ( fragment or full block,
714 * depending ).
715 */
716 size = blksize(fs, ip, lbn);
717 blkoffset = blkoff(fs, uio->uio_offset);
718
719 /*
720 * The amount we want to transfer in this iteration is
721 * one FS block less the amount of the data before
722 * our startpoint (duh!)
723 */
724 xfersize = fs->fs_bsize - blkoffset;
725
726 /*
727 * But if we actually want less than the block,
728 * or the file doesn't have a whole block more of data,
729 * then use the lesser number.
730 */
731 if (uio->uio_resid < xfersize)
732 xfersize = uio->uio_resid;
733 if (bytesinfile < xfersize)
734 xfersize = bytesinfile;
735
736 if (lblktosize(fs, nextlbn) >= ip->i_size) {
737 /*
738 * Don't do readahead if this is the end of the file.
739 */
740 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
741 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
742 /*
743 * Otherwise if we are allowed to cluster,
744 * grab as much as we can.
745 *
746 * XXX This may not be a win if we are not
747 * doing sequential access.
748 */
749 error = cluster_read(vp, ip->i_size, lbn,
750 size, NOCRED, blkoffset + uio->uio_resid,
751 seqcount, bflag, &bp);
752 } else if (seqcount > 1) {
753 /*
754 * If we are NOT allowed to cluster, then
755 * if we appear to be acting sequentially,
756 * fire off a request for a readahead
757 * as well as a read. Note that the 4th and 5th
758 * arguments point to arrays of the size specified in
759 * the 6th argument.
760 */
761 int nextsize = blksize(fs, ip, nextlbn);
762 error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
763 &nextsize, 1, NOCRED, bflag, NULL, &bp);
764 } else {
765 /*
766 * Failing all of the above, just read what the
767 * user asked for. Interestingly, the same as
768 * the first option above.
769 */
770 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
771 }
772 if (error == EJUSTRETURN) {
773 error = ffs_read_hole(uio, xfersize, &size);
774 if (error == 0)
775 continue;
776 }
777 if (error != 0) {
778 brelse(bp);
779 bp = NULL;
780 break;
781 }
782
783 /*
784 * We should only get non-zero b_resid when an I/O error
785 * has occurred, which should cause us to break above.
786 * However, if the short read did not cause an error,
787 * then we want to ensure that we do not uiomove bad
788 * or uninitialized data.
789 */
790 size -= bp->b_resid;
791 if (size < xfersize) {
792 if (size == 0)
793 break;
794 xfersize = size;
795 }
796
797 if (buf_mapped(bp)) {
798 error = vn_io_fault_uiomove((char *)bp->b_data +
799 blkoffset, (int)xfersize, uio);
800 } else {
801 error = vn_io_fault_pgmove(bp->b_pages,
802 blkoffset + (bp->b_offset & PAGE_MASK),
803 (int)xfersize, uio);
804 }
805 if (error)
806 break;
807
808 vfs_bio_brelse(bp, ioflag);
809 }
810
811 /*
812 * This can only happen in the case of an error
813 * because the loop above resets bp to NULL on each iteration
814 * and on normal completion has not set a new value into it.
815 * so it must have come from a 'break' statement
816 */
817 if (bp != NULL)
818 vfs_bio_brelse(bp, ioflag);
819
820 if ((error == 0 || uio->uio_resid != orig_resid) &&
821 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
822 UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
823 return (error);
824 }
825
826 /*
827 * Vnode op for writing.
828 */
829 static int
ffs_write(struct vop_write_args * ap)830 ffs_write(
831 struct vop_write_args /* {
832 struct vnode *a_vp;
833 struct uio *a_uio;
834 int a_ioflag;
835 struct ucred *a_cred;
836 } */ *ap)
837 {
838 struct vnode *vp;
839 struct uio *uio;
840 struct inode *ip;
841 struct fs *fs;
842 struct buf *bp;
843 ufs_lbn_t lbn;
844 off_t osize;
845 ssize_t resid, r;
846 int seqcount;
847 int blkoffset, error, flags, ioflag, size, xfersize;
848
849 vp = ap->a_vp;
850 if (DOINGSUJ(vp))
851 softdep_prealloc(vp, MNT_WAIT);
852 if (vp->v_data == NULL)
853 return (EBADF);
854
855 uio = ap->a_uio;
856 ioflag = ap->a_ioflag;
857 if (ap->a_ioflag & IO_EXT)
858 #ifdef notyet
859 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
860 #else
861 panic("ffs_write+IO_EXT");
862 #endif
863
864 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
865 ip = VTOI(vp);
866
867 #ifdef INVARIANTS
868 if (uio->uio_rw != UIO_WRITE)
869 panic("ffs_write: mode");
870 #endif
871
872 switch (vp->v_type) {
873 case VREG:
874 if (ioflag & IO_APPEND)
875 uio->uio_offset = ip->i_size;
876 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
877 return (EPERM);
878 /* FALLTHROUGH */
879 case VLNK:
880 break;
881 case VDIR:
882 panic("ffs_write: dir write");
883 break;
884 default:
885 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
886 (int)uio->uio_offset,
887 (int)uio->uio_resid
888 );
889 }
890
891 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
892 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
893 fs = ITOFS(ip);
894
895 /*
896 * Maybe this should be above the vnode op call, but so long as
897 * file servers have no limits, I don't think it matters.
898 */
899 error = vn_rlimit_fsizex(vp, uio, fs->fs_maxfilesize, &r,
900 uio->uio_td);
901 if (error != 0) {
902 vn_rlimit_fsizex_res(uio, r);
903 return (error);
904 }
905
906 resid = uio->uio_resid;
907 osize = ip->i_size;
908 if (seqcount > BA_SEQMAX)
909 flags = BA_SEQMAX << BA_SEQSHIFT;
910 else
911 flags = seqcount << BA_SEQSHIFT;
912 if (ioflag & IO_SYNC)
913 flags |= IO_SYNC;
914 flags |= BA_UNMAPPED;
915
916 for (error = 0; uio->uio_resid > 0;) {
917 lbn = lblkno(fs, uio->uio_offset);
918 blkoffset = blkoff(fs, uio->uio_offset);
919 xfersize = fs->fs_bsize - blkoffset;
920 if (uio->uio_resid < xfersize)
921 xfersize = uio->uio_resid;
922 if (uio->uio_offset + xfersize > ip->i_size)
923 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
924
925 /*
926 * We must perform a read-before-write if the transfer size
927 * does not cover the entire buffer.
928 */
929 if (fs->fs_bsize > xfersize)
930 flags |= BA_CLRBUF;
931 else
932 flags &= ~BA_CLRBUF;
933 /* XXX is uio->uio_offset the right thing here? */
934 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
935 ap->a_cred, flags, &bp);
936 if (error != 0) {
937 vnode_pager_setsize(vp, ip->i_size);
938 break;
939 }
940 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
941 bp->b_flags |= B_NOCACHE;
942
943 if (uio->uio_offset + xfersize > ip->i_size) {
944 ip->i_size = uio->uio_offset + xfersize;
945 DIP_SET(ip, i_size, ip->i_size);
946 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
947 }
948
949 size = blksize(fs, ip, lbn) - bp->b_resid;
950 if (size < xfersize)
951 xfersize = size;
952
953 if (buf_mapped(bp)) {
954 error = vn_io_fault_uiomove((char *)bp->b_data +
955 blkoffset, (int)xfersize, uio);
956 } else {
957 error = vn_io_fault_pgmove(bp->b_pages,
958 blkoffset + (bp->b_offset & PAGE_MASK),
959 (int)xfersize, uio);
960 }
961 /*
962 * If the buffer is not already filled and we encounter an
963 * error while trying to fill it, we have to clear out any
964 * garbage data from the pages instantiated for the buffer.
965 * If we do not, a failed uiomove() during a write can leave
966 * the prior contents of the pages exposed to a userland mmap.
967 *
968 * Note that we need only clear buffers with a transfer size
969 * equal to the block size because buffers with a shorter
970 * transfer size were cleared above by the call to UFS_BALLOC()
971 * with the BA_CLRBUF flag set.
972 *
973 * If the source region for uiomove identically mmaps the
974 * buffer, uiomove() performed the NOP copy, and the buffer
975 * content remains valid because the page fault handler
976 * validated the pages.
977 */
978 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
979 fs->fs_bsize == xfersize) {
980 if (error == EFAULT && LIST_EMPTY(&bp->b_dep)) {
981 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
982 brelse(bp);
983 break;
984 } else {
985 vfs_bio_clrbuf(bp);
986 }
987 }
988
989 vfs_bio_set_flags(bp, ioflag);
990
991 /*
992 * If IO_SYNC each buffer is written synchronously. Otherwise
993 * if we have a severe page deficiency write the buffer
994 * asynchronously. Otherwise try to cluster, and if that
995 * doesn't do it then either do an async write (if O_DIRECT),
996 * or a delayed write (if not).
997 */
998 if (ioflag & IO_SYNC) {
999 (void)bwrite(bp);
1000 } else if (vm_page_count_severe() ||
1001 buf_dirty_count_severe() ||
1002 (ioflag & IO_ASYNC)) {
1003 bp->b_flags |= B_CLUSTEROK;
1004 bawrite(bp);
1005 } else if (xfersize + blkoffset == fs->fs_bsize) {
1006 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
1007 bp->b_flags |= B_CLUSTEROK;
1008 cluster_write(vp, &ip->i_clusterw, bp,
1009 ip->i_size, seqcount, GB_UNMAPPED);
1010 } else {
1011 bawrite(bp);
1012 }
1013 } else if (ioflag & IO_DIRECT) {
1014 bp->b_flags |= B_CLUSTEROK;
1015 bawrite(bp);
1016 } else {
1017 bp->b_flags |= B_CLUSTEROK;
1018 bdwrite(bp);
1019 }
1020 if (error || xfersize == 0)
1021 break;
1022 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1023 }
1024 /*
1025 * If we successfully wrote any data, and we are not the superuser
1026 * we clear the setuid and setgid bits as a precaution against
1027 * tampering.
1028 */
1029 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
1030 ap->a_cred) {
1031 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
1032 vn_seqc_write_begin(vp);
1033 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1034 DIP_SET(ip, i_mode, ip->i_mode);
1035 vn_seqc_write_end(vp);
1036 }
1037 }
1038 if (error) {
1039 if (ioflag & IO_UNIT) {
1040 (void)ffs_truncate(vp, osize,
1041 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
1042 uio->uio_offset -= resid - uio->uio_resid;
1043 uio->uio_resid = resid;
1044 }
1045 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
1046 if (!(ioflag & IO_DATASYNC) ||
1047 (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)))
1048 error = ffs_update(vp, 1);
1049 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
1050 error = ENXIO;
1051 }
1052 vn_rlimit_fsizex_res(uio, r);
1053 return (error);
1054 }
1055
1056 /*
1057 * Extended attribute area reading.
1058 */
1059 static int
ffs_extread(struct vnode * vp,struct uio * uio,int ioflag)1060 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1061 {
1062 struct inode *ip;
1063 struct ufs2_dinode *dp;
1064 struct fs *fs;
1065 struct buf *bp;
1066 ufs_lbn_t lbn, nextlbn;
1067 off_t bytesinfile;
1068 long size, xfersize, blkoffset;
1069 ssize_t orig_resid;
1070 int error;
1071
1072 ip = VTOI(vp);
1073 fs = ITOFS(ip);
1074 dp = ip->i_din2;
1075
1076 #ifdef INVARIANTS
1077 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1078 panic("ffs_extread: mode");
1079
1080 #endif
1081 orig_resid = uio->uio_resid;
1082 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
1083 if (orig_resid == 0)
1084 return (0);
1085 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
1086
1087 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1088 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1089 break;
1090 lbn = lblkno(fs, uio->uio_offset);
1091 nextlbn = lbn + 1;
1092
1093 /*
1094 * size of buffer. The buffer representing the
1095 * end of the file is rounded up to the size of
1096 * the block type ( fragment or full block,
1097 * depending ).
1098 */
1099 size = sblksize(fs, dp->di_extsize, lbn);
1100 blkoffset = blkoff(fs, uio->uio_offset);
1101
1102 /*
1103 * The amount we want to transfer in this iteration is
1104 * one FS block less the amount of the data before
1105 * our startpoint (duh!)
1106 */
1107 xfersize = fs->fs_bsize - blkoffset;
1108
1109 /*
1110 * But if we actually want less than the block,
1111 * or the file doesn't have a whole block more of data,
1112 * then use the lesser number.
1113 */
1114 if (uio->uio_resid < xfersize)
1115 xfersize = uio->uio_resid;
1116 if (bytesinfile < xfersize)
1117 xfersize = bytesinfile;
1118
1119 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1120 /*
1121 * Don't do readahead if this is the end of the info.
1122 */
1123 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1124 } else {
1125 /*
1126 * If we have a second block, then
1127 * fire off a request for a readahead
1128 * as well as a read. Note that the 4th and 5th
1129 * arguments point to arrays of the size specified in
1130 * the 6th argument.
1131 */
1132 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1133 nextlbn = -1 - nextlbn;
1134 error = breadn(vp, -1 - lbn,
1135 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1136 }
1137 if (error) {
1138 brelse(bp);
1139 bp = NULL;
1140 break;
1141 }
1142
1143 /*
1144 * We should only get non-zero b_resid when an I/O error
1145 * has occurred, which should cause us to break above.
1146 * However, if the short read did not cause an error,
1147 * then we want to ensure that we do not uiomove bad
1148 * or uninitialized data.
1149 */
1150 size -= bp->b_resid;
1151 if (size < xfersize) {
1152 if (size == 0)
1153 break;
1154 xfersize = size;
1155 }
1156
1157 error = uiomove((char *)bp->b_data + blkoffset,
1158 (int)xfersize, uio);
1159 if (error)
1160 break;
1161 vfs_bio_brelse(bp, ioflag);
1162 }
1163
1164 /*
1165 * This can only happen in the case of an error
1166 * because the loop above resets bp to NULL on each iteration
1167 * and on normal completion has not set a new value into it.
1168 * so it must have come from a 'break' statement
1169 */
1170 if (bp != NULL)
1171 vfs_bio_brelse(bp, ioflag);
1172 return (error);
1173 }
1174
1175 /*
1176 * Extended attribute area writing.
1177 */
1178 static int
ffs_extwrite(struct vnode * vp,struct uio * uio,int ioflag,struct ucred * ucred)1179 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1180 {
1181 struct inode *ip;
1182 struct ufs2_dinode *dp;
1183 struct fs *fs;
1184 struct buf *bp;
1185 ufs_lbn_t lbn;
1186 off_t osize;
1187 ssize_t resid;
1188 int blkoffset, error, flags, size, xfersize;
1189
1190 ip = VTOI(vp);
1191 fs = ITOFS(ip);
1192 dp = ip->i_din2;
1193
1194 #ifdef INVARIANTS
1195 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1196 panic("ffs_extwrite: mode");
1197 #endif
1198
1199 if (ioflag & IO_APPEND)
1200 uio->uio_offset = dp->di_extsize;
1201 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1202 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1203 if ((uoff_t)uio->uio_offset + uio->uio_resid >
1204 UFS_NXADDR * fs->fs_bsize)
1205 return (EFBIG);
1206
1207 resid = uio->uio_resid;
1208 osize = dp->di_extsize;
1209 flags = IO_EXT;
1210 if (ioflag & IO_SYNC)
1211 flags |= IO_SYNC;
1212
1213 for (error = 0; uio->uio_resid > 0;) {
1214 lbn = lblkno(fs, uio->uio_offset);
1215 blkoffset = blkoff(fs, uio->uio_offset);
1216 xfersize = fs->fs_bsize - blkoffset;
1217 if (uio->uio_resid < xfersize)
1218 xfersize = uio->uio_resid;
1219
1220 /*
1221 * We must perform a read-before-write if the transfer size
1222 * does not cover the entire buffer.
1223 */
1224 if (fs->fs_bsize > xfersize)
1225 flags |= BA_CLRBUF;
1226 else
1227 flags &= ~BA_CLRBUF;
1228 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1229 ucred, flags, &bp);
1230 if (error != 0)
1231 break;
1232 /*
1233 * If the buffer is not valid we have to clear out any
1234 * garbage data from the pages instantiated for the buffer.
1235 * If we do not, a failed uiomove() during a write can leave
1236 * the prior contents of the pages exposed to a userland
1237 * mmap(). XXX deal with uiomove() errors a better way.
1238 */
1239 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1240 vfs_bio_clrbuf(bp);
1241
1242 if (uio->uio_offset + xfersize > dp->di_extsize) {
1243 dp->di_extsize = uio->uio_offset + xfersize;
1244 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1245 }
1246
1247 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1248 if (size < xfersize)
1249 xfersize = size;
1250
1251 error =
1252 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1253
1254 vfs_bio_set_flags(bp, ioflag);
1255
1256 /*
1257 * If IO_SYNC each buffer is written synchronously. Otherwise
1258 * if we have a severe page deficiency write the buffer
1259 * asynchronously. Otherwise try to cluster, and if that
1260 * doesn't do it then either do an async write (if O_DIRECT),
1261 * or a delayed write (if not).
1262 */
1263 if (ioflag & IO_SYNC) {
1264 (void)bwrite(bp);
1265 } else if (vm_page_count_severe() ||
1266 buf_dirty_count_severe() ||
1267 xfersize + blkoffset == fs->fs_bsize ||
1268 (ioflag & (IO_ASYNC | IO_DIRECT)))
1269 bawrite(bp);
1270 else
1271 bdwrite(bp);
1272 if (error || xfersize == 0)
1273 break;
1274 UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1275 }
1276 /*
1277 * If we successfully wrote any data, and we are not the superuser
1278 * we clear the setuid and setgid bits as a precaution against
1279 * tampering.
1280 */
1281 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1282 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1283 vn_seqc_write_begin(vp);
1284 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1285 dp->di_mode = ip->i_mode;
1286 vn_seqc_write_end(vp);
1287 }
1288 }
1289 if (error) {
1290 if (ioflag & IO_UNIT) {
1291 (void)ffs_truncate(vp, osize,
1292 IO_EXT | (ioflag&IO_SYNC), ucred);
1293 uio->uio_offset -= resid - uio->uio_resid;
1294 uio->uio_resid = resid;
1295 }
1296 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1297 error = ffs_update(vp, 1);
1298 return (error);
1299 }
1300
1301 /*
1302 * Vnode operating to retrieve a named extended attribute.
1303 *
1304 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1305 * the length of the EA, and possibly the pointer to the entry and to the data.
1306 */
1307 static int
ffs_findextattr(uint8_t * ptr,uint64_t length,int nspace,const char * name,struct extattr ** eapp,uint8_t ** eac)1308 ffs_findextattr(uint8_t *ptr, uint64_t length, int nspace, const char *name,
1309 struct extattr **eapp, uint8_t **eac)
1310 {
1311 struct extattr *eap, *eaend;
1312 size_t nlen;
1313
1314 nlen = strlen(name);
1315 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1316 eap = (struct extattr *)ptr;
1317 eaend = (struct extattr *)(ptr + length);
1318 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1319 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1320 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1321 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1322 || memcmp(eap->ea_name, name, nlen) != 0)
1323 continue;
1324 if (eapp != NULL)
1325 *eapp = eap;
1326 if (eac != NULL)
1327 *eac = EXTATTR_CONTENT(eap);
1328 return (EXTATTR_CONTENT_SIZE(eap));
1329 }
1330 return (-1);
1331 }
1332
1333 static int
ffs_rdextattr(uint8_t ** p,struct vnode * vp,struct thread * td)1334 ffs_rdextattr(uint8_t **p, struct vnode *vp, struct thread *td)
1335 {
1336 const struct extattr *eap, *eaend, *eapnext;
1337 struct inode *ip;
1338 struct ufs2_dinode *dp;
1339 struct fs *fs;
1340 struct uio luio;
1341 struct iovec liovec;
1342 uint64_t easize;
1343 int error;
1344 uint8_t *eae;
1345
1346 ip = VTOI(vp);
1347 fs = ITOFS(ip);
1348 dp = ip->i_din2;
1349 easize = dp->di_extsize;
1350 if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1351 return (EFBIG);
1352
1353 eae = malloc(easize, M_TEMP, M_WAITOK);
1354
1355 liovec.iov_base = eae;
1356 liovec.iov_len = easize;
1357 luio.uio_iov = &liovec;
1358 luio.uio_iovcnt = 1;
1359 luio.uio_offset = 0;
1360 luio.uio_resid = easize;
1361 luio.uio_segflg = UIO_SYSSPACE;
1362 luio.uio_rw = UIO_READ;
1363 luio.uio_td = td;
1364
1365 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1366 if (error) {
1367 free(eae, M_TEMP);
1368 return (error);
1369 }
1370 /* Validate disk xattrfile contents. */
1371 for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1372 eap = eapnext) {
1373 /* Detect zeroed out tail */
1374 if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) {
1375 easize = (const uint8_t *)eap - eae;
1376 break;
1377 }
1378
1379 eapnext = EXTATTR_NEXT(eap);
1380 /* Bogusly long entry. */
1381 if (eapnext > eaend) {
1382 free(eae, M_TEMP);
1383 return (EINTEGRITY);
1384 }
1385 }
1386 ip->i_ea_len = easize;
1387 *p = eae;
1388 return (0);
1389 }
1390
1391 static void
ffs_lock_ea(struct vnode * vp)1392 ffs_lock_ea(struct vnode *vp)
1393 {
1394 struct inode *ip;
1395
1396 ip = VTOI(vp);
1397 VI_LOCK(vp);
1398 while (ip->i_flag & IN_EA_LOCKED) {
1399 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1400 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD, "ufs_ea", 0);
1401 }
1402 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1403 VI_UNLOCK(vp);
1404 }
1405
1406 static void
ffs_unlock_ea(struct vnode * vp)1407 ffs_unlock_ea(struct vnode *vp)
1408 {
1409 struct inode *ip;
1410
1411 ip = VTOI(vp);
1412 VI_LOCK(vp);
1413 if (ip->i_flag & IN_EA_LOCKWAIT)
1414 wakeup(&ip->i_ea_refs);
1415 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1416 VI_UNLOCK(vp);
1417 }
1418
1419 static int
ffs_open_ea(struct vnode * vp,struct ucred * cred,struct thread * td)1420 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1421 {
1422 struct inode *ip;
1423 int error;
1424
1425 ip = VTOI(vp);
1426
1427 ffs_lock_ea(vp);
1428 if (ip->i_ea_area != NULL) {
1429 ip->i_ea_refs++;
1430 ffs_unlock_ea(vp);
1431 return (0);
1432 }
1433 error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1434 if (error) {
1435 ffs_unlock_ea(vp);
1436 return (error);
1437 }
1438 ip->i_ea_error = 0;
1439 ip->i_ea_refs++;
1440 ffs_unlock_ea(vp);
1441 return (0);
1442 }
1443
1444 /*
1445 * Vnode extattr transaction commit/abort
1446 */
1447 static int
ffs_close_ea(struct vnode * vp,int commit,struct ucred * cred,struct thread * td)1448 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1449 {
1450 struct inode *ip;
1451 struct uio luio;
1452 struct iovec *liovec;
1453 struct ufs2_dinode *dp;
1454 size_t ea_len, tlen;
1455 int error, i, lcnt;
1456 bool truncate;
1457
1458 ip = VTOI(vp);
1459
1460 ffs_lock_ea(vp);
1461 if (ip->i_ea_area == NULL) {
1462 ffs_unlock_ea(vp);
1463 return (EINVAL);
1464 }
1465 dp = ip->i_din2;
1466 error = ip->i_ea_error;
1467 truncate = false;
1468 if (commit && error == 0) {
1469 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1470 if (cred == NOCRED)
1471 cred = vp->v_mount->mnt_cred;
1472
1473 ea_len = MAX(ip->i_ea_len, dp->di_extsize);
1474 for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) {
1475 tlen -= MIN(ZERO_REGION_SIZE, tlen);
1476 lcnt++;
1477 }
1478
1479 liovec = __builtin_alloca(lcnt * sizeof(struct iovec));
1480 luio.uio_iovcnt = lcnt;
1481
1482 liovec[0].iov_base = ip->i_ea_area;
1483 liovec[0].iov_len = ip->i_ea_len;
1484 for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) {
1485 liovec[i].iov_base = __DECONST(void *, zero_region);
1486 liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen);
1487 tlen -= liovec[i].iov_len;
1488 }
1489 MPASS(tlen == 0);
1490
1491 luio.uio_iov = liovec;
1492 luio.uio_offset = 0;
1493 luio.uio_resid = ea_len;
1494 luio.uio_segflg = UIO_SYSSPACE;
1495 luio.uio_rw = UIO_WRITE;
1496 luio.uio_td = td;
1497 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1498 if (error == 0 && ip->i_ea_len == 0)
1499 truncate = true;
1500 }
1501 if (--ip->i_ea_refs == 0) {
1502 free(ip->i_ea_area, M_TEMP);
1503 ip->i_ea_area = NULL;
1504 ip->i_ea_len = 0;
1505 ip->i_ea_error = 0;
1506 }
1507 ffs_unlock_ea(vp);
1508
1509 if (truncate)
1510 ffs_truncate(vp, 0, IO_EXT, cred);
1511 return (error);
1512 }
1513
1514 /*
1515 * Vnode extattr strategy routine for fifos.
1516 *
1517 * We need to check for a read or write of the external attributes.
1518 * Otherwise we just fall through and do the usual thing.
1519 */
1520 static int
ffsext_strategy(struct vop_strategy_args * ap)1521 ffsext_strategy(
1522 struct vop_strategy_args /* {
1523 struct vnodeop_desc *a_desc;
1524 struct vnode *a_vp;
1525 struct buf *a_bp;
1526 } */ *ap)
1527 {
1528 struct vnode *vp;
1529 daddr_t lbn;
1530
1531 vp = ap->a_vp;
1532 lbn = ap->a_bp->b_lblkno;
1533 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1534 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1535 if (vp->v_type == VFIFO)
1536 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1537 panic("spec nodes went here");
1538 }
1539
1540 /*
1541 * Vnode extattr transaction commit/abort
1542 */
1543 static int
ffs_openextattr(struct vop_openextattr_args * ap)1544 ffs_openextattr(
1545 struct vop_openextattr_args /* {
1546 struct vnodeop_desc *a_desc;
1547 struct vnode *a_vp;
1548 IN struct ucred *a_cred;
1549 IN struct thread *a_td;
1550 } */ *ap)
1551 {
1552
1553 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1554 return (EOPNOTSUPP);
1555
1556 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1557 }
1558
1559 /*
1560 * Vnode extattr transaction commit/abort
1561 */
1562 static int
ffs_closeextattr(struct vop_closeextattr_args * ap)1563 ffs_closeextattr(
1564 struct vop_closeextattr_args /* {
1565 struct vnodeop_desc *a_desc;
1566 struct vnode *a_vp;
1567 int a_commit;
1568 IN struct ucred *a_cred;
1569 IN struct thread *a_td;
1570 } */ *ap)
1571 {
1572 struct vnode *vp;
1573
1574 vp = ap->a_vp;
1575 if (vp->v_type == VCHR || vp->v_type == VBLK)
1576 return (EOPNOTSUPP);
1577 if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
1578 return (EROFS);
1579
1580 if (ap->a_commit && DOINGSUJ(vp)) {
1581 ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit");
1582 softdep_prealloc(vp, MNT_WAIT);
1583 if (vp->v_data == NULL)
1584 return (EBADF);
1585 }
1586 return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td));
1587 }
1588
1589 /*
1590 * Vnode operation to remove a named attribute.
1591 */
1592 static int
ffs_deleteextattr(struct vop_deleteextattr_args * ap)1593 ffs_deleteextattr(
1594 struct vop_deleteextattr_args /* {
1595 IN struct vnode *a_vp;
1596 IN int a_attrnamespace;
1597 IN const char *a_name;
1598 IN struct ucred *a_cred;
1599 IN struct thread *a_td;
1600 } */ *ap)
1601 {
1602 struct vnode *vp;
1603 struct inode *ip;
1604 struct extattr *eap;
1605 uint32_t ul;
1606 int olen, error, i, easize;
1607 uint8_t *eae;
1608 void *tmp;
1609
1610 vp = ap->a_vp;
1611 ip = VTOI(vp);
1612
1613 if (vp->v_type == VCHR || vp->v_type == VBLK)
1614 return (EOPNOTSUPP);
1615 if (strlen(ap->a_name) == 0)
1616 return (EINVAL);
1617 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1618 return (EROFS);
1619
1620 error = extattr_check_cred(vp, ap->a_attrnamespace,
1621 ap->a_cred, ap->a_td, VWRITE);
1622 if (error) {
1623 /*
1624 * ffs_lock_ea is not needed there, because the vnode
1625 * must be exclusively locked.
1626 */
1627 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1628 ip->i_ea_error = error;
1629 return (error);
1630 }
1631
1632 if (DOINGSUJ(vp)) {
1633 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1634 softdep_prealloc(vp, MNT_WAIT);
1635 if (vp->v_data == NULL)
1636 return (EBADF);
1637 }
1638
1639 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1640 if (error)
1641 return (error);
1642
1643 /* CEM: delete could be done in-place instead */
1644 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1645 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1646 easize = ip->i_ea_len;
1647
1648 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1649 &eap, NULL);
1650 if (olen == -1) {
1651 /* delete but nonexistent */
1652 free(eae, M_TEMP);
1653 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1654 return (ENOATTR);
1655 }
1656 ul = eap->ea_length;
1657 i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1658 bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1659 easize -= ul;
1660
1661 tmp = ip->i_ea_area;
1662 ip->i_ea_area = eae;
1663 ip->i_ea_len = easize;
1664 free(tmp, M_TEMP);
1665 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1666 return (error);
1667 }
1668
1669 /*
1670 * Vnode operation to retrieve a named extended attribute.
1671 */
1672 static int
ffs_getextattr(struct vop_getextattr_args * ap)1673 ffs_getextattr(
1674 struct vop_getextattr_args /* {
1675 IN struct vnode *a_vp;
1676 IN int a_attrnamespace;
1677 IN const char *a_name;
1678 INOUT struct uio *a_uio;
1679 OUT size_t *a_size;
1680 IN struct ucred *a_cred;
1681 IN struct thread *a_td;
1682 } */ *ap)
1683 {
1684 struct inode *ip;
1685 uint8_t *eae, *p;
1686 unsigned easize;
1687 int error, ealen;
1688
1689 ip = VTOI(ap->a_vp);
1690
1691 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1692 return (EOPNOTSUPP);
1693
1694 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1695 ap->a_cred, ap->a_td, VREAD);
1696 if (error)
1697 return (error);
1698
1699 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1700 if (error)
1701 return (error);
1702
1703 eae = ip->i_ea_area;
1704 easize = ip->i_ea_len;
1705
1706 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1707 NULL, &p);
1708 if (ealen >= 0) {
1709 error = 0;
1710 if (ap->a_size != NULL)
1711 *ap->a_size = ealen;
1712 else if (ap->a_uio != NULL)
1713 error = uiomove(p, ealen, ap->a_uio);
1714 } else
1715 error = ENOATTR;
1716
1717 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1718 return (error);
1719 }
1720
1721 /*
1722 * Vnode operation to retrieve extended attributes on a vnode.
1723 */
1724 static int
ffs_listextattr(struct vop_listextattr_args * ap)1725 ffs_listextattr(
1726 struct vop_listextattr_args /* {
1727 IN struct vnode *a_vp;
1728 IN int a_attrnamespace;
1729 INOUT struct uio *a_uio;
1730 OUT size_t *a_size;
1731 IN struct ucred *a_cred;
1732 IN struct thread *a_td;
1733 } */ *ap)
1734 {
1735 struct inode *ip;
1736 struct extattr *eap, *eaend;
1737 int error, ealen;
1738
1739 ip = VTOI(ap->a_vp);
1740
1741 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1742 return (EOPNOTSUPP);
1743
1744 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1745 ap->a_cred, ap->a_td, VREAD);
1746 if (error)
1747 return (error);
1748
1749 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1750 if (error)
1751 return (error);
1752
1753 error = 0;
1754 if (ap->a_size != NULL)
1755 *ap->a_size = 0;
1756
1757 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1758 eap = (struct extattr *)ip->i_ea_area;
1759 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1760 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1761 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1762 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1763 if (eap->ea_namespace != ap->a_attrnamespace)
1764 continue;
1765
1766 ealen = eap->ea_namelength;
1767 if (ap->a_size != NULL)
1768 *ap->a_size += ealen + 1;
1769 else if (ap->a_uio != NULL)
1770 error = uiomove(&eap->ea_namelength, ealen + 1,
1771 ap->a_uio);
1772 }
1773
1774 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1775 return (error);
1776 }
1777
1778 /*
1779 * Vnode operation to set a named attribute.
1780 */
1781 static int
ffs_setextattr(struct vop_setextattr_args * ap)1782 ffs_setextattr(
1783 struct vop_setextattr_args /* {
1784 IN struct vnode *a_vp;
1785 IN int a_attrnamespace;
1786 IN const char *a_name;
1787 INOUT struct uio *a_uio;
1788 IN struct ucred *a_cred;
1789 IN struct thread *a_td;
1790 } */ *ap)
1791 {
1792 struct vnode *vp;
1793 struct inode *ip;
1794 struct fs *fs;
1795 struct extattr *eap;
1796 uint32_t ealength, ul;
1797 ssize_t ealen;
1798 int olen, eapad1, eapad2, error, i, easize;
1799 uint8_t *eae;
1800 void *tmp;
1801
1802 vp = ap->a_vp;
1803 ip = VTOI(vp);
1804 fs = ITOFS(ip);
1805
1806 if (vp->v_type == VCHR || vp->v_type == VBLK)
1807 return (EOPNOTSUPP);
1808 if (strlen(ap->a_name) == 0)
1809 return (EINVAL);
1810
1811 /* XXX Now unsupported API to delete EAs using NULL uio. */
1812 if (ap->a_uio == NULL)
1813 return (EOPNOTSUPP);
1814
1815 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1816 return (EROFS);
1817
1818 ealen = ap->a_uio->uio_resid;
1819 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1820 return (EINVAL);
1821
1822 error = extattr_check_cred(vp, ap->a_attrnamespace,
1823 ap->a_cred, ap->a_td, VWRITE);
1824 if (error) {
1825 /*
1826 * ffs_lock_ea is not needed there, because the vnode
1827 * must be exclusively locked.
1828 */
1829 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1830 ip->i_ea_error = error;
1831 return (error);
1832 }
1833
1834 if (DOINGSUJ(vp)) {
1835 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1836 softdep_prealloc(vp, MNT_WAIT);
1837 if (vp->v_data == NULL)
1838 return (EBADF);
1839 }
1840
1841 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1842 if (error)
1843 return (error);
1844
1845 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1846 eapad1 = roundup2(ealength, 8) - ealength;
1847 eapad2 = roundup2(ealen, 8) - ealen;
1848 ealength += eapad1 + ealen + eapad2;
1849
1850 /*
1851 * CEM: rewrites of the same size or smaller could be done in-place
1852 * instead. (We don't acquire any fine-grained locks in here either,
1853 * so we could also do bigger writes in-place.)
1854 */
1855 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1856 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1857 easize = ip->i_ea_len;
1858
1859 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1860 &eap, NULL);
1861 if (olen == -1) {
1862 /* new, append at end */
1863 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1864 ("unaligned"));
1865 eap = (struct extattr *)(eae + easize);
1866 easize += ealength;
1867 } else {
1868 ul = eap->ea_length;
1869 i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1870 if (ul != ealength) {
1871 bcopy(EXTATTR_NEXT(eap), (uint8_t *)eap + ealength,
1872 easize - i);
1873 easize += (ealength - ul);
1874 }
1875 }
1876 if (easize > lblktosize(fs, UFS_NXADDR)) {
1877 free(eae, M_TEMP);
1878 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1879 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1880 ip->i_ea_error = ENOSPC;
1881 return (ENOSPC);
1882 }
1883 eap->ea_length = ealength;
1884 eap->ea_namespace = ap->a_attrnamespace;
1885 eap->ea_contentpadlen = eapad2;
1886 eap->ea_namelength = strlen(ap->a_name);
1887 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1888 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1889 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1890 if (error) {
1891 free(eae, M_TEMP);
1892 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1893 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1894 ip->i_ea_error = error;
1895 return (error);
1896 }
1897 bzero((uint8_t *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1898
1899 tmp = ip->i_ea_area;
1900 ip->i_ea_area = eae;
1901 ip->i_ea_len = easize;
1902 free(tmp, M_TEMP);
1903 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1904 return (error);
1905 }
1906
1907 /*
1908 * Vnode pointer to File handle
1909 */
1910 static int
ffs_vptofh(struct vop_vptofh_args * ap)1911 ffs_vptofh(
1912 struct vop_vptofh_args /* {
1913 IN struct vnode *a_vp;
1914 IN struct fid *a_fhp;
1915 } */ *ap)
1916 {
1917 struct inode *ip;
1918 struct ufid *ufhp;
1919 _Static_assert(sizeof(struct ufid) <= sizeof(struct fid),
1920 "struct ufid cannot be larger than struct fid");
1921
1922 ip = VTOI(ap->a_vp);
1923 ufhp = (struct ufid *)ap->a_fhp;
1924 ufhp->ufid_len = sizeof(struct ufid);
1925 ufhp->ufid_ino = ip->i_number;
1926 ufhp->ufid_gen = ip->i_gen;
1927 return (0);
1928 }
1929
1930 SYSCTL_DECL(_vfs_ffs);
1931 static int use_buf_pager = 1;
1932 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1933 "Always use buffer pager instead of bmap");
1934
1935 static daddr_t
ffs_gbp_getblkno(struct vnode * vp,vm_ooffset_t off)1936 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1937 {
1938
1939 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1940 }
1941
1942 static int
ffs_gbp_getblksz(struct vnode * vp,daddr_t lbn,long * sz)1943 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1944 {
1945
1946 *sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1947 return (0);
1948 }
1949
1950 static int
ffs_getpages(struct vop_getpages_args * ap)1951 ffs_getpages(struct vop_getpages_args *ap)
1952 {
1953 struct vnode *vp;
1954 struct ufsmount *um;
1955
1956 vp = ap->a_vp;
1957 um = VFSTOUFS(vp->v_mount);
1958
1959 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1960 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1961 ap->a_rbehind, ap->a_rahead, NULL, NULL));
1962 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1963 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1964 }
1965
1966 static int
ffs_getpages_async(struct vop_getpages_async_args * ap)1967 ffs_getpages_async(struct vop_getpages_async_args *ap)
1968 {
1969 struct vnode *vp;
1970 struct ufsmount *um;
1971 bool do_iodone;
1972 int error;
1973
1974 vp = ap->a_vp;
1975 um = VFSTOUFS(vp->v_mount);
1976 do_iodone = true;
1977
1978 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1979 error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1980 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1981 if (error == 0)
1982 do_iodone = false;
1983 } else {
1984 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1985 ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1986 ffs_gbp_getblksz);
1987 }
1988 if (do_iodone && ap->a_iodone != NULL)
1989 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1990
1991 return (error);
1992 }
1993
1994 static int
ffs_vput_pair(struct vop_vput_pair_args * ap)1995 ffs_vput_pair(struct vop_vput_pair_args *ap)
1996 {
1997 struct mount *mp;
1998 struct vnode *dvp, *vp, *vp1, **vpp;
1999 struct inode *dp, *ip;
2000 ino_t ip_ino;
2001 uint64_t ip_gen;
2002 int error, vp_locked;
2003
2004 dvp = ap->a_dvp;
2005 dp = VTOI(dvp);
2006 vpp = ap->a_vpp;
2007 vp = vpp != NULL ? *vpp : NULL;
2008
2009 if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) {
2010 vput(dvp);
2011 if (vp != NULL && ap->a_unlock_vp)
2012 vput(vp);
2013 return (0);
2014 }
2015
2016 mp = dvp->v_mount;
2017 if (vp != NULL) {
2018 if (ap->a_unlock_vp) {
2019 vput(vp);
2020 } else {
2021 MPASS(vp->v_type != VNON);
2022 vp_locked = VOP_ISLOCKED(vp);
2023 ip = VTOI(vp);
2024 ip_ino = ip->i_number;
2025 ip_gen = ip->i_gen;
2026 VOP_UNLOCK(vp);
2027 }
2028 }
2029
2030 /*
2031 * If compaction or fsync was requested do it in ffs_vput_pair()
2032 * now that other locks are no longer held.
2033 */
2034 if ((dp->i_flag & IN_ENDOFF) != 0) {
2035 VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp,
2036 ("IN_ENDOFF set but I_ENDOFF() is not"));
2037 dp->i_flag &= ~IN_ENDOFF;
2038 error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL |
2039 (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred);
2040 if (error != 0 && error != ERELOOKUP) {
2041 if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) {
2042 vn_printf(dvp,
2043 "IN_ENDOFF: failed to truncate, "
2044 "error %d\n", error);
2045 }
2046 #ifdef UFS_DIRHASH
2047 ufsdirhash_free(dp);
2048 #endif
2049 }
2050 SET_I_ENDOFF(dp, 0);
2051 }
2052 if ((dp->i_flag & IN_NEEDSYNC) != 0) {
2053 do {
2054 error = ffs_syncvnode(dvp, MNT_WAIT, 0);
2055 } while (error == ERELOOKUP);
2056 }
2057
2058 vput(dvp);
2059
2060 if (vp == NULL || ap->a_unlock_vp)
2061 return (0);
2062 MPASS(mp != NULL);
2063
2064 /*
2065 * It is possible that vp is reclaimed at this point. Only
2066 * routines that call us with a_unlock_vp == false can find
2067 * that their vp has been reclaimed. There are three areas
2068 * that are affected:
2069 * 1) vn_open_cred() - later VOPs could fail, but
2070 * dead_open() returns 0 to simulate successful open.
2071 * 2) ffs_snapshot() - creation of snapshot fails with EBADF.
2072 * 3) NFS server (several places) - code is prepared to detect
2073 * and respond to dead vnodes by returning ESTALE.
2074 */
2075 VOP_LOCK(vp, vp_locked | LK_RETRY);
2076 if (IS_UFS(vp))
2077 return (0);
2078
2079 /*
2080 * Try harder to recover from reclaimed vp if reclaim was not
2081 * because underlying inode was cleared. We saved inode
2082 * number and inode generation, so we can try to reinstantiate
2083 * exactly same version of inode. If this fails, return
2084 * original doomed vnode and let caller to handle
2085 * consequences.
2086 *
2087 * Note that callers must keep write started around
2088 * VOP_VPUT_PAIR() calls, so it is safe to use mp without
2089 * busying it.
2090 */
2091 VOP_UNLOCK(vp);
2092 error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1,
2093 FFSV_REPLACE_DOOMED);
2094 if (error != 0) {
2095 VOP_LOCK(vp, vp_locked | LK_RETRY);
2096 } else {
2097 vrele(vp);
2098 *vpp = vp1;
2099 }
2100 return (error);
2101 }
2102