1 /* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include <sys/param.h> 49 #include <sys/bio.h> 50 #include <sys/systm.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/extattr.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/mount.h> 58 #include <sys/proc.h> 59 #include <sys/resourcevar.h> 60 #include <sys/signalvar.h> 61 #include <sys/stat.h> 62 #include <sys/vmmeter.h> 63 #include <sys/vnode.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_extern.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_page.h> 69 #include <vm/vm_pager.h> 70 #include <vm/vnode_pager.h> 71 72 #include <ufs/ufs/extattr.h> 73 #include <ufs/ufs/quota.h> 74 #include <ufs/ufs/inode.h> 75 #include <ufs/ufs/ufs_extern.h> 76 #include <ufs/ufs/ufsmount.h> 77 78 #include <ufs/ffs/fs.h> 79 #include <ufs/ffs/ffs_extern.h> 80 #include "opt_directio.h" 81 82 #ifdef DIRECTIO 83 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 84 #endif 85 static int ffs_fsync(struct vop_fsync_args *); 86 static int ffs_getpages(struct vop_getpages_args *); 87 static int ffs_read(struct vop_read_args *); 88 static int ffs_write(struct vop_write_args *); 89 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 90 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 91 struct ucred *cred); 92 static int ffsext_strategy(struct vop_strategy_args *); 93 static int ffs_closeextattr(struct vop_closeextattr_args *); 94 static int ffs_getextattr(struct vop_getextattr_args *); 95 static int ffs_listextattr(struct vop_listextattr_args *); 96 static int ffs_openextattr(struct vop_openextattr_args *); 97 static int ffs_setextattr(struct vop_setextattr_args *); 98 99 100 /* Global vfs data structures for ufs. */ 101 vop_t **ffs_vnodeop_p; 102 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 103 { &vop_default_desc, (vop_t *) ufs_vnoperate }, 104 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 105 { &vop_getpages_desc, (vop_t *) ffs_getpages }, 106 { &vop_read_desc, (vop_t *) ffs_read }, 107 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 108 { &vop_write_desc, (vop_t *) ffs_write }, 109 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 110 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 111 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 112 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 113 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 114 { NULL, NULL } 115 }; 116 static struct vnodeopv_desc ffs_vnodeop_opv_desc = 117 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 118 119 vop_t **ffs_specop_p; 120 static struct vnodeopv_entry_desc ffs_specop_entries[] = { 121 { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, 122 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 123 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 124 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 125 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 126 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 127 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 128 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 129 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 130 { NULL, NULL } 131 }; 132 static struct vnodeopv_desc ffs_specop_opv_desc = 133 { &ffs_specop_p, ffs_specop_entries }; 134 135 vop_t **ffs_fifoop_p; 136 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 137 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, 138 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 139 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 140 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 141 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 142 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 143 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 144 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 145 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 146 { NULL, NULL } 147 }; 148 static struct vnodeopv_desc ffs_fifoop_opv_desc = 149 { &ffs_fifoop_p, ffs_fifoop_entries }; 150 151 VNODEOP_SET(ffs_vnodeop_opv_desc); 152 VNODEOP_SET(ffs_specop_opv_desc); 153 VNODEOP_SET(ffs_fifoop_opv_desc); 154 155 /* 156 * Synch an open file. 157 */ 158 /* ARGSUSED */ 159 static int 160 ffs_fsync(ap) 161 struct vop_fsync_args /* { 162 struct vnode *a_vp; 163 struct ucred *a_cred; 164 int a_waitfor; 165 struct thread *a_td; 166 } */ *ap; 167 { 168 struct vnode *vp = ap->a_vp; 169 struct inode *ip = VTOI(vp); 170 struct buf *bp; 171 struct buf *nbp; 172 int s, error, wait, passes, skipmeta; 173 ufs_lbn_t lbn; 174 175 wait = (ap->a_waitfor == MNT_WAIT); 176 if (vn_isdisk(vp, NULL)) { 177 lbn = INT_MAX; 178 if (vp->v_rdev->si_mountpoint != NULL && 179 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 180 softdep_fsync_mountdev(vp); 181 } else { 182 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 183 } 184 185 /* 186 * Flush all dirty buffers associated with a vnode. 187 */ 188 passes = NIADDR + 1; 189 skipmeta = 0; 190 if (wait) 191 skipmeta = 1; 192 s = splbio(); 193 VI_LOCK(vp); 194 loop: 195 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 196 bp->b_vflags &= ~BV_SCANNED; 197 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 198 nbp = TAILQ_NEXT(bp, b_vnbufs); 199 /* 200 * Reasons to skip this buffer: it has already been considered 201 * on this pass, this pass is the first time through on a 202 * synchronous flush request and the buffer being considered 203 * is metadata, the buffer has dependencies that will cause 204 * it to be redirtied and it has not already been deferred, 205 * or it is already being written. 206 */ 207 if ((bp->b_vflags & BV_SCANNED) != 0) 208 continue; 209 bp->b_vflags |= BV_SCANNED; 210 if ((skipmeta == 1 && bp->b_lblkno < 0)) 211 continue; 212 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 213 continue; 214 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 215 (bp->b_flags & B_DEFERRED) == 0 && 216 buf_countdeps(bp, 0)) { 217 bp->b_flags |= B_DEFERRED; 218 BUF_UNLOCK(bp); 219 continue; 220 } 221 VI_UNLOCK(vp); 222 if ((bp->b_flags & B_DELWRI) == 0) 223 panic("ffs_fsync: not dirty"); 224 if (vp != bp->b_vp) 225 panic("ffs_fsync: vp != vp->b_vp"); 226 /* 227 * If this is a synchronous flush request, or it is not a 228 * file or device, start the write on this buffer immediatly. 229 */ 230 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 231 232 /* 233 * On our final pass through, do all I/O synchronously 234 * so that we can find out if our flush is failing 235 * because of write errors. 236 */ 237 if (passes > 0 || !wait) { 238 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 239 (void) vfs_bio_awrite(bp); 240 } else { 241 bremfree(bp); 242 splx(s); 243 (void) bawrite(bp); 244 s = splbio(); 245 } 246 } else { 247 bremfree(bp); 248 splx(s); 249 if ((error = bwrite(bp)) != 0) 250 return (error); 251 s = splbio(); 252 } 253 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 254 /* 255 * If the buffer is for data that has been truncated 256 * off the file, then throw it away. 257 */ 258 bremfree(bp); 259 bp->b_flags |= B_INVAL | B_NOCACHE; 260 splx(s); 261 brelse(bp); 262 s = splbio(); 263 } else 264 vfs_bio_awrite(bp); 265 266 /* 267 * Since we may have slept during the I/O, we need 268 * to start from a known point. 269 */ 270 VI_LOCK(vp); 271 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 272 } 273 /* 274 * If we were asked to do this synchronously, then go back for 275 * another pass, this time doing the metadata. 276 */ 277 if (skipmeta) { 278 skipmeta = 0; 279 goto loop; 280 } 281 282 if (wait) { 283 while (vp->v_numoutput) { 284 vp->v_iflag |= VI_BWAIT; 285 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 286 PRIBIO + 4, "ffsfsn", 0); 287 } 288 VI_UNLOCK(vp); 289 290 /* 291 * Ensure that any filesystem metatdata associated 292 * with the vnode has been written. 293 */ 294 splx(s); 295 if ((error = softdep_sync_metadata(ap)) != 0) 296 return (error); 297 s = splbio(); 298 299 VI_LOCK(vp); 300 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 301 /* 302 * Block devices associated with filesystems may 303 * have new I/O requests posted for them even if 304 * the vnode is locked, so no amount of trying will 305 * get them clean. Thus we give block devices a 306 * good effort, then just give up. For all other file 307 * types, go around and try again until it is clean. 308 */ 309 if (passes > 0) { 310 passes -= 1; 311 goto loop; 312 } 313 #ifdef DIAGNOSTIC 314 if (!vn_isdisk(vp, NULL)) 315 vprint("ffs_fsync: dirty", vp); 316 #endif 317 } 318 } 319 VI_UNLOCK(vp); 320 splx(s); 321 return (UFS_UPDATE(vp, wait)); 322 } 323 324 325 /* 326 * Vnode op for reading. 327 */ 328 /* ARGSUSED */ 329 static int 330 ffs_read(ap) 331 struct vop_read_args /* { 332 struct vnode *a_vp; 333 struct uio *a_uio; 334 int a_ioflag; 335 struct ucred *a_cred; 336 } */ *ap; 337 { 338 struct vnode *vp; 339 struct inode *ip; 340 struct uio *uio; 341 struct fs *fs; 342 struct buf *bp; 343 ufs_lbn_t lbn, nextlbn; 344 off_t bytesinfile; 345 long size, xfersize, blkoffset; 346 int error, orig_resid; 347 int seqcount; 348 int ioflag; 349 vm_object_t object; 350 351 vp = ap->a_vp; 352 uio = ap->a_uio; 353 ioflag = ap->a_ioflag; 354 if (ap->a_ioflag & IO_EXT) 355 #ifdef notyet 356 return (ffs_extread(vp, uio, ioflag)); 357 #else 358 panic("ffs_read+IO_EXT"); 359 #endif 360 #ifdef DIRECTIO 361 if ((ioflag & IO_DIRECT) != 0) { 362 int workdone; 363 364 error = ffs_rawread(vp, uio, &workdone); 365 if (error != 0 || workdone != 0) 366 return error; 367 } 368 #endif 369 370 GIANT_REQUIRED; 371 372 seqcount = ap->a_ioflag >> 16; 373 ip = VTOI(vp); 374 375 #ifdef DIAGNOSTIC 376 if (uio->uio_rw != UIO_READ) 377 panic("ffs_read: mode"); 378 379 if (vp->v_type == VLNK) { 380 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 381 panic("ffs_read: short symlink"); 382 } else if (vp->v_type != VREG && vp->v_type != VDIR) 383 panic("ffs_read: type %d", vp->v_type); 384 #endif 385 fs = ip->i_fs; 386 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 387 return (EFBIG); 388 389 orig_resid = uio->uio_resid; 390 if (orig_resid <= 0) 391 return (0); 392 393 object = vp->v_object; 394 395 bytesinfile = ip->i_size - uio->uio_offset; 396 if (bytesinfile <= 0) { 397 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 398 ip->i_flag |= IN_ACCESS; 399 return 0; 400 } 401 402 if (object) { 403 vm_object_reference(object); 404 } 405 406 /* 407 * Ok so we couldn't do it all in one vm trick... 408 * so cycle around trying smaller bites.. 409 */ 410 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 411 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 412 break; 413 414 lbn = lblkno(fs, uio->uio_offset); 415 nextlbn = lbn + 1; 416 417 /* 418 * size of buffer. The buffer representing the 419 * end of the file is rounded up to the size of 420 * the block type ( fragment or full block, 421 * depending ). 422 */ 423 size = blksize(fs, ip, lbn); 424 blkoffset = blkoff(fs, uio->uio_offset); 425 426 /* 427 * The amount we want to transfer in this iteration is 428 * one FS block less the amount of the data before 429 * our startpoint (duh!) 430 */ 431 xfersize = fs->fs_bsize - blkoffset; 432 433 /* 434 * But if we actually want less than the block, 435 * or the file doesn't have a whole block more of data, 436 * then use the lesser number. 437 */ 438 if (uio->uio_resid < xfersize) 439 xfersize = uio->uio_resid; 440 if (bytesinfile < xfersize) 441 xfersize = bytesinfile; 442 443 if (lblktosize(fs, nextlbn) >= ip->i_size) { 444 /* 445 * Don't do readahead if this is the end of the file. 446 */ 447 error = bread(vp, lbn, size, NOCRED, &bp); 448 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 449 /* 450 * Otherwise if we are allowed to cluster, 451 * grab as much as we can. 452 * 453 * XXX This may not be a win if we are not 454 * doing sequential access. 455 */ 456 error = cluster_read(vp, ip->i_size, lbn, 457 size, NOCRED, uio->uio_resid, seqcount, &bp); 458 } else if (seqcount > 1) { 459 /* 460 * If we are NOT allowed to cluster, then 461 * if we appear to be acting sequentially, 462 * fire off a request for a readahead 463 * as well as a read. Note that the 4th and 5th 464 * arguments point to arrays of the size specified in 465 * the 6th argument. 466 */ 467 int nextsize = blksize(fs, ip, nextlbn); 468 error = breadn(vp, lbn, 469 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 470 } else { 471 /* 472 * Failing all of the above, just read what the 473 * user asked for. Interestingly, the same as 474 * the first option above. 475 */ 476 error = bread(vp, lbn, size, NOCRED, &bp); 477 } 478 if (error) { 479 brelse(bp); 480 bp = NULL; 481 break; 482 } 483 484 /* 485 * If IO_DIRECT then set B_DIRECT for the buffer. This 486 * will cause us to attempt to release the buffer later on 487 * and will cause the buffer cache to attempt to free the 488 * underlying pages. 489 */ 490 if (ioflag & IO_DIRECT) 491 bp->b_flags |= B_DIRECT; 492 493 /* 494 * We should only get non-zero b_resid when an I/O error 495 * has occurred, which should cause us to break above. 496 * However, if the short read did not cause an error, 497 * then we want to ensure that we do not uiomove bad 498 * or uninitialized data. 499 */ 500 size -= bp->b_resid; 501 if (size < xfersize) { 502 if (size == 0) 503 break; 504 xfersize = size; 505 } 506 507 { 508 /* 509 * otherwise use the general form 510 */ 511 error = 512 uiomove((char *)bp->b_data + blkoffset, 513 (int)xfersize, uio); 514 } 515 516 if (error) 517 break; 518 519 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 520 (LIST_FIRST(&bp->b_dep) == NULL)) { 521 /* 522 * If there are no dependencies, and it's VMIO, 523 * then we don't need the buf, mark it available 524 * for freeing. The VM has the data. 525 */ 526 bp->b_flags |= B_RELBUF; 527 brelse(bp); 528 } else { 529 /* 530 * Otherwise let whoever 531 * made the request take care of 532 * freeing it. We just queue 533 * it onto another list. 534 */ 535 bqrelse(bp); 536 } 537 } 538 539 /* 540 * This can only happen in the case of an error 541 * because the loop above resets bp to NULL on each iteration 542 * and on normal completion has not set a new value into it. 543 * so it must have come from a 'break' statement 544 */ 545 if (bp != NULL) { 546 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 547 (LIST_FIRST(&bp->b_dep) == NULL)) { 548 bp->b_flags |= B_RELBUF; 549 brelse(bp); 550 } else { 551 bqrelse(bp); 552 } 553 } 554 555 if (object) { 556 VM_OBJECT_LOCK(object); 557 vm_object_vndeallocate(object); 558 } 559 if ((error == 0 || uio->uio_resid != orig_resid) && 560 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 561 ip->i_flag |= IN_ACCESS; 562 return (error); 563 } 564 565 /* 566 * Vnode op for writing. 567 */ 568 static int 569 ffs_write(ap) 570 struct vop_write_args /* { 571 struct vnode *a_vp; 572 struct uio *a_uio; 573 int a_ioflag; 574 struct ucred *a_cred; 575 } */ *ap; 576 { 577 struct vnode *vp; 578 struct uio *uio; 579 struct inode *ip; 580 struct fs *fs; 581 struct buf *bp; 582 struct thread *td; 583 ufs_lbn_t lbn; 584 off_t osize; 585 int seqcount; 586 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 587 vm_object_t object; 588 589 vp = ap->a_vp; 590 uio = ap->a_uio; 591 ioflag = ap->a_ioflag; 592 if (ap->a_ioflag & IO_EXT) 593 #ifdef notyet 594 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 595 #else 596 panic("ffs_read+IO_EXT"); 597 #endif 598 599 GIANT_REQUIRED; 600 601 extended = 0; 602 seqcount = ap->a_ioflag >> 16; 603 ip = VTOI(vp); 604 605 object = vp->v_object; 606 if (object) { 607 vm_object_reference(object); 608 } 609 610 #ifdef DIAGNOSTIC 611 if (uio->uio_rw != UIO_WRITE) 612 panic("ffswrite: mode"); 613 #endif 614 615 switch (vp->v_type) { 616 case VREG: 617 if (ioflag & IO_APPEND) 618 uio->uio_offset = ip->i_size; 619 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 620 if (object) { 621 VM_OBJECT_LOCK(object); 622 vm_object_vndeallocate(object); 623 } 624 return (EPERM); 625 } 626 /* FALLTHROUGH */ 627 case VLNK: 628 break; 629 case VDIR: 630 panic("ffswrite: dir write"); 631 break; 632 default: 633 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 634 (int)uio->uio_offset, 635 (int)uio->uio_resid 636 ); 637 } 638 639 fs = ip->i_fs; 640 if (uio->uio_offset < 0 || 641 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 642 if (object) { 643 VM_OBJECT_LOCK(object); 644 vm_object_vndeallocate(object); 645 } 646 return (EFBIG); 647 } 648 /* 649 * Maybe this should be above the vnode op call, but so long as 650 * file servers have no limits, I don't think it matters. 651 */ 652 td = uio->uio_td; 653 if (vp->v_type == VREG && td && 654 uio->uio_offset + uio->uio_resid > 655 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 656 PROC_LOCK(td->td_proc); 657 psignal(td->td_proc, SIGXFSZ); 658 PROC_UNLOCK(td->td_proc); 659 if (object) { 660 VM_OBJECT_LOCK(object); 661 vm_object_vndeallocate(object); 662 } 663 return (EFBIG); 664 } 665 666 resid = uio->uio_resid; 667 osize = ip->i_size; 668 if (seqcount > BA_SEQMAX) 669 flags = BA_SEQMAX << BA_SEQSHIFT; 670 else 671 flags = seqcount << BA_SEQSHIFT; 672 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 673 flags |= IO_SYNC; 674 675 for (error = 0; uio->uio_resid > 0;) { 676 lbn = lblkno(fs, uio->uio_offset); 677 blkoffset = blkoff(fs, uio->uio_offset); 678 xfersize = fs->fs_bsize - blkoffset; 679 if (uio->uio_resid < xfersize) 680 xfersize = uio->uio_resid; 681 682 if (uio->uio_offset + xfersize > ip->i_size) 683 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 684 685 /* 686 * We must perform a read-before-write if the transfer size 687 * does not cover the entire buffer. 688 */ 689 if (fs->fs_bsize > xfersize) 690 flags |= BA_CLRBUF; 691 else 692 flags &= ~BA_CLRBUF; 693 /* XXX is uio->uio_offset the right thing here? */ 694 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 695 ap->a_cred, flags, &bp); 696 if (error != 0) 697 break; 698 /* 699 * If the buffer is not valid we have to clear out any 700 * garbage data from the pages instantiated for the buffer. 701 * If we do not, a failed uiomove() during a write can leave 702 * the prior contents of the pages exposed to a userland 703 * mmap(). XXX deal with uiomove() errors a better way. 704 */ 705 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 706 vfs_bio_clrbuf(bp); 707 if (ioflag & IO_DIRECT) 708 bp->b_flags |= B_DIRECT; 709 710 if (uio->uio_offset + xfersize > ip->i_size) { 711 ip->i_size = uio->uio_offset + xfersize; 712 DIP(ip, i_size) = ip->i_size; 713 extended = 1; 714 } 715 716 size = blksize(fs, ip, lbn) - bp->b_resid; 717 if (size < xfersize) 718 xfersize = size; 719 720 error = 721 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 722 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 723 (LIST_FIRST(&bp->b_dep) == NULL)) { 724 bp->b_flags |= B_RELBUF; 725 } 726 727 /* 728 * If IO_SYNC each buffer is written synchronously. Otherwise 729 * if we have a severe page deficiency write the buffer 730 * asynchronously. Otherwise try to cluster, and if that 731 * doesn't do it then either do an async write (if O_DIRECT), 732 * or a delayed write (if not). 733 */ 734 if (ioflag & IO_SYNC) { 735 (void)bwrite(bp); 736 } else if (vm_page_count_severe() || 737 buf_dirty_count_severe() || 738 (ioflag & IO_ASYNC)) { 739 bp->b_flags |= B_CLUSTEROK; 740 bawrite(bp); 741 } else if (xfersize + blkoffset == fs->fs_bsize) { 742 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 743 bp->b_flags |= B_CLUSTEROK; 744 cluster_write(bp, ip->i_size, seqcount); 745 } else { 746 bawrite(bp); 747 } 748 } else if (ioflag & IO_DIRECT) { 749 bp->b_flags |= B_CLUSTEROK; 750 bawrite(bp); 751 } else { 752 bp->b_flags |= B_CLUSTEROK; 753 bdwrite(bp); 754 } 755 if (error || xfersize == 0) 756 break; 757 ip->i_flag |= IN_CHANGE | IN_UPDATE; 758 } 759 /* 760 * If we successfully wrote any data, and we are not the superuser 761 * we clear the setuid and setgid bits as a precaution against 762 * tampering. 763 */ 764 if (resid > uio->uio_resid && ap->a_cred && 765 suser_cred(ap->a_cred, PRISON_ROOT)) { 766 ip->i_mode &= ~(ISUID | ISGID); 767 DIP(ip, i_mode) = ip->i_mode; 768 } 769 if (resid > uio->uio_resid) 770 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 771 if (error) { 772 if (ioflag & IO_UNIT) { 773 (void)UFS_TRUNCATE(vp, osize, 774 IO_NORMAL | (ioflag & IO_SYNC), 775 ap->a_cred, uio->uio_td); 776 uio->uio_offset -= resid - uio->uio_resid; 777 uio->uio_resid = resid; 778 } 779 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 780 error = UFS_UPDATE(vp, 1); 781 782 if (object) { 783 VM_OBJECT_LOCK(object); 784 vm_object_vndeallocate(object); 785 } 786 787 return (error); 788 } 789 790 /* 791 * get page routine 792 */ 793 static int 794 ffs_getpages(ap) 795 struct vop_getpages_args *ap; 796 { 797 off_t foff, physoffset; 798 int i, size, bsize; 799 struct vnode *dp, *vp; 800 vm_object_t obj; 801 vm_pindex_t pindex; 802 vm_page_t mreq; 803 int bbackwards, bforwards; 804 int pbackwards, pforwards; 805 int firstpage; 806 ufs2_daddr_t reqblkno, reqlblkno; 807 int poff; 808 int pcount; 809 int rtval; 810 int pagesperblock; 811 812 GIANT_REQUIRED; 813 814 pcount = round_page(ap->a_count) / PAGE_SIZE; 815 mreq = ap->a_m[ap->a_reqpage]; 816 817 /* 818 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 819 * then the entire page is valid. Since the page may be mapped, 820 * user programs might reference data beyond the actual end of file 821 * occuring within the page. We have to zero that data. 822 */ 823 if (mreq->valid) { 824 if (mreq->valid != VM_PAGE_BITS_ALL) 825 vm_page_zero_invalid(mreq, TRUE); 826 VM_OBJECT_LOCK(mreq->object); 827 vm_page_lock_queues(); 828 for (i = 0; i < pcount; i++) { 829 if (i != ap->a_reqpage) { 830 vm_page_free(ap->a_m[i]); 831 } 832 } 833 vm_page_unlock_queues(); 834 VM_OBJECT_UNLOCK(mreq->object); 835 return VM_PAGER_OK; 836 } 837 838 vp = ap->a_vp; 839 obj = vp->v_object; 840 bsize = vp->v_mount->mnt_stat.f_iosize; 841 pindex = mreq->pindex; 842 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; 843 844 if (bsize < PAGE_SIZE) 845 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 846 ap->a_count, 847 ap->a_reqpage); 848 849 /* 850 * foff is the file offset of the required page 851 * reqlblkno is the logical block that contains the page 852 * poff is the index of the page into the logical block 853 */ 854 reqlblkno = foff / bsize; 855 poff = (foff % bsize) / PAGE_SIZE; 856 857 dp = VTOI(vp)->i_devvp; 858 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 859 || (reqblkno == -1)) { 860 VM_OBJECT_LOCK(obj); 861 vm_page_lock_queues(); 862 for(i = 0; i < pcount; i++) { 863 if (i != ap->a_reqpage) 864 vm_page_free(ap->a_m[i]); 865 } 866 vm_page_unlock_queues(); 867 VM_OBJECT_UNLOCK(obj); 868 if (reqblkno == -1) { 869 if ((mreq->flags & PG_ZERO) == 0) 870 pmap_zero_page(mreq); 871 vm_page_undirty(mreq); 872 mreq->valid = VM_PAGE_BITS_ALL; 873 return VM_PAGER_OK; 874 } else { 875 return VM_PAGER_ERROR; 876 } 877 } 878 879 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 880 pagesperblock = bsize / PAGE_SIZE; 881 /* 882 * find the first page that is contiguous... 883 * note that pbackwards is the number of pages that are contiguous 884 * backwards. 885 */ 886 firstpage = 0; 887 if (ap->a_count) { 888 pbackwards = poff + bbackwards * pagesperblock; 889 if (ap->a_reqpage > pbackwards) { 890 firstpage = ap->a_reqpage - pbackwards; 891 VM_OBJECT_LOCK(obj); 892 vm_page_lock_queues(); 893 for(i=0;i<firstpage;i++) 894 vm_page_free(ap->a_m[i]); 895 vm_page_unlock_queues(); 896 VM_OBJECT_UNLOCK(obj); 897 } 898 899 /* 900 * pforwards is the number of pages that are contiguous 901 * after the current page. 902 */ 903 pforwards = (pagesperblock - (poff + 1)) + 904 bforwards * pagesperblock; 905 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 906 VM_OBJECT_LOCK(obj); 907 vm_page_lock_queues(); 908 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 909 vm_page_free(ap->a_m[i]); 910 vm_page_unlock_queues(); 911 VM_OBJECT_UNLOCK(obj); 912 pcount = ap->a_reqpage + pforwards + 1; 913 } 914 915 /* 916 * number of pages for I/O corrected for the non-contig pages at 917 * the beginning of the array. 918 */ 919 pcount -= firstpage; 920 } 921 922 /* 923 * calculate the size of the transfer 924 */ 925 926 size = pcount * PAGE_SIZE; 927 928 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 929 obj->un_pager.vnp.vnp_size) 930 size = obj->un_pager.vnp.vnp_size - 931 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 932 933 physoffset -= foff; 934 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 935 (ap->a_reqpage - firstpage), physoffset); 936 937 return (rtval); 938 } 939 940 /* 941 * Extended attribute area reading. 942 */ 943 static int 944 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 945 { 946 struct inode *ip; 947 struct ufs2_dinode *dp; 948 struct fs *fs; 949 struct buf *bp; 950 ufs_lbn_t lbn, nextlbn; 951 off_t bytesinfile; 952 long size, xfersize, blkoffset; 953 int error, orig_resid; 954 955 GIANT_REQUIRED; 956 957 ip = VTOI(vp); 958 fs = ip->i_fs; 959 dp = ip->i_din2; 960 961 #ifdef DIAGNOSTIC 962 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 963 panic("ffs_extread: mode"); 964 965 #endif 966 orig_resid = uio->uio_resid; 967 if (orig_resid <= 0) 968 return (0); 969 970 bytesinfile = dp->di_extsize - uio->uio_offset; 971 if (bytesinfile <= 0) { 972 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 973 ip->i_flag |= IN_ACCESS; 974 return 0; 975 } 976 977 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 978 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 979 break; 980 981 lbn = lblkno(fs, uio->uio_offset); 982 nextlbn = lbn + 1; 983 984 /* 985 * size of buffer. The buffer representing the 986 * end of the file is rounded up to the size of 987 * the block type ( fragment or full block, 988 * depending ). 989 */ 990 size = sblksize(fs, dp->di_extsize, lbn); 991 blkoffset = blkoff(fs, uio->uio_offset); 992 993 /* 994 * The amount we want to transfer in this iteration is 995 * one FS block less the amount of the data before 996 * our startpoint (duh!) 997 */ 998 xfersize = fs->fs_bsize - blkoffset; 999 1000 /* 1001 * But if we actually want less than the block, 1002 * or the file doesn't have a whole block more of data, 1003 * then use the lesser number. 1004 */ 1005 if (uio->uio_resid < xfersize) 1006 xfersize = uio->uio_resid; 1007 if (bytesinfile < xfersize) 1008 xfersize = bytesinfile; 1009 1010 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 1011 /* 1012 * Don't do readahead if this is the end of the info. 1013 */ 1014 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 1015 } else { 1016 /* 1017 * If we have a second block, then 1018 * fire off a request for a readahead 1019 * as well as a read. Note that the 4th and 5th 1020 * arguments point to arrays of the size specified in 1021 * the 6th argument. 1022 */ 1023 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1024 1025 nextlbn = -1 - nextlbn; 1026 error = breadn(vp, -1 - lbn, 1027 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1028 } 1029 if (error) { 1030 brelse(bp); 1031 bp = NULL; 1032 break; 1033 } 1034 1035 /* 1036 * If IO_DIRECT then set B_DIRECT for the buffer. This 1037 * will cause us to attempt to release the buffer later on 1038 * and will cause the buffer cache to attempt to free the 1039 * underlying pages. 1040 */ 1041 if (ioflag & IO_DIRECT) 1042 bp->b_flags |= B_DIRECT; 1043 1044 /* 1045 * We should only get non-zero b_resid when an I/O error 1046 * has occurred, which should cause us to break above. 1047 * However, if the short read did not cause an error, 1048 * then we want to ensure that we do not uiomove bad 1049 * or uninitialized data. 1050 */ 1051 size -= bp->b_resid; 1052 if (size < xfersize) { 1053 if (size == 0) 1054 break; 1055 xfersize = size; 1056 } 1057 1058 error = uiomove((char *)bp->b_data + blkoffset, 1059 (int)xfersize, uio); 1060 if (error) 1061 break; 1062 1063 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1064 (LIST_FIRST(&bp->b_dep) == NULL)) { 1065 /* 1066 * If there are no dependencies, and it's VMIO, 1067 * then we don't need the buf, mark it available 1068 * for freeing. The VM has the data. 1069 */ 1070 bp->b_flags |= B_RELBUF; 1071 brelse(bp); 1072 } else { 1073 /* 1074 * Otherwise let whoever 1075 * made the request take care of 1076 * freeing it. We just queue 1077 * it onto another list. 1078 */ 1079 bqrelse(bp); 1080 } 1081 } 1082 1083 /* 1084 * This can only happen in the case of an error 1085 * because the loop above resets bp to NULL on each iteration 1086 * and on normal completion has not set a new value into it. 1087 * so it must have come from a 'break' statement 1088 */ 1089 if (bp != NULL) { 1090 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1091 (LIST_FIRST(&bp->b_dep) == NULL)) { 1092 bp->b_flags |= B_RELBUF; 1093 brelse(bp); 1094 } else { 1095 bqrelse(bp); 1096 } 1097 } 1098 1099 if ((error == 0 || uio->uio_resid != orig_resid) && 1100 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1101 ip->i_flag |= IN_ACCESS; 1102 return (error); 1103 } 1104 1105 /* 1106 * Extended attribute area writing. 1107 */ 1108 static int 1109 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1110 { 1111 struct inode *ip; 1112 struct ufs2_dinode *dp; 1113 struct fs *fs; 1114 struct buf *bp; 1115 ufs_lbn_t lbn; 1116 off_t osize; 1117 int blkoffset, error, flags, resid, size, xfersize; 1118 1119 GIANT_REQUIRED; 1120 1121 ip = VTOI(vp); 1122 fs = ip->i_fs; 1123 dp = ip->i_din2; 1124 1125 #ifdef DIAGNOSTIC 1126 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1127 panic("ext_write: mode"); 1128 #endif 1129 1130 if (ioflag & IO_APPEND) 1131 uio->uio_offset = dp->di_extsize; 1132 1133 if (uio->uio_offset < 0 || 1134 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1135 return (EFBIG); 1136 1137 resid = uio->uio_resid; 1138 osize = dp->di_extsize; 1139 flags = IO_EXT; 1140 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1141 flags |= IO_SYNC; 1142 1143 for (error = 0; uio->uio_resid > 0;) { 1144 lbn = lblkno(fs, uio->uio_offset); 1145 blkoffset = blkoff(fs, uio->uio_offset); 1146 xfersize = fs->fs_bsize - blkoffset; 1147 if (uio->uio_resid < xfersize) 1148 xfersize = uio->uio_resid; 1149 1150 /* 1151 * We must perform a read-before-write if the transfer size 1152 * does not cover the entire buffer. 1153 */ 1154 if (fs->fs_bsize > xfersize) 1155 flags |= BA_CLRBUF; 1156 else 1157 flags &= ~BA_CLRBUF; 1158 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1159 ucred, flags, &bp); 1160 if (error != 0) 1161 break; 1162 /* 1163 * If the buffer is not valid we have to clear out any 1164 * garbage data from the pages instantiated for the buffer. 1165 * If we do not, a failed uiomove() during a write can leave 1166 * the prior contents of the pages exposed to a userland 1167 * mmap(). XXX deal with uiomove() errors a better way. 1168 */ 1169 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1170 vfs_bio_clrbuf(bp); 1171 if (ioflag & IO_DIRECT) 1172 bp->b_flags |= B_DIRECT; 1173 1174 if (uio->uio_offset + xfersize > dp->di_extsize) 1175 dp->di_extsize = uio->uio_offset + xfersize; 1176 1177 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1178 if (size < xfersize) 1179 xfersize = size; 1180 1181 error = 1182 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1183 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1184 (LIST_FIRST(&bp->b_dep) == NULL)) { 1185 bp->b_flags |= B_RELBUF; 1186 } 1187 1188 /* 1189 * If IO_SYNC each buffer is written synchronously. Otherwise 1190 * if we have a severe page deficiency write the buffer 1191 * asynchronously. Otherwise try to cluster, and if that 1192 * doesn't do it then either do an async write (if O_DIRECT), 1193 * or a delayed write (if not). 1194 */ 1195 if (ioflag & IO_SYNC) { 1196 (void)bwrite(bp); 1197 } else if (vm_page_count_severe() || 1198 buf_dirty_count_severe() || 1199 xfersize + blkoffset == fs->fs_bsize || 1200 (ioflag & (IO_ASYNC | IO_DIRECT))) 1201 bawrite(bp); 1202 else 1203 bdwrite(bp); 1204 if (error || xfersize == 0) 1205 break; 1206 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1207 } 1208 /* 1209 * If we successfully wrote any data, and we are not the superuser 1210 * we clear the setuid and setgid bits as a precaution against 1211 * tampering. 1212 */ 1213 if (resid > uio->uio_resid && ucred && 1214 suser_cred(ucred, PRISON_ROOT)) { 1215 ip->i_mode &= ~(ISUID | ISGID); 1216 dp->di_mode = ip->i_mode; 1217 } 1218 if (error) { 1219 if (ioflag & IO_UNIT) { 1220 (void)UFS_TRUNCATE(vp, osize, 1221 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1222 uio->uio_offset -= resid - uio->uio_resid; 1223 uio->uio_resid = resid; 1224 } 1225 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1226 error = UFS_UPDATE(vp, 1); 1227 return (error); 1228 } 1229 1230 1231 /* 1232 * Vnode operating to retrieve a named extended attribute. 1233 * 1234 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1235 * the length of the EA, and possibly the pointer to the entry and to the data. 1236 */ 1237 static int 1238 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac) 1239 { 1240 u_char *p, *pe, *pn, *p0; 1241 int eapad1, eapad2, ealength, ealen, nlen; 1242 uint32_t ul; 1243 1244 pe = ptr + length; 1245 nlen = strlen(name); 1246 1247 for (p = ptr; p < pe; p = pn) { 1248 p0 = p; 1249 bcopy(p, &ul, sizeof(ul)); 1250 pn = p + ul; 1251 /* make sure this entry is complete */ 1252 if (pn > pe) 1253 break; 1254 p += sizeof(uint32_t); 1255 if (*p != nspace) 1256 continue; 1257 p++; 1258 eapad2 = *p++; 1259 if (*p != nlen) 1260 continue; 1261 p++; 1262 if (bcmp(p, name, nlen)) 1263 continue; 1264 ealength = sizeof(uint32_t) + 3 + nlen; 1265 eapad1 = 8 - (ealength % 8); 1266 if (eapad1 == 8) 1267 eapad1 = 0; 1268 ealength += eapad1; 1269 ealen = ul - ealength - eapad2; 1270 p += nlen + eapad1; 1271 if (eap != NULL) 1272 *eap = p0; 1273 if (eac != NULL) 1274 *eac = p; 1275 return (ealen); 1276 } 1277 return(-1); 1278 } 1279 1280 static int 1281 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1282 { 1283 struct inode *ip; 1284 struct ufs2_dinode *dp; 1285 struct uio luio; 1286 struct iovec liovec; 1287 int easize, error; 1288 u_char *eae; 1289 1290 ip = VTOI(vp); 1291 dp = ip->i_din2; 1292 easize = dp->di_extsize; 1293 1294 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1295 1296 liovec.iov_base = eae; 1297 liovec.iov_len = easize; 1298 luio.uio_iov = &liovec; 1299 luio.uio_iovcnt = 1; 1300 luio.uio_offset = 0; 1301 luio.uio_resid = easize; 1302 luio.uio_segflg = UIO_SYSSPACE; 1303 luio.uio_rw = UIO_READ; 1304 luio.uio_td = td; 1305 1306 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1307 if (error) { 1308 free(eae, M_TEMP); 1309 return(error); 1310 } 1311 *p = eae; 1312 return (0); 1313 } 1314 1315 static int 1316 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1317 { 1318 struct inode *ip; 1319 struct ufs2_dinode *dp; 1320 int error; 1321 1322 ip = VTOI(vp); 1323 1324 if (ip->i_ea_area != NULL) 1325 return (EBUSY); 1326 dp = ip->i_din2; 1327 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1328 if (error) 1329 return (error); 1330 ip->i_ea_len = dp->di_extsize; 1331 ip->i_ea_error = 0; 1332 return (0); 1333 } 1334 1335 /* 1336 * Vnode extattr transaction commit/abort 1337 */ 1338 static int 1339 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1340 { 1341 struct inode *ip; 1342 struct uio luio; 1343 struct iovec liovec; 1344 int error; 1345 struct ufs2_dinode *dp; 1346 1347 ip = VTOI(vp); 1348 if (ip->i_ea_area == NULL) 1349 return (EINVAL); 1350 dp = ip->i_din2; 1351 error = ip->i_ea_error; 1352 if (commit && error == 0) { 1353 if (cred == NOCRED) 1354 cred = vp->v_mount->mnt_cred; 1355 liovec.iov_base = ip->i_ea_area; 1356 liovec.iov_len = ip->i_ea_len; 1357 luio.uio_iov = &liovec; 1358 luio.uio_iovcnt = 1; 1359 luio.uio_offset = 0; 1360 luio.uio_resid = ip->i_ea_len; 1361 luio.uio_segflg = UIO_SYSSPACE; 1362 luio.uio_rw = UIO_WRITE; 1363 luio.uio_td = td; 1364 /* XXX: I'm not happy about truncating to zero size */ 1365 if (ip->i_ea_len < dp->di_extsize) 1366 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1367 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1368 } 1369 free(ip->i_ea_area, M_TEMP); 1370 ip->i_ea_area = NULL; 1371 ip->i_ea_len = 0; 1372 ip->i_ea_error = 0; 1373 return (error); 1374 } 1375 1376 /* 1377 * Vnode extattr strategy routine for special devices and fifos. 1378 * 1379 * We need to check for a read or write of the external attributes. 1380 * Otherwise we just fall through and do the usual thing. 1381 */ 1382 static int 1383 ffsext_strategy(struct vop_strategy_args *ap) 1384 /* 1385 struct vop_strategy_args { 1386 struct vnodeop_desc *a_desc; 1387 struct vnode *a_vp; 1388 struct buf *a_bp; 1389 }; 1390 */ 1391 { 1392 struct vnode *vp; 1393 daddr_t lbn; 1394 1395 KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", 1396 __func__, ap->a_vp, ap->a_bp->b_vp)); 1397 vp = ap->a_vp; 1398 lbn = ap->a_bp->b_lblkno; 1399 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1400 lbn < 0 && lbn >= -NXADDR) 1401 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1402 if (vp->v_type == VFIFO) 1403 return (ufs_vnoperatefifo((struct vop_generic_args *)ap)); 1404 return (ufs_vnoperatespec((struct vop_generic_args *)ap)); 1405 } 1406 1407 /* 1408 * Vnode extattr transaction commit/abort 1409 */ 1410 static int 1411 ffs_openextattr(struct vop_openextattr_args *ap) 1412 /* 1413 struct vop_openextattr_args { 1414 struct vnodeop_desc *a_desc; 1415 struct vnode *a_vp; 1416 IN struct ucred *a_cred; 1417 IN struct thread *a_td; 1418 }; 1419 */ 1420 { 1421 struct inode *ip; 1422 struct fs *fs; 1423 1424 ip = VTOI(ap->a_vp); 1425 fs = ip->i_fs; 1426 if (fs->fs_magic == FS_UFS1_MAGIC) 1427 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1428 1429 if (ap->a_vp->v_type == VCHR) 1430 return (EOPNOTSUPP); 1431 1432 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1433 } 1434 1435 1436 /* 1437 * Vnode extattr transaction commit/abort 1438 */ 1439 static int 1440 ffs_closeextattr(struct vop_closeextattr_args *ap) 1441 /* 1442 struct vop_closeextattr_args { 1443 struct vnodeop_desc *a_desc; 1444 struct vnode *a_vp; 1445 int a_commit; 1446 IN struct ucred *a_cred; 1447 IN struct thread *a_td; 1448 }; 1449 */ 1450 { 1451 struct inode *ip; 1452 struct fs *fs; 1453 1454 ip = VTOI(ap->a_vp); 1455 fs = ip->i_fs; 1456 if (fs->fs_magic == FS_UFS1_MAGIC) 1457 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1458 1459 if (ap->a_vp->v_type == VCHR) 1460 return (EOPNOTSUPP); 1461 1462 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1463 } 1464 1465 1466 1467 /* 1468 * Vnode operation to retrieve a named extended attribute. 1469 */ 1470 static int 1471 ffs_getextattr(struct vop_getextattr_args *ap) 1472 /* 1473 vop_getextattr { 1474 IN struct vnode *a_vp; 1475 IN int a_attrnamespace; 1476 IN const char *a_name; 1477 INOUT struct uio *a_uio; 1478 OUT size_t *a_size; 1479 IN struct ucred *a_cred; 1480 IN struct thread *a_td; 1481 }; 1482 */ 1483 { 1484 struct inode *ip; 1485 struct fs *fs; 1486 u_char *eae, *p; 1487 unsigned easize; 1488 int error, ealen, stand_alone; 1489 1490 ip = VTOI(ap->a_vp); 1491 fs = ip->i_fs; 1492 1493 if (fs->fs_magic == FS_UFS1_MAGIC) 1494 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1495 1496 if (ap->a_vp->v_type == VCHR) 1497 return (EOPNOTSUPP); 1498 1499 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1500 ap->a_cred, ap->a_td, IREAD); 1501 if (error) 1502 return (error); 1503 1504 if (ip->i_ea_area == NULL) { 1505 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1506 if (error) 1507 return (error); 1508 stand_alone = 1; 1509 } else { 1510 stand_alone = 0; 1511 } 1512 eae = ip->i_ea_area; 1513 easize = ip->i_ea_len; 1514 1515 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1516 NULL, &p); 1517 if (ealen >= 0) { 1518 error = 0; 1519 if (ap->a_size != NULL) 1520 *ap->a_size = ealen; 1521 else if (ap->a_uio != NULL) 1522 error = uiomove(p, ealen, ap->a_uio); 1523 } else 1524 error = ENOATTR; 1525 if (stand_alone) 1526 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1527 return(error); 1528 } 1529 1530 /* 1531 * Vnode operation to retrieve extended attributes on a vnode. 1532 */ 1533 static int 1534 ffs_listextattr(struct vop_listextattr_args *ap) 1535 /* 1536 vop_listextattr { 1537 IN struct vnode *a_vp; 1538 IN int a_attrnamespace; 1539 INOUT struct uio *a_uio; 1540 OUT size_t *a_size; 1541 IN struct ucred *a_cred; 1542 IN struct thread *a_td; 1543 }; 1544 */ 1545 { 1546 struct inode *ip; 1547 struct fs *fs; 1548 u_char *eae, *p, *pe, *pn; 1549 unsigned easize; 1550 uint32_t ul; 1551 int error, ealen, stand_alone; 1552 1553 ip = VTOI(ap->a_vp); 1554 fs = ip->i_fs; 1555 1556 if (fs->fs_magic == FS_UFS1_MAGIC) 1557 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1558 1559 if (ap->a_vp->v_type == VCHR) 1560 return (EOPNOTSUPP); 1561 1562 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1563 ap->a_cred, ap->a_td, IREAD); 1564 if (error) 1565 return (error); 1566 1567 if (ip->i_ea_area == NULL) { 1568 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1569 if (error) 1570 return (error); 1571 stand_alone = 1; 1572 } else { 1573 stand_alone = 0; 1574 } 1575 eae = ip->i_ea_area; 1576 easize = ip->i_ea_len; 1577 1578 error = 0; 1579 if (ap->a_size != NULL) 1580 *ap->a_size = 0; 1581 pe = eae + easize; 1582 for(p = eae; error == 0 && p < pe; p = pn) { 1583 bcopy(p, &ul, sizeof(ul)); 1584 pn = p + ul; 1585 if (pn > pe) 1586 break; 1587 p += sizeof(ul); 1588 if (*p++ != ap->a_attrnamespace) 1589 continue; 1590 p++; /* pad2 */ 1591 ealen = *p; 1592 if (ap->a_size != NULL) { 1593 *ap->a_size += ealen + 1; 1594 } else if (ap->a_uio != NULL) { 1595 error = uiomove(p, ealen + 1, ap->a_uio); 1596 } 1597 } 1598 if (stand_alone) 1599 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1600 return(error); 1601 } 1602 1603 /* 1604 * Vnode operation to set a named attribute. 1605 */ 1606 static int 1607 ffs_setextattr(struct vop_setextattr_args *ap) 1608 /* 1609 vop_setextattr { 1610 IN struct vnode *a_vp; 1611 IN int a_attrnamespace; 1612 IN const char *a_name; 1613 INOUT struct uio *a_uio; 1614 IN struct ucred *a_cred; 1615 IN struct thread *a_td; 1616 }; 1617 */ 1618 { 1619 struct inode *ip; 1620 struct fs *fs; 1621 uint32_t ealength, ul; 1622 int ealen, olen, eapad1, eapad2, error, i, easize; 1623 u_char *eae, *p; 1624 int stand_alone; 1625 1626 ip = VTOI(ap->a_vp); 1627 fs = ip->i_fs; 1628 1629 if (fs->fs_magic == FS_UFS1_MAGIC) 1630 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1631 1632 if (ap->a_vp->v_type == VCHR) 1633 return (EOPNOTSUPP); 1634 1635 if (strlen(ap->a_name) == 0) 1636 return (EINVAL); 1637 1638 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1639 ap->a_cred, ap->a_td, IWRITE); 1640 if (error) { 1641 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1642 ip->i_ea_error = error; 1643 return (error); 1644 } 1645 1646 if (ip->i_ea_area == NULL) { 1647 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1648 if (error) 1649 return (error); 1650 stand_alone = 1; 1651 } else { 1652 stand_alone = 0; 1653 } 1654 1655 /* Calculate the length of the EA entry */ 1656 if (ap->a_uio == NULL) { 1657 /* delete */ 1658 ealength = eapad1 = ealen = eapad2 = 0; 1659 } else { 1660 ealen = ap->a_uio->uio_resid; 1661 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1662 eapad1 = 8 - (ealength % 8); 1663 if (eapad1 == 8) 1664 eapad1 = 0; 1665 eapad2 = 8 - (ealen % 8); 1666 if (eapad2 == 8) 1667 eapad2 = 0; 1668 ealength += eapad1 + ealen + eapad2; 1669 } 1670 1671 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1672 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1673 easize = ip->i_ea_len; 1674 1675 olen = ffs_findextattr(eae, easize, 1676 ap->a_attrnamespace, ap->a_name, &p, NULL); 1677 if (olen == -1 && ealength == 0) { 1678 /* delete but nonexistent */ 1679 free(eae, M_TEMP); 1680 if (stand_alone) 1681 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1682 return(ENOATTR); 1683 } 1684 if (olen == -1) { 1685 /* new, append at end */ 1686 p = eae + easize; 1687 easize += ealength; 1688 } else { 1689 bcopy(p, &ul, sizeof ul); 1690 i = p - eae + ul; 1691 if (ul != ealength) { 1692 bcopy(p + ul, p + ealength, easize - i); 1693 easize += (ealength - ul); 1694 } 1695 } 1696 if (easize > NXADDR * fs->fs_bsize) { 1697 free(eae, M_TEMP); 1698 if (stand_alone) 1699 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1700 else if (ip->i_ea_error == 0) 1701 ip->i_ea_error = ENOSPC; 1702 return(ENOSPC); 1703 } 1704 if (ealength != 0) { 1705 bcopy(&ealength, p, sizeof(ealength)); 1706 p += sizeof(ealength); 1707 *p++ = ap->a_attrnamespace; 1708 *p++ = eapad2; 1709 *p++ = strlen(ap->a_name); 1710 strcpy(p, ap->a_name); 1711 p += strlen(ap->a_name); 1712 bzero(p, eapad1); 1713 p += eapad1; 1714 error = uiomove(p, ealen, ap->a_uio); 1715 if (error) { 1716 free(eae, M_TEMP); 1717 if (stand_alone) 1718 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1719 else if (ip->i_ea_error == 0) 1720 ip->i_ea_error = error; 1721 return(error); 1722 } 1723 p += ealen; 1724 bzero(p, eapad2); 1725 } 1726 p = ip->i_ea_area; 1727 ip->i_ea_area = eae; 1728 ip->i_ea_len = easize; 1729 free(p, M_TEMP); 1730 if (stand_alone) 1731 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1732 return(error); 1733 } 1734