1 /* 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 39 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... 40 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include <sys/param.h> 47 #include <sys/bio.h> 48 #include <sys/systm.h> 49 #include <sys/buf.h> 50 #include <sys/conf.h> 51 #include <sys/extattr.h> 52 #include <sys/kernel.h> 53 #include <sys/limits.h> 54 #include <sys/malloc.h> 55 #include <sys/mount.h> 56 #include <sys/proc.h> 57 #include <sys/resourcevar.h> 58 #include <sys/signalvar.h> 59 #include <sys/stat.h> 60 #include <sys/vmmeter.h> 61 #include <sys/vnode.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/vm_object.h> 66 #include <vm/vm_page.h> 67 #include <vm/vm_pager.h> 68 #include <vm/vnode_pager.h> 69 70 #include <ufs/ufs/extattr.h> 71 #include <ufs/ufs/quota.h> 72 #include <ufs/ufs/inode.h> 73 #include <ufs/ufs/ufs_extern.h> 74 #include <ufs/ufs/ufsmount.h> 75 76 #include <ufs/ffs/fs.h> 77 #include <ufs/ffs/ffs_extern.h> 78 #include "opt_directio.h" 79 80 #ifdef DIRECTIO 81 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 82 #endif 83 static int ffs_fsync(struct vop_fsync_args *); 84 static int ffs_getpages(struct vop_getpages_args *); 85 static int ffs_read(struct vop_read_args *); 86 static int ffs_write(struct vop_write_args *); 87 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); 88 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, 89 struct ucred *cred); 90 static int ffsext_strategy(struct vop_strategy_args *); 91 static int ffs_closeextattr(struct vop_closeextattr_args *); 92 static int ffs_deleteextattr(struct vop_deleteextattr_args *); 93 static int ffs_getextattr(struct vop_getextattr_args *); 94 static int ffs_listextattr(struct vop_listextattr_args *); 95 static int ffs_openextattr(struct vop_openextattr_args *); 96 static int ffs_setextattr(struct vop_setextattr_args *); 97 98 99 /* Global vfs data structures for ufs. */ 100 vop_t **ffs_vnodeop_p; 101 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 102 { &vop_default_desc, (vop_t *) ufs_vnoperate }, 103 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 104 { &vop_getpages_desc, (vop_t *) ffs_getpages }, 105 { &vop_read_desc, (vop_t *) ffs_read }, 106 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 107 { &vop_write_desc, (vop_t *) ffs_write }, 108 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 109 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr }, 110 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 111 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 112 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 113 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 114 { NULL, NULL } 115 }; 116 static struct vnodeopv_desc ffs_vnodeop_opv_desc = 117 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 118 119 vop_t **ffs_specop_p; 120 static struct vnodeopv_entry_desc ffs_specop_entries[] = { 121 { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, 122 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 123 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 124 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 125 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 126 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr }, 127 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 128 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 129 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 130 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 131 { NULL, NULL } 132 }; 133 static struct vnodeopv_desc ffs_specop_opv_desc = 134 { &ffs_specop_p, ffs_specop_entries }; 135 136 vop_t **ffs_fifoop_p; 137 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 138 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, 139 { &vop_fsync_desc, (vop_t *) ffs_fsync }, 140 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, 141 { &vop_strategy_desc, (vop_t *) ffsext_strategy }, 142 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr }, 143 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr }, 144 { &vop_getextattr_desc, (vop_t *) ffs_getextattr }, 145 { &vop_listextattr_desc, (vop_t *) ffs_listextattr }, 146 { &vop_openextattr_desc, (vop_t *) ffs_openextattr }, 147 { &vop_setextattr_desc, (vop_t *) ffs_setextattr }, 148 { NULL, NULL } 149 }; 150 static struct vnodeopv_desc ffs_fifoop_opv_desc = 151 { &ffs_fifoop_p, ffs_fifoop_entries }; 152 153 VNODEOP_SET(ffs_vnodeop_opv_desc); 154 VNODEOP_SET(ffs_specop_opv_desc); 155 VNODEOP_SET(ffs_fifoop_opv_desc); 156 157 /* 158 * Synch an open file. 159 */ 160 /* ARGSUSED */ 161 static int 162 ffs_fsync(ap) 163 struct vop_fsync_args /* { 164 struct vnode *a_vp; 165 struct ucred *a_cred; 166 int a_waitfor; 167 struct thread *a_td; 168 } */ *ap; 169 { 170 struct vnode *vp = ap->a_vp; 171 struct inode *ip = VTOI(vp); 172 struct buf *bp; 173 struct buf *nbp; 174 int s, error, wait, passes, skipmeta; 175 ufs_lbn_t lbn; 176 177 wait = (ap->a_waitfor == MNT_WAIT); 178 if (vn_isdisk(vp, NULL)) { 179 lbn = INT_MAX; 180 if (vp->v_rdev->si_mountpoint != NULL && 181 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 182 softdep_fsync_mountdev(vp); 183 } else { 184 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 185 } 186 187 /* 188 * Flush all dirty buffers associated with a vnode. 189 */ 190 passes = NIADDR + 1; 191 skipmeta = 0; 192 if (wait) 193 skipmeta = 1; 194 s = splbio(); 195 VI_LOCK(vp); 196 loop: 197 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 198 bp->b_vflags &= ~BV_SCANNED; 199 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 200 nbp = TAILQ_NEXT(bp, b_vnbufs); 201 /* 202 * Reasons to skip this buffer: it has already been considered 203 * on this pass, this pass is the first time through on a 204 * synchronous flush request and the buffer being considered 205 * is metadata, the buffer has dependencies that will cause 206 * it to be redirtied and it has not already been deferred, 207 * or it is already being written. 208 */ 209 if ((bp->b_vflags & BV_SCANNED) != 0) 210 continue; 211 bp->b_vflags |= BV_SCANNED; 212 if ((skipmeta == 1 && bp->b_lblkno < 0)) 213 continue; 214 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 215 continue; 216 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 217 (bp->b_flags & B_DEFERRED) == 0 && 218 buf_countdeps(bp, 0)) { 219 bp->b_flags |= B_DEFERRED; 220 BUF_UNLOCK(bp); 221 continue; 222 } 223 VI_UNLOCK(vp); 224 if ((bp->b_flags & B_DELWRI) == 0) 225 panic("ffs_fsync: not dirty"); 226 if (vp != bp->b_vp) 227 panic("ffs_fsync: vp != vp->b_vp"); 228 /* 229 * If this is a synchronous flush request, or it is not a 230 * file or device, start the write on this buffer immediatly. 231 */ 232 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { 233 234 /* 235 * On our final pass through, do all I/O synchronously 236 * so that we can find out if our flush is failing 237 * because of write errors. 238 */ 239 if (passes > 0 || !wait) { 240 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 241 (void) vfs_bio_awrite(bp); 242 } else { 243 bremfree(bp); 244 splx(s); 245 (void) bawrite(bp); 246 s = splbio(); 247 } 248 } else { 249 bremfree(bp); 250 splx(s); 251 if ((error = bwrite(bp)) != 0) 252 return (error); 253 s = splbio(); 254 } 255 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 256 /* 257 * If the buffer is for data that has been truncated 258 * off the file, then throw it away. 259 */ 260 bremfree(bp); 261 bp->b_flags |= B_INVAL | B_NOCACHE; 262 splx(s); 263 brelse(bp); 264 s = splbio(); 265 } else 266 vfs_bio_awrite(bp); 267 268 /* 269 * Since we may have slept during the I/O, we need 270 * to start from a known point. 271 */ 272 VI_LOCK(vp); 273 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 274 } 275 /* 276 * If we were asked to do this synchronously, then go back for 277 * another pass, this time doing the metadata. 278 */ 279 if (skipmeta) { 280 skipmeta = 0; 281 goto loop; 282 } 283 284 if (wait) { 285 while (vp->v_numoutput) { 286 vp->v_iflag |= VI_BWAIT; 287 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 288 PRIBIO + 4, "ffsfsn", 0); 289 } 290 VI_UNLOCK(vp); 291 292 /* 293 * Ensure that any filesystem metatdata associated 294 * with the vnode has been written. 295 */ 296 splx(s); 297 if ((error = softdep_sync_metadata(ap)) != 0) 298 return (error); 299 s = splbio(); 300 301 VI_LOCK(vp); 302 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 303 /* 304 * Block devices associated with filesystems may 305 * have new I/O requests posted for them even if 306 * the vnode is locked, so no amount of trying will 307 * get them clean. Thus we give block devices a 308 * good effort, then just give up. For all other file 309 * types, go around and try again until it is clean. 310 */ 311 if (passes > 0) { 312 passes -= 1; 313 goto loop; 314 } 315 #ifdef DIAGNOSTIC 316 if (!vn_isdisk(vp, NULL)) 317 vprint("ffs_fsync: dirty", vp); 318 #endif 319 } 320 } 321 VI_UNLOCK(vp); 322 splx(s); 323 return (UFS_UPDATE(vp, wait)); 324 } 325 326 327 /* 328 * Vnode op for reading. 329 */ 330 /* ARGSUSED */ 331 static int 332 ffs_read(ap) 333 struct vop_read_args /* { 334 struct vnode *a_vp; 335 struct uio *a_uio; 336 int a_ioflag; 337 struct ucred *a_cred; 338 } */ *ap; 339 { 340 struct vnode *vp; 341 struct inode *ip; 342 struct uio *uio; 343 struct fs *fs; 344 struct buf *bp; 345 ufs_lbn_t lbn, nextlbn; 346 off_t bytesinfile; 347 long size, xfersize, blkoffset; 348 int error, orig_resid; 349 int seqcount; 350 int ioflag; 351 352 vp = ap->a_vp; 353 uio = ap->a_uio; 354 ioflag = ap->a_ioflag; 355 if (ap->a_ioflag & IO_EXT) 356 #ifdef notyet 357 return (ffs_extread(vp, uio, ioflag)); 358 #else 359 panic("ffs_read+IO_EXT"); 360 #endif 361 #ifdef DIRECTIO 362 if ((ioflag & IO_DIRECT) != 0) { 363 int workdone; 364 365 error = ffs_rawread(vp, uio, &workdone); 366 if (error != 0 || workdone != 0) 367 return error; 368 } 369 #endif 370 371 GIANT_REQUIRED; 372 373 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 374 ip = VTOI(vp); 375 376 #ifdef DIAGNOSTIC 377 if (uio->uio_rw != UIO_READ) 378 panic("ffs_read: mode"); 379 380 if (vp->v_type == VLNK) { 381 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 382 panic("ffs_read: short symlink"); 383 } else if (vp->v_type != VREG && vp->v_type != VDIR) 384 panic("ffs_read: type %d", vp->v_type); 385 #endif 386 orig_resid = uio->uio_resid; 387 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); 388 if (orig_resid == 0) 389 return (0); 390 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); 391 fs = ip->i_fs; 392 if (uio->uio_offset < ip->i_size && 393 uio->uio_offset >= fs->fs_maxfilesize) 394 return (EOVERFLOW); 395 396 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 397 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 398 break; 399 lbn = lblkno(fs, uio->uio_offset); 400 nextlbn = lbn + 1; 401 402 /* 403 * size of buffer. The buffer representing the 404 * end of the file is rounded up to the size of 405 * the block type ( fragment or full block, 406 * depending ). 407 */ 408 size = blksize(fs, ip, lbn); 409 blkoffset = blkoff(fs, uio->uio_offset); 410 411 /* 412 * The amount we want to transfer in this iteration is 413 * one FS block less the amount of the data before 414 * our startpoint (duh!) 415 */ 416 xfersize = fs->fs_bsize - blkoffset; 417 418 /* 419 * But if we actually want less than the block, 420 * or the file doesn't have a whole block more of data, 421 * then use the lesser number. 422 */ 423 if (uio->uio_resid < xfersize) 424 xfersize = uio->uio_resid; 425 if (bytesinfile < xfersize) 426 xfersize = bytesinfile; 427 428 if (lblktosize(fs, nextlbn) >= ip->i_size) { 429 /* 430 * Don't do readahead if this is the end of the file. 431 */ 432 error = bread(vp, lbn, size, NOCRED, &bp); 433 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 434 /* 435 * Otherwise if we are allowed to cluster, 436 * grab as much as we can. 437 * 438 * XXX This may not be a win if we are not 439 * doing sequential access. 440 */ 441 error = cluster_read(vp, ip->i_size, lbn, 442 size, NOCRED, uio->uio_resid, seqcount, &bp); 443 } else if (seqcount > 1) { 444 /* 445 * If we are NOT allowed to cluster, then 446 * if we appear to be acting sequentially, 447 * fire off a request for a readahead 448 * as well as a read. Note that the 4th and 5th 449 * arguments point to arrays of the size specified in 450 * the 6th argument. 451 */ 452 int nextsize = blksize(fs, ip, nextlbn); 453 error = breadn(vp, lbn, 454 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 455 } else { 456 /* 457 * Failing all of the above, just read what the 458 * user asked for. Interestingly, the same as 459 * the first option above. 460 */ 461 error = bread(vp, lbn, size, NOCRED, &bp); 462 } 463 if (error) { 464 brelse(bp); 465 bp = NULL; 466 break; 467 } 468 469 /* 470 * If IO_DIRECT then set B_DIRECT for the buffer. This 471 * will cause us to attempt to release the buffer later on 472 * and will cause the buffer cache to attempt to free the 473 * underlying pages. 474 */ 475 if (ioflag & IO_DIRECT) 476 bp->b_flags |= B_DIRECT; 477 478 /* 479 * We should only get non-zero b_resid when an I/O error 480 * has occurred, which should cause us to break above. 481 * However, if the short read did not cause an error, 482 * then we want to ensure that we do not uiomove bad 483 * or uninitialized data. 484 */ 485 size -= bp->b_resid; 486 if (size < xfersize) { 487 if (size == 0) 488 break; 489 xfersize = size; 490 } 491 492 error = uiomove((char *)bp->b_data + blkoffset, 493 (int)xfersize, uio); 494 if (error) 495 break; 496 497 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 498 (LIST_FIRST(&bp->b_dep) == NULL)) { 499 /* 500 * If there are no dependencies, and it's VMIO, 501 * then we don't need the buf, mark it available 502 * for freeing. The VM has the data. 503 */ 504 bp->b_flags |= B_RELBUF; 505 brelse(bp); 506 } else { 507 /* 508 * Otherwise let whoever 509 * made the request take care of 510 * freeing it. We just queue 511 * it onto another list. 512 */ 513 bqrelse(bp); 514 } 515 } 516 517 /* 518 * This can only happen in the case of an error 519 * because the loop above resets bp to NULL on each iteration 520 * and on normal completion has not set a new value into it. 521 * so it must have come from a 'break' statement 522 */ 523 if (bp != NULL) { 524 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 525 (LIST_FIRST(&bp->b_dep) == NULL)) { 526 bp->b_flags |= B_RELBUF; 527 brelse(bp); 528 } else { 529 bqrelse(bp); 530 } 531 } 532 533 if ((error == 0 || uio->uio_resid != orig_resid) && 534 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 535 ip->i_flag |= IN_ACCESS; 536 return (error); 537 } 538 539 /* 540 * Vnode op for writing. 541 */ 542 static int 543 ffs_write(ap) 544 struct vop_write_args /* { 545 struct vnode *a_vp; 546 struct uio *a_uio; 547 int a_ioflag; 548 struct ucred *a_cred; 549 } */ *ap; 550 { 551 struct vnode *vp; 552 struct uio *uio; 553 struct inode *ip; 554 struct fs *fs; 555 struct buf *bp; 556 struct thread *td; 557 ufs_lbn_t lbn; 558 off_t osize; 559 int seqcount; 560 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 561 562 vp = ap->a_vp; 563 uio = ap->a_uio; 564 ioflag = ap->a_ioflag; 565 if (ap->a_ioflag & IO_EXT) 566 #ifdef notyet 567 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 568 #else 569 panic("ffs_write+IO_EXT"); 570 #endif 571 572 GIANT_REQUIRED; 573 574 extended = 0; 575 seqcount = ap->a_ioflag >> IO_SEQSHIFT; 576 ip = VTOI(vp); 577 578 #ifdef DIAGNOSTIC 579 if (uio->uio_rw != UIO_WRITE) 580 panic("ffs_write: mode"); 581 #endif 582 583 switch (vp->v_type) { 584 case VREG: 585 if (ioflag & IO_APPEND) 586 uio->uio_offset = ip->i_size; 587 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 588 return (EPERM); 589 /* FALLTHROUGH */ 590 case VLNK: 591 break; 592 case VDIR: 593 panic("ffs_write: dir write"); 594 break; 595 default: 596 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, 597 (int)uio->uio_offset, 598 (int)uio->uio_resid 599 ); 600 } 601 602 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); 603 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); 604 fs = ip->i_fs; 605 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 606 return (EFBIG); 607 /* 608 * Maybe this should be above the vnode op call, but so long as 609 * file servers have no limits, I don't think it matters. 610 */ 611 td = uio->uio_td; 612 if (vp->v_type == VREG && td != NULL) { 613 PROC_LOCK(td->td_proc); 614 if (uio->uio_offset + uio->uio_resid > 615 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 616 psignal(td->td_proc, SIGXFSZ); 617 PROC_UNLOCK(td->td_proc); 618 return (EFBIG); 619 } 620 PROC_UNLOCK(td->td_proc); 621 } 622 623 resid = uio->uio_resid; 624 osize = ip->i_size; 625 if (seqcount > BA_SEQMAX) 626 flags = BA_SEQMAX << BA_SEQSHIFT; 627 else 628 flags = seqcount << BA_SEQSHIFT; 629 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 630 flags |= IO_SYNC; 631 632 for (error = 0; uio->uio_resid > 0;) { 633 lbn = lblkno(fs, uio->uio_offset); 634 blkoffset = blkoff(fs, uio->uio_offset); 635 xfersize = fs->fs_bsize - blkoffset; 636 if (uio->uio_resid < xfersize) 637 xfersize = uio->uio_resid; 638 if (uio->uio_offset + xfersize > ip->i_size) 639 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 640 641 /* 642 * We must perform a read-before-write if the transfer size 643 * does not cover the entire buffer. 644 */ 645 if (fs->fs_bsize > xfersize) 646 flags |= BA_CLRBUF; 647 else 648 flags &= ~BA_CLRBUF; 649 /* XXX is uio->uio_offset the right thing here? */ 650 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 651 ap->a_cred, flags, &bp); 652 if (error != 0) 653 break; 654 /* 655 * If the buffer is not valid we have to clear out any 656 * garbage data from the pages instantiated for the buffer. 657 * If we do not, a failed uiomove() during a write can leave 658 * the prior contents of the pages exposed to a userland 659 * mmap(). XXX deal with uiomove() errors a better way. 660 */ 661 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 662 vfs_bio_clrbuf(bp); 663 if (ioflag & IO_DIRECT) 664 bp->b_flags |= B_DIRECT; 665 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) 666 bp->b_flags |= B_NOCACHE; 667 668 if (uio->uio_offset + xfersize > ip->i_size) { 669 ip->i_size = uio->uio_offset + xfersize; 670 DIP_SET(ip, i_size, ip->i_size); 671 extended = 1; 672 } 673 674 size = blksize(fs, ip, lbn) - bp->b_resid; 675 if (size < xfersize) 676 xfersize = size; 677 678 error = 679 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 680 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 681 (LIST_FIRST(&bp->b_dep) == NULL)) { 682 bp->b_flags |= B_RELBUF; 683 } 684 685 /* 686 * If IO_SYNC each buffer is written synchronously. Otherwise 687 * if we have a severe page deficiency write the buffer 688 * asynchronously. Otherwise try to cluster, and if that 689 * doesn't do it then either do an async write (if O_DIRECT), 690 * or a delayed write (if not). 691 */ 692 if (ioflag & IO_SYNC) { 693 (void)bwrite(bp); 694 } else if (vm_page_count_severe() || 695 buf_dirty_count_severe() || 696 (ioflag & IO_ASYNC)) { 697 bp->b_flags |= B_CLUSTEROK; 698 bawrite(bp); 699 } else if (xfersize + blkoffset == fs->fs_bsize) { 700 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 701 bp->b_flags |= B_CLUSTEROK; 702 cluster_write(bp, ip->i_size, seqcount); 703 } else { 704 bawrite(bp); 705 } 706 } else if (ioflag & IO_DIRECT) { 707 bp->b_flags |= B_CLUSTEROK; 708 bawrite(bp); 709 } else { 710 bp->b_flags |= B_CLUSTEROK; 711 bdwrite(bp); 712 } 713 if (error || xfersize == 0) 714 break; 715 ip->i_flag |= IN_CHANGE | IN_UPDATE; 716 } 717 /* 718 * If we successfully wrote any data, and we are not the superuser 719 * we clear the setuid and setgid bits as a precaution against 720 * tampering. 721 */ 722 if (resid > uio->uio_resid && ap->a_cred && 723 suser_cred(ap->a_cred, SUSER_ALLOWJAIL)) { 724 ip->i_mode &= ~(ISUID | ISGID); 725 DIP_SET(ip, i_mode, ip->i_mode); 726 } 727 if (resid > uio->uio_resid) 728 VN_KNOTE_UNLOCKED(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 729 if (error) { 730 if (ioflag & IO_UNIT) { 731 (void)UFS_TRUNCATE(vp, osize, 732 IO_NORMAL | (ioflag & IO_SYNC), 733 ap->a_cred, uio->uio_td); 734 uio->uio_offset -= resid - uio->uio_resid; 735 uio->uio_resid = resid; 736 } 737 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 738 error = UFS_UPDATE(vp, 1); 739 return (error); 740 } 741 742 /* 743 * get page routine 744 */ 745 static int 746 ffs_getpages(ap) 747 struct vop_getpages_args *ap; 748 { 749 off_t foff, physoffset; 750 int i, size, bsize; 751 struct vnode *dp, *vp; 752 vm_object_t obj; 753 vm_pindex_t pindex; 754 vm_page_t mreq; 755 int bbackwards, bforwards; 756 int pbackwards, pforwards; 757 int firstpage; 758 ufs2_daddr_t reqblkno, reqlblkno; 759 int poff; 760 int pcount; 761 int rtval; 762 int pagesperblock; 763 764 GIANT_REQUIRED; 765 766 pcount = round_page(ap->a_count) / PAGE_SIZE; 767 mreq = ap->a_m[ap->a_reqpage]; 768 769 /* 770 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 771 * then the entire page is valid. Since the page may be mapped, 772 * user programs might reference data beyond the actual end of file 773 * occuring within the page. We have to zero that data. 774 */ 775 VM_OBJECT_LOCK(mreq->object); 776 if (mreq->valid) { 777 if (mreq->valid != VM_PAGE_BITS_ALL) 778 vm_page_zero_invalid(mreq, TRUE); 779 vm_page_lock_queues(); 780 for (i = 0; i < pcount; i++) { 781 if (i != ap->a_reqpage) { 782 vm_page_free(ap->a_m[i]); 783 } 784 } 785 vm_page_unlock_queues(); 786 VM_OBJECT_UNLOCK(mreq->object); 787 return VM_PAGER_OK; 788 } 789 VM_OBJECT_UNLOCK(mreq->object); 790 vp = ap->a_vp; 791 obj = vp->v_object; 792 bsize = vp->v_mount->mnt_stat.f_iosize; 793 pindex = mreq->pindex; 794 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; 795 796 if (bsize < PAGE_SIZE) 797 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 798 ap->a_count, 799 ap->a_reqpage); 800 801 /* 802 * foff is the file offset of the required page 803 * reqlblkno is the logical block that contains the page 804 * poff is the index of the page into the logical block 805 */ 806 reqlblkno = foff / bsize; 807 poff = (foff % bsize) / PAGE_SIZE; 808 809 dp = VTOI(vp)->i_devvp; 810 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 811 || (reqblkno == -1)) { 812 VM_OBJECT_LOCK(obj); 813 vm_page_lock_queues(); 814 for(i = 0; i < pcount; i++) { 815 if (i != ap->a_reqpage) 816 vm_page_free(ap->a_m[i]); 817 } 818 vm_page_unlock_queues(); 819 if (reqblkno == -1) { 820 if ((mreq->flags & PG_ZERO) == 0) 821 pmap_zero_page(mreq); 822 vm_page_undirty(mreq); 823 mreq->valid = VM_PAGE_BITS_ALL; 824 VM_OBJECT_UNLOCK(obj); 825 return VM_PAGER_OK; 826 } else { 827 VM_OBJECT_UNLOCK(obj); 828 return VM_PAGER_ERROR; 829 } 830 } 831 832 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 833 pagesperblock = bsize / PAGE_SIZE; 834 /* 835 * find the first page that is contiguous... 836 * note that pbackwards is the number of pages that are contiguous 837 * backwards. 838 */ 839 firstpage = 0; 840 if (ap->a_count) { 841 pbackwards = poff + bbackwards * pagesperblock; 842 if (ap->a_reqpage > pbackwards) { 843 firstpage = ap->a_reqpage - pbackwards; 844 VM_OBJECT_LOCK(obj); 845 vm_page_lock_queues(); 846 for(i=0;i<firstpage;i++) 847 vm_page_free(ap->a_m[i]); 848 vm_page_unlock_queues(); 849 VM_OBJECT_UNLOCK(obj); 850 } 851 852 /* 853 * pforwards is the number of pages that are contiguous 854 * after the current page. 855 */ 856 pforwards = (pagesperblock - (poff + 1)) + 857 bforwards * pagesperblock; 858 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 859 VM_OBJECT_LOCK(obj); 860 vm_page_lock_queues(); 861 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 862 vm_page_free(ap->a_m[i]); 863 vm_page_unlock_queues(); 864 VM_OBJECT_UNLOCK(obj); 865 pcount = ap->a_reqpage + pforwards + 1; 866 } 867 868 /* 869 * number of pages for I/O corrected for the non-contig pages at 870 * the beginning of the array. 871 */ 872 pcount -= firstpage; 873 } 874 875 /* 876 * calculate the size of the transfer 877 */ 878 879 size = pcount * PAGE_SIZE; 880 881 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 882 obj->un_pager.vnp.vnp_size) 883 size = obj->un_pager.vnp.vnp_size - 884 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 885 886 physoffset -= foff; 887 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 888 (ap->a_reqpage - firstpage), physoffset); 889 890 return (rtval); 891 } 892 893 /* 894 * Extended attribute area reading. 895 */ 896 static int 897 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) 898 { 899 struct inode *ip; 900 struct ufs2_dinode *dp; 901 struct fs *fs; 902 struct buf *bp; 903 ufs_lbn_t lbn, nextlbn; 904 off_t bytesinfile; 905 long size, xfersize, blkoffset; 906 int error, orig_resid; 907 908 GIANT_REQUIRED; 909 910 ip = VTOI(vp); 911 fs = ip->i_fs; 912 dp = ip->i_din2; 913 914 #ifdef DIAGNOSTIC 915 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) 916 panic("ffs_extread: mode"); 917 918 #endif 919 orig_resid = uio->uio_resid; 920 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); 921 if (orig_resid == 0) 922 return (0); 923 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); 924 925 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 926 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 927 break; 928 lbn = lblkno(fs, uio->uio_offset); 929 nextlbn = lbn + 1; 930 931 /* 932 * size of buffer. The buffer representing the 933 * end of the file is rounded up to the size of 934 * the block type ( fragment or full block, 935 * depending ). 936 */ 937 size = sblksize(fs, dp->di_extsize, lbn); 938 blkoffset = blkoff(fs, uio->uio_offset); 939 940 /* 941 * The amount we want to transfer in this iteration is 942 * one FS block less the amount of the data before 943 * our startpoint (duh!) 944 */ 945 xfersize = fs->fs_bsize - blkoffset; 946 947 /* 948 * But if we actually want less than the block, 949 * or the file doesn't have a whole block more of data, 950 * then use the lesser number. 951 */ 952 if (uio->uio_resid < xfersize) 953 xfersize = uio->uio_resid; 954 if (bytesinfile < xfersize) 955 xfersize = bytesinfile; 956 957 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 958 /* 959 * Don't do readahead if this is the end of the info. 960 */ 961 error = bread(vp, -1 - lbn, size, NOCRED, &bp); 962 } else { 963 /* 964 * If we have a second block, then 965 * fire off a request for a readahead 966 * as well as a read. Note that the 4th and 5th 967 * arguments point to arrays of the size specified in 968 * the 6th argument. 969 */ 970 int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 971 972 nextlbn = -1 - nextlbn; 973 error = breadn(vp, -1 - lbn, 974 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 975 } 976 if (error) { 977 brelse(bp); 978 bp = NULL; 979 break; 980 } 981 982 /* 983 * If IO_DIRECT then set B_DIRECT for the buffer. This 984 * will cause us to attempt to release the buffer later on 985 * and will cause the buffer cache to attempt to free the 986 * underlying pages. 987 */ 988 if (ioflag & IO_DIRECT) 989 bp->b_flags |= B_DIRECT; 990 991 /* 992 * We should only get non-zero b_resid when an I/O error 993 * has occurred, which should cause us to break above. 994 * However, if the short read did not cause an error, 995 * then we want to ensure that we do not uiomove bad 996 * or uninitialized data. 997 */ 998 size -= bp->b_resid; 999 if (size < xfersize) { 1000 if (size == 0) 1001 break; 1002 xfersize = size; 1003 } 1004 1005 error = uiomove((char *)bp->b_data + blkoffset, 1006 (int)xfersize, uio); 1007 if (error) 1008 break; 1009 1010 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1011 (LIST_FIRST(&bp->b_dep) == NULL)) { 1012 /* 1013 * If there are no dependencies, and it's VMIO, 1014 * then we don't need the buf, mark it available 1015 * for freeing. The VM has the data. 1016 */ 1017 bp->b_flags |= B_RELBUF; 1018 brelse(bp); 1019 } else { 1020 /* 1021 * Otherwise let whoever 1022 * made the request take care of 1023 * freeing it. We just queue 1024 * it onto another list. 1025 */ 1026 bqrelse(bp); 1027 } 1028 } 1029 1030 /* 1031 * This can only happen in the case of an error 1032 * because the loop above resets bp to NULL on each iteration 1033 * and on normal completion has not set a new value into it. 1034 * so it must have come from a 'break' statement 1035 */ 1036 if (bp != NULL) { 1037 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1038 (LIST_FIRST(&bp->b_dep) == NULL)) { 1039 bp->b_flags |= B_RELBUF; 1040 brelse(bp); 1041 } else { 1042 bqrelse(bp); 1043 } 1044 } 1045 1046 if ((error == 0 || uio->uio_resid != orig_resid) && 1047 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1048 ip->i_flag |= IN_ACCESS; 1049 return (error); 1050 } 1051 1052 /* 1053 * Extended attribute area writing. 1054 */ 1055 static int 1056 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) 1057 { 1058 struct inode *ip; 1059 struct ufs2_dinode *dp; 1060 struct fs *fs; 1061 struct buf *bp; 1062 ufs_lbn_t lbn; 1063 off_t osize; 1064 int blkoffset, error, flags, resid, size, xfersize; 1065 1066 GIANT_REQUIRED; 1067 1068 ip = VTOI(vp); 1069 fs = ip->i_fs; 1070 dp = ip->i_din2; 1071 1072 #ifdef DIAGNOSTIC 1073 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) 1074 panic("ffs_extwrite: mode"); 1075 #endif 1076 1077 if (ioflag & IO_APPEND) 1078 uio->uio_offset = dp->di_extsize; 1079 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); 1080 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); 1081 if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1082 return (EFBIG); 1083 1084 resid = uio->uio_resid; 1085 osize = dp->di_extsize; 1086 flags = IO_EXT; 1087 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1088 flags |= IO_SYNC; 1089 1090 for (error = 0; uio->uio_resid > 0;) { 1091 lbn = lblkno(fs, uio->uio_offset); 1092 blkoffset = blkoff(fs, uio->uio_offset); 1093 xfersize = fs->fs_bsize - blkoffset; 1094 if (uio->uio_resid < xfersize) 1095 xfersize = uio->uio_resid; 1096 1097 /* 1098 * We must perform a read-before-write if the transfer size 1099 * does not cover the entire buffer. 1100 */ 1101 if (fs->fs_bsize > xfersize) 1102 flags |= BA_CLRBUF; 1103 else 1104 flags &= ~BA_CLRBUF; 1105 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1106 ucred, flags, &bp); 1107 if (error != 0) 1108 break; 1109 /* 1110 * If the buffer is not valid we have to clear out any 1111 * garbage data from the pages instantiated for the buffer. 1112 * If we do not, a failed uiomove() during a write can leave 1113 * the prior contents of the pages exposed to a userland 1114 * mmap(). XXX deal with uiomove() errors a better way. 1115 */ 1116 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1117 vfs_bio_clrbuf(bp); 1118 if (ioflag & IO_DIRECT) 1119 bp->b_flags |= B_DIRECT; 1120 1121 if (uio->uio_offset + xfersize > dp->di_extsize) 1122 dp->di_extsize = uio->uio_offset + xfersize; 1123 1124 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1125 if (size < xfersize) 1126 xfersize = size; 1127 1128 error = 1129 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); 1130 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 1131 (LIST_FIRST(&bp->b_dep) == NULL)) { 1132 bp->b_flags |= B_RELBUF; 1133 } 1134 1135 /* 1136 * If IO_SYNC each buffer is written synchronously. Otherwise 1137 * if we have a severe page deficiency write the buffer 1138 * asynchronously. Otherwise try to cluster, and if that 1139 * doesn't do it then either do an async write (if O_DIRECT), 1140 * or a delayed write (if not). 1141 */ 1142 if (ioflag & IO_SYNC) { 1143 (void)bwrite(bp); 1144 } else if (vm_page_count_severe() || 1145 buf_dirty_count_severe() || 1146 xfersize + blkoffset == fs->fs_bsize || 1147 (ioflag & (IO_ASYNC | IO_DIRECT))) 1148 bawrite(bp); 1149 else 1150 bdwrite(bp); 1151 if (error || xfersize == 0) 1152 break; 1153 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1154 } 1155 /* 1156 * If we successfully wrote any data, and we are not the superuser 1157 * we clear the setuid and setgid bits as a precaution against 1158 * tampering. 1159 */ 1160 if (resid > uio->uio_resid && ucred && 1161 suser_cred(ucred, SUSER_ALLOWJAIL)) { 1162 ip->i_mode &= ~(ISUID | ISGID); 1163 dp->di_mode = ip->i_mode; 1164 } 1165 if (error) { 1166 if (ioflag & IO_UNIT) { 1167 (void)UFS_TRUNCATE(vp, osize, 1168 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); 1169 uio->uio_offset -= resid - uio->uio_resid; 1170 uio->uio_resid = resid; 1171 } 1172 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1173 error = UFS_UPDATE(vp, 1); 1174 return (error); 1175 } 1176 1177 1178 /* 1179 * Vnode operating to retrieve a named extended attribute. 1180 * 1181 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1182 * the length of the EA, and possibly the pointer to the entry and to the data. 1183 */ 1184 static int 1185 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) 1186 { 1187 u_char *p, *pe, *pn, *p0; 1188 int eapad1, eapad2, ealength, ealen, nlen; 1189 uint32_t ul; 1190 1191 pe = ptr + length; 1192 nlen = strlen(name); 1193 1194 for (p = ptr; p < pe; p = pn) { 1195 p0 = p; 1196 bcopy(p, &ul, sizeof(ul)); 1197 pn = p + ul; 1198 /* make sure this entry is complete */ 1199 if (pn > pe) 1200 break; 1201 p += sizeof(uint32_t); 1202 if (*p != nspace) 1203 continue; 1204 p++; 1205 eapad2 = *p++; 1206 if (*p != nlen) 1207 continue; 1208 p++; 1209 if (bcmp(p, name, nlen)) 1210 continue; 1211 ealength = sizeof(uint32_t) + 3 + nlen; 1212 eapad1 = 8 - (ealength % 8); 1213 if (eapad1 == 8) 1214 eapad1 = 0; 1215 ealength += eapad1; 1216 ealen = ul - ealength - eapad2; 1217 p += nlen + eapad1; 1218 if (eap != NULL) 1219 *eap = p0; 1220 if (eac != NULL) 1221 *eac = p; 1222 return (ealen); 1223 } 1224 return(-1); 1225 } 1226 1227 static int 1228 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) 1229 { 1230 struct inode *ip; 1231 struct ufs2_dinode *dp; 1232 struct uio luio; 1233 struct iovec liovec; 1234 int easize, error; 1235 u_char *eae; 1236 1237 ip = VTOI(vp); 1238 dp = ip->i_din2; 1239 easize = dp->di_extsize; 1240 1241 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1242 1243 liovec.iov_base = eae; 1244 liovec.iov_len = easize; 1245 luio.uio_iov = &liovec; 1246 luio.uio_iovcnt = 1; 1247 luio.uio_offset = 0; 1248 luio.uio_resid = easize; 1249 luio.uio_segflg = UIO_SYSSPACE; 1250 luio.uio_rw = UIO_READ; 1251 luio.uio_td = td; 1252 1253 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); 1254 if (error) { 1255 free(eae, M_TEMP); 1256 return(error); 1257 } 1258 *p = eae; 1259 return (0); 1260 } 1261 1262 static int 1263 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) 1264 { 1265 struct inode *ip; 1266 struct ufs2_dinode *dp; 1267 int error; 1268 1269 ip = VTOI(vp); 1270 1271 if (ip->i_ea_area != NULL) 1272 return (EBUSY); 1273 dp = ip->i_din2; 1274 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1275 if (error) 1276 return (error); 1277 ip->i_ea_len = dp->di_extsize; 1278 ip->i_ea_error = 0; 1279 return (0); 1280 } 1281 1282 /* 1283 * Vnode extattr transaction commit/abort 1284 */ 1285 static int 1286 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) 1287 { 1288 struct inode *ip; 1289 struct uio luio; 1290 struct iovec liovec; 1291 int error; 1292 struct ufs2_dinode *dp; 1293 1294 ip = VTOI(vp); 1295 if (ip->i_ea_area == NULL) 1296 return (EINVAL); 1297 dp = ip->i_din2; 1298 error = ip->i_ea_error; 1299 if (commit && error == 0) { 1300 if (cred == NOCRED) 1301 cred = vp->v_mount->mnt_cred; 1302 liovec.iov_base = ip->i_ea_area; 1303 liovec.iov_len = ip->i_ea_len; 1304 luio.uio_iov = &liovec; 1305 luio.uio_iovcnt = 1; 1306 luio.uio_offset = 0; 1307 luio.uio_resid = ip->i_ea_len; 1308 luio.uio_segflg = UIO_SYSSPACE; 1309 luio.uio_rw = UIO_WRITE; 1310 luio.uio_td = td; 1311 /* XXX: I'm not happy about truncating to zero size */ 1312 if (ip->i_ea_len < dp->di_extsize) 1313 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1314 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); 1315 } 1316 free(ip->i_ea_area, M_TEMP); 1317 ip->i_ea_area = NULL; 1318 ip->i_ea_len = 0; 1319 ip->i_ea_error = 0; 1320 return (error); 1321 } 1322 1323 /* 1324 * Vnode extattr strategy routine for special devices and fifos. 1325 * 1326 * We need to check for a read or write of the external attributes. 1327 * Otherwise we just fall through and do the usual thing. 1328 */ 1329 static int 1330 ffsext_strategy(struct vop_strategy_args *ap) 1331 /* 1332 struct vop_strategy_args { 1333 struct vnodeop_desc *a_desc; 1334 struct vnode *a_vp; 1335 struct buf *a_bp; 1336 }; 1337 */ 1338 { 1339 struct vnode *vp; 1340 daddr_t lbn; 1341 1342 KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", 1343 __func__, ap->a_vp, ap->a_bp->b_vp)); 1344 vp = ap->a_vp; 1345 lbn = ap->a_bp->b_lblkno; 1346 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1347 lbn < 0 && lbn >= -NXADDR) 1348 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1349 if (vp->v_type == VFIFO) 1350 return (ufs_vnoperatefifo((struct vop_generic_args *)ap)); 1351 return (ufs_vnoperatespec((struct vop_generic_args *)ap)); 1352 } 1353 1354 /* 1355 * Vnode extattr transaction commit/abort 1356 */ 1357 static int 1358 ffs_openextattr(struct vop_openextattr_args *ap) 1359 /* 1360 struct vop_openextattr_args { 1361 struct vnodeop_desc *a_desc; 1362 struct vnode *a_vp; 1363 IN struct ucred *a_cred; 1364 IN struct thread *a_td; 1365 }; 1366 */ 1367 { 1368 struct inode *ip; 1369 struct fs *fs; 1370 1371 ip = VTOI(ap->a_vp); 1372 fs = ip->i_fs; 1373 if (fs->fs_magic == FS_UFS1_MAGIC) 1374 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1375 1376 if (ap->a_vp->v_type == VCHR) 1377 return (EOPNOTSUPP); 1378 1379 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1380 } 1381 1382 1383 /* 1384 * Vnode extattr transaction commit/abort 1385 */ 1386 static int 1387 ffs_closeextattr(struct vop_closeextattr_args *ap) 1388 /* 1389 struct vop_closeextattr_args { 1390 struct vnodeop_desc *a_desc; 1391 struct vnode *a_vp; 1392 int a_commit; 1393 IN struct ucred *a_cred; 1394 IN struct thread *a_td; 1395 }; 1396 */ 1397 { 1398 struct inode *ip; 1399 struct fs *fs; 1400 1401 ip = VTOI(ap->a_vp); 1402 fs = ip->i_fs; 1403 if (fs->fs_magic == FS_UFS1_MAGIC) 1404 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1405 1406 if (ap->a_vp->v_type == VCHR) 1407 return (EOPNOTSUPP); 1408 1409 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1410 } 1411 1412 /* 1413 * Vnode operation to remove a named attribute. 1414 */ 1415 static int 1416 ffs_deleteextattr(struct vop_deleteextattr_args *ap) 1417 /* 1418 vop_deleteextattr { 1419 IN struct vnode *a_vp; 1420 IN int a_attrnamespace; 1421 IN const char *a_name; 1422 IN struct ucred *a_cred; 1423 IN struct thread *a_td; 1424 }; 1425 */ 1426 { 1427 struct inode *ip; 1428 struct fs *fs; 1429 uint32_t ealength, ul; 1430 int ealen, olen, eapad1, eapad2, error, i, easize; 1431 u_char *eae, *p; 1432 int stand_alone; 1433 1434 ip = VTOI(ap->a_vp); 1435 fs = ip->i_fs; 1436 1437 if (fs->fs_magic == FS_UFS1_MAGIC) 1438 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1439 1440 if (ap->a_vp->v_type == VCHR) 1441 return (EOPNOTSUPP); 1442 1443 if (strlen(ap->a_name) == 0) 1444 return (EINVAL); 1445 1446 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1447 ap->a_cred, ap->a_td, IWRITE); 1448 if (error) { 1449 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1450 ip->i_ea_error = error; 1451 return (error); 1452 } 1453 1454 if (ip->i_ea_area == NULL) { 1455 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1456 if (error) 1457 return (error); 1458 stand_alone = 1; 1459 } else { 1460 stand_alone = 0; 1461 } 1462 1463 ealength = eapad1 = ealen = eapad2 = 0; 1464 1465 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); 1466 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1467 easize = ip->i_ea_len; 1468 1469 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1470 &p, NULL); 1471 if (olen == -1) { 1472 /* delete but nonexistent */ 1473 free(eae, M_TEMP); 1474 if (stand_alone) 1475 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1476 return(ENOATTR); 1477 } 1478 bcopy(p, &ul, sizeof ul); 1479 i = p - eae + ul; 1480 if (ul != ealength) { 1481 bcopy(p + ul, p + ealength, easize - i); 1482 easize += (ealength - ul); 1483 } 1484 if (easize > NXADDR * fs->fs_bsize) { 1485 free(eae, M_TEMP); 1486 if (stand_alone) 1487 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1488 else if (ip->i_ea_error == 0) 1489 ip->i_ea_error = ENOSPC; 1490 return(ENOSPC); 1491 } 1492 p = ip->i_ea_area; 1493 ip->i_ea_area = eae; 1494 ip->i_ea_len = easize; 1495 free(p, M_TEMP); 1496 if (stand_alone) 1497 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1498 return(error); 1499 } 1500 1501 /* 1502 * Vnode operation to retrieve a named extended attribute. 1503 */ 1504 static int 1505 ffs_getextattr(struct vop_getextattr_args *ap) 1506 /* 1507 vop_getextattr { 1508 IN struct vnode *a_vp; 1509 IN int a_attrnamespace; 1510 IN const char *a_name; 1511 INOUT struct uio *a_uio; 1512 OUT size_t *a_size; 1513 IN struct ucred *a_cred; 1514 IN struct thread *a_td; 1515 }; 1516 */ 1517 { 1518 struct inode *ip; 1519 struct fs *fs; 1520 u_char *eae, *p; 1521 unsigned easize; 1522 int error, ealen, stand_alone; 1523 1524 ip = VTOI(ap->a_vp); 1525 fs = ip->i_fs; 1526 1527 if (fs->fs_magic == FS_UFS1_MAGIC) 1528 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1529 1530 if (ap->a_vp->v_type == VCHR) 1531 return (EOPNOTSUPP); 1532 1533 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1534 ap->a_cred, ap->a_td, IREAD); 1535 if (error) 1536 return (error); 1537 1538 if (ip->i_ea_area == NULL) { 1539 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1540 if (error) 1541 return (error); 1542 stand_alone = 1; 1543 } else { 1544 stand_alone = 0; 1545 } 1546 eae = ip->i_ea_area; 1547 easize = ip->i_ea_len; 1548 1549 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, 1550 NULL, &p); 1551 if (ealen >= 0) { 1552 error = 0; 1553 if (ap->a_size != NULL) 1554 *ap->a_size = ealen; 1555 else if (ap->a_uio != NULL) 1556 error = uiomove(p, ealen, ap->a_uio); 1557 } else 1558 error = ENOATTR; 1559 if (stand_alone) 1560 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1561 return(error); 1562 } 1563 1564 /* 1565 * Vnode operation to retrieve extended attributes on a vnode. 1566 */ 1567 static int 1568 ffs_listextattr(struct vop_listextattr_args *ap) 1569 /* 1570 vop_listextattr { 1571 IN struct vnode *a_vp; 1572 IN int a_attrnamespace; 1573 INOUT struct uio *a_uio; 1574 OUT size_t *a_size; 1575 IN struct ucred *a_cred; 1576 IN struct thread *a_td; 1577 }; 1578 */ 1579 { 1580 struct inode *ip; 1581 struct fs *fs; 1582 u_char *eae, *p, *pe, *pn; 1583 unsigned easize; 1584 uint32_t ul; 1585 int error, ealen, stand_alone; 1586 1587 ip = VTOI(ap->a_vp); 1588 fs = ip->i_fs; 1589 1590 if (fs->fs_magic == FS_UFS1_MAGIC) 1591 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1592 1593 if (ap->a_vp->v_type == VCHR) 1594 return (EOPNOTSUPP); 1595 1596 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1597 ap->a_cred, ap->a_td, IREAD); 1598 if (error) 1599 return (error); 1600 1601 if (ip->i_ea_area == NULL) { 1602 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1603 if (error) 1604 return (error); 1605 stand_alone = 1; 1606 } else { 1607 stand_alone = 0; 1608 } 1609 eae = ip->i_ea_area; 1610 easize = ip->i_ea_len; 1611 1612 error = 0; 1613 if (ap->a_size != NULL) 1614 *ap->a_size = 0; 1615 pe = eae + easize; 1616 for(p = eae; error == 0 && p < pe; p = pn) { 1617 bcopy(p, &ul, sizeof(ul)); 1618 pn = p + ul; 1619 if (pn > pe) 1620 break; 1621 p += sizeof(ul); 1622 if (*p++ != ap->a_attrnamespace) 1623 continue; 1624 p++; /* pad2 */ 1625 ealen = *p; 1626 if (ap->a_size != NULL) { 1627 *ap->a_size += ealen + 1; 1628 } else if (ap->a_uio != NULL) { 1629 error = uiomove(p, ealen + 1, ap->a_uio); 1630 } 1631 } 1632 if (stand_alone) 1633 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1634 return(error); 1635 } 1636 1637 /* 1638 * Vnode operation to set a named attribute. 1639 */ 1640 static int 1641 ffs_setextattr(struct vop_setextattr_args *ap) 1642 /* 1643 vop_setextattr { 1644 IN struct vnode *a_vp; 1645 IN int a_attrnamespace; 1646 IN const char *a_name; 1647 INOUT struct uio *a_uio; 1648 IN struct ucred *a_cred; 1649 IN struct thread *a_td; 1650 }; 1651 */ 1652 { 1653 struct inode *ip; 1654 struct fs *fs; 1655 uint32_t ealength, ul; 1656 int ealen, olen, eapad1, eapad2, error, i, easize; 1657 u_char *eae, *p; 1658 int stand_alone; 1659 1660 ip = VTOI(ap->a_vp); 1661 fs = ip->i_fs; 1662 1663 if (fs->fs_magic == FS_UFS1_MAGIC) 1664 return (ufs_vnoperate((struct vop_generic_args *)ap)); 1665 1666 if (ap->a_vp->v_type == VCHR) 1667 return (EOPNOTSUPP); 1668 1669 if (strlen(ap->a_name) == 0) 1670 return (EINVAL); 1671 1672 /* XXX Now unsupported API to delete EAs using NULL uio. */ 1673 if (ap->a_uio == NULL) 1674 return (EOPNOTSUPP); 1675 1676 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1677 ap->a_cred, ap->a_td, IWRITE); 1678 if (error) { 1679 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1680 ip->i_ea_error = error; 1681 return (error); 1682 } 1683 1684 if (ip->i_ea_area == NULL) { 1685 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1686 if (error) 1687 return (error); 1688 stand_alone = 1; 1689 } else { 1690 stand_alone = 0; 1691 } 1692 1693 ealen = ap->a_uio->uio_resid; 1694 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1695 eapad1 = 8 - (ealength % 8); 1696 if (eapad1 == 8) 1697 eapad1 = 0; 1698 eapad2 = 8 - (ealen % 8); 1699 if (eapad2 == 8) 1700 eapad2 = 0; 1701 ealength += eapad1 + ealen + eapad2; 1702 1703 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1704 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1705 easize = ip->i_ea_len; 1706 1707 olen = ffs_findextattr(eae, easize, 1708 ap->a_attrnamespace, ap->a_name, &p, NULL); 1709 if (olen == -1) { 1710 /* new, append at end */ 1711 p = eae + easize; 1712 easize += ealength; 1713 } else { 1714 bcopy(p, &ul, sizeof ul); 1715 i = p - eae + ul; 1716 if (ul != ealength) { 1717 bcopy(p + ul, p + ealength, easize - i); 1718 easize += (ealength - ul); 1719 } 1720 } 1721 if (easize > NXADDR * fs->fs_bsize) { 1722 free(eae, M_TEMP); 1723 if (stand_alone) 1724 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1725 else if (ip->i_ea_error == 0) 1726 ip->i_ea_error = ENOSPC; 1727 return(ENOSPC); 1728 } 1729 bcopy(&ealength, p, sizeof(ealength)); 1730 p += sizeof(ealength); 1731 *p++ = ap->a_attrnamespace; 1732 *p++ = eapad2; 1733 *p++ = strlen(ap->a_name); 1734 strcpy(p, ap->a_name); 1735 p += strlen(ap->a_name); 1736 bzero(p, eapad1); 1737 p += eapad1; 1738 error = uiomove(p, ealen, ap->a_uio); 1739 if (error) { 1740 free(eae, M_TEMP); 1741 if (stand_alone) 1742 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1743 else if (ip->i_ea_error == 0) 1744 ip->i_ea_error = error; 1745 return(error); 1746 } 1747 p += ealen; 1748 bzero(p, eapad2); 1749 1750 p = ip->i_ea_area; 1751 ip->i_ea_area = eae; 1752 ip->i_ea_len = easize; 1753 free(p, M_TEMP); 1754 if (stand_alone) 1755 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1756 return(error); 1757 } 1758