1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vfs_opreg.h> 40 #include <sys/stat.h> 41 #include <sys/vnode.h> 42 #include <sys/mode.h> 43 #include <sys/proc.h> 44 #include <sys/disp.h> 45 #include <sys/file.h> 46 #include <sys/fcntl.h> 47 #include <sys/flock.h> 48 #include <sys/kmem.h> 49 #include <sys/uio.h> 50 #include <sys/dnlc.h> 51 #include <sys/conf.h> 52 #include <sys/errno.h> 53 #include <sys/mman.h> 54 #include <sys/fbuf.h> 55 #include <sys/pathname.h> 56 #include <sys/debug.h> 57 #include <sys/vmsystm.h> 58 #include <sys/cmn_err.h> 59 #include <sys/dirent.h> 60 #include <sys/errno.h> 61 #include <sys/modctl.h> 62 #include <sys/statvfs.h> 63 #include <sys/mount.h> 64 #include <sys/sunddi.h> 65 #include <sys/bootconf.h> 66 #include <sys/policy.h> 67 68 #include <vm/hat.h> 69 #include <vm/page.h> 70 #include <vm/pvn.h> 71 #include <vm/as.h> 72 #include <vm/seg.h> 73 #include <vm/seg_map.h> 74 #include <vm/seg_kmem.h> 75 #include <vm/seg_vn.h> 76 #include <vm/rm.h> 77 #include <vm/page.h> 78 #include <sys/swap.h> 79 80 #include <fs/fs_subr.h> 81 82 #include <sys/fs/udf_volume.h> 83 #include <sys/fs/udf_inode.h> 84 85 static int32_t udf_open(struct vnode **, 86 int32_t, struct cred *, caller_context_t *); 87 static int32_t udf_close(struct vnode *, 88 int32_t, int32_t, offset_t, struct cred *, caller_context_t *); 89 static int32_t udf_read(struct vnode *, 90 struct uio *, int32_t, struct cred *, caller_context_t *); 91 static int32_t udf_write(struct vnode *, 92 struct uio *, int32_t, struct cred *, caller_context_t *); 93 static int32_t udf_ioctl(struct vnode *, 94 int32_t, intptr_t, int32_t, struct cred *, int32_t *, 95 caller_context_t *); 96 static int32_t udf_getattr(struct vnode *, 97 struct vattr *, int32_t, struct cred *, caller_context_t *); 98 static int32_t udf_setattr(struct vnode *, 99 struct vattr *, int32_t, struct cred *, caller_context_t *); 100 static int32_t udf_access(struct vnode *, 101 int32_t, int32_t, struct cred *, caller_context_t *); 102 static int32_t udf_lookup(struct vnode *, 103 char *, struct vnode **, struct pathname *, 104 int32_t, struct vnode *, struct cred *, 105 caller_context_t *, int *, pathname_t *); 106 static int32_t udf_create(struct vnode *, 107 char *, struct vattr *, enum vcexcl, 108 int32_t, struct vnode **, struct cred *, int32_t, 109 caller_context_t *, vsecattr_t *); 110 static int32_t udf_remove(struct vnode *, 111 char *, struct cred *, caller_context_t *, int); 112 static int32_t udf_link(struct vnode *, 113 struct vnode *, char *, struct cred *, caller_context_t *, int); 114 static int32_t udf_rename(struct vnode *, 115 char *, struct vnode *, char *, struct cred *, caller_context_t *, int); 116 static int32_t udf_mkdir(struct vnode *, 117 char *, struct vattr *, struct vnode **, struct cred *, 118 caller_context_t *, int, vsecattr_t *); 119 static int32_t udf_rmdir(struct vnode *, 120 char *, struct vnode *, struct cred *, caller_context_t *, int); 121 static int32_t udf_readdir(struct vnode *, 122 struct uio *, struct cred *, int32_t *, caller_context_t *, int); 123 static int32_t udf_symlink(struct vnode *, 124 char *, struct vattr *, char *, struct cred *, caller_context_t *, int); 125 static int32_t udf_readlink(struct vnode *, 126 struct uio *, struct cred *, caller_context_t *); 127 static int32_t udf_fsync(struct vnode *, 128 int32_t, struct cred *, caller_context_t *); 129 static void udf_inactive(struct vnode *, 130 struct cred *, caller_context_t *); 131 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *); 132 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *); 133 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *); 134 static int32_t udf_seek(struct vnode *, offset_t, offset_t *, 135 caller_context_t *); 136 static int32_t udf_frlock(struct vnode *, int32_t, 137 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *, 138 caller_context_t *); 139 static int32_t udf_space(struct vnode *, int32_t, 140 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *); 141 static int32_t udf_getpage(struct vnode *, offset_t, 142 size_t, uint32_t *, struct page **, size_t, 143 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *); 144 static int32_t udf_putpage(struct vnode *, offset_t, 145 size_t, int32_t, struct cred *, caller_context_t *); 146 static int32_t udf_map(struct vnode *, offset_t, struct as *, 147 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *, 148 caller_context_t *); 149 static int32_t udf_addmap(struct vnode *, offset_t, struct as *, 150 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *, 151 caller_context_t *); 152 static int32_t udf_delmap(struct vnode *, offset_t, struct as *, 153 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *, 154 caller_context_t *); 155 static int32_t udf_l_pathconf(struct vnode *, int32_t, 156 ulong_t *, struct cred *, caller_context_t *); 157 static int32_t udf_pageio(struct vnode *, struct page *, 158 u_offset_t, size_t, int32_t, struct cred *, caller_context_t *); 159 160 int32_t ud_getpage_miss(struct vnode *, u_offset_t, 161 size_t, struct seg *, caddr_t, page_t *pl[], 162 size_t, enum seg_rw, int32_t); 163 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 164 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *); 165 int32_t ud_page_fill(struct ud_inode *, page_t *, 166 u_offset_t, uint32_t, u_offset_t *); 167 int32_t ud_iodone(struct buf *); 168 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *); 169 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *); 170 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t); 171 int32_t ud_slave_done(struct buf *); 172 173 /* 174 * Structures to control multiple IO operations to get or put pages 175 * that are backed by discontiguous blocks. The master struct is 176 * a dummy that holds the original bp from pageio_setup. The 177 * slave struct holds the working bp's to do the actual IO. Once 178 * all the slave IOs complete. The master is processed as if a single 179 * IO op has completed. 180 */ 181 uint32_t master_index = 0; 182 typedef struct mio_master { 183 kmutex_t mm_mutex; /* protect the fields below */ 184 int32_t mm_size; 185 buf_t *mm_bp; /* original bp */ 186 int32_t mm_resid; /* bytes remaining to transfer */ 187 int32_t mm_error; /* accumulated error from slaves */ 188 int32_t mm_index; /* XXX debugging */ 189 } mio_master_t; 190 191 typedef struct mio_slave { 192 buf_t ms_buf; /* working buffer for this IO chunk */ 193 mio_master_t *ms_ptr; /* pointer to master */ 194 } mio_slave_t; 195 196 struct vnodeops *udf_vnodeops; 197 198 const fs_operation_def_t udf_vnodeops_template[] = { 199 VOPNAME_OPEN, { .vop_open = udf_open }, 200 VOPNAME_CLOSE, { .vop_close = udf_close }, 201 VOPNAME_READ, { .vop_read = udf_read }, 202 VOPNAME_WRITE, { .vop_write = udf_write }, 203 VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl }, 204 VOPNAME_GETATTR, { .vop_getattr = udf_getattr }, 205 VOPNAME_SETATTR, { .vop_setattr = udf_setattr }, 206 VOPNAME_ACCESS, { .vop_access = udf_access }, 207 VOPNAME_LOOKUP, { .vop_lookup = udf_lookup }, 208 VOPNAME_CREATE, { .vop_create = udf_create }, 209 VOPNAME_REMOVE, { .vop_remove = udf_remove }, 210 VOPNAME_LINK, { .vop_link = udf_link }, 211 VOPNAME_RENAME, { .vop_rename = udf_rename }, 212 VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir }, 213 VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir }, 214 VOPNAME_READDIR, { .vop_readdir = udf_readdir }, 215 VOPNAME_SYMLINK, { .vop_symlink = udf_symlink }, 216 VOPNAME_READLINK, { .vop_readlink = udf_readlink }, 217 VOPNAME_FSYNC, { .vop_fsync = udf_fsync }, 218 VOPNAME_INACTIVE, { .vop_inactive = udf_inactive }, 219 VOPNAME_FID, { .vop_fid = udf_fid }, 220 VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock }, 221 VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock }, 222 VOPNAME_SEEK, { .vop_seek = udf_seek }, 223 VOPNAME_FRLOCK, { .vop_frlock = udf_frlock }, 224 VOPNAME_SPACE, { .vop_space = udf_space }, 225 VOPNAME_GETPAGE, { .vop_getpage = udf_getpage }, 226 VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage }, 227 VOPNAME_MAP, { .vop_map = udf_map }, 228 VOPNAME_ADDMAP, { .vop_addmap = udf_addmap }, 229 VOPNAME_DELMAP, { .vop_delmap = udf_delmap }, 230 VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf }, 231 VOPNAME_PAGEIO, { .vop_pageio = udf_pageio }, 232 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 233 NULL, NULL 234 }; 235 236 /* ARGSUSED */ 237 static int32_t 238 udf_open( 239 struct vnode **vpp, 240 int32_t flag, 241 struct cred *cr, 242 caller_context_t *ct) 243 { 244 ud_printf("udf_open\n"); 245 246 return (0); 247 } 248 249 /* ARGSUSED */ 250 static int32_t 251 udf_close( 252 struct vnode *vp, 253 int32_t flag, 254 int32_t count, 255 offset_t offset, 256 struct cred *cr, 257 caller_context_t *ct) 258 { 259 struct ud_inode *ip = VTOI(vp); 260 261 ud_printf("udf_close\n"); 262 263 ITIMES(ip); 264 265 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 266 cleanshares(vp, ttoproc(curthread)->p_pid); 267 268 /* 269 * Push partially filled cluster at last close. 270 * ``last close'' is approximated because the dnlc 271 * may have a hold on the vnode. 272 */ 273 if (vp->v_count <= 2 && vp->v_type != VBAD) { 274 struct ud_inode *ip = VTOI(vp); 275 if (ip->i_delaylen) { 276 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen, 277 B_ASYNC | B_FREE, cr); 278 ip->i_delaylen = 0; 279 } 280 } 281 282 return (0); 283 } 284 285 /* ARGSUSED */ 286 static int32_t 287 udf_read( 288 struct vnode *vp, 289 struct uio *uiop, 290 int32_t ioflag, 291 struct cred *cr, 292 caller_context_t *ct) 293 { 294 struct ud_inode *ip = VTOI(vp); 295 int32_t error; 296 297 ud_printf("udf_read\n"); 298 299 #ifdef __lock_lint 300 rw_enter(&ip->i_rwlock, RW_READER); 301 #endif 302 303 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 304 305 if (MANDLOCK(vp, ip->i_char)) { 306 /* 307 * udf_getattr ends up being called by chklock 308 */ 309 error = chklock(vp, FREAD, uiop->uio_loffset, 310 uiop->uio_resid, uiop->uio_fmode, ct); 311 if (error) { 312 goto end; 313 } 314 } 315 316 rw_enter(&ip->i_contents, RW_READER); 317 error = ud_rdip(ip, uiop, ioflag, cr); 318 rw_exit(&ip->i_contents); 319 320 end: 321 #ifdef __lock_lint 322 rw_exit(&ip->i_rwlock); 323 #endif 324 325 return (error); 326 } 327 328 329 int32_t ud_WRITES = 1; 330 int32_t ud_HW = 96 * 1024; 331 int32_t ud_LW = 64 * 1024; 332 int32_t ud_throttles = 0; 333 334 /* ARGSUSED */ 335 static int32_t 336 udf_write( 337 struct vnode *vp, 338 struct uio *uiop, 339 int32_t ioflag, 340 struct cred *cr, 341 caller_context_t *ct) 342 { 343 struct ud_inode *ip = VTOI(vp); 344 int32_t error = 0; 345 346 ud_printf("udf_write\n"); 347 348 #ifdef __lock_lint 349 rw_enter(&ip->i_rwlock, RW_WRITER); 350 #endif 351 352 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 353 354 if (MANDLOCK(vp, ip->i_char)) { 355 /* 356 * ud_getattr ends up being called by chklock 357 */ 358 error = chklock(vp, FWRITE, uiop->uio_loffset, 359 uiop->uio_resid, uiop->uio_fmode, ct); 360 if (error) { 361 goto end; 362 } 363 } 364 /* 365 * Throttle writes. 366 */ 367 mutex_enter(&ip->i_tlock); 368 if (ud_WRITES && (ip->i_writes > ud_HW)) { 369 while (ip->i_writes > ud_HW) { 370 ud_throttles++; 371 cv_wait(&ip->i_wrcv, &ip->i_tlock); 372 } 373 } 374 mutex_exit(&ip->i_tlock); 375 376 /* 377 * Write to the file 378 */ 379 rw_enter(&ip->i_contents, RW_WRITER); 380 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) { 381 /* 382 * In append mode start at end of file. 383 */ 384 uiop->uio_loffset = ip->i_size; 385 } 386 error = ud_wrip(ip, uiop, ioflag, cr); 387 rw_exit(&ip->i_contents); 388 389 end: 390 #ifdef __lock_lint 391 rw_exit(&ip->i_rwlock); 392 #endif 393 394 return (error); 395 } 396 397 /* ARGSUSED */ 398 static int32_t 399 udf_ioctl( 400 struct vnode *vp, 401 int32_t cmd, 402 intptr_t arg, 403 int32_t flag, 404 struct cred *cr, 405 int32_t *rvalp, 406 caller_context_t *ct) 407 { 408 return (ENOTTY); 409 } 410 411 /* ARGSUSED */ 412 static int32_t 413 udf_getattr( 414 struct vnode *vp, 415 struct vattr *vap, 416 int32_t flags, 417 struct cred *cr, 418 caller_context_t *ct) 419 { 420 struct ud_inode *ip = VTOI(vp); 421 422 ud_printf("udf_getattr\n"); 423 424 if (vap->va_mask == AT_SIZE) { 425 /* 426 * for performance, if only the size is requested don't bother 427 * with anything else. 428 */ 429 vap->va_size = ip->i_size; 430 return (0); 431 } 432 433 rw_enter(&ip->i_contents, RW_READER); 434 435 vap->va_type = vp->v_type; 436 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; 437 438 vap->va_uid = ip->i_uid; 439 vap->va_gid = ip->i_gid; 440 vap->va_fsid = ip->i_dev; 441 vap->va_nodeid = ip->i_icb_lbano; 442 vap->va_nlink = ip->i_nlink; 443 vap->va_size = ip->i_size; 444 vap->va_seq = ip->i_seq; 445 if (vp->v_type == VCHR || vp->v_type == VBLK) { 446 vap->va_rdev = ip->i_rdev; 447 } else { 448 vap->va_rdev = 0; 449 } 450 451 mutex_enter(&ip->i_tlock); 452 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 453 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 454 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec; 455 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 456 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec; 457 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 458 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec; 459 mutex_exit(&ip->i_tlock); 460 461 switch (ip->i_type) { 462 case VBLK: 463 vap->va_blksize = MAXBSIZE; 464 break; 465 case VCHR: 466 vap->va_blksize = MAXBSIZE; 467 break; 468 default: 469 vap->va_blksize = ip->i_udf->udf_lbsize; 470 break; 471 } 472 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift; 473 474 rw_exit(&ip->i_contents); 475 476 return (0); 477 } 478 479 static int 480 ud_iaccess_vmode(void *ip, int mode, struct cred *cr) 481 { 482 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr)); 483 } 484 485 /*ARGSUSED4*/ 486 static int32_t 487 udf_setattr( 488 struct vnode *vp, 489 struct vattr *vap, 490 int32_t flags, 491 struct cred *cr, 492 caller_context_t *ct) 493 { 494 int32_t error = 0; 495 uint32_t mask = vap->va_mask; 496 struct ud_inode *ip; 497 timestruc_t now; 498 struct vattr ovap; 499 500 ud_printf("udf_setattr\n"); 501 502 ip = VTOI(vp); 503 504 /* 505 * not updates allowed to 4096 files 506 */ 507 if (ip->i_astrat == STRAT_TYPE4096) { 508 return (EINVAL); 509 } 510 511 /* 512 * Cannot set these attributes 513 */ 514 if (mask & AT_NOSET) { 515 return (EINVAL); 516 } 517 518 rw_enter(&ip->i_rwlock, RW_WRITER); 519 rw_enter(&ip->i_contents, RW_WRITER); 520 521 ovap.va_uid = ip->i_uid; 522 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; 523 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags, 524 ud_iaccess_vmode, ip); 525 if (error) 526 goto update_inode; 527 528 mask = vap->va_mask; 529 /* 530 * Change file access modes. 531 */ 532 if (mask & AT_MODE) { 533 ip->i_perm = VA2UD_PERM(vap->va_mode); 534 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX); 535 mutex_enter(&ip->i_tlock); 536 ip->i_flag |= ICHG; 537 mutex_exit(&ip->i_tlock); 538 } 539 if (mask & (AT_UID|AT_GID)) { 540 if (mask & AT_UID) { 541 ip->i_uid = vap->va_uid; 542 } 543 if (mask & AT_GID) { 544 ip->i_gid = vap->va_gid; 545 } 546 mutex_enter(&ip->i_tlock); 547 ip->i_flag |= ICHG; 548 mutex_exit(&ip->i_tlock); 549 } 550 /* 551 * Truncate file. Must have write permission and not be a directory. 552 */ 553 if (mask & AT_SIZE) { 554 if (vp->v_type == VDIR) { 555 error = EISDIR; 556 goto update_inode; 557 } 558 if (error = ud_iaccess(ip, IWRITE, cr)) { 559 goto update_inode; 560 } 561 if (vap->va_size > MAXOFFSET_T) { 562 error = EFBIG; 563 goto update_inode; 564 } 565 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) { 566 goto update_inode; 567 } 568 } 569 /* 570 * Change file access or modified times. 571 */ 572 if (mask & (AT_ATIME|AT_MTIME)) { 573 mutex_enter(&ip->i_tlock); 574 if (mask & AT_ATIME) { 575 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 576 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec; 577 ip->i_flag &= ~IACC; 578 } 579 if (mask & AT_MTIME) { 580 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 581 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec; 582 gethrestime(&now); 583 ip->i_ctime.tv_sec = now.tv_sec; 584 ip->i_ctime.tv_nsec = now.tv_nsec; 585 ip->i_flag &= ~(IUPD|ICHG); 586 ip->i_flag |= IMODTIME; 587 } 588 ip->i_flag |= IMOD; 589 mutex_exit(&ip->i_tlock); 590 } 591 592 update_inode: 593 if (curthread->t_flag & T_DONTPEND) { 594 ud_iupdat(ip, 1); 595 } else { 596 ITIMES_NOLOCK(ip); 597 } 598 rw_exit(&ip->i_contents); 599 rw_exit(&ip->i_rwlock); 600 601 return (error); 602 } 603 604 /* ARGSUSED */ 605 static int32_t 606 udf_access( 607 struct vnode *vp, 608 int32_t mode, 609 int32_t flags, 610 struct cred *cr, 611 caller_context_t *ct) 612 { 613 struct ud_inode *ip = VTOI(vp); 614 int32_t error; 615 616 ud_printf("udf_access\n"); 617 618 if (ip->i_udf == NULL) { 619 return (EIO); 620 } 621 622 error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr); 623 624 return (error); 625 } 626 627 int32_t udfs_stickyhack = 1; 628 629 /* ARGSUSED */ 630 static int32_t 631 udf_lookup( 632 struct vnode *dvp, 633 char *nm, 634 struct vnode **vpp, 635 struct pathname *pnp, 636 int32_t flags, 637 struct vnode *rdir, 638 struct cred *cr, 639 caller_context_t *ct, 640 int *direntflags, 641 pathname_t *realpnp) 642 { 643 int32_t error; 644 struct vnode *vp; 645 struct ud_inode *ip, *xip; 646 647 ud_printf("udf_lookup\n"); 648 /* 649 * Null component name is a synonym for directory being searched. 650 */ 651 if (*nm == '\0') { 652 VN_HOLD(dvp); 653 *vpp = dvp; 654 error = 0; 655 goto out; 656 } 657 658 /* 659 * Fast path: Check the directory name lookup cache. 660 */ 661 ip = VTOI(dvp); 662 if (vp = dnlc_lookup(dvp, nm)) { 663 /* 664 * Check accessibility of directory. 665 */ 666 if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) { 667 VN_RELE(vp); 668 } 669 xip = VTOI(vp); 670 } else { 671 error = ud_dirlook(ip, nm, &xip, cr, 1); 672 ITIMES(ip); 673 } 674 675 if (error == 0) { 676 ip = xip; 677 *vpp = ITOV(ip); 678 if ((ip->i_type != VDIR) && 679 (ip->i_char & ISVTX) && 680 ((ip->i_perm & IEXEC) == 0) && 681 udfs_stickyhack) { 682 mutex_enter(&(*vpp)->v_lock); 683 (*vpp)->v_flag |= VISSWAP; 684 mutex_exit(&(*vpp)->v_lock); 685 } 686 ITIMES(ip); 687 /* 688 * If vnode is a device return special vnode instead. 689 */ 690 if (IS_DEVVP(*vpp)) { 691 struct vnode *newvp; 692 newvp = specvp(*vpp, (*vpp)->v_rdev, 693 (*vpp)->v_type, cr); 694 VN_RELE(*vpp); 695 if (newvp == NULL) { 696 error = ENOSYS; 697 } else { 698 *vpp = newvp; 699 } 700 } 701 } 702 out: 703 return (error); 704 } 705 706 /* ARGSUSED */ 707 static int32_t 708 udf_create( 709 struct vnode *dvp, 710 char *name, 711 struct vattr *vap, 712 enum vcexcl excl, 713 int32_t mode, 714 struct vnode **vpp, 715 struct cred *cr, 716 int32_t flag, 717 caller_context_t *ct, 718 vsecattr_t *vsecp) 719 { 720 int32_t error; 721 struct ud_inode *ip = VTOI(dvp), *xip; 722 723 ud_printf("udf_create\n"); 724 725 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 726 vap->va_mode &= ~VSVTX; 727 728 if (*name == '\0') { 729 /* 730 * Null component name refers to the directory itself. 731 */ 732 VN_HOLD(dvp); 733 ITIMES(ip); 734 error = EEXIST; 735 } else { 736 xip = NULL; 737 rw_enter(&ip->i_rwlock, RW_WRITER); 738 error = ud_direnter(ip, name, DE_CREATE, 739 (struct ud_inode *)0, (struct ud_inode *)0, 740 vap, &xip, cr, ct); 741 rw_exit(&ip->i_rwlock); 742 ITIMES(ip); 743 ip = xip; 744 } 745 #ifdef __lock_lint 746 rw_enter(&ip->i_contents, RW_WRITER); 747 #else 748 if (ip != NULL) { 749 rw_enter(&ip->i_contents, RW_WRITER); 750 } 751 #endif 752 753 /* 754 * If the file already exists and this is a non-exclusive create, 755 * check permissions and allow access for non-directories. 756 * Read-only create of an existing directory is also allowed. 757 * We fail an exclusive create of anything which already exists. 758 */ 759 if (error == EEXIST) { 760 if (excl == NONEXCL) { 761 if ((ip->i_type == VDIR) && (mode & VWRITE)) { 762 error = EISDIR; 763 } else if (mode) { 764 error = ud_iaccess(ip, 765 UD_UPERM2DPERM(mode), cr); 766 } else { 767 error = 0; 768 } 769 } 770 if (error) { 771 rw_exit(&ip->i_contents); 772 VN_RELE(ITOV(ip)); 773 goto out; 774 } else if ((ip->i_type == VREG) && 775 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 776 /* 777 * Truncate regular files, if requested by caller. 778 * Grab i_rwlock to make sure no one else is 779 * currently writing to the file (we promised 780 * bmap we would do this). 781 * Must get the locks in the correct order. 782 */ 783 if (ip->i_size == 0) { 784 ip->i_flag |= ICHG | IUPD; 785 } else { 786 rw_exit(&ip->i_contents); 787 rw_enter(&ip->i_rwlock, RW_WRITER); 788 rw_enter(&ip->i_contents, RW_WRITER); 789 (void) ud_itrunc(ip, 0, 0, cr); 790 rw_exit(&ip->i_rwlock); 791 } 792 vnevent_create(ITOV(ip), ct); 793 } 794 } 795 796 if (error == 0) { 797 *vpp = ITOV(ip); 798 ITIMES(ip); 799 } 800 #ifdef __lock_lint 801 rw_exit(&ip->i_contents); 802 #else 803 if (ip != NULL) { 804 rw_exit(&ip->i_contents); 805 } 806 #endif 807 if (error) { 808 goto out; 809 } 810 811 /* 812 * If vnode is a device return special vnode instead. 813 */ 814 if (!error && IS_DEVVP(*vpp)) { 815 struct vnode *newvp; 816 817 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 818 VN_RELE(*vpp); 819 if (newvp == NULL) { 820 error = ENOSYS; 821 goto out; 822 } 823 *vpp = newvp; 824 } 825 out: 826 return (error); 827 } 828 829 /* ARGSUSED */ 830 static int32_t 831 udf_remove( 832 struct vnode *vp, 833 char *nm, 834 struct cred *cr, 835 caller_context_t *ct, 836 int flags) 837 { 838 int32_t error; 839 struct ud_inode *ip = VTOI(vp); 840 841 ud_printf("udf_remove\n"); 842 843 rw_enter(&ip->i_rwlock, RW_WRITER); 844 error = ud_dirremove(ip, nm, 845 (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct); 846 rw_exit(&ip->i_rwlock); 847 ITIMES(ip); 848 849 return (error); 850 } 851 852 /* ARGSUSED */ 853 static int32_t 854 udf_link( 855 struct vnode *tdvp, 856 struct vnode *svp, 857 char *tnm, 858 struct cred *cr, 859 caller_context_t *ct, 860 int flags) 861 { 862 int32_t error; 863 struct vnode *realvp; 864 struct ud_inode *sip; 865 struct ud_inode *tdp; 866 867 ud_printf("udf_link\n"); 868 if (VOP_REALVP(svp, &realvp, ct) == 0) { 869 svp = realvp; 870 } 871 872 /* 873 * Do not allow links to directories 874 */ 875 if (svp->v_type == VDIR) { 876 return (EPERM); 877 } 878 879 sip = VTOI(svp); 880 881 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) 882 return (EPERM); 883 884 tdp = VTOI(tdvp); 885 886 rw_enter(&tdp->i_rwlock, RW_WRITER); 887 error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0, 888 sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct); 889 rw_exit(&tdp->i_rwlock); 890 ITIMES(sip); 891 ITIMES(tdp); 892 893 if (error == 0) { 894 vnevent_link(svp, ct); 895 } 896 897 return (error); 898 } 899 900 /* ARGSUSED */ 901 static int32_t 902 udf_rename( 903 struct vnode *sdvp, 904 char *snm, 905 struct vnode *tdvp, 906 char *tnm, 907 struct cred *cr, 908 caller_context_t *ct, 909 int flags) 910 { 911 int32_t error = 0; 912 struct udf_vfs *udf_vfsp; 913 struct ud_inode *sip; /* source inode */ 914 struct ud_inode *sdp, *tdp; /* source and target parent inode */ 915 struct vnode *realvp; 916 917 ud_printf("udf_rename\n"); 918 919 if (VOP_REALVP(tdvp, &realvp, ct) == 0) { 920 tdvp = realvp; 921 } 922 923 sdp = VTOI(sdvp); 924 tdp = VTOI(tdvp); 925 926 udf_vfsp = sdp->i_udf; 927 928 mutex_enter(&udf_vfsp->udf_rename_lck); 929 /* 930 * Look up inode of file we're supposed to rename. 931 */ 932 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) { 933 mutex_exit(&udf_vfsp->udf_rename_lck); 934 return (error); 935 } 936 /* 937 * be sure this is not a directory with another file system mounted 938 * over it. If it is just give up the locks, and return with 939 * EBUSY 940 */ 941 if (vn_mountedvfs(ITOV(sip)) != NULL) { 942 error = EBUSY; 943 goto errout; 944 } 945 /* 946 * Make sure we can delete the source entry. This requires 947 * write permission on the containing directory. If that 948 * directory is "sticky" it further requires (except for 949 * privileged users) that the user own the directory or the 950 * source entry, or else have permission to write the source 951 * entry. 952 */ 953 rw_enter(&sdp->i_contents, RW_READER); 954 rw_enter(&sip->i_contents, RW_READER); 955 if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 || 956 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) { 957 rw_exit(&sip->i_contents); 958 rw_exit(&sdp->i_contents); 959 ITIMES(sip); 960 goto errout; 961 } 962 963 /* 964 * Check for renaming '.' or '..' or alias of '.' 965 */ 966 if ((strcmp(snm, ".") == 0) || 967 (strcmp(snm, "..") == 0) || 968 (sdp == sip)) { 969 error = EINVAL; 970 rw_exit(&sip->i_contents); 971 rw_exit(&sdp->i_contents); 972 goto errout; 973 } 974 rw_exit(&sip->i_contents); 975 rw_exit(&sdp->i_contents); 976 977 978 /* 979 * Link source to the target. 980 */ 981 rw_enter(&tdp->i_rwlock, RW_WRITER); 982 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip, 983 (struct vattr *)0, (struct ud_inode **)0, cr, ct)) { 984 /* 985 * ESAME isn't really an error; it indicates that the 986 * operation should not be done because the source and target 987 * are the same file, but that no error should be reported. 988 */ 989 if (error == ESAME) { 990 error = 0; 991 } 992 rw_exit(&tdp->i_rwlock); 993 goto errout; 994 } 995 vnevent_rename_src(ITOV(sip), sdvp, snm, ct); 996 rw_exit(&tdp->i_rwlock); 997 998 rw_enter(&sdp->i_rwlock, RW_WRITER); 999 /* 1000 * Unlink the source. 1001 * Remove the source entry. ud_dirremove() checks that the entry 1002 * still reflects sip, and returns an error if it doesn't. 1003 * If the entry has changed just forget about it. Release 1004 * the source inode. 1005 */ 1006 if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0, 1007 DR_RENAME, cr, ct)) == ENOENT) { 1008 error = 0; 1009 } 1010 rw_exit(&sdp->i_rwlock); 1011 errout: 1012 ITIMES(sdp); 1013 ITIMES(tdp); 1014 VN_RELE(ITOV(sip)); 1015 mutex_exit(&udf_vfsp->udf_rename_lck); 1016 1017 return (error); 1018 } 1019 1020 /* ARGSUSED */ 1021 static int32_t 1022 udf_mkdir( 1023 struct vnode *dvp, 1024 char *dirname, 1025 struct vattr *vap, 1026 struct vnode **vpp, 1027 struct cred *cr, 1028 caller_context_t *ct, 1029 int flags, 1030 vsecattr_t *vsecp) 1031 { 1032 int32_t error; 1033 struct ud_inode *ip; 1034 struct ud_inode *xip; 1035 1036 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 1037 1038 ud_printf("udf_mkdir\n"); 1039 1040 ip = VTOI(dvp); 1041 rw_enter(&ip->i_rwlock, RW_WRITER); 1042 error = ud_direnter(ip, dirname, DE_MKDIR, 1043 (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct); 1044 rw_exit(&ip->i_rwlock); 1045 ITIMES(ip); 1046 if (error == 0) { 1047 ip = xip; 1048 *vpp = ITOV(ip); 1049 ITIMES(ip); 1050 } else if (error == EEXIST) { 1051 ITIMES(xip); 1052 VN_RELE(ITOV(xip)); 1053 } 1054 1055 return (error); 1056 } 1057 1058 /* ARGSUSED */ 1059 static int32_t 1060 udf_rmdir( 1061 struct vnode *vp, 1062 char *nm, 1063 struct vnode *cdir, 1064 struct cred *cr, 1065 caller_context_t *ct, 1066 int flags) 1067 { 1068 int32_t error; 1069 struct ud_inode *ip = VTOI(vp); 1070 1071 ud_printf("udf_rmdir\n"); 1072 1073 rw_enter(&ip->i_rwlock, RW_WRITER); 1074 error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, 1075 cr, ct); 1076 rw_exit(&ip->i_rwlock); 1077 ITIMES(ip); 1078 1079 return (error); 1080 } 1081 1082 /* ARGSUSED */ 1083 static int32_t 1084 udf_readdir( 1085 struct vnode *vp, 1086 struct uio *uiop, 1087 struct cred *cr, 1088 int32_t *eofp, 1089 caller_context_t *ct, 1090 int flags) 1091 { 1092 struct ud_inode *ip; 1093 struct dirent64 *nd; 1094 struct udf_vfs *udf_vfsp; 1095 int32_t error = 0, len, outcount = 0; 1096 uint32_t dirsiz, offset; 1097 uint32_t bufsize, ndlen, dummy; 1098 caddr_t outbuf; 1099 caddr_t outb, end_outb; 1100 struct iovec *iovp; 1101 1102 uint8_t *dname; 1103 int32_t length; 1104 1105 uint8_t *buf = NULL; 1106 1107 struct fbuf *fbp = NULL; 1108 struct file_id *fid; 1109 uint8_t *name; 1110 1111 1112 ud_printf("udf_readdir\n"); 1113 1114 ip = VTOI(vp); 1115 udf_vfsp = ip->i_udf; 1116 1117 dirsiz = ip->i_size; 1118 if ((uiop->uio_offset >= dirsiz) || 1119 (ip->i_nlink <= 0)) { 1120 if (eofp) { 1121 *eofp = 1; 1122 } 1123 return (0); 1124 } 1125 1126 offset = uiop->uio_offset; 1127 iovp = uiop->uio_iov; 1128 bufsize = iovp->iov_len; 1129 1130 outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP); 1131 end_outb = outb + bufsize; 1132 nd = (struct dirent64 *)outbuf; 1133 1134 dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP); 1135 buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); 1136 1137 if (offset == 0) { 1138 len = DIRENT64_RECLEN(1); 1139 if (((caddr_t)nd + len) >= end_outb) { 1140 error = EINVAL; 1141 goto end; 1142 } 1143 nd->d_ino = ip->i_icb_lbano; 1144 nd->d_reclen = (uint16_t)len; 1145 nd->d_off = 0x10; 1146 nd->d_name[0] = '.'; 1147 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1); 1148 nd = (struct dirent64 *)((char *)nd + nd->d_reclen); 1149 outcount++; 1150 } else if (offset == 0x10) { 1151 offset = 0; 1152 } 1153 1154 while (offset < dirsiz) { 1155 error = ud_get_next_fid(ip, &fbp, 1156 offset, &fid, &name, buf); 1157 if (error != 0) { 1158 break; 1159 } 1160 1161 if ((fid->fid_flags & FID_DELETED) == 0) { 1162 if (fid->fid_flags & FID_PARENT) { 1163 1164 len = DIRENT64_RECLEN(2); 1165 if (((caddr_t)nd + len) >= end_outb) { 1166 error = EINVAL; 1167 break; 1168 } 1169 1170 nd->d_ino = ip->i_icb_lbano; 1171 nd->d_reclen = (uint16_t)len; 1172 nd->d_off = offset + FID_LEN(fid); 1173 nd->d_name[0] = '.'; 1174 nd->d_name[1] = '.'; 1175 bzero(&nd->d_name[2], 1176 DIRENT64_NAMELEN(len) - 2); 1177 nd = (struct dirent64 *) 1178 ((char *)nd + nd->d_reclen); 1179 } else { 1180 if ((error = ud_uncompress(fid->fid_idlen, 1181 &length, name, dname)) != 0) { 1182 break; 1183 } 1184 if (length == 0) { 1185 offset += FID_LEN(fid); 1186 continue; 1187 } 1188 len = DIRENT64_RECLEN(length); 1189 if (((caddr_t)nd + len) >= end_outb) { 1190 if (!outcount) { 1191 error = EINVAL; 1192 } 1193 break; 1194 } 1195 (void) strncpy(nd->d_name, 1196 (caddr_t)dname, length); 1197 bzero(&nd->d_name[length], 1198 DIRENT64_NAMELEN(len) - length); 1199 nd->d_ino = ud_xlate_to_daddr(udf_vfsp, 1200 SWAP_16(fid->fid_icb.lad_ext_prn), 1201 SWAP_32(fid->fid_icb.lad_ext_loc), 1, 1202 &dummy); 1203 nd->d_reclen = (uint16_t)len; 1204 nd->d_off = offset + FID_LEN(fid); 1205 nd = (struct dirent64 *) 1206 ((char *)nd + nd->d_reclen); 1207 } 1208 outcount++; 1209 } 1210 1211 offset += FID_LEN(fid); 1212 } 1213 1214 end: 1215 if (fbp != NULL) { 1216 fbrelse(fbp, S_OTHER); 1217 } 1218 ndlen = ((char *)nd - outbuf); 1219 /* 1220 * In case of error do not call uiomove. 1221 * Return the error to the caller. 1222 */ 1223 if ((error == 0) && (ndlen != 0)) { 1224 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop); 1225 uiop->uio_offset = offset; 1226 } 1227 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize); 1228 kmem_free((caddr_t)dname, 1024); 1229 kmem_free(outbuf, (uint32_t)bufsize); 1230 if (eofp && error == 0) { 1231 *eofp = (uiop->uio_offset >= dirsiz); 1232 } 1233 return (error); 1234 } 1235 1236 /* ARGSUSED */ 1237 static int32_t 1238 udf_symlink( 1239 struct vnode *dvp, 1240 char *linkname, 1241 struct vattr *vap, 1242 char *target, 1243 struct cred *cr, 1244 caller_context_t *ct, 1245 int flags) 1246 { 1247 int32_t error = 0, outlen; 1248 uint32_t ioflag = 0; 1249 struct ud_inode *ip, *dip = VTOI(dvp); 1250 1251 struct path_comp *pc; 1252 int8_t *dname = NULL, *uname = NULL, *sp; 1253 1254 ud_printf("udf_symlink\n"); 1255 1256 ip = (struct ud_inode *)0; 1257 vap->va_type = VLNK; 1258 vap->va_rdev = 0; 1259 1260 rw_enter(&dip->i_rwlock, RW_WRITER); 1261 error = ud_direnter(dip, linkname, DE_CREATE, 1262 (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct); 1263 rw_exit(&dip->i_rwlock); 1264 if (error == 0) { 1265 dname = kmem_zalloc(1024, KM_SLEEP); 1266 uname = kmem_zalloc(PAGESIZE, KM_SLEEP); 1267 1268 pc = (struct path_comp *)uname; 1269 /* 1270 * If the first character in target is "/" 1271 * then skip it and create entry for it 1272 */ 1273 if (*target == '/') { 1274 pc->pc_type = 2; 1275 pc->pc_len = 0; 1276 pc = (struct path_comp *)(((char *)pc) + 4); 1277 while (*target == '/') { 1278 target++; 1279 } 1280 } 1281 1282 while (*target != NULL) { 1283 sp = target; 1284 while ((*target != '/') && (*target != '\0')) { 1285 target ++; 1286 } 1287 /* 1288 * We got the next component of the 1289 * path name. Create path_comp of 1290 * appropriate type 1291 */ 1292 if (((target - sp) == 1) && (*sp == '.')) { 1293 /* 1294 * Dot entry. 1295 */ 1296 pc->pc_type = 4; 1297 pc = (struct path_comp *)(((char *)pc) + 4); 1298 } else if (((target - sp) == 2) && 1299 (*sp == '.') && ((*(sp + 1)) == '.')) { 1300 /* 1301 * DotDot entry. 1302 */ 1303 pc->pc_type = 3; 1304 pc = (struct path_comp *)(((char *)pc) + 4); 1305 } else { 1306 /* 1307 * convert the user given name 1308 * into appropriate form to be put 1309 * on the media 1310 */ 1311 outlen = 1024; /* set to size of dname */ 1312 if (error = ud_compress(target - sp, &outlen, 1313 (uint8_t *)sp, (uint8_t *)dname)) { 1314 break; 1315 } 1316 pc->pc_type = 5; 1317 /* LINTED */ 1318 pc->pc_len = outlen; 1319 dname[outlen] = '\0'; 1320 (void) strcpy((char *)pc->pc_id, dname); 1321 pc = (struct path_comp *) 1322 (((char *)pc) + 4 + outlen); 1323 } 1324 while (*target == '/') { 1325 target++; 1326 } 1327 if (*target == NULL) { 1328 break; 1329 } 1330 } 1331 1332 rw_enter(&ip->i_contents, RW_WRITER); 1333 if (error == 0) { 1334 ioflag = FWRITE; 1335 if (curthread->t_flag & T_DONTPEND) { 1336 ioflag |= FDSYNC; 1337 } 1338 error = ud_rdwri(UIO_WRITE, ioflag, ip, 1339 uname, ((int8_t *)pc) - uname, 1340 (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr); 1341 } 1342 if (error) { 1343 ud_idrop(ip); 1344 rw_exit(&ip->i_contents); 1345 rw_enter(&dip->i_rwlock, RW_WRITER); 1346 (void) ud_dirremove(dip, linkname, (struct ud_inode *)0, 1347 (struct vnode *)0, DR_REMOVE, cr, ct); 1348 rw_exit(&dip->i_rwlock); 1349 goto update_inode; 1350 } 1351 rw_exit(&ip->i_contents); 1352 } 1353 1354 if ((error == 0) || (error == EEXIST)) { 1355 VN_RELE(ITOV(ip)); 1356 } 1357 1358 update_inode: 1359 ITIMES(VTOI(dvp)); 1360 if (uname != NULL) { 1361 kmem_free(uname, PAGESIZE); 1362 } 1363 if (dname != NULL) { 1364 kmem_free(dname, 1024); 1365 } 1366 1367 return (error); 1368 } 1369 1370 /* ARGSUSED */ 1371 static int32_t 1372 udf_readlink( 1373 struct vnode *vp, 1374 struct uio *uiop, 1375 struct cred *cr, 1376 caller_context_t *ct) 1377 { 1378 int32_t error = 0, off, id_len, size, len; 1379 int8_t *dname = NULL, *uname = NULL; 1380 struct ud_inode *ip; 1381 struct fbuf *fbp = NULL; 1382 struct path_comp *pc; 1383 1384 ud_printf("udf_readlink\n"); 1385 1386 if (vp->v_type != VLNK) { 1387 return (EINVAL); 1388 } 1389 1390 ip = VTOI(vp); 1391 size = ip->i_size; 1392 if (size > PAGESIZE) { 1393 return (EIO); 1394 } 1395 1396 if (size == 0) { 1397 return (0); 1398 } 1399 1400 dname = kmem_zalloc(1024, KM_SLEEP); 1401 uname = kmem_zalloc(PAGESIZE, KM_SLEEP); 1402 1403 rw_enter(&ip->i_contents, RW_READER); 1404 1405 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) { 1406 goto end; 1407 } 1408 1409 off = 0; 1410 1411 while (off < size) { 1412 pc = (struct path_comp *)(fbp->fb_addr + off); 1413 switch (pc->pc_type) { 1414 case 1 : 1415 (void) strcpy(uname, ip->i_udf->udf_fsmnt); 1416 (void) strcat(uname, "/"); 1417 break; 1418 case 2 : 1419 if (pc->pc_len != 0) { 1420 goto end; 1421 } 1422 uname[0] = '/'; 1423 uname[1] = '\0'; 1424 break; 1425 case 3 : 1426 (void) strcat(uname, "../"); 1427 break; 1428 case 4 : 1429 (void) strcat(uname, "./"); 1430 break; 1431 case 5 : 1432 if ((error = ud_uncompress(pc->pc_len, &id_len, 1433 pc->pc_id, (uint8_t *)dname)) != 0) { 1434 break; 1435 } 1436 dname[id_len] = '\0'; 1437 (void) strcat(uname, dname); 1438 (void) strcat(uname, "/"); 1439 break; 1440 default : 1441 error = EINVAL; 1442 goto end; 1443 } 1444 off += 4 + pc->pc_len; 1445 } 1446 len = strlen(uname) - 1; 1447 if (uname[len] == '/') { 1448 if (len == 0) { 1449 /* 1450 * special case link to / 1451 */ 1452 len = 1; 1453 } else { 1454 uname[len] = '\0'; 1455 } 1456 } 1457 1458 error = uiomove(uname, len, UIO_READ, uiop); 1459 1460 ITIMES(ip); 1461 1462 end: 1463 if (fbp != NULL) { 1464 fbrelse(fbp, S_OTHER); 1465 } 1466 rw_exit(&ip->i_contents); 1467 if (uname != NULL) { 1468 kmem_free(uname, PAGESIZE); 1469 } 1470 if (dname != NULL) { 1471 kmem_free(dname, 1024); 1472 } 1473 return (error); 1474 } 1475 1476 /* ARGSUSED */ 1477 static int32_t 1478 udf_fsync( 1479 struct vnode *vp, 1480 int32_t syncflag, 1481 struct cred *cr, 1482 caller_context_t *ct) 1483 { 1484 int32_t error = 0; 1485 struct ud_inode *ip = VTOI(vp); 1486 1487 ud_printf("udf_fsync\n"); 1488 1489 rw_enter(&ip->i_contents, RW_WRITER); 1490 if (!(IS_SWAPVP(vp))) { 1491 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */ 1492 } 1493 if (error == 0) { 1494 error = ud_sync_indir(ip); 1495 } 1496 ITIMES(ip); /* XXX: is this necessary ??? */ 1497 rw_exit(&ip->i_contents); 1498 1499 return (error); 1500 } 1501 1502 /* ARGSUSED */ 1503 static void 1504 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) 1505 { 1506 ud_printf("udf_iinactive\n"); 1507 1508 ud_iinactive(VTOI(vp), cr); 1509 } 1510 1511 /* ARGSUSED */ 1512 static int32_t 1513 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) 1514 { 1515 struct udf_fid *udfidp; 1516 struct ud_inode *ip = VTOI(vp); 1517 1518 ud_printf("udf_fid\n"); 1519 1520 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) { 1521 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t); 1522 return (ENOSPC); 1523 } 1524 1525 udfidp = (struct udf_fid *)fidp; 1526 bzero((char *)udfidp, sizeof (struct udf_fid)); 1527 rw_enter(&ip->i_contents, RW_READER); 1528 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t); 1529 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff; 1530 udfidp->udfid_prn = ip->i_icb_prn; 1531 udfidp->udfid_icb_lbn = ip->i_icb_block; 1532 rw_exit(&ip->i_contents); 1533 1534 return (0); 1535 } 1536 1537 /* ARGSUSED2 */ 1538 static int 1539 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) 1540 { 1541 struct ud_inode *ip = VTOI(vp); 1542 1543 ud_printf("udf_rwlock\n"); 1544 1545 if (write_lock) { 1546 rw_enter(&ip->i_rwlock, RW_WRITER); 1547 } else { 1548 rw_enter(&ip->i_rwlock, RW_READER); 1549 } 1550 #ifdef __lock_lint 1551 rw_exit(&ip->i_rwlock); 1552 #endif 1553 return (write_lock); 1554 } 1555 1556 /* ARGSUSED */ 1557 static void 1558 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) 1559 { 1560 struct ud_inode *ip = VTOI(vp); 1561 1562 ud_printf("udf_rwunlock\n"); 1563 1564 #ifdef __lock_lint 1565 rw_enter(&ip->i_rwlock, RW_WRITER); 1566 #endif 1567 1568 rw_exit(&ip->i_rwlock); 1569 1570 } 1571 1572 /* ARGSUSED */ 1573 static int32_t 1574 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 1575 { 1576 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 1577 } 1578 1579 static int32_t 1580 udf_frlock( 1581 struct vnode *vp, 1582 int32_t cmd, 1583 struct flock64 *bfp, 1584 int32_t flag, 1585 offset_t offset, 1586 struct flk_callback *flk_cbp, 1587 cred_t *cr, 1588 caller_context_t *ct) 1589 { 1590 struct ud_inode *ip = VTOI(vp); 1591 1592 ud_printf("udf_frlock\n"); 1593 1594 /* 1595 * If file is being mapped, disallow frlock. 1596 * XXX I am not holding tlock while checking i_mapcnt because the 1597 * current locking strategy drops all locks before calling fs_frlock. 1598 * So, mapcnt could change before we enter fs_frlock making is 1599 * meaningless to have held tlock in the first place. 1600 */ 1601 if ((ip->i_mapcnt > 0) && 1602 (MANDLOCK(vp, ip->i_char))) { 1603 return (EAGAIN); 1604 } 1605 1606 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 1607 } 1608 1609 /*ARGSUSED6*/ 1610 static int32_t 1611 udf_space( 1612 struct vnode *vp, 1613 int32_t cmd, 1614 struct flock64 *bfp, 1615 int32_t flag, 1616 offset_t offset, 1617 cred_t *cr, 1618 caller_context_t *ct) 1619 { 1620 int32_t error = 0; 1621 1622 ud_printf("udf_space\n"); 1623 1624 if (cmd != F_FREESP) { 1625 error = EINVAL; 1626 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { 1627 error = ud_freesp(vp, bfp, flag, cr); 1628 } 1629 1630 return (error); 1631 } 1632 1633 /* ARGSUSED */ 1634 static int32_t 1635 udf_getpage( 1636 struct vnode *vp, 1637 offset_t off, 1638 size_t len, 1639 uint32_t *protp, 1640 struct page **plarr, 1641 size_t plsz, 1642 struct seg *seg, 1643 caddr_t addr, 1644 enum seg_rw rw, 1645 struct cred *cr, 1646 caller_context_t *ct) 1647 { 1648 struct ud_inode *ip = VTOI(vp); 1649 int32_t error, has_holes, beyond_eof, seqmode, dolock; 1650 int32_t pgsize = PAGESIZE; 1651 struct udf_vfs *udf_vfsp = ip->i_udf; 1652 page_t **pl; 1653 u_offset_t pgoff, eoff, uoff; 1654 krw_t rwtype; 1655 caddr_t pgaddr; 1656 1657 ud_printf("udf_getpage\n"); 1658 1659 uoff = (u_offset_t)off; /* type conversion */ 1660 if (protp) { 1661 *protp = PROT_ALL; 1662 } 1663 if (vp->v_flag & VNOMAP) { 1664 return (ENOSYS); 1665 } 1666 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 1667 1668 rwtype = RW_READER; 1669 dolock = (rw_owner(&ip->i_contents) != curthread); 1670 retrylock: 1671 #ifdef __lock_lint 1672 rw_enter(&ip->i_contents, rwtype); 1673 #else 1674 if (dolock) { 1675 rw_enter(&ip->i_contents, rwtype); 1676 } 1677 #endif 1678 1679 /* 1680 * We may be getting called as a side effect of a bmap using 1681 * fbread() when the blocks might be being allocated and the 1682 * size has not yet been up'ed. In this case we want to be 1683 * able to return zero pages if we get back UDF_HOLE from 1684 * calling bmap for a non write case here. We also might have 1685 * to read some frags from the disk into a page if we are 1686 * extending the number of frags for a given lbn in bmap(). 1687 */ 1688 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 1689 if (beyond_eof && seg != segkmap) { 1690 #ifdef __lock_lint 1691 rw_exit(&ip->i_contents); 1692 #else 1693 if (dolock) { 1694 rw_exit(&ip->i_contents); 1695 } 1696 #endif 1697 return (EFAULT); 1698 } 1699 1700 /* 1701 * Must hold i_contents lock throughout the call to pvn_getpages 1702 * since locked pages are returned from each call to ud_getapage. 1703 * Must *not* return locked pages and then try for contents lock 1704 * due to lock ordering requirements (inode > page) 1705 */ 1706 1707 has_holes = ud_bmap_has_holes(ip); 1708 1709 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) { 1710 int32_t blk_size, count; 1711 u_offset_t offset; 1712 1713 /* 1714 * We must acquire the RW_WRITER lock in order to 1715 * call bmap_write(). 1716 */ 1717 if (dolock && rwtype == RW_READER) { 1718 rwtype = RW_WRITER; 1719 1720 if (!rw_tryupgrade(&ip->i_contents)) { 1721 1722 rw_exit(&ip->i_contents); 1723 1724 goto retrylock; 1725 } 1726 } 1727 1728 /* 1729 * May be allocating disk blocks for holes here as 1730 * a result of mmap faults. write(2) does the bmap_write 1731 * in rdip/wrip, not here. We are not dealing with frags 1732 * in this case. 1733 */ 1734 offset = uoff; 1735 while ((offset < uoff + len) && 1736 (offset < ip->i_size)) { 1737 /* 1738 * the variable "bnp" is to simplify the expression for 1739 * the compiler; * just passing in &bn to bmap_write 1740 * causes a compiler "loop" 1741 */ 1742 1743 blk_size = udf_vfsp->udf_lbsize; 1744 if ((offset + blk_size) > ip->i_size) { 1745 count = ip->i_size - offset; 1746 } else { 1747 count = blk_size; 1748 } 1749 error = ud_bmap_write(ip, offset, count, 0, cr); 1750 if (error) { 1751 goto update_inode; 1752 } 1753 offset += count; /* XXX - make this contig */ 1754 } 1755 } 1756 1757 /* 1758 * Can be a reader from now on. 1759 */ 1760 #ifdef __lock_lint 1761 if (rwtype == RW_WRITER) { 1762 rw_downgrade(&ip->i_contents); 1763 } 1764 #else 1765 if (dolock && rwtype == RW_WRITER) { 1766 rw_downgrade(&ip->i_contents); 1767 } 1768 #endif 1769 1770 /* 1771 * We remove PROT_WRITE in cases when the file has UDF holes 1772 * because we don't want to call bmap_read() to check each 1773 * page if it is backed with a disk block. 1774 */ 1775 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) { 1776 *protp &= ~PROT_WRITE; 1777 } 1778 1779 error = 0; 1780 1781 /* 1782 * The loop looks up pages in the range <off, off + len). 1783 * For each page, we first check if we should initiate an asynchronous 1784 * read ahead before we call page_lookup (we may sleep in page_lookup 1785 * for a previously initiated disk read). 1786 */ 1787 eoff = (uoff + len); 1788 for (pgoff = uoff, pgaddr = addr, pl = plarr; 1789 pgoff < eoff; /* empty */) { 1790 page_t *pp; 1791 u_offset_t nextrio; 1792 se_t se; 1793 1794 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED); 1795 1796 /* 1797 * Handle async getpage (faultahead) 1798 */ 1799 if (plarr == NULL) { 1800 ip->i_nextrio = pgoff; 1801 ud_getpage_ra(vp, pgoff, seg, pgaddr); 1802 pgoff += pgsize; 1803 pgaddr += pgsize; 1804 continue; 1805 } 1806 1807 /* 1808 * Check if we should initiate read ahead of next cluster. 1809 * We call page_exists only when we need to confirm that 1810 * we have the current page before we initiate the read ahead. 1811 */ 1812 nextrio = ip->i_nextrio; 1813 if (seqmode && 1814 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 1815 nextrio < ip->i_size && page_exists(vp, pgoff)) 1816 ud_getpage_ra(vp, pgoff, seg, pgaddr); 1817 1818 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 1819 1820 /* 1821 * We found the page in the page cache. 1822 */ 1823 *pl++ = pp; 1824 pgoff += pgsize; 1825 pgaddr += pgsize; 1826 len -= pgsize; 1827 plsz -= pgsize; 1828 } else { 1829 1830 /* 1831 * We have to create the page, or read it from disk. 1832 */ 1833 if (error = ud_getpage_miss(vp, pgoff, len, 1834 seg, pgaddr, pl, plsz, rw, seqmode)) { 1835 goto error_out; 1836 } 1837 1838 while (*pl != NULL) { 1839 pl++; 1840 pgoff += pgsize; 1841 pgaddr += pgsize; 1842 len -= pgsize; 1843 plsz -= pgsize; 1844 } 1845 } 1846 } 1847 1848 /* 1849 * Return pages up to plsz if they are in the page cache. 1850 * We cannot return pages if there is a chance that they are 1851 * backed with a UDF hole and rw is S_WRITE or S_CREATE. 1852 */ 1853 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 1854 1855 ASSERT((protp == NULL) || 1856 !(has_holes && (*protp & PROT_WRITE))); 1857 1858 eoff = pgoff + plsz; 1859 while (pgoff < eoff) { 1860 page_t *pp; 1861 1862 if ((pp = page_lookup_nowait(vp, pgoff, 1863 SE_SHARED)) == NULL) 1864 break; 1865 1866 *pl++ = pp; 1867 pgoff += pgsize; 1868 plsz -= pgsize; 1869 } 1870 } 1871 1872 if (plarr) 1873 *pl = NULL; /* Terminate page list */ 1874 ip->i_nextr = pgoff; 1875 1876 error_out: 1877 if (error && plarr) { 1878 /* 1879 * Release any pages we have locked. 1880 */ 1881 while (pl > &plarr[0]) 1882 page_unlock(*--pl); 1883 1884 plarr[0] = NULL; 1885 } 1886 1887 update_inode: 1888 #ifdef __lock_lint 1889 rw_exit(&ip->i_contents); 1890 #else 1891 if (dolock) { 1892 rw_exit(&ip->i_contents); 1893 } 1894 #endif 1895 1896 /* 1897 * If the inode is not already marked for IACC (in rwip() for read) 1898 * and the inode is not marked for no access time update (in rwip() 1899 * for write) then update the inode access time and mod time now. 1900 */ 1901 mutex_enter(&ip->i_tlock); 1902 if ((ip->i_flag & (IACC | INOACC)) == 0) { 1903 if ((rw != S_OTHER) && (ip->i_type != VDIR)) { 1904 ip->i_flag |= IACC; 1905 } 1906 if (rw == S_WRITE) { 1907 ip->i_flag |= IUPD; 1908 } 1909 ITIMES_NOLOCK(ip); 1910 } 1911 mutex_exit(&ip->i_tlock); 1912 1913 return (error); 1914 } 1915 1916 int32_t ud_delay = 1; 1917 1918 /* ARGSUSED */ 1919 static int32_t 1920 udf_putpage( 1921 struct vnode *vp, 1922 offset_t off, 1923 size_t len, 1924 int32_t flags, 1925 struct cred *cr, 1926 caller_context_t *ct) 1927 { 1928 struct ud_inode *ip; 1929 int32_t error = 0; 1930 1931 ud_printf("udf_putpage\n"); 1932 1933 ip = VTOI(vp); 1934 #ifdef __lock_lint 1935 rw_enter(&ip->i_contents, RW_WRITER); 1936 #endif 1937 1938 if (vp->v_count == 0) { 1939 cmn_err(CE_WARN, "ud_putpage : bad v_count"); 1940 error = EINVAL; 1941 goto out; 1942 } 1943 1944 if (vp->v_flag & VNOMAP) { 1945 error = ENOSYS; 1946 goto out; 1947 } 1948 1949 if (flags & B_ASYNC) { 1950 if (ud_delay && len && 1951 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 1952 mutex_enter(&ip->i_tlock); 1953 1954 /* 1955 * If nobody stalled, start a new cluster. 1956 */ 1957 if (ip->i_delaylen == 0) { 1958 ip->i_delayoff = off; 1959 ip->i_delaylen = len; 1960 mutex_exit(&ip->i_tlock); 1961 goto out; 1962 } 1963 1964 /* 1965 * If we have a full cluster or they are not contig, 1966 * then push last cluster and start over. 1967 */ 1968 if (ip->i_delaylen >= WR_CLUSTSZ(ip) || 1969 ip->i_delayoff + ip->i_delaylen != off) { 1970 u_offset_t doff; 1971 size_t dlen; 1972 1973 doff = ip->i_delayoff; 1974 dlen = ip->i_delaylen; 1975 ip->i_delayoff = off; 1976 ip->i_delaylen = len; 1977 mutex_exit(&ip->i_tlock); 1978 error = ud_putpages(vp, doff, dlen, flags, cr); 1979 /* LMXXX - flags are new val, not old */ 1980 goto out; 1981 } 1982 1983 /* 1984 * There is something there, it's not full, and 1985 * it is contig. 1986 */ 1987 ip->i_delaylen += len; 1988 mutex_exit(&ip->i_tlock); 1989 goto out; 1990 } 1991 1992 /* 1993 * Must have weird flags or we are not clustering. 1994 */ 1995 } 1996 1997 error = ud_putpages(vp, off, len, flags, cr); 1998 1999 out: 2000 #ifdef __lock_lint 2001 rw_exit(&ip->i_contents); 2002 #endif 2003 return (error); 2004 } 2005 2006 /* ARGSUSED */ 2007 static int32_t 2008 udf_map( 2009 struct vnode *vp, 2010 offset_t off, 2011 struct as *as, 2012 caddr_t *addrp, 2013 size_t len, 2014 uint8_t prot, 2015 uint8_t maxprot, 2016 uint32_t flags, 2017 struct cred *cr, 2018 caller_context_t *ct) 2019 { 2020 struct segvn_crargs vn_a; 2021 int32_t error = 0; 2022 2023 ud_printf("udf_map\n"); 2024 2025 if (vp->v_flag & VNOMAP) { 2026 error = ENOSYS; 2027 goto end; 2028 } 2029 2030 if ((off < (offset_t)0) || 2031 ((off + len) < (offset_t)0)) { 2032 error = EINVAL; 2033 goto end; 2034 } 2035 2036 if (vp->v_type != VREG) { 2037 error = ENODEV; 2038 goto end; 2039 } 2040 2041 /* 2042 * If file is being locked, disallow mapping. 2043 */ 2044 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) { 2045 error = EAGAIN; 2046 goto end; 2047 } 2048 2049 as_rangelock(as); 2050 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 2051 if (error != 0) { 2052 as_rangeunlock(as); 2053 goto end; 2054 } 2055 2056 vn_a.vp = vp; 2057 vn_a.offset = off; 2058 vn_a.type = flags & MAP_TYPE; 2059 vn_a.prot = prot; 2060 vn_a.maxprot = maxprot; 2061 vn_a.cred = cr; 2062 vn_a.amp = NULL; 2063 vn_a.flags = flags & ~MAP_TYPE; 2064 vn_a.szc = 0; 2065 vn_a.lgrp_mem_policy_flags = 0; 2066 2067 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a); 2068 as_rangeunlock(as); 2069 2070 end: 2071 return (error); 2072 } 2073 2074 /* ARGSUSED */ 2075 static int32_t 2076 udf_addmap(struct vnode *vp, 2077 offset_t off, 2078 struct as *as, 2079 caddr_t addr, 2080 size_t len, 2081 uint8_t prot, 2082 uint8_t maxprot, 2083 uint32_t flags, 2084 struct cred *cr, 2085 caller_context_t *ct) 2086 { 2087 struct ud_inode *ip = VTOI(vp); 2088 2089 ud_printf("udf_addmap\n"); 2090 2091 if (vp->v_flag & VNOMAP) { 2092 return (ENOSYS); 2093 } 2094 2095 mutex_enter(&ip->i_tlock); 2096 ip->i_mapcnt += btopr(len); 2097 mutex_exit(&ip->i_tlock); 2098 2099 return (0); 2100 } 2101 2102 /* ARGSUSED */ 2103 static int32_t 2104 udf_delmap( 2105 struct vnode *vp, offset_t off, 2106 struct as *as, 2107 caddr_t addr, 2108 size_t len, 2109 uint32_t prot, 2110 uint32_t maxprot, 2111 uint32_t flags, 2112 struct cred *cr, 2113 caller_context_t *ct) 2114 { 2115 struct ud_inode *ip = VTOI(vp); 2116 2117 ud_printf("udf_delmap\n"); 2118 2119 if (vp->v_flag & VNOMAP) { 2120 return (ENOSYS); 2121 } 2122 2123 mutex_enter(&ip->i_tlock); 2124 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 2125 ASSERT(ip->i_mapcnt >= 0); 2126 mutex_exit(&ip->i_tlock); 2127 2128 return (0); 2129 } 2130 2131 /* ARGSUSED */ 2132 static int32_t 2133 udf_l_pathconf( 2134 struct vnode *vp, 2135 int32_t cmd, 2136 ulong_t *valp, 2137 struct cred *cr, 2138 caller_context_t *ct) 2139 { 2140 int32_t error = 0; 2141 2142 ud_printf("udf_l_pathconf\n"); 2143 2144 if (cmd == _PC_FILESIZEBITS) { 2145 /* 2146 * udf supports 64 bits as file size 2147 * but there are several other restrictions 2148 * it only supports 32-bit block numbers and 2149 * daddr32_t is only and int32_t so taking these 2150 * into account we can stay just as where ufs is 2151 */ 2152 *valp = 41; 2153 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) { 2154 /* nanosecond timestamp resolution */ 2155 *valp = 1L; 2156 } else { 2157 error = fs_pathconf(vp, cmd, valp, cr, ct); 2158 } 2159 2160 return (error); 2161 } 2162 2163 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0; 2164 #ifndef __lint 2165 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads)) 2166 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes)) 2167 #endif 2168 /* 2169 * Assumption is that there will not be a pageio request 2170 * to a enbedded file 2171 */ 2172 /* ARGSUSED */ 2173 static int32_t 2174 udf_pageio( 2175 struct vnode *vp, 2176 struct page *pp, 2177 u_offset_t io_off, 2178 size_t io_len, 2179 int32_t flags, 2180 struct cred *cr, 2181 caller_context_t *ct) 2182 { 2183 daddr_t bn; 2184 struct buf *bp; 2185 struct ud_inode *ip = VTOI(vp); 2186 int32_t dolock, error = 0, contig, multi_io; 2187 size_t done_len = 0, cur_len = 0; 2188 page_t *npp = NULL, *opp = NULL, *cpp = pp; 2189 2190 if (pp == NULL) { 2191 return (EINVAL); 2192 } 2193 2194 dolock = (rw_owner(&ip->i_contents) != curthread); 2195 2196 /* 2197 * We need a better check. Ideally, we would use another 2198 * vnodeops so that hlocked and forcibly unmounted file 2199 * systems would return EIO where appropriate and w/o the 2200 * need for these checks. 2201 */ 2202 if (ip->i_udf == NULL) { 2203 return (EIO); 2204 } 2205 2206 #ifdef __lock_lint 2207 rw_enter(&ip->i_contents, RW_READER); 2208 #else 2209 if (dolock) { 2210 rw_enter(&ip->i_contents, RW_READER); 2211 } 2212 #endif 2213 2214 /* 2215 * Break the io request into chunks, one for each contiguous 2216 * stretch of disk blocks in the target file. 2217 */ 2218 while (done_len < io_len) { 2219 ASSERT(cpp); 2220 bp = NULL; 2221 contig = 0; 2222 if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len), 2223 &bn, &contig)) { 2224 break; 2225 } 2226 2227 if (bn == UDF_HOLE) { /* No holey swapfiles */ 2228 cmn_err(CE_WARN, "SWAP file has HOLES"); 2229 error = EINVAL; 2230 break; 2231 } 2232 2233 cur_len = MIN(io_len - done_len, contig); 2234 2235 /* 2236 * Check if more than one I/O is 2237 * required to complete the given 2238 * I/O operation 2239 */ 2240 if (ip->i_udf->udf_lbsize < PAGESIZE) { 2241 if (cur_len >= PAGESIZE) { 2242 multi_io = 0; 2243 cur_len &= PAGEMASK; 2244 } else { 2245 multi_io = 1; 2246 cur_len = MIN(io_len - done_len, PAGESIZE); 2247 } 2248 } 2249 page_list_break(&cpp, &npp, btop(cur_len)); 2250 2251 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 2252 ASSERT(bp != NULL); 2253 2254 bp->b_edev = ip->i_dev; 2255 bp->b_dev = cmpdev(ip->i_dev); 2256 bp->b_blkno = bn; 2257 bp->b_un.b_addr = (caddr_t)0; 2258 bp->b_file = vp; 2259 bp->b_offset = (offset_t)(io_off + done_len); 2260 2261 /* 2262 * ub.ub_pageios.value.ul++; 2263 */ 2264 if (multi_io == 0) { 2265 (void) bdev_strategy(bp); 2266 } else { 2267 error = ud_multi_strat(ip, cpp, bp, 2268 (u_offset_t)(io_off + done_len)); 2269 if (error != 0) { 2270 pageio_done(bp); 2271 break; 2272 } 2273 } 2274 if (flags & B_READ) { 2275 ud_pageio_reads++; 2276 } else { 2277 ud_pageio_writes++; 2278 } 2279 2280 /* 2281 * If the request is not B_ASYNC, wait for i/o to complete 2282 * and re-assemble the page list to return to the caller. 2283 * If it is B_ASYNC we leave the page list in pieces and 2284 * cleanup() will dispose of them. 2285 */ 2286 if ((flags & B_ASYNC) == 0) { 2287 error = biowait(bp); 2288 pageio_done(bp); 2289 if (error) { 2290 break; 2291 } 2292 page_list_concat(&opp, &cpp); 2293 } 2294 cpp = npp; 2295 npp = NULL; 2296 done_len += cur_len; 2297 } 2298 2299 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len)); 2300 if (error) { 2301 if (flags & B_ASYNC) { 2302 /* Cleanup unprocessed parts of list */ 2303 page_list_concat(&cpp, &npp); 2304 if (flags & B_READ) { 2305 pvn_read_done(cpp, B_ERROR); 2306 } else { 2307 pvn_write_done(cpp, B_ERROR); 2308 } 2309 } else { 2310 /* Re-assemble list and let caller clean up */ 2311 page_list_concat(&opp, &cpp); 2312 page_list_concat(&opp, &npp); 2313 } 2314 } 2315 2316 #ifdef __lock_lint 2317 rw_exit(&ip->i_contents); 2318 #else 2319 if (dolock) { 2320 rw_exit(&ip->i_contents); 2321 } 2322 #endif 2323 return (error); 2324 } 2325 2326 2327 2328 2329 /* -------------------- local functions --------------------------- */ 2330 2331 2332 2333 int32_t 2334 ud_rdwri(enum uio_rw rw, int32_t ioflag, 2335 struct ud_inode *ip, caddr_t base, int32_t len, 2336 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr) 2337 { 2338 int32_t error; 2339 struct uio auio; 2340 struct iovec aiov; 2341 2342 ud_printf("ud_rdwri\n"); 2343 2344 bzero((caddr_t)&auio, sizeof (uio_t)); 2345 bzero((caddr_t)&aiov, sizeof (iovec_t)); 2346 2347 aiov.iov_base = base; 2348 aiov.iov_len = len; 2349 auio.uio_iov = &aiov; 2350 auio.uio_iovcnt = 1; 2351 auio.uio_loffset = offset; 2352 auio.uio_segflg = (int16_t)seg; 2353 auio.uio_resid = len; 2354 2355 if (rw == UIO_WRITE) { 2356 auio.uio_fmode = FWRITE; 2357 auio.uio_extflg = UIO_COPY_DEFAULT; 2358 auio.uio_llimit = curproc->p_fsz_ctl; 2359 error = ud_wrip(ip, &auio, ioflag, cr); 2360 } else { 2361 auio.uio_fmode = FREAD; 2362 auio.uio_extflg = UIO_COPY_CACHED; 2363 auio.uio_llimit = MAXOFFSET_T; 2364 error = ud_rdip(ip, &auio, ioflag, cr); 2365 } 2366 2367 if (aresid) { 2368 *aresid = auio.uio_resid; 2369 } else if (auio.uio_resid) { 2370 error = EIO; 2371 } 2372 return (error); 2373 } 2374 2375 /* 2376 * Free behind hacks. The pager is busted. 2377 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 2378 * or B_FREE_IF_TIGHT_ON_MEMORY. 2379 */ 2380 int32_t ud_freebehind = 1; 2381 int32_t ud_smallfile = 32 * 1024; 2382 2383 /* ARGSUSED */ 2384 int32_t 2385 ud_getpage_miss(struct vnode *vp, u_offset_t off, 2386 size_t len, struct seg *seg, caddr_t addr, page_t *pl[], 2387 size_t plsz, enum seg_rw rw, int32_t seq) 2388 { 2389 struct ud_inode *ip = VTOI(vp); 2390 int32_t err = 0; 2391 size_t io_len; 2392 u_offset_t io_off; 2393 u_offset_t pgoff; 2394 page_t *pp; 2395 2396 pl[0] = NULL; 2397 2398 /* 2399 * Figure out whether the page can be created, or must be 2400 * read from the disk 2401 */ 2402 if (rw == S_CREATE) { 2403 if ((pp = page_create_va(vp, off, 2404 PAGESIZE, PG_WAIT, seg, addr)) == NULL) { 2405 cmn_err(CE_WARN, "ud_getpage_miss: page_create"); 2406 return (EINVAL); 2407 } 2408 io_len = PAGESIZE; 2409 } else { 2410 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 2411 &io_len, off, PAGESIZE, 0); 2412 2413 /* 2414 * Some other thread has entered the page. 2415 * ud_getpage will retry page_lookup. 2416 */ 2417 if (pp == NULL) { 2418 return (0); 2419 } 2420 2421 /* 2422 * Fill the page with as much data as we can from the file. 2423 */ 2424 err = ud_page_fill(ip, pp, off, B_READ, &pgoff); 2425 if (err) { 2426 pvn_read_done(pp, B_ERROR); 2427 return (err); 2428 } 2429 2430 /* 2431 * XXX ??? ufs has io_len instead of pgoff below 2432 */ 2433 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK); 2434 2435 /* 2436 * If the file access is sequential, initiate read ahead 2437 * of the next cluster. 2438 */ 2439 if (seq && ip->i_nextrio < ip->i_size) { 2440 ud_getpage_ra(vp, off, seg, addr); 2441 } 2442 } 2443 2444 outmiss: 2445 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw); 2446 return (err); 2447 } 2448 2449 /* ARGSUSED */ 2450 void 2451 ud_getpage_ra(struct vnode *vp, 2452 u_offset_t off, struct seg *seg, caddr_t addr) 2453 { 2454 page_t *pp; 2455 size_t io_len; 2456 struct ud_inode *ip = VTOI(vp); 2457 u_offset_t io_off = ip->i_nextrio, pgoff; 2458 caddr_t addr2 = addr + (io_off - off); 2459 daddr_t bn; 2460 int32_t contig = 0; 2461 2462 /* 2463 * Is this test needed? 2464 */ 2465 2466 if (addr2 >= seg->s_base + seg->s_size) { 2467 return; 2468 } 2469 2470 contig = 0; 2471 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) { 2472 return; 2473 } 2474 2475 pp = pvn_read_kluster(vp, io_off, seg, addr2, 2476 &io_off, &io_len, io_off, PAGESIZE, 1); 2477 2478 /* 2479 * Some other thread has entered the page. 2480 * So no read head done here (ie we will have to and wait 2481 * for the read when needed). 2482 */ 2483 2484 if (pp == NULL) { 2485 return; 2486 } 2487 2488 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff); 2489 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK); 2490 } 2491 2492 int 2493 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off, 2494 uint32_t bflgs, u_offset_t *pg_off) 2495 { 2496 daddr_t bn; 2497 struct buf *bp; 2498 caddr_t kaddr, caddr; 2499 int32_t error = 0, contig = 0, multi_io = 0; 2500 int32_t lbsize = ip->i_udf->udf_lbsize; 2501 int32_t lbmask = ip->i_udf->udf_lbmask; 2502 uint64_t isize; 2503 2504 isize = (ip->i_size + lbmask) & (~lbmask); 2505 if (ip->i_desc_type == ICB_FLAG_ONE_AD) { 2506 2507 /* 2508 * Embedded file read file_entry 2509 * from buffer cache and copy the required 2510 * portions 2511 */ 2512 bp = ud_bread(ip->i_dev, 2513 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize); 2514 if ((bp->b_error == 0) && 2515 (bp->b_resid == 0)) { 2516 2517 caddr = bp->b_un.b_addr + ip->i_data_off; 2518 2519 /* 2520 * mapin to kvm 2521 */ 2522 kaddr = (caddr_t)ppmapin(pp, 2523 PROT_READ | PROT_WRITE, (caddr_t)-1); 2524 (void) kcopy(caddr, kaddr, ip->i_size); 2525 2526 /* 2527 * mapout of kvm 2528 */ 2529 ppmapout(kaddr); 2530 } 2531 brelse(bp); 2532 contig = ip->i_size; 2533 } else { 2534 2535 /* 2536 * Get the continuous size and block number 2537 * at offset "off" 2538 */ 2539 if (error = ud_bmap_read(ip, off, &bn, &contig)) 2540 goto out; 2541 contig = MIN(contig, PAGESIZE); 2542 contig = (contig + lbmask) & (~lbmask); 2543 2544 /* 2545 * Zero part of the page which we are not 2546 * going to read from the disk. 2547 */ 2548 2549 if (bn == UDF_HOLE) { 2550 2551 /* 2552 * This is a HOLE. Just zero out 2553 * the page 2554 */ 2555 if (((off + contig) == isize) || 2556 (contig == PAGESIZE)) { 2557 pagezero(pp->p_prev, 0, PAGESIZE); 2558 goto out; 2559 } 2560 } 2561 2562 if (contig < PAGESIZE) { 2563 uint64_t count; 2564 2565 count = isize - off; 2566 if (contig != count) { 2567 multi_io = 1; 2568 contig = (int32_t)(MIN(count, PAGESIZE)); 2569 } else { 2570 pagezero(pp->p_prev, contig, PAGESIZE - contig); 2571 } 2572 } 2573 2574 /* 2575 * Get a bp and initialize it 2576 */ 2577 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs); 2578 ASSERT(bp != NULL); 2579 2580 bp->b_edev = ip->i_dev; 2581 bp->b_dev = cmpdev(ip->i_dev); 2582 bp->b_blkno = bn; 2583 bp->b_un.b_addr = 0; 2584 bp->b_file = ip->i_vnode; 2585 2586 /* 2587 * Start I/O 2588 */ 2589 if (multi_io == 0) { 2590 2591 /* 2592 * Single I/O is sufficient for this page 2593 */ 2594 (void) bdev_strategy(bp); 2595 } else { 2596 2597 /* 2598 * We need to do the I/O in 2599 * piece's 2600 */ 2601 error = ud_multi_strat(ip, pp, bp, off); 2602 if (error != 0) { 2603 goto out; 2604 } 2605 } 2606 if ((bflgs & B_ASYNC) == 0) { 2607 2608 /* 2609 * Wait for i/o to complete. 2610 */ 2611 2612 error = biowait(bp); 2613 pageio_done(bp); 2614 if (error) { 2615 goto out; 2616 } 2617 } 2618 } 2619 if ((off + contig) >= ip->i_size) { 2620 contig = ip->i_size - off; 2621 } 2622 2623 out: 2624 *pg_off = contig; 2625 return (error); 2626 } 2627 2628 int32_t 2629 ud_putpages(struct vnode *vp, offset_t off, 2630 size_t len, int32_t flags, struct cred *cr) 2631 { 2632 struct ud_inode *ip; 2633 page_t *pp; 2634 u_offset_t io_off; 2635 size_t io_len; 2636 u_offset_t eoff; 2637 int32_t err = 0; 2638 int32_t dolock; 2639 2640 ud_printf("ud_putpages\n"); 2641 2642 if (vp->v_count == 0) { 2643 cmn_err(CE_WARN, "ud_putpages: bad v_count"); 2644 return (EINVAL); 2645 } 2646 2647 ip = VTOI(vp); 2648 2649 /* 2650 * Acquire the readers/write inode lock before locking 2651 * any pages in this inode. 2652 * The inode lock is held during i/o. 2653 */ 2654 if (len == 0) { 2655 mutex_enter(&ip->i_tlock); 2656 ip->i_delayoff = ip->i_delaylen = 0; 2657 mutex_exit(&ip->i_tlock); 2658 } 2659 #ifdef __lock_lint 2660 rw_enter(&ip->i_contents, RW_READER); 2661 #else 2662 dolock = (rw_owner(&ip->i_contents) != curthread); 2663 if (dolock) { 2664 rw_enter(&ip->i_contents, RW_READER); 2665 } 2666 #endif 2667 2668 if (!vn_has_cached_data(vp)) { 2669 #ifdef __lock_lint 2670 rw_exit(&ip->i_contents); 2671 #else 2672 if (dolock) { 2673 rw_exit(&ip->i_contents); 2674 } 2675 #endif 2676 return (0); 2677 } 2678 2679 if (len == 0) { 2680 /* 2681 * Search the entire vp list for pages >= off. 2682 */ 2683 err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage, 2684 flags, cr); 2685 } else { 2686 /* 2687 * Loop over all offsets in the range looking for 2688 * pages to deal with. 2689 */ 2690 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) { 2691 eoff = MIN(off + len, eoff); 2692 } else { 2693 eoff = off + len; 2694 } 2695 2696 for (io_off = off; io_off < eoff; io_off += io_len) { 2697 /* 2698 * If we are not invalidating, synchronously 2699 * freeing or writing pages, use the routine 2700 * page_lookup_nowait() to prevent reclaiming 2701 * them from the free list. 2702 */ 2703 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 2704 pp = page_lookup(vp, io_off, 2705 (flags & (B_INVAL | B_FREE)) ? 2706 SE_EXCL : SE_SHARED); 2707 } else { 2708 pp = page_lookup_nowait(vp, io_off, 2709 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2710 } 2711 2712 if (pp == NULL || pvn_getdirty(pp, flags) == 0) { 2713 io_len = PAGESIZE; 2714 } else { 2715 2716 err = ud_putapage(vp, pp, 2717 &io_off, &io_len, flags, cr); 2718 if (err != 0) { 2719 break; 2720 } 2721 /* 2722 * "io_off" and "io_len" are returned as 2723 * the range of pages we actually wrote. 2724 * This allows us to skip ahead more quickly 2725 * since several pages may've been dealt 2726 * with by this iteration of the loop. 2727 */ 2728 } 2729 } 2730 } 2731 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 2732 /* 2733 * We have just sync'ed back all the pages on 2734 * the inode, turn off the IMODTIME flag. 2735 */ 2736 mutex_enter(&ip->i_tlock); 2737 ip->i_flag &= ~IMODTIME; 2738 mutex_exit(&ip->i_tlock); 2739 } 2740 #ifdef __lock_lint 2741 rw_exit(&ip->i_contents); 2742 #else 2743 if (dolock) { 2744 rw_exit(&ip->i_contents); 2745 } 2746 #endif 2747 return (err); 2748 } 2749 2750 /* ARGSUSED */ 2751 int32_t 2752 ud_putapage(struct vnode *vp, 2753 page_t *pp, u_offset_t *offp, 2754 size_t *lenp, int32_t flags, struct cred *cr) 2755 { 2756 daddr_t bn; 2757 size_t io_len; 2758 struct ud_inode *ip; 2759 int32_t error = 0, contig, multi_io = 0; 2760 struct udf_vfs *udf_vfsp; 2761 u_offset_t off, io_off; 2762 caddr_t kaddr, caddr; 2763 struct buf *bp = NULL; 2764 int32_t lbmask; 2765 uint64_t isize; 2766 int32_t crc_len; 2767 struct file_entry *fe; 2768 2769 ud_printf("ud_putapage\n"); 2770 2771 ip = VTOI(vp); 2772 ASSERT(ip); 2773 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2774 lbmask = ip->i_udf->udf_lbmask; 2775 isize = (ip->i_size + lbmask) & (~lbmask); 2776 2777 udf_vfsp = ip->i_udf; 2778 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW); 2779 2780 /* 2781 * If the modified time on the inode has not already been 2782 * set elsewhere (e.g. for write/setattr) we set the time now. 2783 * This gives us approximate modified times for mmap'ed files 2784 * which are modified via stores in the user address space. 2785 */ 2786 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) { 2787 mutex_enter(&ip->i_tlock); 2788 ip->i_flag |= IUPD; 2789 ITIMES_NOLOCK(ip); 2790 mutex_exit(&ip->i_tlock); 2791 } 2792 2793 2794 /* 2795 * Align the request to a block boundry (for old file systems), 2796 * and go ask bmap() how contiguous things are for this file. 2797 */ 2798 off = pp->p_offset & ~(offset_t)lbmask; 2799 /* block align it */ 2800 2801 2802 if (ip->i_desc_type == ICB_FLAG_ONE_AD) { 2803 ASSERT(ip->i_size <= ip->i_max_emb); 2804 2805 pp = pvn_write_kluster(vp, pp, &io_off, 2806 &io_len, off, PAGESIZE, flags); 2807 if (io_len == 0) { 2808 io_len = PAGESIZE; 2809 } 2810 2811 bp = ud_bread(ip->i_dev, 2812 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift, 2813 udf_vfsp->udf_lbsize); 2814 fe = (struct file_entry *)bp->b_un.b_addr; 2815 if ((bp->b_flags & B_ERROR) || 2816 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY, 2817 ip->i_icb_block, 2818 1, udf_vfsp->udf_lbsize) != 0)) { 2819 if (pp != NULL) 2820 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 2821 if (bp->b_flags & B_ERROR) { 2822 error = EIO; 2823 } else { 2824 error = EINVAL; 2825 } 2826 brelse(bp); 2827 return (error); 2828 } 2829 if ((bp->b_error == 0) && 2830 (bp->b_resid == 0)) { 2831 2832 caddr = bp->b_un.b_addr + ip->i_data_off; 2833 kaddr = (caddr_t)ppmapin(pp, 2834 PROT_READ | PROT_WRITE, (caddr_t)-1); 2835 (void) kcopy(kaddr, caddr, ip->i_size); 2836 ppmapout(kaddr); 2837 } 2838 crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) + 2839 SWAP_32(fe->fe_len_ear); 2840 crc_len += ip->i_size; 2841 ud_make_tag(ip->i_udf, &fe->fe_tag, 2842 UD_FILE_ENTRY, ip->i_icb_block, crc_len); 2843 2844 bwrite(bp); 2845 2846 if (flags & B_ASYNC) { 2847 pvn_write_done(pp, flags); 2848 } 2849 contig = ip->i_size; 2850 } else { 2851 2852 if (error = ud_bmap_read(ip, off, &bn, &contig)) { 2853 goto out; 2854 } 2855 contig = MIN(contig, PAGESIZE); 2856 contig = (contig + lbmask) & (~lbmask); 2857 2858 if (contig < PAGESIZE) { 2859 uint64_t count; 2860 2861 count = isize - off; 2862 if (contig != count) { 2863 multi_io = 1; 2864 contig = (int32_t)(MIN(count, PAGESIZE)); 2865 } 2866 } 2867 2868 if ((off + contig) > isize) { 2869 contig = isize - off; 2870 } 2871 2872 if (contig > PAGESIZE) { 2873 if (contig & PAGEOFFSET) { 2874 contig &= PAGEMASK; 2875 } 2876 } 2877 2878 pp = pvn_write_kluster(vp, pp, &io_off, 2879 &io_len, off, contig, flags); 2880 if (io_len == 0) { 2881 io_len = PAGESIZE; 2882 } 2883 2884 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags); 2885 ASSERT(bp != NULL); 2886 2887 bp->b_edev = ip->i_dev; 2888 bp->b_dev = cmpdev(ip->i_dev); 2889 bp->b_blkno = bn; 2890 bp->b_un.b_addr = 0; 2891 bp->b_file = vp; 2892 bp->b_offset = (offset_t)off; 2893 2894 2895 /* 2896 * write throttle 2897 */ 2898 ASSERT(bp->b_iodone == NULL); 2899 bp->b_iodone = ud_iodone; 2900 mutex_enter(&ip->i_tlock); 2901 ip->i_writes += bp->b_bcount; 2902 mutex_exit(&ip->i_tlock); 2903 2904 if (multi_io == 0) { 2905 2906 (void) bdev_strategy(bp); 2907 } else { 2908 error = ud_multi_strat(ip, pp, bp, off); 2909 if (error != 0) { 2910 goto out; 2911 } 2912 } 2913 2914 if ((flags & B_ASYNC) == 0) { 2915 /* 2916 * Wait for i/o to complete. 2917 */ 2918 error = biowait(bp); 2919 pageio_done(bp); 2920 } 2921 } 2922 2923 if ((flags & B_ASYNC) == 0) { 2924 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags); 2925 } 2926 2927 pp = NULL; 2928 2929 out: 2930 if (error != 0 && pp != NULL) { 2931 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 2932 } 2933 2934 if (offp) { 2935 *offp = io_off; 2936 } 2937 if (lenp) { 2938 *lenp = io_len; 2939 } 2940 2941 return (error); 2942 } 2943 2944 2945 int32_t 2946 ud_iodone(struct buf *bp) 2947 { 2948 struct ud_inode *ip; 2949 2950 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 2951 2952 bp->b_iodone = NULL; 2953 2954 ip = VTOI(bp->b_pages->p_vnode); 2955 2956 mutex_enter(&ip->i_tlock); 2957 if (ip->i_writes >= ud_LW) { 2958 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) { 2959 if (ud_WRITES) { 2960 cv_broadcast(&ip->i_wrcv); /* wake all up */ 2961 } 2962 } 2963 } else { 2964 ip->i_writes -= bp->b_bcount; 2965 } 2966 mutex_exit(&ip->i_tlock); 2967 iodone(bp); 2968 return (0); 2969 } 2970 2971 /* ARGSUSED3 */ 2972 int32_t 2973 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr) 2974 { 2975 struct vnode *vp; 2976 struct udf_vfs *udf_vfsp; 2977 krw_t rwtype; 2978 caddr_t base; 2979 uint32_t flags; 2980 int32_t error, n, on, mapon, dofree; 2981 u_offset_t off; 2982 long oresid = uio->uio_resid; 2983 2984 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2985 if ((ip->i_type != VREG) && 2986 (ip->i_type != VDIR) && 2987 (ip->i_type != VLNK)) { 2988 return (EIO); 2989 } 2990 2991 if (uio->uio_loffset > MAXOFFSET_T) { 2992 return (0); 2993 } 2994 2995 if ((uio->uio_loffset < (offset_t)0) || 2996 ((uio->uio_loffset + uio->uio_resid) < 0)) { 2997 return (EINVAL); 2998 } 2999 if (uio->uio_resid == 0) { 3000 return (0); 3001 } 3002 3003 vp = ITOV(ip); 3004 udf_vfsp = ip->i_udf; 3005 mutex_enter(&ip->i_tlock); 3006 ip->i_flag |= IACC; 3007 mutex_exit(&ip->i_tlock); 3008 3009 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 3010 3011 do { 3012 offset_t diff; 3013 u_offset_t uoff = uio->uio_loffset; 3014 off = uoff & (offset_t)MAXBMASK; 3015 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 3016 on = (int)blkoff(udf_vfsp, uoff); 3017 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); 3018 3019 diff = ip->i_size - uoff; 3020 3021 if (diff <= (offset_t)0) { 3022 error = 0; 3023 goto out; 3024 } 3025 if (diff < (offset_t)n) { 3026 n = (int)diff; 3027 } 3028 dofree = ud_freebehind && 3029 ip->i_nextr == (off & PAGEMASK) && 3030 off > ud_smallfile; 3031 3032 #ifndef __lock_lint 3033 if (rwtype == RW_READER) { 3034 rw_exit(&ip->i_contents); 3035 } 3036 #endif 3037 3038 base = segmap_getmapflt(segkmap, vp, (off + mapon), 3039 (uint32_t)n, 1, S_READ); 3040 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 3041 3042 flags = 0; 3043 if (!error) { 3044 /* 3045 * If read a whole block, or read to eof, 3046 * won't need this buffer again soon. 3047 */ 3048 if (n + on == MAXBSIZE && ud_freebehind && dofree && 3049 freemem < lotsfree + pages_before_pager) { 3050 flags = SM_FREE | SM_DONTNEED |SM_ASYNC; 3051 } 3052 /* 3053 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 3054 * we want to make sure that the page which has 3055 * been read, is written on disk if it is dirty. 3056 * And corresponding indirect blocks should also 3057 * be flushed out. 3058 */ 3059 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 3060 flags &= ~SM_ASYNC; 3061 flags |= SM_WRITE; 3062 } 3063 error = segmap_release(segkmap, base, flags); 3064 } else { 3065 (void) segmap_release(segkmap, base, flags); 3066 } 3067 3068 #ifndef __lock_lint 3069 if (rwtype == RW_READER) { 3070 rw_enter(&ip->i_contents, rwtype); 3071 } 3072 #endif 3073 } while (error == 0 && uio->uio_resid > 0 && n != 0); 3074 out: 3075 /* 3076 * Inode is updated according to this table if FRSYNC is set. 3077 * 3078 * FSYNC FDSYNC(posix.4) 3079 * -------------------------- 3080 * always IATTCHG|IBDWRITE 3081 */ 3082 if (ioflag & FRSYNC) { 3083 if ((ioflag & FSYNC) || 3084 ((ioflag & FDSYNC) && 3085 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 3086 rw_exit(&ip->i_contents); 3087 rw_enter(&ip->i_contents, RW_WRITER); 3088 ud_iupdat(ip, 1); 3089 } 3090 } 3091 /* 3092 * If we've already done a partial read, terminate 3093 * the read but return no error. 3094 */ 3095 if (oresid != uio->uio_resid) { 3096 error = 0; 3097 } 3098 ITIMES(ip); 3099 3100 return (error); 3101 } 3102 3103 int32_t 3104 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr) 3105 { 3106 caddr_t base; 3107 struct vnode *vp; 3108 struct udf_vfs *udf_vfsp; 3109 uint32_t flags; 3110 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0; 3111 int32_t pagecreate, newpage; 3112 uint64_t old_i_size; 3113 u_offset_t off; 3114 long start_resid = uio->uio_resid, premove_resid; 3115 rlim64_t limit = uio->uio_limit; 3116 3117 3118 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 3119 if ((ip->i_type != VREG) && 3120 (ip->i_type != VDIR) && 3121 (ip->i_type != VLNK)) { 3122 return (EIO); 3123 } 3124 3125 if (uio->uio_loffset >= MAXOFFSET_T) { 3126 return (EFBIG); 3127 } 3128 /* 3129 * see udf_l_pathconf 3130 */ 3131 if (limit > (((uint64_t)1 << 40) - 1)) { 3132 limit = ((uint64_t)1 << 40) - 1; 3133 } 3134 if (uio->uio_loffset >= limit) { 3135 proc_t *p = ttoproc(curthread); 3136 3137 mutex_enter(&p->p_lock); 3138 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 3139 p, RCA_UNSAFE_SIGINFO); 3140 mutex_exit(&p->p_lock); 3141 return (EFBIG); 3142 } 3143 if ((uio->uio_loffset < (offset_t)0) || 3144 ((uio->uio_loffset + uio->uio_resid) < 0)) { 3145 return (EINVAL); 3146 } 3147 if (uio->uio_resid == 0) { 3148 return (0); 3149 } 3150 3151 mutex_enter(&ip->i_tlock); 3152 ip->i_flag |= INOACC; 3153 3154 if (ioflag & (FSYNC | FDSYNC)) { 3155 ip->i_flag |= ISYNC; 3156 iupdat_flag = 1; 3157 } 3158 mutex_exit(&ip->i_tlock); 3159 3160 udf_vfsp = ip->i_udf; 3161 vp = ITOV(ip); 3162 3163 do { 3164 u_offset_t uoff = uio->uio_loffset; 3165 off = uoff & (offset_t)MAXBMASK; 3166 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 3167 on = (int)blkoff(udf_vfsp, uoff); 3168 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); 3169 3170 if (ip->i_type == VREG && uoff + n >= limit) { 3171 if (uoff >= limit) { 3172 error = EFBIG; 3173 goto out; 3174 } 3175 n = (int)(limit - (rlim64_t)uoff); 3176 } 3177 if (uoff + n > ip->i_size) { 3178 /* 3179 * We are extending the length of the file. 3180 * bmap is used so that we are sure that 3181 * if we need to allocate new blocks, that it 3182 * is done here before we up the file size. 3183 */ 3184 error = ud_bmap_write(ip, uoff, 3185 (int)(on + n), mapon == 0, cr); 3186 if (error) { 3187 break; 3188 } 3189 i_size_changed = 1; 3190 old_i_size = ip->i_size; 3191 ip->i_size = uoff + n; 3192 /* 3193 * If we are writing from the beginning of 3194 * the mapping, we can just create the 3195 * pages without having to read them. 3196 */ 3197 pagecreate = (mapon == 0); 3198 } else if (n == MAXBSIZE) { 3199 /* 3200 * Going to do a whole mappings worth, 3201 * so we can just create the pages w/o 3202 * having to read them in. But before 3203 * we do that, we need to make sure any 3204 * needed blocks are allocated first. 3205 */ 3206 error = ud_bmap_write(ip, uoff, 3207 (int)(on + n), 1, cr); 3208 if (error) { 3209 break; 3210 } 3211 pagecreate = 1; 3212 } else { 3213 pagecreate = 0; 3214 } 3215 3216 rw_exit(&ip->i_contents); 3217 3218 /* 3219 * Touch the page and fault it in if it is not in 3220 * core before segmap_getmapflt can lock it. This 3221 * is to avoid the deadlock if the buffer is mapped 3222 * to the same file through mmap which we want to 3223 * write to. 3224 */ 3225 uio_prefaultpages((long)n, uio); 3226 3227 base = segmap_getmapflt(segkmap, vp, (off + mapon), 3228 (uint32_t)n, !pagecreate, S_WRITE); 3229 3230 /* 3231 * segmap_pagecreate() returns 1 if it calls 3232 * page_create_va() to allocate any pages. 3233 */ 3234 newpage = 0; 3235 if (pagecreate) { 3236 newpage = segmap_pagecreate(segkmap, base, 3237 (size_t)n, 0); 3238 } 3239 3240 premove_resid = uio->uio_resid; 3241 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 3242 3243 if (pagecreate && 3244 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 3245 /* 3246 * We created pages w/o initializing them completely, 3247 * thus we need to zero the part that wasn't set up. 3248 * This happens on most EOF write cases and if 3249 * we had some sort of error during the uiomove. 3250 */ 3251 int nzero, nmoved; 3252 3253 nmoved = (int)(uio->uio_loffset - (off + mapon)); 3254 ASSERT(nmoved >= 0 && nmoved <= n); 3255 nzero = roundup(on + n, PAGESIZE) - nmoved; 3256 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 3257 (void) kzero(base + mapon + nmoved, (uint32_t)nzero); 3258 } 3259 3260 /* 3261 * Unlock the pages allocated by page_create_va() 3262 * in segmap_pagecreate() 3263 */ 3264 if (newpage) { 3265 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 3266 } 3267 3268 if (error) { 3269 /* 3270 * If we failed on a write, we may have already 3271 * allocated file blocks as well as pages. It's 3272 * hard to undo the block allocation, but we must 3273 * be sure to invalidate any pages that may have 3274 * been allocated. 3275 */ 3276 (void) segmap_release(segkmap, base, SM_INVAL); 3277 } else { 3278 flags = 0; 3279 /* 3280 * Force write back for synchronous write cases. 3281 */ 3282 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) { 3283 /* 3284 * If the sticky bit is set but the 3285 * execute bit is not set, we do a 3286 * synchronous write back and free 3287 * the page when done. We set up swap 3288 * files to be handled this way to 3289 * prevent servers from keeping around 3290 * the client's swap pages too long. 3291 * XXX - there ought to be a better way. 3292 */ 3293 if (IS_SWAPVP(vp)) { 3294 flags = SM_WRITE | SM_FREE | 3295 SM_DONTNEED; 3296 iupdat_flag = 0; 3297 } else { 3298 flags = SM_WRITE; 3299 } 3300 } else if (((mapon + n) == MAXBSIZE) || 3301 IS_SWAPVP(vp)) { 3302 /* 3303 * Have written a whole block. 3304 * Start an asynchronous write and 3305 * mark the buffer to indicate that 3306 * it won't be needed again soon. 3307 */ 3308 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED; 3309 } 3310 error = segmap_release(segkmap, base, flags); 3311 3312 /* 3313 * If the operation failed and is synchronous, 3314 * then we need to unwind what uiomove() last 3315 * did so we can potentially return an error to 3316 * the caller. If this write operation was 3317 * done in two pieces and the first succeeded, 3318 * then we won't return an error for the second 3319 * piece that failed. However, we only want to 3320 * return a resid value that reflects what was 3321 * really done. 3322 * 3323 * Failures for non-synchronous operations can 3324 * be ignored since the page subsystem will 3325 * retry the operation until it succeeds or the 3326 * file system is unmounted. 3327 */ 3328 if (error) { 3329 if ((ioflag & (FSYNC | FDSYNC)) || 3330 ip->i_type == VDIR) { 3331 uio->uio_resid = premove_resid; 3332 } else { 3333 error = 0; 3334 } 3335 } 3336 } 3337 3338 /* 3339 * Re-acquire contents lock. 3340 */ 3341 rw_enter(&ip->i_contents, RW_WRITER); 3342 /* 3343 * If the uiomove() failed or if a synchronous 3344 * page push failed, fix up i_size. 3345 */ 3346 if (error) { 3347 if (i_size_changed) { 3348 /* 3349 * The uiomove failed, and we 3350 * allocated blocks,so get rid 3351 * of them. 3352 */ 3353 (void) ud_itrunc(ip, old_i_size, 0, cr); 3354 } 3355 } else { 3356 /* 3357 * XXX - Can this be out of the loop? 3358 */ 3359 ip->i_flag |= IUPD | ICHG; 3360 if (i_size_changed) { 3361 ip->i_flag |= IATTCHG; 3362 } 3363 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) | 3364 (IEXEC >> 10))) != 0 && 3365 (ip->i_char & (ISUID | ISGID)) != 0 && 3366 secpolicy_vnode_setid_retain(cr, 3367 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) { 3368 /* 3369 * Clear Set-UID & Set-GID bits on 3370 * successful write if not privileged 3371 * and at least one of the execute bits 3372 * is set. If we always clear Set-GID, 3373 * mandatory file and record locking is 3374 * unuseable. 3375 */ 3376 ip->i_char &= ~(ISUID | ISGID); 3377 } 3378 } 3379 } while (error == 0 && uio->uio_resid > 0 && n != 0); 3380 3381 out: 3382 /* 3383 * Inode is updated according to this table - 3384 * 3385 * FSYNC FDSYNC(posix.4) 3386 * -------------------------- 3387 * always@ IATTCHG|IBDWRITE 3388 * 3389 * @ - If we are doing synchronous write the only time we should 3390 * not be sync'ing the ip here is if we have the stickyhack 3391 * activated, the file is marked with the sticky bit and 3392 * no exec bit, the file length has not been changed and 3393 * no new blocks have been allocated during this write. 3394 */ 3395 if ((ip->i_flag & ISYNC) != 0) { 3396 /* 3397 * we have eliminated nosync 3398 */ 3399 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 3400 ((ioflag & FSYNC) && iupdat_flag)) { 3401 ud_iupdat(ip, 1); 3402 } 3403 } 3404 3405 /* 3406 * If we've already done a partial-write, terminate 3407 * the write but return no error. 3408 */ 3409 if (start_resid != uio->uio_resid) { 3410 error = 0; 3411 } 3412 ip->i_flag &= ~(INOACC | ISYNC); 3413 ITIMES_NOLOCK(ip); 3414 3415 return (error); 3416 } 3417 3418 int32_t 3419 ud_multi_strat(struct ud_inode *ip, 3420 page_t *pp, struct buf *bp, u_offset_t start) 3421 { 3422 daddr_t bn; 3423 int32_t error = 0, io_count, contig, alloc_sz, i; 3424 uint32_t io_off; 3425 mio_master_t *mm = NULL; 3426 mio_slave_t *ms = NULL; 3427 struct buf *rbp; 3428 3429 ASSERT(!(start & PAGEOFFSET)); 3430 3431 /* 3432 * Figure out how many buffers to allocate 3433 */ 3434 io_count = 0; 3435 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { 3436 contig = 0; 3437 if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off), 3438 &bn, &contig)) { 3439 goto end; 3440 } 3441 if (contig == 0) { 3442 goto end; 3443 } 3444 contig = MIN(contig, PAGESIZE - io_off); 3445 if (bn != UDF_HOLE) { 3446 io_count ++; 3447 } else { 3448 /* 3449 * HOLE 3450 */ 3451 if (bp->b_flags & B_READ) { 3452 3453 /* 3454 * This is a hole and is read 3455 * it should be filled with 0's 3456 */ 3457 pagezero(pp, io_off, contig); 3458 } 3459 } 3460 } 3461 3462 3463 if (io_count != 0) { 3464 3465 /* 3466 * Allocate memory for all the 3467 * required number of buffers 3468 */ 3469 alloc_sz = sizeof (mio_master_t) + 3470 (sizeof (mio_slave_t) * io_count); 3471 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP); 3472 if (mm == NULL) { 3473 error = ENOMEM; 3474 goto end; 3475 } 3476 3477 /* 3478 * initialize master 3479 */ 3480 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL); 3481 mm->mm_size = alloc_sz; 3482 mm->mm_bp = bp; 3483 mm->mm_resid = 0; 3484 mm->mm_error = 0; 3485 mm->mm_index = master_index++; 3486 3487 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); 3488 3489 /* 3490 * Initialize buffers 3491 */ 3492 io_count = 0; 3493 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { 3494 contig = 0; 3495 if (error = ud_bmap_read(ip, 3496 (u_offset_t)(start + io_off), 3497 &bn, &contig)) { 3498 goto end; 3499 } 3500 ASSERT(contig); 3501 if ((io_off + contig) > bp->b_bcount) { 3502 contig = bp->b_bcount - io_off; 3503 } 3504 if (bn != UDF_HOLE) { 3505 /* 3506 * Clone the buffer 3507 * and prepare to start I/O 3508 */ 3509 ms->ms_ptr = mm; 3510 bioinit(&ms->ms_buf); 3511 rbp = bioclone(bp, io_off, (size_t)contig, 3512 bp->b_edev, bn, ud_slave_done, 3513 &ms->ms_buf, KM_NOSLEEP); 3514 ASSERT(rbp == &ms->ms_buf); 3515 mm->mm_resid += contig; 3516 io_count++; 3517 ms ++; 3518 } 3519 } 3520 3521 /* 3522 * Start I/O's 3523 */ 3524 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); 3525 for (i = 0; i < io_count; i++) { 3526 (void) bdev_strategy(&ms->ms_buf); 3527 ms ++; 3528 } 3529 } 3530 3531 end: 3532 if (error != 0) { 3533 bp->b_flags |= B_ERROR; 3534 bp->b_error = error; 3535 if (mm != NULL) { 3536 mutex_destroy(&mm->mm_mutex); 3537 kmem_free(mm, mm->mm_size); 3538 } 3539 } 3540 return (error); 3541 } 3542 3543 int32_t 3544 ud_slave_done(struct buf *bp) 3545 { 3546 mio_master_t *mm; 3547 int32_t resid; 3548 3549 ASSERT(SEMA_HELD(&bp->b_sem)); 3550 ASSERT((bp->b_flags & B_DONE) == 0); 3551 3552 mm = ((mio_slave_t *)bp)->ms_ptr; 3553 3554 /* 3555 * Propagate error and byte count info from slave struct to 3556 * the master struct 3557 */ 3558 mutex_enter(&mm->mm_mutex); 3559 if (bp->b_flags & B_ERROR) { 3560 3561 /* 3562 * If multiple slave buffers get 3563 * error we forget the old errors 3564 * this is ok because we any way 3565 * cannot return multiple errors 3566 */ 3567 mm->mm_error = bp->b_error; 3568 } 3569 mm->mm_resid -= bp->b_bcount; 3570 resid = mm->mm_resid; 3571 mutex_exit(&mm->mm_mutex); 3572 3573 /* 3574 * free up the resources allocated to cloned buffers. 3575 */ 3576 bp_mapout(bp); 3577 biofini(bp); 3578 3579 if (resid == 0) { 3580 3581 /* 3582 * This is the last I/O operation 3583 * clean up and return the original buffer 3584 */ 3585 if (mm->mm_error) { 3586 mm->mm_bp->b_flags |= B_ERROR; 3587 mm->mm_bp->b_error = mm->mm_error; 3588 } 3589 biodone(mm->mm_bp); 3590 mutex_destroy(&mm->mm_mutex); 3591 kmem_free(mm, mm->mm_size); 3592 } 3593 return (0); 3594 } 3595