xref: /illumos-gate/usr/src/uts/common/fs/udfs/udf_vnops.c (revision b31ca922c7346747131aed07c0c171ec2f573aac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2015, Joyent, Inc.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/stat.h>
44 #include <sys/vnode.h>
45 #include <sys/mode.h>
46 #include <sys/proc.h>
47 #include <sys/disp.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/dnlc.h>
54 #include <sys/conf.h>
55 #include <sys/errno.h>
56 #include <sys/mman.h>
57 #include <sys/fbuf.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/dirent.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/statvfs.h>
66 #include <sys/mount.h>
67 #include <sys/sunddi.h>
68 #include <sys/bootconf.h>
69 #include <sys/policy.h>
70 
71 #include <vm/hat.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_map.h>
77 #include <vm/seg_kmem.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <vm/page.h>
81 #include <sys/swap.h>
82 
83 #include <fs/fs_subr.h>
84 
85 #include <sys/fs/udf_volume.h>
86 #include <sys/fs/udf_inode.h>
87 
88 static int32_t udf_open(struct vnode **,
89 	int32_t, struct cred *, caller_context_t *);
90 static int32_t udf_close(struct vnode *,
91 	int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
92 static int32_t udf_read(struct vnode *,
93 	struct uio *, int32_t, struct cred *, caller_context_t *);
94 static int32_t udf_write(struct vnode *,
95 	struct uio *, int32_t, struct cred *, caller_context_t *);
96 static int32_t udf_ioctl(struct vnode *,
97 	int32_t, intptr_t, int32_t, struct cred *, int32_t *,
98 	caller_context_t *);
99 static int32_t udf_getattr(struct vnode *,
100 	struct vattr *, int32_t, struct cred *, caller_context_t *);
101 static int32_t udf_setattr(struct vnode *,
102 	struct vattr *, int32_t, struct cred *, caller_context_t *);
103 static int32_t udf_access(struct vnode *,
104 	int32_t, int32_t, struct cred *, caller_context_t *);
105 static int32_t udf_lookup(struct vnode *,
106 	char *, struct vnode **, struct pathname *,
107 	int32_t, struct vnode *, struct cred *,
108 	caller_context_t *, int *, pathname_t *);
109 static int32_t udf_create(struct vnode *,
110 	char *, struct vattr *, enum vcexcl,
111 	int32_t, struct vnode **, struct cred *, int32_t,
112 	caller_context_t *, vsecattr_t *);
113 static int32_t udf_remove(struct vnode *,
114 	char *, struct cred *, caller_context_t *, int);
115 static int32_t udf_link(struct vnode *,
116 	struct vnode *, char *, struct cred *, caller_context_t *, int);
117 static int32_t udf_rename(struct vnode *,
118 	char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
119 static int32_t udf_mkdir(struct vnode *,
120 	char *, struct vattr *, struct vnode **, struct cred *,
121 	caller_context_t *, int, vsecattr_t *);
122 static int32_t udf_rmdir(struct vnode *,
123 	char *, struct vnode *, struct cred *, caller_context_t *, int);
124 static int32_t udf_readdir(struct vnode *,
125 	struct uio *, struct cred *, int32_t *, caller_context_t *, int);
126 static int32_t udf_symlink(struct vnode *,
127 	char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
128 static int32_t udf_readlink(struct vnode *,
129 	struct uio *, struct cred *, caller_context_t *);
130 static int32_t udf_fsync(struct vnode *,
131 	int32_t, struct cred *, caller_context_t *);
132 static void udf_inactive(struct vnode *,
133 	struct cred *, caller_context_t *);
134 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
135 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
136 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
137 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
138 	caller_context_t *);
139 static int32_t udf_frlock(struct vnode *, int32_t,
140 	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
141 	caller_context_t *);
142 static int32_t udf_space(struct vnode *, int32_t,
143 	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
144 static int32_t udf_getpage(struct vnode *, offset_t,
145 	size_t, uint32_t *, struct page **, size_t,
146 	struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
147 static int32_t udf_putpage(struct vnode *, offset_t,
148 	size_t, int32_t, struct cred *, caller_context_t *);
149 static int32_t udf_map(struct vnode *, offset_t, struct as *,
150 	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
151 	caller_context_t *);
152 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
153 	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
154 	caller_context_t *);
155 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
156 	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
157 	caller_context_t *);
158 static int32_t udf_l_pathconf(struct vnode *, int32_t,
159 	ulong_t *, struct cred *, caller_context_t *);
160 static int32_t udf_pageio(struct vnode *, struct page *,
161 	u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
162 
163 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
164 	size_t, struct seg *, caddr_t, page_t *pl[],
165 	size_t, enum seg_rw, int32_t);
166 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
167 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
168 int32_t ud_page_fill(struct ud_inode *, page_t *,
169 	u_offset_t, uint32_t, u_offset_t *);
170 int32_t ud_iodone(struct buf *);
171 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
173 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
174 int32_t ud_slave_done(struct buf *);
175 
176 /*
177  * Structures to control multiple IO operations to get or put pages
178  * that are backed by discontiguous blocks. The master struct is
179  * a dummy that holds the original bp from pageio_setup. The
180  * slave struct holds the working bp's to do the actual IO. Once
181  * all the slave IOs complete. The master is processed as if a single
182  * IO op has completed.
183  */
184 uint32_t master_index = 0;
185 typedef struct mio_master {
186 	kmutex_t	mm_mutex;	/* protect the fields below */
187 	int32_t		mm_size;
188 	buf_t		*mm_bp;		/* original bp */
189 	int32_t		mm_resid;	/* bytes remaining to transfer */
190 	int32_t		mm_error;	/* accumulated error from slaves */
191 	int32_t		mm_index;	/* XXX debugging */
192 } mio_master_t;
193 
194 typedef struct mio_slave {
195 	buf_t		ms_buf;		/* working buffer for this IO chunk */
196 	mio_master_t	*ms_ptr;	/* pointer to master */
197 } mio_slave_t;
198 
199 struct vnodeops *udf_vnodeops;
200 
201 const fs_operation_def_t udf_vnodeops_template[] = {
202 	VOPNAME_OPEN,		{ .vop_open = udf_open },
203 	VOPNAME_CLOSE,		{ .vop_close = udf_close },
204 	VOPNAME_READ,		{ .vop_read = udf_read },
205 	VOPNAME_WRITE,		{ .vop_write = udf_write },
206 	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
207 	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
208 	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
209 	VOPNAME_ACCESS,		{ .vop_access = udf_access },
210 	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
211 	VOPNAME_CREATE,		{ .vop_create = udf_create },
212 	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
213 	VOPNAME_LINK,		{ .vop_link = udf_link },
214 	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
215 	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
216 	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
217 	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
218 	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
219 	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
220 	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
221 	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
222 	VOPNAME_FID,		{ .vop_fid = udf_fid },
223 	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
224 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
225 	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
226 	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
227 	VOPNAME_SPACE,		{ .vop_space = udf_space },
228 	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
229 	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
230 	VOPNAME_MAP,		{ .vop_map = udf_map },
231 	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
232 	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
233 	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
234 	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
235 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
236 	NULL,			NULL
237 };
238 
239 /* ARGSUSED */
240 static int32_t
241 udf_open(
242 	struct vnode **vpp,
243 	int32_t flag,
244 	struct cred *cr,
245 	caller_context_t *ct)
246 {
247 	ud_printf("udf_open\n");
248 
249 	return (0);
250 }
251 
252 /* ARGSUSED */
253 static int32_t
254 udf_close(
255 	struct vnode *vp,
256 	int32_t flag,
257 	int32_t count,
258 	offset_t offset,
259 	struct cred *cr,
260 	caller_context_t *ct)
261 {
262 	struct ud_inode *ip = VTOI(vp);
263 
264 	ud_printf("udf_close\n");
265 
266 	ITIMES(ip);
267 
268 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
269 	cleanshares(vp, ttoproc(curthread)->p_pid);
270 
271 	/*
272 	 * Push partially filled cluster at last close.
273 	 * ``last close'' is approximated because the dnlc
274 	 * may have a hold on the vnode.
275 	 */
276 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
277 		struct ud_inode *ip = VTOI(vp);
278 		if (ip->i_delaylen) {
279 			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
280 			    B_ASYNC | B_FREE, cr);
281 			ip->i_delaylen = 0;
282 		}
283 	}
284 
285 	return (0);
286 }
287 
288 /* ARGSUSED */
289 static int32_t
290 udf_read(
291 	struct vnode *vp,
292 	struct uio *uiop,
293 	int32_t ioflag,
294 	struct cred *cr,
295 	caller_context_t *ct)
296 {
297 	struct ud_inode *ip = VTOI(vp);
298 	int32_t error;
299 
300 	ud_printf("udf_read\n");
301 
302 #ifdef	__lock_lint
303 	rw_enter(&ip->i_rwlock, RW_READER);
304 #endif
305 
306 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
307 
308 	if (MANDLOCK(vp, ip->i_char)) {
309 		/*
310 		 * udf_getattr ends up being called by chklock
311 		 */
312 		error = chklock(vp, FREAD, uiop->uio_loffset,
313 		    uiop->uio_resid, uiop->uio_fmode, ct);
314 		if (error) {
315 			goto end;
316 		}
317 	}
318 
319 	rw_enter(&ip->i_contents, RW_READER);
320 	error = ud_rdip(ip, uiop, ioflag, cr);
321 	rw_exit(&ip->i_contents);
322 
323 end:
324 #ifdef	__lock_lint
325 	rw_exit(&ip->i_rwlock);
326 #endif
327 
328 	return (error);
329 }
330 
331 
332 int32_t ud_WRITES = 1;
333 int32_t ud_HW = 96 * 1024;
334 int32_t ud_LW = 64 * 1024;
335 int32_t ud_throttles = 0;
336 
337 /* ARGSUSED */
338 static int32_t
339 udf_write(
340 	struct vnode *vp,
341 	struct uio *uiop,
342 	int32_t ioflag,
343 	struct cred *cr,
344 	caller_context_t *ct)
345 {
346 	struct ud_inode *ip = VTOI(vp);
347 	int32_t error = 0;
348 
349 	ud_printf("udf_write\n");
350 
351 #ifdef	__lock_lint
352 	rw_enter(&ip->i_rwlock, RW_WRITER);
353 #endif
354 
355 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
356 
357 	if (MANDLOCK(vp, ip->i_char)) {
358 		/*
359 		 * ud_getattr ends up being called by chklock
360 		 */
361 		error = chklock(vp, FWRITE, uiop->uio_loffset,
362 		    uiop->uio_resid, uiop->uio_fmode, ct);
363 		if (error) {
364 			goto end;
365 		}
366 	}
367 	/*
368 	 * Throttle writes.
369 	 */
370 	mutex_enter(&ip->i_tlock);
371 	if (ud_WRITES && (ip->i_writes > ud_HW)) {
372 		while (ip->i_writes > ud_HW) {
373 			ud_throttles++;
374 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
375 		}
376 	}
377 	mutex_exit(&ip->i_tlock);
378 
379 	/*
380 	 * Write to the file
381 	 */
382 	rw_enter(&ip->i_contents, RW_WRITER);
383 	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
384 		/*
385 		 * In append mode start at end of file.
386 		 */
387 		uiop->uio_loffset = ip->i_size;
388 	}
389 	error = ud_wrip(ip, uiop, ioflag, cr);
390 	rw_exit(&ip->i_contents);
391 
392 end:
393 #ifdef	__lock_lint
394 	rw_exit(&ip->i_rwlock);
395 #endif
396 
397 	return (error);
398 }
399 
400 /* ARGSUSED */
401 static int32_t
402 udf_ioctl(
403 	struct vnode *vp,
404 	int32_t cmd,
405 	intptr_t arg,
406 	int32_t flag,
407 	struct cred *cr,
408 	int32_t *rvalp,
409 	caller_context_t *ct)
410 {
411 	return (ENOTTY);
412 }
413 
414 /* ARGSUSED */
415 static int32_t
416 udf_getattr(
417 	struct vnode *vp,
418 	struct vattr *vap,
419 	int32_t flags,
420 	struct cred *cr,
421 	caller_context_t *ct)
422 {
423 	struct ud_inode *ip = VTOI(vp);
424 
425 	ud_printf("udf_getattr\n");
426 
427 	if (vap->va_mask == AT_SIZE) {
428 		/*
429 		 * for performance, if only the size is requested don't bother
430 		 * with anything else.
431 		 */
432 		vap->va_size = ip->i_size;
433 		return (0);
434 	}
435 
436 	rw_enter(&ip->i_contents, RW_READER);
437 
438 	vap->va_type = vp->v_type;
439 	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
440 
441 	vap->va_uid = ip->i_uid;
442 	vap->va_gid = ip->i_gid;
443 	vap->va_fsid = ip->i_dev;
444 	vap->va_nodeid = ip->i_icb_lbano;
445 	vap->va_nlink = ip->i_nlink;
446 	vap->va_size = ip->i_size;
447 	vap->va_seq = ip->i_seq;
448 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
449 		vap->va_rdev = ip->i_rdev;
450 	} else {
451 		vap->va_rdev = 0;
452 	}
453 
454 	mutex_enter(&ip->i_tlock);
455 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
456 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
457 	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
458 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
459 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
460 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
461 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
462 	mutex_exit(&ip->i_tlock);
463 
464 	switch (ip->i_type) {
465 		case VBLK:
466 			vap->va_blksize = MAXBSIZE;
467 			break;
468 		case VCHR:
469 			vap->va_blksize = MAXBSIZE;
470 			break;
471 		default:
472 			vap->va_blksize = ip->i_udf->udf_lbsize;
473 			break;
474 	}
475 	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
476 
477 	rw_exit(&ip->i_contents);
478 
479 	return (0);
480 }
481 
482 static int
483 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
484 {
485 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
486 }
487 
488 /*ARGSUSED4*/
489 static int32_t
490 udf_setattr(
491 	struct vnode *vp,
492 	struct vattr *vap,
493 	int32_t flags,
494 	struct cred *cr,
495 	caller_context_t *ct)
496 {
497 	int32_t error = 0;
498 	uint32_t mask = vap->va_mask;
499 	struct ud_inode *ip;
500 	timestruc_t now;
501 	struct vattr ovap;
502 
503 	ud_printf("udf_setattr\n");
504 
505 	ip = VTOI(vp);
506 
507 	/*
508 	 * not updates allowed to 4096 files
509 	 */
510 	if (ip->i_astrat == STRAT_TYPE4096) {
511 		return (EINVAL);
512 	}
513 
514 	/*
515 	 * Cannot set these attributes
516 	 */
517 	if (mask & AT_NOSET) {
518 		return (EINVAL);
519 	}
520 
521 	rw_enter(&ip->i_rwlock, RW_WRITER);
522 	rw_enter(&ip->i_contents, RW_WRITER);
523 
524 	ovap.va_uid = ip->i_uid;
525 	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
526 	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
527 	    ud_iaccess_vmode, ip);
528 	if (error)
529 		goto update_inode;
530 
531 	mask = vap->va_mask;
532 	/*
533 	 * Change file access modes.
534 	 */
535 	if (mask & AT_MODE) {
536 		ip->i_perm = VA2UD_PERM(vap->va_mode);
537 		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
538 		mutex_enter(&ip->i_tlock);
539 		ip->i_flag |= ICHG;
540 		mutex_exit(&ip->i_tlock);
541 	}
542 	if (mask & (AT_UID|AT_GID)) {
543 		if (mask & AT_UID) {
544 			ip->i_uid = vap->va_uid;
545 		}
546 		if (mask & AT_GID) {
547 			ip->i_gid = vap->va_gid;
548 		}
549 		mutex_enter(&ip->i_tlock);
550 		ip->i_flag |= ICHG;
551 		mutex_exit(&ip->i_tlock);
552 	}
553 	/*
554 	 * Truncate file.  Must have write permission and not be a directory.
555 	 */
556 	if (mask & AT_SIZE) {
557 		if (vp->v_type == VDIR) {
558 			error = EISDIR;
559 			goto update_inode;
560 		}
561 		if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
562 			goto update_inode;
563 		}
564 		if (vap->va_size > MAXOFFSET_T) {
565 			error = EFBIG;
566 			goto update_inode;
567 		}
568 		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
569 			goto update_inode;
570 		}
571 
572 		if (vap->va_size == 0)
573 			vnevent_truncate(vp, ct);
574 	}
575 	/*
576 	 * Change file access or modified times.
577 	 */
578 	if (mask & (AT_ATIME|AT_MTIME)) {
579 		mutex_enter(&ip->i_tlock);
580 		if (mask & AT_ATIME) {
581 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
582 			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
583 			ip->i_flag &= ~IACC;
584 		}
585 		if (mask & AT_MTIME) {
586 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
587 			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
588 			gethrestime(&now);
589 			ip->i_ctime.tv_sec = now.tv_sec;
590 			ip->i_ctime.tv_nsec = now.tv_nsec;
591 			ip->i_flag &= ~(IUPD|ICHG);
592 			ip->i_flag |= IMODTIME;
593 		}
594 		ip->i_flag |= IMOD;
595 		mutex_exit(&ip->i_tlock);
596 	}
597 
598 update_inode:
599 	if (curthread->t_flag & T_DONTPEND) {
600 		ud_iupdat(ip, 1);
601 	} else {
602 		ITIMES_NOLOCK(ip);
603 	}
604 	rw_exit(&ip->i_contents);
605 	rw_exit(&ip->i_rwlock);
606 
607 	return (error);
608 }
609 
610 /* ARGSUSED */
611 static int32_t
612 udf_access(
613 	struct vnode *vp,
614 	int32_t mode,
615 	int32_t flags,
616 	struct cred *cr,
617 	caller_context_t *ct)
618 {
619 	struct ud_inode *ip = VTOI(vp);
620 
621 	ud_printf("udf_access\n");
622 
623 	if (ip->i_udf == NULL) {
624 		return (EIO);
625 	}
626 
627 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
628 }
629 
630 int32_t udfs_stickyhack = 1;
631 
632 /* ARGSUSED */
633 static int32_t
634 udf_lookup(
635 	struct vnode *dvp,
636 	char *nm,
637 	struct vnode **vpp,
638 	struct pathname *pnp,
639 	int32_t flags,
640 	struct vnode *rdir,
641 	struct cred *cr,
642 	caller_context_t *ct,
643 	int *direntflags,
644 	pathname_t *realpnp)
645 {
646 	int32_t error;
647 	struct vnode *vp;
648 	struct ud_inode *ip, *xip;
649 
650 	ud_printf("udf_lookup\n");
651 	/*
652 	 * Null component name is a synonym for directory being searched.
653 	 */
654 	if (*nm == '\0') {
655 		VN_HOLD(dvp);
656 		*vpp = dvp;
657 		error = 0;
658 		goto out;
659 	}
660 
661 	/*
662 	 * Fast path: Check the directory name lookup cache.
663 	 */
664 	ip = VTOI(dvp);
665 	if (vp = dnlc_lookup(dvp, nm)) {
666 		/*
667 		 * Check accessibility of directory.
668 		 */
669 		if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
670 			VN_RELE(vp);
671 		}
672 		xip = VTOI(vp);
673 	} else {
674 		error = ud_dirlook(ip, nm, &xip, cr, 1);
675 		ITIMES(ip);
676 	}
677 
678 	if (error == 0) {
679 		ip = xip;
680 		*vpp = ITOV(ip);
681 		if ((ip->i_type != VDIR) &&
682 		    (ip->i_char & ISVTX) &&
683 		    ((ip->i_perm & IEXEC) == 0) &&
684 		    udfs_stickyhack) {
685 			mutex_enter(&(*vpp)->v_lock);
686 			(*vpp)->v_flag |= VISSWAP;
687 			mutex_exit(&(*vpp)->v_lock);
688 		}
689 		ITIMES(ip);
690 		/*
691 		 * If vnode is a device return special vnode instead.
692 		 */
693 		if (IS_DEVVP(*vpp)) {
694 			struct vnode *newvp;
695 			newvp = specvp(*vpp, (*vpp)->v_rdev,
696 			    (*vpp)->v_type, cr);
697 			VN_RELE(*vpp);
698 			if (newvp == NULL) {
699 				error = ENOSYS;
700 			} else {
701 				*vpp = newvp;
702 			}
703 		}
704 	}
705 out:
706 	return (error);
707 }
708 
709 /* ARGSUSED */
710 static int32_t
711 udf_create(
712 	struct vnode *dvp,
713 	char *name,
714 	struct vattr *vap,
715 	enum vcexcl excl,
716 	int32_t mode,
717 	struct vnode **vpp,
718 	struct cred *cr,
719 	int32_t flag,
720 	caller_context_t *ct,
721 	vsecattr_t *vsecp)
722 {
723 	int32_t error;
724 	struct ud_inode *ip = VTOI(dvp), *xip;
725 
726 	ud_printf("udf_create\n");
727 
728 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
729 		vap->va_mode &= ~VSVTX;
730 
731 	if (*name == '\0') {
732 		/*
733 		 * Null component name refers to the directory itself.
734 		 */
735 		VN_HOLD(dvp);
736 		ITIMES(ip);
737 		error = EEXIST;
738 	} else {
739 		xip = NULL;
740 		rw_enter(&ip->i_rwlock, RW_WRITER);
741 		error = ud_direnter(ip, name, DE_CREATE,
742 		    (struct ud_inode *)0, (struct ud_inode *)0,
743 		    vap, &xip, cr, ct);
744 		rw_exit(&ip->i_rwlock);
745 		ITIMES(ip);
746 		ip = xip;
747 	}
748 #ifdef	__lock_lint
749 	rw_enter(&ip->i_contents, RW_WRITER);
750 #else
751 	if (ip != NULL) {
752 		rw_enter(&ip->i_contents, RW_WRITER);
753 	}
754 #endif
755 
756 	/*
757 	 * If the file already exists and this is a non-exclusive create,
758 	 * check permissions and allow access for non-directories.
759 	 * Read-only create of an existing directory is also allowed.
760 	 * We fail an exclusive create of anything which already exists.
761 	 */
762 	if (error == EEXIST) {
763 		if (excl == NONEXCL) {
764 			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
765 				error = EISDIR;
766 			} else if (mode) {
767 				error = ud_iaccess(ip,
768 				    UD_UPERM2DPERM(mode), cr, 0);
769 			} else {
770 				error = 0;
771 			}
772 		}
773 		if (error) {
774 			rw_exit(&ip->i_contents);
775 			VN_RELE(ITOV(ip));
776 			goto out;
777 		} else if ((ip->i_type == VREG) &&
778 		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
779 			/*
780 			 * Truncate regular files, if requested by caller.
781 			 * Grab i_rwlock to make sure no one else is
782 			 * currently writing to the file (we promised
783 			 * bmap we would do this).
784 			 * Must get the locks in the correct order.
785 			 */
786 			if (ip->i_size == 0) {
787 				ip->i_flag |= ICHG | IUPD;
788 			} else {
789 				rw_exit(&ip->i_contents);
790 				rw_enter(&ip->i_rwlock, RW_WRITER);
791 				rw_enter(&ip->i_contents, RW_WRITER);
792 				(void) ud_itrunc(ip, 0, 0, cr);
793 				rw_exit(&ip->i_rwlock);
794 			}
795 			vnevent_create(ITOV(ip), ct);
796 		}
797 	}
798 
799 	if (error == 0) {
800 		*vpp = ITOV(ip);
801 		ITIMES(ip);
802 	}
803 #ifdef	__lock_lint
804 	rw_exit(&ip->i_contents);
805 #else
806 	if (ip != NULL) {
807 		rw_exit(&ip->i_contents);
808 	}
809 #endif
810 	if (error) {
811 		goto out;
812 	}
813 
814 	/*
815 	 * If vnode is a device return special vnode instead.
816 	 */
817 	if (!error && IS_DEVVP(*vpp)) {
818 		struct vnode *newvp;
819 
820 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
821 		VN_RELE(*vpp);
822 		if (newvp == NULL) {
823 			error = ENOSYS;
824 			goto out;
825 		}
826 		*vpp = newvp;
827 	}
828 out:
829 	return (error);
830 }
831 
832 /* ARGSUSED */
833 static int32_t
834 udf_remove(
835 	struct vnode *vp,
836 	char *nm,
837 	struct cred *cr,
838 	caller_context_t *ct,
839 	int flags)
840 {
841 	int32_t error;
842 	struct ud_inode *ip = VTOI(vp);
843 
844 	ud_printf("udf_remove\n");
845 
846 	rw_enter(&ip->i_rwlock, RW_WRITER);
847 	error = ud_dirremove(ip, nm,
848 	    (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
849 	rw_exit(&ip->i_rwlock);
850 	ITIMES(ip);
851 
852 	return (error);
853 }
854 
855 /* ARGSUSED */
856 static int32_t
857 udf_link(
858 	struct vnode *tdvp,
859 	struct vnode *svp,
860 	char *tnm,
861 	struct cred *cr,
862 	caller_context_t *ct,
863 	int flags)
864 {
865 	int32_t error;
866 	struct vnode *realvp;
867 	struct ud_inode *sip;
868 	struct ud_inode *tdp;
869 
870 	ud_printf("udf_link\n");
871 	if (VOP_REALVP(svp, &realvp, ct) == 0) {
872 		svp = realvp;
873 	}
874 
875 	/*
876 	 * Do not allow links to directories
877 	 */
878 	if (svp->v_type == VDIR) {
879 		return (EPERM);
880 	}
881 
882 	sip = VTOI(svp);
883 
884 	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
885 		return (EPERM);
886 
887 	tdp = VTOI(tdvp);
888 
889 	rw_enter(&tdp->i_rwlock, RW_WRITER);
890 	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
891 	    sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
892 	rw_exit(&tdp->i_rwlock);
893 	ITIMES(sip);
894 	ITIMES(tdp);
895 
896 	if (error == 0) {
897 		vnevent_link(svp, ct);
898 	}
899 
900 	return (error);
901 }
902 
903 /* ARGSUSED */
904 static int32_t
905 udf_rename(
906 	struct vnode *sdvp,
907 	char *snm,
908 	struct vnode *tdvp,
909 	char *tnm,
910 	struct cred *cr,
911 	caller_context_t *ct,
912 	int flags)
913 {
914 	int32_t error = 0;
915 	struct udf_vfs *udf_vfsp;
916 	struct ud_inode *sip;		/* source inode */
917 	struct ud_inode *tip;		/* target inode */
918 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
919 	struct vnode *realvp;
920 
921 	ud_printf("udf_rename\n");
922 
923 	if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
924 		tdvp = realvp;
925 	}
926 
927 	sdp = VTOI(sdvp);
928 	tdp = VTOI(tdvp);
929 
930 	udf_vfsp = sdp->i_udf;
931 
932 	mutex_enter(&udf_vfsp->udf_rename_lck);
933 	/*
934 	 * Look up inode of file we're supposed to rename.
935 	 */
936 	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
937 		mutex_exit(&udf_vfsp->udf_rename_lck);
938 		return (error);
939 	}
940 	/*
941 	 * be sure this is not a directory with another file system mounted
942 	 * over it.  If it is just give up the locks, and return with
943 	 * EBUSY
944 	 */
945 	if (vn_mountedvfs(ITOV(sip)) != NULL) {
946 		error = EBUSY;
947 		goto errout;
948 	}
949 	/*
950 	 * Make sure we can delete the source entry.  This requires
951 	 * write permission on the containing directory.  If that
952 	 * directory is "sticky" it further requires (except for
953 	 * privileged users) that the user own the directory or the
954 	 * source entry, or else have permission to write the source
955 	 * entry.
956 	 */
957 	rw_enter(&sdp->i_contents, RW_READER);
958 	rw_enter(&sip->i_contents, RW_READER);
959 	if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
960 	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
961 		rw_exit(&sip->i_contents);
962 		rw_exit(&sdp->i_contents);
963 		ITIMES(sip);
964 		goto errout;
965 	}
966 
967 	/*
968 	 * Check for renaming '.' or '..' or alias of '.'
969 	 */
970 	if ((strcmp(snm, ".") == 0) ||
971 	    (strcmp(snm, "..") == 0) ||
972 	    (sdp == sip)) {
973 		error = EINVAL;
974 		rw_exit(&sip->i_contents);
975 		rw_exit(&sdp->i_contents);
976 		goto errout;
977 	}
978 
979 	rw_exit(&sip->i_contents);
980 	rw_exit(&sdp->i_contents);
981 
982 	if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
983 		vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
984 		VN_RELE(ITOV(tip));
985 	}
986 
987 	/* Notify the target dir. if not the same as the source dir. */
988 	if (sdvp != tdvp)
989 		vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
990 
991 	vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
992 
993 	/*
994 	 * Link source to the target.
995 	 */
996 	rw_enter(&tdp->i_rwlock, RW_WRITER);
997 	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
998 	    (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
999 		/*
1000 		 * ESAME isn't really an error; it indicates that the
1001 		 * operation should not be done because the source and target
1002 		 * are the same file, but that no error should be reported.
1003 		 */
1004 		if (error == ESAME) {
1005 			error = 0;
1006 		}
1007 		rw_exit(&tdp->i_rwlock);
1008 		goto errout;
1009 	}
1010 	rw_exit(&tdp->i_rwlock);
1011 
1012 	rw_enter(&sdp->i_rwlock, RW_WRITER);
1013 	/*
1014 	 * Unlink the source.
1015 	 * Remove the source entry.  ud_dirremove() checks that the entry
1016 	 * still reflects sip, and returns an error if it doesn't.
1017 	 * If the entry has changed just forget about it.  Release
1018 	 * the source inode.
1019 	 */
1020 	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1021 	    DR_RENAME, cr, ct)) == ENOENT) {
1022 		error = 0;
1023 	}
1024 	rw_exit(&sdp->i_rwlock);
1025 
1026 	if (error == 0) {
1027 		vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1028 		/*
1029 		 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1030 		 * in ud_direnter().
1031 		 */
1032 	}
1033 
1034 errout:
1035 	ITIMES(sdp);
1036 	ITIMES(tdp);
1037 	VN_RELE(ITOV(sip));
1038 	mutex_exit(&udf_vfsp->udf_rename_lck);
1039 
1040 	return (error);
1041 }
1042 
1043 /* ARGSUSED */
1044 static int32_t
1045 udf_mkdir(
1046 	struct vnode *dvp,
1047 	char *dirname,
1048 	struct vattr *vap,
1049 	struct vnode **vpp,
1050 	struct cred *cr,
1051 	caller_context_t *ct,
1052 	int flags,
1053 	vsecattr_t *vsecp)
1054 {
1055 	int32_t error;
1056 	struct ud_inode *ip;
1057 	struct ud_inode *xip;
1058 
1059 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1060 
1061 	ud_printf("udf_mkdir\n");
1062 
1063 	ip = VTOI(dvp);
1064 	rw_enter(&ip->i_rwlock, RW_WRITER);
1065 	error = ud_direnter(ip, dirname, DE_MKDIR,
1066 	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1067 	rw_exit(&ip->i_rwlock);
1068 	ITIMES(ip);
1069 	if (error == 0) {
1070 		ip = xip;
1071 		*vpp = ITOV(ip);
1072 		ITIMES(ip);
1073 	} else if (error == EEXIST) {
1074 		ITIMES(xip);
1075 		VN_RELE(ITOV(xip));
1076 	}
1077 
1078 	return (error);
1079 }
1080 
1081 /* ARGSUSED */
1082 static int32_t
1083 udf_rmdir(
1084 	struct vnode *vp,
1085 	char *nm,
1086 	struct vnode *cdir,
1087 	struct cred *cr,
1088 	caller_context_t *ct,
1089 	int flags)
1090 {
1091 	int32_t error;
1092 	struct ud_inode *ip = VTOI(vp);
1093 
1094 	ud_printf("udf_rmdir\n");
1095 
1096 	rw_enter(&ip->i_rwlock, RW_WRITER);
1097 	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1098 	    cr, ct);
1099 	rw_exit(&ip->i_rwlock);
1100 	ITIMES(ip);
1101 
1102 	return (error);
1103 }
1104 
1105 /* ARGSUSED */
1106 static int32_t
1107 udf_readdir(
1108 	struct vnode *vp,
1109 	struct uio *uiop,
1110 	struct cred *cr,
1111 	int32_t *eofp,
1112 	caller_context_t *ct,
1113 	int flags)
1114 {
1115 	struct ud_inode *ip;
1116 	struct dirent64 *nd;
1117 	struct udf_vfs *udf_vfsp;
1118 	int32_t error = 0, len, outcount = 0;
1119 	uint32_t dirsiz, offset;
1120 	uint32_t bufsize, ndlen, dummy;
1121 	caddr_t outbuf;
1122 	caddr_t outb, end_outb;
1123 	struct iovec *iovp;
1124 
1125 	uint8_t *dname;
1126 	int32_t length;
1127 
1128 	uint8_t *buf = NULL;
1129 
1130 	struct fbuf *fbp = NULL;
1131 	struct file_id *fid;
1132 	uint8_t *name;
1133 
1134 
1135 	ud_printf("udf_readdir\n");
1136 
1137 	ip = VTOI(vp);
1138 	udf_vfsp = ip->i_udf;
1139 
1140 	dirsiz = ip->i_size;
1141 	if ((uiop->uio_offset >= dirsiz) ||
1142 	    (ip->i_nlink <= 0)) {
1143 		if (eofp) {
1144 			*eofp = 1;
1145 		}
1146 		return (0);
1147 	}
1148 
1149 	offset = uiop->uio_offset;
1150 	iovp = uiop->uio_iov;
1151 	bufsize = iovp->iov_len;
1152 
1153 	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1154 	end_outb = outb + bufsize;
1155 	nd = (struct dirent64 *)outbuf;
1156 
1157 	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1158 	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1159 
1160 	if (offset == 0) {
1161 		len = DIRENT64_RECLEN(1);
1162 		if (((caddr_t)nd + len) >= end_outb) {
1163 			error = EINVAL;
1164 			goto end;
1165 		}
1166 		nd->d_ino = ip->i_icb_lbano;
1167 		nd->d_reclen = (uint16_t)len;
1168 		nd->d_off = 0x10;
1169 		nd->d_name[0] = '.';
1170 		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1171 		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1172 		outcount++;
1173 	} else if (offset == 0x10) {
1174 		offset = 0;
1175 	}
1176 
1177 	while (offset < dirsiz) {
1178 		error = ud_get_next_fid(ip, &fbp,
1179 		    offset, &fid, &name, buf);
1180 		if (error != 0) {
1181 			break;
1182 		}
1183 
1184 		if ((fid->fid_flags & FID_DELETED) == 0) {
1185 			if (fid->fid_flags & FID_PARENT) {
1186 
1187 				len = DIRENT64_RECLEN(2);
1188 				if (((caddr_t)nd + len) >= end_outb) {
1189 					error = EINVAL;
1190 					break;
1191 				}
1192 
1193 				nd->d_ino = ip->i_icb_lbano;
1194 				nd->d_reclen = (uint16_t)len;
1195 				nd->d_off = offset + FID_LEN(fid);
1196 				nd->d_name[0] = '.';
1197 				nd->d_name[1] = '.';
1198 				bzero(&nd->d_name[2],
1199 				    DIRENT64_NAMELEN(len) - 2);
1200 				nd = (struct dirent64 *)
1201 				    ((char *)nd + nd->d_reclen);
1202 			} else {
1203 				if ((error = ud_uncompress(fid->fid_idlen,
1204 				    &length, name, dname)) != 0) {
1205 					break;
1206 				}
1207 				if (length == 0) {
1208 					offset += FID_LEN(fid);
1209 					continue;
1210 				}
1211 				len = DIRENT64_RECLEN(length);
1212 				if (((caddr_t)nd + len) >= end_outb) {
1213 					if (!outcount) {
1214 						error = EINVAL;
1215 					}
1216 					break;
1217 				}
1218 				(void) strncpy(nd->d_name,
1219 				    (caddr_t)dname, length);
1220 				bzero(&nd->d_name[length],
1221 				    DIRENT64_NAMELEN(len) - length);
1222 				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1223 				    SWAP_16(fid->fid_icb.lad_ext_prn),
1224 				    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1225 				    &dummy);
1226 				nd->d_reclen = (uint16_t)len;
1227 				nd->d_off = offset + FID_LEN(fid);
1228 				nd = (struct dirent64 *)
1229 				    ((char *)nd + nd->d_reclen);
1230 			}
1231 			outcount++;
1232 		}
1233 
1234 		offset += FID_LEN(fid);
1235 	}
1236 
1237 end:
1238 	if (fbp != NULL) {
1239 		fbrelse(fbp, S_OTHER);
1240 	}
1241 	ndlen = ((char *)nd - outbuf);
1242 	/*
1243 	 * In case of error do not call uiomove.
1244 	 * Return the error to the caller.
1245 	 */
1246 	if ((error == 0) && (ndlen != 0)) {
1247 		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1248 		uiop->uio_offset = offset;
1249 	}
1250 	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1251 	kmem_free((caddr_t)dname, 1024);
1252 	kmem_free(outbuf, (uint32_t)bufsize);
1253 	if (eofp && error == 0) {
1254 		*eofp = (uiop->uio_offset >= dirsiz);
1255 	}
1256 	return (error);
1257 }
1258 
1259 /* ARGSUSED */
1260 static int32_t
1261 udf_symlink(
1262 	struct vnode *dvp,
1263 	char *linkname,
1264 	struct vattr *vap,
1265 	char *target,
1266 	struct cred *cr,
1267 	caller_context_t *ct,
1268 	int flags)
1269 {
1270 	int32_t error = 0, outlen;
1271 	uint32_t ioflag = 0;
1272 	struct ud_inode *ip, *dip = VTOI(dvp);
1273 
1274 	struct path_comp *pc;
1275 	int8_t *dname = NULL, *uname = NULL, *sp;
1276 
1277 	ud_printf("udf_symlink\n");
1278 
1279 	ip = (struct ud_inode *)0;
1280 	vap->va_type = VLNK;
1281 	vap->va_rdev = 0;
1282 
1283 	rw_enter(&dip->i_rwlock, RW_WRITER);
1284 	error = ud_direnter(dip, linkname, DE_CREATE,
1285 	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1286 	rw_exit(&dip->i_rwlock);
1287 	if (error == 0) {
1288 		dname = kmem_zalloc(1024, KM_SLEEP);
1289 		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1290 
1291 		pc = (struct path_comp *)uname;
1292 		/*
1293 		 * If the first character in target is "/"
1294 		 * then skip it and create entry for it
1295 		 */
1296 		if (*target == '/') {
1297 			pc->pc_type = 2;
1298 			pc->pc_len = 0;
1299 			pc = (struct path_comp *)(((char *)pc) + 4);
1300 			while (*target == '/') {
1301 				target++;
1302 			}
1303 		}
1304 
1305 		while (*target != NULL) {
1306 			sp = target;
1307 			while ((*target != '/') && (*target != '\0')) {
1308 				target ++;
1309 			}
1310 			/*
1311 			 * We got the next component of the
1312 			 * path name. Create path_comp of
1313 			 * appropriate type
1314 			 */
1315 			if (((target - sp) == 1) && (*sp == '.')) {
1316 				/*
1317 				 * Dot entry.
1318 				 */
1319 				pc->pc_type = 4;
1320 				pc = (struct path_comp *)(((char *)pc) + 4);
1321 			} else if (((target - sp) == 2) &&
1322 			    (*sp == '.') && ((*(sp + 1)) == '.')) {
1323 				/*
1324 				 * DotDot entry.
1325 				 */
1326 				pc->pc_type = 3;
1327 				pc = (struct path_comp *)(((char *)pc) + 4);
1328 			} else {
1329 				/*
1330 				 * convert the user given name
1331 				 * into appropriate form to be put
1332 				 * on the media
1333 				 */
1334 				outlen = 1024;	/* set to size of dname */
1335 				if (error = ud_compress(target - sp, &outlen,
1336 				    (uint8_t *)sp, (uint8_t *)dname)) {
1337 					break;
1338 				}
1339 				pc->pc_type = 5;
1340 				/* LINTED */
1341 				pc->pc_len = outlen;
1342 				dname[outlen] = '\0';
1343 				(void) strcpy((char *)pc->pc_id, dname);
1344 				pc = (struct path_comp *)
1345 				    (((char *)pc) + 4 + outlen);
1346 			}
1347 			while (*target == '/') {
1348 				target++;
1349 			}
1350 			if (*target == NULL) {
1351 				break;
1352 			}
1353 		}
1354 
1355 		rw_enter(&ip->i_contents, RW_WRITER);
1356 		if (error == 0) {
1357 			ioflag = FWRITE;
1358 			if (curthread->t_flag & T_DONTPEND) {
1359 				ioflag |= FDSYNC;
1360 			}
1361 			error = ud_rdwri(UIO_WRITE, ioflag, ip,
1362 			    uname, ((int8_t *)pc) - uname,
1363 			    (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1364 		}
1365 		if (error) {
1366 			ud_idrop(ip);
1367 			rw_exit(&ip->i_contents);
1368 			rw_enter(&dip->i_rwlock, RW_WRITER);
1369 			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1370 			    (struct vnode *)0, DR_REMOVE, cr, ct);
1371 			rw_exit(&dip->i_rwlock);
1372 			goto update_inode;
1373 		}
1374 		rw_exit(&ip->i_contents);
1375 	}
1376 
1377 	if ((error == 0) || (error == EEXIST)) {
1378 		VN_RELE(ITOV(ip));
1379 	}
1380 
1381 update_inode:
1382 	ITIMES(VTOI(dvp));
1383 	if (uname != NULL) {
1384 		kmem_free(uname, PAGESIZE);
1385 	}
1386 	if (dname != NULL) {
1387 		kmem_free(dname, 1024);
1388 	}
1389 
1390 	return (error);
1391 }
1392 
1393 /* ARGSUSED */
1394 static int32_t
1395 udf_readlink(
1396 	struct vnode *vp,
1397 	struct uio *uiop,
1398 	struct cred *cr,
1399 	caller_context_t *ct)
1400 {
1401 	int32_t error = 0, off, id_len, size, len;
1402 	int8_t *dname = NULL, *uname = NULL;
1403 	struct ud_inode *ip;
1404 	struct fbuf *fbp = NULL;
1405 	struct path_comp *pc;
1406 
1407 	ud_printf("udf_readlink\n");
1408 
1409 	if (vp->v_type != VLNK) {
1410 		return (EINVAL);
1411 	}
1412 
1413 	ip = VTOI(vp);
1414 	size = ip->i_size;
1415 	if (size > PAGESIZE) {
1416 		return (EIO);
1417 	}
1418 
1419 	if (size == 0) {
1420 		return (0);
1421 	}
1422 
1423 	dname = kmem_zalloc(1024, KM_SLEEP);
1424 	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1425 
1426 	rw_enter(&ip->i_contents, RW_READER);
1427 
1428 	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1429 		goto end;
1430 	}
1431 
1432 	off = 0;
1433 
1434 	while (off < size) {
1435 		pc = (struct path_comp *)(fbp->fb_addr + off);
1436 		switch (pc->pc_type) {
1437 			case 1 :
1438 				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
1439 				(void) strcat(uname, "/");
1440 				break;
1441 			case 2 :
1442 				if (pc->pc_len != 0) {
1443 					goto end;
1444 				}
1445 				uname[0] = '/';
1446 				uname[1] = '\0';
1447 				break;
1448 			case 3 :
1449 				(void) strcat(uname, "../");
1450 				break;
1451 			case 4 :
1452 				(void) strcat(uname, "./");
1453 				break;
1454 			case 5 :
1455 				if ((error = ud_uncompress(pc->pc_len, &id_len,
1456 				    pc->pc_id, (uint8_t *)dname)) != 0) {
1457 					break;
1458 				}
1459 				dname[id_len] = '\0';
1460 				(void) strcat(uname, dname);
1461 				(void) strcat(uname, "/");
1462 				break;
1463 			default :
1464 				error = EINVAL;
1465 				goto end;
1466 		}
1467 		off += 4 + pc->pc_len;
1468 	}
1469 	len = strlen(uname) - 1;
1470 	if (uname[len] == '/') {
1471 		if (len == 0) {
1472 			/*
1473 			 * special case link to /
1474 			 */
1475 			len = 1;
1476 		} else {
1477 			uname[len] = '\0';
1478 		}
1479 	}
1480 
1481 	error = uiomove(uname, len, UIO_READ, uiop);
1482 
1483 	ITIMES(ip);
1484 
1485 end:
1486 	if (fbp != NULL) {
1487 		fbrelse(fbp, S_OTHER);
1488 	}
1489 	rw_exit(&ip->i_contents);
1490 	if (uname != NULL) {
1491 		kmem_free(uname, PAGESIZE);
1492 	}
1493 	if (dname != NULL) {
1494 		kmem_free(dname, 1024);
1495 	}
1496 	return (error);
1497 }
1498 
1499 /* ARGSUSED */
1500 static int32_t
1501 udf_fsync(
1502 	struct vnode *vp,
1503 	int32_t syncflag,
1504 	struct cred *cr,
1505 	caller_context_t *ct)
1506 {
1507 	int32_t error = 0;
1508 	struct ud_inode *ip = VTOI(vp);
1509 
1510 	ud_printf("udf_fsync\n");
1511 
1512 	rw_enter(&ip->i_contents, RW_WRITER);
1513 	if (!(IS_SWAPVP(vp))) {
1514 		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1515 	}
1516 	if (error == 0) {
1517 		error = ud_sync_indir(ip);
1518 	}
1519 	ITIMES(ip);		/* XXX: is this necessary ??? */
1520 	rw_exit(&ip->i_contents);
1521 
1522 	return (error);
1523 }
1524 
1525 /* ARGSUSED */
1526 static void
1527 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1528 {
1529 	ud_printf("udf_iinactive\n");
1530 
1531 	ud_iinactive(VTOI(vp), cr);
1532 }
1533 
1534 /* ARGSUSED */
1535 static int32_t
1536 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1537 {
1538 	struct udf_fid *udfidp;
1539 	struct ud_inode *ip = VTOI(vp);
1540 
1541 	ud_printf("udf_fid\n");
1542 
1543 	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1544 		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1545 		return (ENOSPC);
1546 	}
1547 
1548 	udfidp = (struct udf_fid *)fidp;
1549 	bzero((char *)udfidp, sizeof (struct udf_fid));
1550 	rw_enter(&ip->i_contents, RW_READER);
1551 	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1552 	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1553 	udfidp->udfid_prn = ip->i_icb_prn;
1554 	udfidp->udfid_icb_lbn = ip->i_icb_block;
1555 	rw_exit(&ip->i_contents);
1556 
1557 	return (0);
1558 }
1559 
1560 /* ARGSUSED2 */
1561 static int
1562 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1563 {
1564 	struct ud_inode *ip = VTOI(vp);
1565 
1566 	ud_printf("udf_rwlock\n");
1567 
1568 	if (write_lock) {
1569 		rw_enter(&ip->i_rwlock, RW_WRITER);
1570 	} else {
1571 		rw_enter(&ip->i_rwlock, RW_READER);
1572 	}
1573 #ifdef	__lock_lint
1574 	rw_exit(&ip->i_rwlock);
1575 #endif
1576 	return (write_lock);
1577 }
1578 
1579 /* ARGSUSED */
1580 static void
1581 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1582 {
1583 	struct ud_inode *ip = VTOI(vp);
1584 
1585 	ud_printf("udf_rwunlock\n");
1586 
1587 #ifdef	__lock_lint
1588 	rw_enter(&ip->i_rwlock, RW_WRITER);
1589 #endif
1590 
1591 	rw_exit(&ip->i_rwlock);
1592 
1593 }
1594 
1595 /* ARGSUSED */
1596 static int32_t
1597 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1598 {
1599 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1600 }
1601 
1602 static int32_t
1603 udf_frlock(
1604 	struct vnode *vp,
1605 	int32_t cmd,
1606 	struct flock64 *bfp,
1607 	int32_t flag,
1608 	offset_t offset,
1609 	struct flk_callback *flk_cbp,
1610 	cred_t *cr,
1611 	caller_context_t *ct)
1612 {
1613 	struct ud_inode *ip = VTOI(vp);
1614 
1615 	ud_printf("udf_frlock\n");
1616 
1617 	/*
1618 	 * If file is being mapped, disallow frlock.
1619 	 * XXX I am not holding tlock while checking i_mapcnt because the
1620 	 * current locking strategy drops all locks before calling fs_frlock.
1621 	 * So, mapcnt could change before we enter fs_frlock making is
1622 	 * meaningless to have held tlock in the first place.
1623 	 */
1624 	if ((ip->i_mapcnt > 0) &&
1625 	    (MANDLOCK(vp, ip->i_char))) {
1626 		return (EAGAIN);
1627 	}
1628 
1629 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1630 }
1631 
1632 /*ARGSUSED6*/
1633 static int32_t
1634 udf_space(
1635 	struct vnode *vp,
1636 	int32_t cmd,
1637 	struct flock64 *bfp,
1638 	int32_t flag,
1639 	offset_t offset,
1640 	cred_t *cr,
1641 	caller_context_t *ct)
1642 {
1643 	int32_t error = 0;
1644 
1645 	ud_printf("udf_space\n");
1646 
1647 	if (cmd != F_FREESP) {
1648 		error =  EINVAL;
1649 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1650 		error = ud_freesp(vp, bfp, flag, cr);
1651 
1652 		if (error == 0 && bfp->l_start == 0)
1653 			vnevent_truncate(vp, ct);
1654 	}
1655 
1656 	return (error);
1657 }
1658 
1659 /* ARGSUSED */
1660 static int32_t
1661 udf_getpage(
1662 	struct vnode *vp,
1663 	offset_t off,
1664 	size_t len,
1665 	uint32_t *protp,
1666 	struct page **plarr,
1667 	size_t plsz,
1668 	struct seg *seg,
1669 	caddr_t addr,
1670 	enum seg_rw rw,
1671 	struct cred *cr,
1672 	caller_context_t *ct)
1673 {
1674 	struct ud_inode *ip = VTOI(vp);
1675 	int32_t error, has_holes, beyond_eof, seqmode, dolock;
1676 	int32_t pgsize = PAGESIZE;
1677 	struct udf_vfs *udf_vfsp = ip->i_udf;
1678 	page_t **pl;
1679 	u_offset_t pgoff, eoff, uoff;
1680 	krw_t rwtype;
1681 	caddr_t pgaddr;
1682 
1683 	ud_printf("udf_getpage\n");
1684 
1685 	uoff = (u_offset_t)off; /* type conversion */
1686 	if (protp) {
1687 		*protp = PROT_ALL;
1688 	}
1689 	if (vp->v_flag & VNOMAP) {
1690 		return (ENOSYS);
1691 	}
1692 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1693 
1694 	rwtype = RW_READER;
1695 	dolock = (rw_owner(&ip->i_contents) != curthread);
1696 retrylock:
1697 #ifdef	__lock_lint
1698 	rw_enter(&ip->i_contents, rwtype);
1699 #else
1700 	if (dolock) {
1701 		rw_enter(&ip->i_contents, rwtype);
1702 	}
1703 #endif
1704 
1705 	/*
1706 	 * We may be getting called as a side effect of a bmap using
1707 	 * fbread() when the blocks might be being allocated and the
1708 	 * size has not yet been up'ed.  In this case we want to be
1709 	 * able to return zero pages if we get back UDF_HOLE from
1710 	 * calling bmap for a non write case here.  We also might have
1711 	 * to read some frags from the disk into a page if we are
1712 	 * extending the number of frags for a given lbn in bmap().
1713 	 */
1714 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1715 	if (beyond_eof && seg != segkmap) {
1716 #ifdef	__lock_lint
1717 		rw_exit(&ip->i_contents);
1718 #else
1719 		if (dolock) {
1720 			rw_exit(&ip->i_contents);
1721 		}
1722 #endif
1723 		return (EFAULT);
1724 	}
1725 
1726 	/*
1727 	 * Must hold i_contents lock throughout the call to pvn_getpages
1728 	 * since locked pages are returned from each call to ud_getapage.
1729 	 * Must *not* return locked pages and then try for contents lock
1730 	 * due to lock ordering requirements (inode > page)
1731 	 */
1732 
1733 	has_holes = ud_bmap_has_holes(ip);
1734 
1735 	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1736 		int32_t	blk_size, count;
1737 		u_offset_t offset;
1738 
1739 		/*
1740 		 * We must acquire the RW_WRITER lock in order to
1741 		 * call bmap_write().
1742 		 */
1743 		if (dolock && rwtype == RW_READER) {
1744 			rwtype = RW_WRITER;
1745 
1746 			if (!rw_tryupgrade(&ip->i_contents)) {
1747 
1748 				rw_exit(&ip->i_contents);
1749 
1750 				goto retrylock;
1751 			}
1752 		}
1753 
1754 		/*
1755 		 * May be allocating disk blocks for holes here as
1756 		 * a result of mmap faults. write(2) does the bmap_write
1757 		 * in rdip/wrip, not here. We are not dealing with frags
1758 		 * in this case.
1759 		 */
1760 		offset = uoff;
1761 		while ((offset < uoff + len) &&
1762 		    (offset < ip->i_size)) {
1763 			/*
1764 			 * the variable "bnp" is to simplify the expression for
1765 			 * the compiler; * just passing in &bn to bmap_write
1766 			 * causes a compiler "loop"
1767 			 */
1768 
1769 			blk_size = udf_vfsp->udf_lbsize;
1770 			if ((offset + blk_size) > ip->i_size) {
1771 				count = ip->i_size - offset;
1772 			} else {
1773 				count = blk_size;
1774 			}
1775 			error = ud_bmap_write(ip, offset, count, 0, cr);
1776 			if (error) {
1777 				goto update_inode;
1778 			}
1779 			offset += count; /* XXX - make this contig */
1780 		}
1781 	}
1782 
1783 	/*
1784 	 * Can be a reader from now on.
1785 	 */
1786 #ifdef	__lock_lint
1787 	if (rwtype == RW_WRITER) {
1788 		rw_downgrade(&ip->i_contents);
1789 	}
1790 #else
1791 	if (dolock && rwtype == RW_WRITER) {
1792 		rw_downgrade(&ip->i_contents);
1793 	}
1794 #endif
1795 
1796 	/*
1797 	 * We remove PROT_WRITE in cases when the file has UDF holes
1798 	 * because we don't  want to call bmap_read() to check each
1799 	 * page if it is backed with a disk block.
1800 	 */
1801 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1802 		*protp &= ~PROT_WRITE;
1803 	}
1804 
1805 	error = 0;
1806 
1807 	/*
1808 	 * The loop looks up pages in the range <off, off + len).
1809 	 * For each page, we first check if we should initiate an asynchronous
1810 	 * read ahead before we call page_lookup (we may sleep in page_lookup
1811 	 * for a previously initiated disk read).
1812 	 */
1813 	eoff = (uoff + len);
1814 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
1815 	    pgoff < eoff; /* empty */) {
1816 		page_t	*pp;
1817 		u_offset_t	nextrio;
1818 		se_t	se;
1819 
1820 		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1821 
1822 		/*
1823 		 * Handle async getpage (faultahead)
1824 		 */
1825 		if (plarr == NULL) {
1826 			ip->i_nextrio = pgoff;
1827 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1828 			pgoff += pgsize;
1829 			pgaddr += pgsize;
1830 			continue;
1831 		}
1832 
1833 		/*
1834 		 * Check if we should initiate read ahead of next cluster.
1835 		 * We call page_exists only when we need to confirm that
1836 		 * we have the current page before we initiate the read ahead.
1837 		 */
1838 		nextrio = ip->i_nextrio;
1839 		if (seqmode &&
1840 		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1841 		    nextrio < ip->i_size && page_exists(vp, pgoff))
1842 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1843 
1844 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1845 
1846 			/*
1847 			 * We found the page in the page cache.
1848 			 */
1849 			*pl++ = pp;
1850 			pgoff += pgsize;
1851 			pgaddr += pgsize;
1852 			len -= pgsize;
1853 			plsz -= pgsize;
1854 		} else  {
1855 
1856 			/*
1857 			 * We have to create the page, or read it from disk.
1858 			 */
1859 			if (error = ud_getpage_miss(vp, pgoff, len,
1860 			    seg, pgaddr, pl, plsz, rw, seqmode)) {
1861 				goto error_out;
1862 			}
1863 
1864 			while (*pl != NULL) {
1865 				pl++;
1866 				pgoff += pgsize;
1867 				pgaddr += pgsize;
1868 				len -= pgsize;
1869 				plsz -= pgsize;
1870 			}
1871 		}
1872 	}
1873 
1874 	/*
1875 	 * Return pages up to plsz if they are in the page cache.
1876 	 * We cannot return pages if there is a chance that they are
1877 	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1878 	 */
1879 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1880 
1881 		ASSERT((protp == NULL) ||
1882 		    !(has_holes && (*protp & PROT_WRITE)));
1883 
1884 		eoff = pgoff + plsz;
1885 		while (pgoff < eoff) {
1886 			page_t		*pp;
1887 
1888 			if ((pp = page_lookup_nowait(vp, pgoff,
1889 			    SE_SHARED)) == NULL)
1890 				break;
1891 
1892 			*pl++ = pp;
1893 			pgoff += pgsize;
1894 			plsz -= pgsize;
1895 		}
1896 	}
1897 
1898 	if (plarr)
1899 		*pl = NULL;			/* Terminate page list */
1900 	ip->i_nextr = pgoff;
1901 
1902 error_out:
1903 	if (error && plarr) {
1904 		/*
1905 		 * Release any pages we have locked.
1906 		 */
1907 		while (pl > &plarr[0])
1908 			page_unlock(*--pl);
1909 
1910 		plarr[0] = NULL;
1911 	}
1912 
1913 update_inode:
1914 #ifdef	__lock_lint
1915 	rw_exit(&ip->i_contents);
1916 #else
1917 	if (dolock) {
1918 		rw_exit(&ip->i_contents);
1919 	}
1920 #endif
1921 
1922 	/*
1923 	 * If the inode is not already marked for IACC (in rwip() for read)
1924 	 * and the inode is not marked for no access time update (in rwip()
1925 	 * for write) then update the inode access time and mod time now.
1926 	 */
1927 	mutex_enter(&ip->i_tlock);
1928 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
1929 		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1930 			ip->i_flag |= IACC;
1931 		}
1932 		if (rw == S_WRITE) {
1933 			ip->i_flag |= IUPD;
1934 		}
1935 		ITIMES_NOLOCK(ip);
1936 	}
1937 	mutex_exit(&ip->i_tlock);
1938 
1939 	return (error);
1940 }
1941 
1942 int32_t ud_delay = 1;
1943 
1944 /* ARGSUSED */
1945 static int32_t
1946 udf_putpage(
1947 	struct vnode *vp,
1948 	offset_t off,
1949 	size_t len,
1950 	int32_t flags,
1951 	struct cred *cr,
1952 	caller_context_t *ct)
1953 {
1954 	struct ud_inode *ip;
1955 	int32_t error = 0;
1956 
1957 	ud_printf("udf_putpage\n");
1958 
1959 	ip = VTOI(vp);
1960 #ifdef	__lock_lint
1961 	rw_enter(&ip->i_contents, RW_WRITER);
1962 #endif
1963 
1964 	if (vp->v_count == 0) {
1965 		cmn_err(CE_WARN, "ud_putpage : bad v_count");
1966 		error = EINVAL;
1967 		goto out;
1968 	}
1969 
1970 	if (vp->v_flag & VNOMAP) {
1971 		error = ENOSYS;
1972 		goto out;
1973 	}
1974 
1975 	if (flags & B_ASYNC) {
1976 		if (ud_delay && len &&
1977 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1978 			mutex_enter(&ip->i_tlock);
1979 
1980 			/*
1981 			 * If nobody stalled, start a new cluster.
1982 			 */
1983 			if (ip->i_delaylen == 0) {
1984 				ip->i_delayoff = off;
1985 				ip->i_delaylen = len;
1986 				mutex_exit(&ip->i_tlock);
1987 				goto out;
1988 			}
1989 
1990 			/*
1991 			 * If we have a full cluster or they are not contig,
1992 			 * then push last cluster and start over.
1993 			 */
1994 			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1995 			    ip->i_delayoff + ip->i_delaylen != off) {
1996 				u_offset_t doff;
1997 				size_t dlen;
1998 
1999 				doff = ip->i_delayoff;
2000 				dlen = ip->i_delaylen;
2001 				ip->i_delayoff = off;
2002 				ip->i_delaylen = len;
2003 				mutex_exit(&ip->i_tlock);
2004 				error = ud_putpages(vp, doff, dlen, flags, cr);
2005 				/* LMXXX - flags are new val, not old */
2006 				goto out;
2007 			}
2008 
2009 			/*
2010 			 * There is something there, it's not full, and
2011 			 * it is contig.
2012 			 */
2013 			ip->i_delaylen += len;
2014 			mutex_exit(&ip->i_tlock);
2015 			goto out;
2016 		}
2017 
2018 		/*
2019 		 * Must have weird flags or we are not clustering.
2020 		 */
2021 	}
2022 
2023 	error = ud_putpages(vp, off, len, flags, cr);
2024 
2025 out:
2026 #ifdef	__lock_lint
2027 	rw_exit(&ip->i_contents);
2028 #endif
2029 	return (error);
2030 }
2031 
2032 /* ARGSUSED */
2033 static int32_t
2034 udf_map(
2035 	struct vnode *vp,
2036 	offset_t off,
2037 	struct as *as,
2038 	caddr_t *addrp,
2039 	size_t len,
2040 	uint8_t prot,
2041 	uint8_t maxprot,
2042 	uint32_t flags,
2043 	struct cred *cr,
2044 	caller_context_t *ct)
2045 {
2046 	struct segvn_crargs vn_a;
2047 	int32_t error = 0;
2048 
2049 	ud_printf("udf_map\n");
2050 
2051 	if (vp->v_flag & VNOMAP) {
2052 		error = ENOSYS;
2053 		goto end;
2054 	}
2055 
2056 	if ((off < (offset_t)0) ||
2057 	    ((off + len) < (offset_t)0)) {
2058 		error = EINVAL;
2059 		goto end;
2060 	}
2061 
2062 	if (vp->v_type != VREG) {
2063 		error = ENODEV;
2064 		goto end;
2065 	}
2066 
2067 	/*
2068 	 * If file is being locked, disallow mapping.
2069 	 */
2070 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2071 		error = EAGAIN;
2072 		goto end;
2073 	}
2074 
2075 	as_rangelock(as);
2076 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2077 	if (error != 0) {
2078 		as_rangeunlock(as);
2079 		goto end;
2080 	}
2081 
2082 	vn_a.vp = vp;
2083 	vn_a.offset = off;
2084 	vn_a.type = flags & MAP_TYPE;
2085 	vn_a.prot = prot;
2086 	vn_a.maxprot = maxprot;
2087 	vn_a.cred = cr;
2088 	vn_a.amp = NULL;
2089 	vn_a.flags = flags & ~MAP_TYPE;
2090 	vn_a.szc = 0;
2091 	vn_a.lgrp_mem_policy_flags = 0;
2092 
2093 	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2094 	as_rangeunlock(as);
2095 
2096 end:
2097 	return (error);
2098 }
2099 
2100 /* ARGSUSED */
2101 static int32_t
2102 udf_addmap(struct vnode *vp,
2103 	offset_t off,
2104 	struct as *as,
2105 	caddr_t addr,
2106 	size_t len,
2107 	uint8_t prot,
2108 	uint8_t maxprot,
2109 	uint32_t flags,
2110 	struct cred *cr,
2111 	caller_context_t *ct)
2112 {
2113 	struct ud_inode *ip = VTOI(vp);
2114 
2115 	ud_printf("udf_addmap\n");
2116 
2117 	if (vp->v_flag & VNOMAP) {
2118 		return (ENOSYS);
2119 	}
2120 
2121 	mutex_enter(&ip->i_tlock);
2122 	ip->i_mapcnt += btopr(len);
2123 	mutex_exit(&ip->i_tlock);
2124 
2125 	return (0);
2126 }
2127 
2128 /* ARGSUSED */
2129 static int32_t
2130 udf_delmap(
2131 	struct vnode *vp, offset_t off,
2132 	struct as *as,
2133 	caddr_t addr,
2134 	size_t len,
2135 	uint32_t prot,
2136 	uint32_t maxprot,
2137 	uint32_t flags,
2138 	struct cred *cr,
2139 	caller_context_t *ct)
2140 {
2141 	struct ud_inode *ip = VTOI(vp);
2142 
2143 	ud_printf("udf_delmap\n");
2144 
2145 	if (vp->v_flag & VNOMAP) {
2146 		return (ENOSYS);
2147 	}
2148 
2149 	mutex_enter(&ip->i_tlock);
2150 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
2151 	ASSERT(ip->i_mapcnt >= 0);
2152 	mutex_exit(&ip->i_tlock);
2153 
2154 	return (0);
2155 }
2156 
2157 /* ARGSUSED */
2158 static int32_t
2159 udf_l_pathconf(
2160 	struct vnode *vp,
2161 	int32_t cmd,
2162 	ulong_t *valp,
2163 	struct cred *cr,
2164 	caller_context_t *ct)
2165 {
2166 	int32_t error = 0;
2167 
2168 	ud_printf("udf_l_pathconf\n");
2169 
2170 	if (cmd == _PC_FILESIZEBITS) {
2171 		/*
2172 		 * udf supports 64 bits as file size
2173 		 * but there are several other restrictions
2174 		 * it only supports 32-bit block numbers and
2175 		 * daddr32_t is only and int32_t so taking these
2176 		 * into account we can stay just as where ufs is
2177 		 */
2178 		*valp = 41;
2179 	} else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2180 		/* nanosecond timestamp resolution */
2181 		*valp = 1L;
2182 	} else {
2183 		error = fs_pathconf(vp, cmd, valp, cr, ct);
2184 	}
2185 
2186 	return (error);
2187 }
2188 
2189 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2190 #ifndef	__lint
2191 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2192 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2193 #endif
2194 /*
2195  * Assumption is that there will not be a pageio request
2196  * to a enbedded file
2197  */
2198 /* ARGSUSED */
2199 static int32_t
2200 udf_pageio(
2201 	struct vnode *vp,
2202 	struct page *pp,
2203 	u_offset_t io_off,
2204 	size_t io_len,
2205 	int32_t flags,
2206 	struct cred *cr,
2207 	caller_context_t *ct)
2208 {
2209 	daddr_t bn;
2210 	struct buf *bp;
2211 	struct ud_inode *ip = VTOI(vp);
2212 	int32_t dolock, error = 0, contig, multi_io;
2213 	size_t done_len = 0, cur_len = 0;
2214 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
2215 
2216 	if (pp == NULL) {
2217 		return (EINVAL);
2218 	}
2219 
2220 	dolock = (rw_owner(&ip->i_contents) != curthread);
2221 
2222 	/*
2223 	 * We need a better check.  Ideally, we would use another
2224 	 * vnodeops so that hlocked and forcibly unmounted file
2225 	 * systems would return EIO where appropriate and w/o the
2226 	 * need for these checks.
2227 	 */
2228 	if (ip->i_udf == NULL) {
2229 		return (EIO);
2230 	}
2231 
2232 #ifdef	__lock_lint
2233 	rw_enter(&ip->i_contents, RW_READER);
2234 #else
2235 	if (dolock) {
2236 		rw_enter(&ip->i_contents, RW_READER);
2237 	}
2238 #endif
2239 
2240 	/*
2241 	 * Break the io request into chunks, one for each contiguous
2242 	 * stretch of disk blocks in the target file.
2243 	 */
2244 	while (done_len < io_len) {
2245 		ASSERT(cpp);
2246 		bp = NULL;
2247 		contig = 0;
2248 		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2249 		    &bn, &contig)) {
2250 			break;
2251 		}
2252 
2253 		if (bn == UDF_HOLE) {   /* No holey swapfiles */
2254 			cmn_err(CE_WARN, "SWAP file has HOLES");
2255 			error = EINVAL;
2256 			break;
2257 		}
2258 
2259 		cur_len = MIN(io_len - done_len, contig);
2260 
2261 		/*
2262 		 * Check if more than one I/O is
2263 		 * required to complete the given
2264 		 * I/O operation
2265 		 */
2266 		if (ip->i_udf->udf_lbsize < PAGESIZE) {
2267 			if (cur_len >= PAGESIZE) {
2268 				multi_io = 0;
2269 				cur_len &= PAGEMASK;
2270 			} else {
2271 				multi_io = 1;
2272 				cur_len = MIN(io_len - done_len, PAGESIZE);
2273 			}
2274 		}
2275 		page_list_break(&cpp, &npp, btop(cur_len));
2276 
2277 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2278 		ASSERT(bp != NULL);
2279 
2280 		bp->b_edev = ip->i_dev;
2281 		bp->b_dev = cmpdev(ip->i_dev);
2282 		bp->b_blkno = bn;
2283 		bp->b_un.b_addr = (caddr_t)0;
2284 		bp->b_file = vp;
2285 		bp->b_offset = (offset_t)(io_off + done_len);
2286 
2287 /*
2288  *		ub.ub_pageios.value.ul++;
2289  */
2290 		if (multi_io == 0) {
2291 			(void) bdev_strategy(bp);
2292 		} else {
2293 			error = ud_multi_strat(ip, cpp, bp,
2294 			    (u_offset_t)(io_off + done_len));
2295 			if (error != 0) {
2296 				pageio_done(bp);
2297 				break;
2298 			}
2299 		}
2300 		if (flags & B_READ) {
2301 			ud_pageio_reads++;
2302 		} else {
2303 			ud_pageio_writes++;
2304 		}
2305 
2306 		/*
2307 		 * If the request is not B_ASYNC, wait for i/o to complete
2308 		 * and re-assemble the page list to return to the caller.
2309 		 * If it is B_ASYNC we leave the page list in pieces and
2310 		 * cleanup() will dispose of them.
2311 		 */
2312 		if ((flags & B_ASYNC) == 0) {
2313 			error = biowait(bp);
2314 			pageio_done(bp);
2315 			if (error) {
2316 				break;
2317 			}
2318 			page_list_concat(&opp, &cpp);
2319 		}
2320 		cpp = npp;
2321 		npp = NULL;
2322 		done_len += cur_len;
2323 	}
2324 
2325 	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2326 	if (error) {
2327 		if (flags & B_ASYNC) {
2328 			/* Cleanup unprocessed parts of list */
2329 			page_list_concat(&cpp, &npp);
2330 			if (flags & B_READ) {
2331 				pvn_read_done(cpp, B_ERROR);
2332 			} else {
2333 				pvn_write_done(cpp, B_ERROR);
2334 			}
2335 		} else {
2336 			/* Re-assemble list and let caller clean up */
2337 			page_list_concat(&opp, &cpp);
2338 			page_list_concat(&opp, &npp);
2339 		}
2340 	}
2341 
2342 #ifdef	__lock_lint
2343 	rw_exit(&ip->i_contents);
2344 #else
2345 	if (dolock) {
2346 		rw_exit(&ip->i_contents);
2347 	}
2348 #endif
2349 	return (error);
2350 }
2351 
2352 
2353 
2354 
2355 /* -------------------- local functions --------------------------- */
2356 
2357 
2358 
2359 int32_t
2360 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2361 	struct ud_inode *ip, caddr_t base, int32_t len,
2362 	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2363 {
2364 	int32_t error;
2365 	struct uio auio;
2366 	struct iovec aiov;
2367 
2368 	ud_printf("ud_rdwri\n");
2369 
2370 	bzero((caddr_t)&auio, sizeof (uio_t));
2371 	bzero((caddr_t)&aiov, sizeof (iovec_t));
2372 
2373 	aiov.iov_base = base;
2374 	aiov.iov_len = len;
2375 	auio.uio_iov = &aiov;
2376 	auio.uio_iovcnt = 1;
2377 	auio.uio_loffset = offset;
2378 	auio.uio_segflg = (int16_t)seg;
2379 	auio.uio_resid = len;
2380 
2381 	if (rw == UIO_WRITE) {
2382 		auio.uio_fmode = FWRITE;
2383 		auio.uio_extflg = UIO_COPY_DEFAULT;
2384 		auio.uio_llimit = curproc->p_fsz_ctl;
2385 		error = ud_wrip(ip, &auio, ioflag, cr);
2386 	} else {
2387 		auio.uio_fmode = FREAD;
2388 		auio.uio_extflg = UIO_COPY_CACHED;
2389 		auio.uio_llimit = MAXOFFSET_T;
2390 		error = ud_rdip(ip, &auio, ioflag, cr);
2391 	}
2392 
2393 	if (aresid) {
2394 		*aresid = auio.uio_resid;
2395 	} else if (auio.uio_resid) {
2396 		error = EIO;
2397 	}
2398 	return (error);
2399 }
2400 
2401 /*
2402  * Free behind hacks.  The pager is busted.
2403  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2404  * or B_FREE_IF_TIGHT_ON_MEMORY.
2405  */
2406 int32_t ud_freebehind = 1;
2407 int32_t ud_smallfile = 32 * 1024;
2408 
2409 /* ARGSUSED */
2410 int32_t
2411 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2412 	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2413 	size_t plsz, enum seg_rw rw, int32_t seq)
2414 {
2415 	struct ud_inode *ip = VTOI(vp);
2416 	int32_t err = 0;
2417 	size_t io_len;
2418 	u_offset_t io_off;
2419 	u_offset_t pgoff;
2420 	page_t *pp;
2421 
2422 	pl[0] = NULL;
2423 
2424 	/*
2425 	 * Figure out whether the page can be created, or must be
2426 	 * read from the disk
2427 	 */
2428 	if (rw == S_CREATE) {
2429 		if ((pp = page_create_va(vp, off,
2430 		    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2431 			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2432 			return (EINVAL);
2433 		}
2434 		io_len = PAGESIZE;
2435 	} else {
2436 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2437 		    &io_len, off, PAGESIZE, 0);
2438 
2439 		/*
2440 		 * Some other thread has entered the page.
2441 		 * ud_getpage will retry page_lookup.
2442 		 */
2443 		if (pp == NULL) {
2444 			return (0);
2445 		}
2446 
2447 		/*
2448 		 * Fill the page with as much data as we can from the file.
2449 		 */
2450 		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2451 		if (err) {
2452 			pvn_read_done(pp, B_ERROR);
2453 			return (err);
2454 		}
2455 
2456 		/*
2457 		 * XXX ??? ufs has io_len instead of pgoff below
2458 		 */
2459 		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2460 
2461 		/*
2462 		 * If the file access is sequential, initiate read ahead
2463 		 * of the next cluster.
2464 		 */
2465 		if (seq && ip->i_nextrio < ip->i_size) {
2466 			ud_getpage_ra(vp, off, seg, addr);
2467 		}
2468 	}
2469 
2470 outmiss:
2471 	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2472 	return (err);
2473 }
2474 
2475 /* ARGSUSED */
2476 void
2477 ud_getpage_ra(struct vnode *vp,
2478 	u_offset_t off, struct seg *seg, caddr_t addr)
2479 {
2480 	page_t *pp;
2481 	size_t io_len;
2482 	struct ud_inode *ip = VTOI(vp);
2483 	u_offset_t io_off = ip->i_nextrio, pgoff;
2484 	caddr_t addr2 = addr + (io_off - off);
2485 	daddr_t bn;
2486 	int32_t contig = 0;
2487 
2488 	/*
2489 	 * Is this test needed?
2490 	 */
2491 
2492 	if (addr2 >= seg->s_base + seg->s_size) {
2493 		return;
2494 	}
2495 
2496 	contig = 0;
2497 	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2498 		return;
2499 	}
2500 
2501 	pp = pvn_read_kluster(vp, io_off, seg, addr2,
2502 	    &io_off, &io_len, io_off, PAGESIZE, 1);
2503 
2504 	/*
2505 	 * Some other thread has entered the page.
2506 	 * So no read head done here (ie we will have to and wait
2507 	 * for the read when needed).
2508 	 */
2509 
2510 	if (pp == NULL) {
2511 		return;
2512 	}
2513 
2514 	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2515 	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2516 }
2517 
2518 int
2519 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2520 	uint32_t bflgs, u_offset_t *pg_off)
2521 {
2522 	daddr_t bn;
2523 	struct buf *bp;
2524 	caddr_t kaddr, caddr;
2525 	int32_t error = 0, contig = 0, multi_io = 0;
2526 	int32_t lbsize = ip->i_udf->udf_lbsize;
2527 	int32_t lbmask = ip->i_udf->udf_lbmask;
2528 	uint64_t isize;
2529 
2530 	isize = (ip->i_size + lbmask) & (~lbmask);
2531 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2532 
2533 		/*
2534 		 * Embedded file read file_entry
2535 		 * from buffer cache and copy the required
2536 		 * portions
2537 		 */
2538 		bp = ud_bread(ip->i_dev,
2539 		    ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2540 		if ((bp->b_error == 0) &&
2541 		    (bp->b_resid == 0)) {
2542 
2543 			caddr = bp->b_un.b_addr + ip->i_data_off;
2544 
2545 			/*
2546 			 * mapin to kvm
2547 			 */
2548 			kaddr = (caddr_t)ppmapin(pp,
2549 			    PROT_READ | PROT_WRITE, (caddr_t)-1);
2550 			(void) kcopy(caddr, kaddr, ip->i_size);
2551 
2552 			/*
2553 			 * mapout of kvm
2554 			 */
2555 			ppmapout(kaddr);
2556 		}
2557 		brelse(bp);
2558 		contig = ip->i_size;
2559 	} else {
2560 
2561 		/*
2562 		 * Get the continuous size and block number
2563 		 * at offset "off"
2564 		 */
2565 		if (error = ud_bmap_read(ip, off, &bn, &contig))
2566 			goto out;
2567 		contig = MIN(contig, PAGESIZE);
2568 		contig = (contig + lbmask) & (~lbmask);
2569 
2570 		/*
2571 		 * Zero part of the page which we are not
2572 		 * going to read from the disk.
2573 		 */
2574 
2575 		if (bn == UDF_HOLE) {
2576 
2577 			/*
2578 			 * This is a HOLE. Just zero out
2579 			 * the page
2580 			 */
2581 			if (((off + contig) == isize) ||
2582 			    (contig == PAGESIZE)) {
2583 				pagezero(pp->p_prev, 0, PAGESIZE);
2584 				goto out;
2585 			}
2586 		}
2587 
2588 		if (contig < PAGESIZE) {
2589 			uint64_t count;
2590 
2591 			count = isize - off;
2592 			if (contig != count) {
2593 				multi_io = 1;
2594 				contig = (int32_t)(MIN(count, PAGESIZE));
2595 			} else {
2596 				pagezero(pp->p_prev, contig, PAGESIZE - contig);
2597 			}
2598 		}
2599 
2600 		/*
2601 		 * Get a bp and initialize it
2602 		 */
2603 		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2604 		ASSERT(bp != NULL);
2605 
2606 		bp->b_edev = ip->i_dev;
2607 		bp->b_dev = cmpdev(ip->i_dev);
2608 		bp->b_blkno = bn;
2609 		bp->b_un.b_addr = 0;
2610 		bp->b_file = ip->i_vnode;
2611 
2612 		/*
2613 		 * Start I/O
2614 		 */
2615 		if (multi_io == 0) {
2616 
2617 			/*
2618 			 * Single I/O is sufficient for this page
2619 			 */
2620 			(void) bdev_strategy(bp);
2621 		} else {
2622 
2623 			/*
2624 			 * We need to do the I/O in
2625 			 * piece's
2626 			 */
2627 			error = ud_multi_strat(ip, pp, bp, off);
2628 			if (error != 0) {
2629 				goto out;
2630 			}
2631 		}
2632 		if ((bflgs & B_ASYNC) == 0) {
2633 
2634 			/*
2635 			 * Wait for i/o to complete.
2636 			 */
2637 
2638 			error = biowait(bp);
2639 			pageio_done(bp);
2640 			if (error) {
2641 				goto out;
2642 			}
2643 		}
2644 	}
2645 	if ((off + contig) >= ip->i_size) {
2646 		contig = ip->i_size - off;
2647 	}
2648 
2649 out:
2650 	*pg_off = contig;
2651 	return (error);
2652 }
2653 
2654 int32_t
2655 ud_putpages(struct vnode *vp, offset_t off,
2656 	size_t len, int32_t flags, struct cred *cr)
2657 {
2658 	struct ud_inode *ip;
2659 	page_t *pp;
2660 	u_offset_t io_off;
2661 	size_t io_len;
2662 	u_offset_t eoff;
2663 	int32_t err = 0;
2664 	int32_t dolock;
2665 
2666 	ud_printf("ud_putpages\n");
2667 
2668 	if (vp->v_count == 0) {
2669 		cmn_err(CE_WARN, "ud_putpages: bad v_count");
2670 		return (EINVAL);
2671 	}
2672 
2673 	ip = VTOI(vp);
2674 
2675 	/*
2676 	 * Acquire the readers/write inode lock before locking
2677 	 * any pages in this inode.
2678 	 * The inode lock is held during i/o.
2679 	 */
2680 	if (len == 0) {
2681 		mutex_enter(&ip->i_tlock);
2682 		ip->i_delayoff = ip->i_delaylen = 0;
2683 		mutex_exit(&ip->i_tlock);
2684 	}
2685 #ifdef	__lock_lint
2686 	rw_enter(&ip->i_contents, RW_READER);
2687 #else
2688 	dolock = (rw_owner(&ip->i_contents) != curthread);
2689 	if (dolock) {
2690 		rw_enter(&ip->i_contents, RW_READER);
2691 	}
2692 #endif
2693 
2694 	if (!vn_has_cached_data(vp)) {
2695 #ifdef	__lock_lint
2696 		rw_exit(&ip->i_contents);
2697 #else
2698 		if (dolock) {
2699 			rw_exit(&ip->i_contents);
2700 		}
2701 #endif
2702 		return (0);
2703 	}
2704 
2705 	if (len == 0) {
2706 		/*
2707 		 * Search the entire vp list for pages >= off.
2708 		 */
2709 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2710 		    flags, cr);
2711 	} else {
2712 		/*
2713 		 * Loop over all offsets in the range looking for
2714 		 * pages to deal with.
2715 		 */
2716 		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2717 			eoff = MIN(off + len, eoff);
2718 		} else {
2719 			eoff = off + len;
2720 		}
2721 
2722 		for (io_off = off; io_off < eoff; io_off += io_len) {
2723 			/*
2724 			 * If we are not invalidating, synchronously
2725 			 * freeing or writing pages, use the routine
2726 			 * page_lookup_nowait() to prevent reclaiming
2727 			 * them from the free list.
2728 			 */
2729 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2730 				pp = page_lookup(vp, io_off,
2731 				    (flags & (B_INVAL | B_FREE)) ?
2732 				    SE_EXCL : SE_SHARED);
2733 			} else {
2734 				pp = page_lookup_nowait(vp, io_off,
2735 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2736 			}
2737 
2738 			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2739 				io_len = PAGESIZE;
2740 			} else {
2741 
2742 				err = ud_putapage(vp, pp,
2743 				    &io_off, &io_len, flags, cr);
2744 				if (err != 0) {
2745 					break;
2746 				}
2747 				/*
2748 				 * "io_off" and "io_len" are returned as
2749 				 * the range of pages we actually wrote.
2750 				 * This allows us to skip ahead more quickly
2751 				 * since several pages may've been dealt
2752 				 * with by this iteration of the loop.
2753 				 */
2754 			}
2755 		}
2756 	}
2757 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2758 		/*
2759 		 * We have just sync'ed back all the pages on
2760 		 * the inode, turn off the IMODTIME flag.
2761 		 */
2762 		mutex_enter(&ip->i_tlock);
2763 		ip->i_flag &= ~IMODTIME;
2764 		mutex_exit(&ip->i_tlock);
2765 	}
2766 #ifdef	__lock_lint
2767 	rw_exit(&ip->i_contents);
2768 #else
2769 	if (dolock) {
2770 		rw_exit(&ip->i_contents);
2771 	}
2772 #endif
2773 	return (err);
2774 }
2775 
2776 /* ARGSUSED */
2777 int32_t
2778 ud_putapage(struct vnode *vp,
2779 	page_t *pp, u_offset_t *offp,
2780 	size_t *lenp, int32_t flags, struct cred *cr)
2781 {
2782 	daddr_t bn;
2783 	size_t io_len;
2784 	struct ud_inode *ip;
2785 	int32_t error = 0, contig, multi_io = 0;
2786 	struct udf_vfs *udf_vfsp;
2787 	u_offset_t off, io_off;
2788 	caddr_t kaddr, caddr;
2789 	struct buf *bp = NULL;
2790 	int32_t lbmask;
2791 	uint64_t isize;
2792 	uint16_t crc_len;
2793 	struct file_entry *fe;
2794 
2795 	ud_printf("ud_putapage\n");
2796 
2797 	ip = VTOI(vp);
2798 	ASSERT(ip);
2799 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2800 	lbmask = ip->i_udf->udf_lbmask;
2801 	isize = (ip->i_size + lbmask) & (~lbmask);
2802 
2803 	udf_vfsp = ip->i_udf;
2804 	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2805 
2806 	/*
2807 	 * If the modified time on the inode has not already been
2808 	 * set elsewhere (e.g. for write/setattr) we set the time now.
2809 	 * This gives us approximate modified times for mmap'ed files
2810 	 * which are modified via stores in the user address space.
2811 	 */
2812 	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2813 		mutex_enter(&ip->i_tlock);
2814 		ip->i_flag |= IUPD;
2815 		ITIMES_NOLOCK(ip);
2816 		mutex_exit(&ip->i_tlock);
2817 	}
2818 
2819 
2820 	/*
2821 	 * Align the request to a block boundry (for old file systems),
2822 	 * and go ask bmap() how contiguous things are for this file.
2823 	 */
2824 	off = pp->p_offset & ~(offset_t)lbmask;
2825 				/* block align it */
2826 
2827 
2828 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2829 		ASSERT(ip->i_size <= ip->i_max_emb);
2830 
2831 		pp = pvn_write_kluster(vp, pp, &io_off,
2832 		    &io_len, off, PAGESIZE, flags);
2833 		if (io_len == 0) {
2834 			io_len = PAGESIZE;
2835 		}
2836 
2837 		bp = ud_bread(ip->i_dev,
2838 		    ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2839 		    udf_vfsp->udf_lbsize);
2840 		fe = (struct file_entry *)bp->b_un.b_addr;
2841 		if ((bp->b_flags & B_ERROR) ||
2842 		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2843 		    ip->i_icb_block,
2844 		    1, udf_vfsp->udf_lbsize) != 0)) {
2845 			if (pp != NULL)
2846 				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2847 			if (bp->b_flags & B_ERROR) {
2848 				error = EIO;
2849 			} else {
2850 				error = EINVAL;
2851 			}
2852 			brelse(bp);
2853 			return (error);
2854 		}
2855 		if ((bp->b_error == 0) &&
2856 		    (bp->b_resid == 0)) {
2857 
2858 			caddr = bp->b_un.b_addr + ip->i_data_off;
2859 			kaddr = (caddr_t)ppmapin(pp,
2860 			    PROT_READ | PROT_WRITE, (caddr_t)-1);
2861 			(void) kcopy(kaddr, caddr, ip->i_size);
2862 			ppmapout(kaddr);
2863 		}
2864 		crc_len = offsetof(struct file_entry, fe_spec) +
2865 		    SWAP_32(fe->fe_len_ear);
2866 		crc_len += ip->i_size;
2867 		ud_make_tag(ip->i_udf, &fe->fe_tag,
2868 		    UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2869 
2870 		bwrite(bp);
2871 
2872 		if (flags & B_ASYNC) {
2873 			pvn_write_done(pp, flags);
2874 		}
2875 		contig = ip->i_size;
2876 	} else {
2877 
2878 		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2879 			goto out;
2880 		}
2881 		contig = MIN(contig, PAGESIZE);
2882 		contig = (contig + lbmask) & (~lbmask);
2883 
2884 		if (contig < PAGESIZE) {
2885 			uint64_t count;
2886 
2887 			count = isize - off;
2888 			if (contig != count) {
2889 				multi_io = 1;
2890 				contig = (int32_t)(MIN(count, PAGESIZE));
2891 			}
2892 		}
2893 
2894 		if ((off + contig) > isize) {
2895 			contig = isize - off;
2896 		}
2897 
2898 		if (contig > PAGESIZE) {
2899 			if (contig & PAGEOFFSET) {
2900 				contig &= PAGEMASK;
2901 			}
2902 		}
2903 
2904 		pp = pvn_write_kluster(vp, pp, &io_off,
2905 		    &io_len, off, contig, flags);
2906 		if (io_len == 0) {
2907 			io_len = PAGESIZE;
2908 		}
2909 
2910 		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2911 		ASSERT(bp != NULL);
2912 
2913 		bp->b_edev = ip->i_dev;
2914 		bp->b_dev = cmpdev(ip->i_dev);
2915 		bp->b_blkno = bn;
2916 		bp->b_un.b_addr = 0;
2917 		bp->b_file = vp;
2918 		bp->b_offset = (offset_t)off;
2919 
2920 
2921 		/*
2922 		 * write throttle
2923 		 */
2924 		ASSERT(bp->b_iodone == NULL);
2925 		bp->b_iodone = ud_iodone;
2926 		mutex_enter(&ip->i_tlock);
2927 		ip->i_writes += bp->b_bcount;
2928 		mutex_exit(&ip->i_tlock);
2929 
2930 		if (multi_io == 0) {
2931 
2932 			(void) bdev_strategy(bp);
2933 		} else {
2934 			error = ud_multi_strat(ip, pp, bp, off);
2935 			if (error != 0) {
2936 				goto out;
2937 			}
2938 		}
2939 
2940 		if ((flags & B_ASYNC) == 0) {
2941 			/*
2942 			 * Wait for i/o to complete.
2943 			 */
2944 			error = biowait(bp);
2945 			pageio_done(bp);
2946 		}
2947 	}
2948 
2949 	if ((flags & B_ASYNC) == 0) {
2950 		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2951 	}
2952 
2953 	pp = NULL;
2954 
2955 out:
2956 	if (error != 0 && pp != NULL) {
2957 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2958 	}
2959 
2960 	if (offp) {
2961 		*offp = io_off;
2962 	}
2963 	if (lenp) {
2964 		*lenp = io_len;
2965 	}
2966 
2967 	return (error);
2968 }
2969 
2970 
2971 int32_t
2972 ud_iodone(struct buf *bp)
2973 {
2974 	struct ud_inode *ip;
2975 
2976 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2977 
2978 	bp->b_iodone = NULL;
2979 
2980 	ip = VTOI(bp->b_pages->p_vnode);
2981 
2982 	mutex_enter(&ip->i_tlock);
2983 	if (ip->i_writes >= ud_LW) {
2984 		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2985 			if (ud_WRITES) {
2986 				cv_broadcast(&ip->i_wrcv); /* wake all up */
2987 			}
2988 		}
2989 	} else {
2990 		ip->i_writes -= bp->b_bcount;
2991 	}
2992 	mutex_exit(&ip->i_tlock);
2993 	iodone(bp);
2994 	return (0);
2995 }
2996 
2997 /* ARGSUSED3 */
2998 int32_t
2999 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
3000 {
3001 	struct vnode *vp;
3002 	struct udf_vfs *udf_vfsp;
3003 	krw_t rwtype;
3004 	caddr_t base;
3005 	uint32_t flags;
3006 	int32_t error, n, on, mapon, dofree;
3007 	u_offset_t off;
3008 	long oresid = uio->uio_resid;
3009 
3010 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3011 	if ((ip->i_type != VREG) &&
3012 	    (ip->i_type != VDIR) &&
3013 	    (ip->i_type != VLNK)) {
3014 		return (EIO);
3015 	}
3016 
3017 	if (uio->uio_loffset > MAXOFFSET_T) {
3018 		return (0);
3019 	}
3020 
3021 	if ((uio->uio_loffset < (offset_t)0) ||
3022 	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
3023 		return (EINVAL);
3024 	}
3025 	if (uio->uio_resid == 0) {
3026 		return (0);
3027 	}
3028 
3029 	vp = ITOV(ip);
3030 	udf_vfsp = ip->i_udf;
3031 	mutex_enter(&ip->i_tlock);
3032 	ip->i_flag |= IACC;
3033 	mutex_exit(&ip->i_tlock);
3034 
3035 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3036 
3037 	do {
3038 		offset_t diff;
3039 		u_offset_t uoff = uio->uio_loffset;
3040 		off = uoff & (offset_t)MAXBMASK;
3041 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3042 		on = (int)blkoff(udf_vfsp, uoff);
3043 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3044 
3045 		diff = ip->i_size - uoff;
3046 
3047 		if (diff <= (offset_t)0) {
3048 			error = 0;
3049 			goto out;
3050 		}
3051 		if (diff < (offset_t)n) {
3052 			n = (int)diff;
3053 		}
3054 		dofree = ud_freebehind &&
3055 		    ip->i_nextr == (off & PAGEMASK) &&
3056 		    off > ud_smallfile;
3057 
3058 #ifndef	__lock_lint
3059 		if (rwtype == RW_READER) {
3060 			rw_exit(&ip->i_contents);
3061 		}
3062 #endif
3063 
3064 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3065 		    (uint32_t)n, 1, S_READ);
3066 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3067 
3068 		flags = 0;
3069 		if (!error) {
3070 			/*
3071 			 * If read a whole block, or read to eof,
3072 			 * won't need this buffer again soon.
3073 			 */
3074 			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3075 			    freemem < lotsfree + pages_before_pager) {
3076 				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3077 			}
3078 			/*
3079 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3080 			 * we want to make sure that the page which has
3081 			 * been read, is written on disk if it is dirty.
3082 			 * And corresponding indirect blocks should also
3083 			 * be flushed out.
3084 			 */
3085 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3086 				flags &= ~SM_ASYNC;
3087 				flags |= SM_WRITE;
3088 			}
3089 			error = segmap_release(segkmap, base, flags);
3090 		} else    {
3091 			(void) segmap_release(segkmap, base, flags);
3092 		}
3093 
3094 #ifndef __lock_lint
3095 		if (rwtype == RW_READER) {
3096 			rw_enter(&ip->i_contents, rwtype);
3097 		}
3098 #endif
3099 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3100 out:
3101 	/*
3102 	 * Inode is updated according to this table if FRSYNC is set.
3103 	 *
3104 	 *	FSYNC	FDSYNC(posix.4)
3105 	 *	--------------------------
3106 	 *	always	IATTCHG|IBDWRITE
3107 	 */
3108 	if (ioflag & FRSYNC) {
3109 		if ((ioflag & FSYNC) ||
3110 		    ((ioflag & FDSYNC) &&
3111 		    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3112 		rw_exit(&ip->i_contents);
3113 		rw_enter(&ip->i_contents, RW_WRITER);
3114 		ud_iupdat(ip, 1);
3115 		}
3116 	}
3117 	/*
3118 	 * If we've already done a partial read, terminate
3119 	 * the read but return no error.
3120 	 */
3121 	if (oresid != uio->uio_resid) {
3122 		error = 0;
3123 	}
3124 	ITIMES(ip);
3125 
3126 	return (error);
3127 }
3128 
3129 int32_t
3130 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3131 {
3132 	caddr_t base;
3133 	struct vnode *vp;
3134 	struct udf_vfs *udf_vfsp;
3135 	uint32_t flags;
3136 	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3137 	int32_t pagecreate, newpage;
3138 	uint64_t old_i_size;
3139 	u_offset_t off;
3140 	long start_resid = uio->uio_resid, premove_resid;
3141 	rlim64_t limit = uio->uio_limit;
3142 
3143 
3144 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
3145 	if ((ip->i_type != VREG) &&
3146 	    (ip->i_type != VDIR) &&
3147 	    (ip->i_type != VLNK)) {
3148 		return (EIO);
3149 	}
3150 
3151 	if (uio->uio_loffset >= MAXOFFSET_T) {
3152 		return (EFBIG);
3153 	}
3154 	/*
3155 	 * see udf_l_pathconf
3156 	 */
3157 	if (limit > (((uint64_t)1 << 40) - 1)) {
3158 		limit = ((uint64_t)1 << 40) - 1;
3159 	}
3160 	if (uio->uio_loffset >= limit) {
3161 		proc_t *p = ttoproc(curthread);
3162 
3163 		mutex_enter(&p->p_lock);
3164 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3165 		    p, RCA_UNSAFE_SIGINFO);
3166 		mutex_exit(&p->p_lock);
3167 		return (EFBIG);
3168 	}
3169 	if ((uio->uio_loffset < (offset_t)0) ||
3170 	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
3171 		return (EINVAL);
3172 	}
3173 	if (uio->uio_resid == 0) {
3174 		return (0);
3175 	}
3176 
3177 	mutex_enter(&ip->i_tlock);
3178 	ip->i_flag |= INOACC;
3179 
3180 	if (ioflag & (FSYNC | FDSYNC)) {
3181 		ip->i_flag |= ISYNC;
3182 		iupdat_flag = 1;
3183 	}
3184 	mutex_exit(&ip->i_tlock);
3185 
3186 	udf_vfsp = ip->i_udf;
3187 	vp = ITOV(ip);
3188 
3189 	do {
3190 		u_offset_t uoff = uio->uio_loffset;
3191 		off = uoff & (offset_t)MAXBMASK;
3192 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3193 		on = (int)blkoff(udf_vfsp, uoff);
3194 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3195 
3196 		if (ip->i_type == VREG && uoff + n >= limit) {
3197 			if (uoff >= limit) {
3198 				error = EFBIG;
3199 				goto out;
3200 			}
3201 			n = (int)(limit - (rlim64_t)uoff);
3202 		}
3203 		if (uoff + n > ip->i_size) {
3204 			/*
3205 			 * We are extending the length of the file.
3206 			 * bmap is used so that we are sure that
3207 			 * if we need to allocate new blocks, that it
3208 			 * is done here before we up the file size.
3209 			 */
3210 			error = ud_bmap_write(ip, uoff,
3211 			    (int)(on + n), mapon == 0, cr);
3212 			if (error) {
3213 				break;
3214 			}
3215 			i_size_changed = 1;
3216 			old_i_size = ip->i_size;
3217 			ip->i_size = uoff + n;
3218 			/*
3219 			 * If we are writing from the beginning of
3220 			 * the mapping, we can just create the
3221 			 * pages without having to read them.
3222 			 */
3223 			pagecreate = (mapon == 0);
3224 		} else if (n == MAXBSIZE) {
3225 			/*
3226 			 * Going to do a whole mappings worth,
3227 			 * so we can just create the pages w/o
3228 			 * having to read them in.  But before
3229 			 * we do that, we need to make sure any
3230 			 * needed blocks are allocated first.
3231 			 */
3232 			error = ud_bmap_write(ip, uoff,
3233 			    (int)(on + n), 1, cr);
3234 			if (error) {
3235 				break;
3236 			}
3237 			pagecreate = 1;
3238 		} else {
3239 			pagecreate = 0;
3240 		}
3241 
3242 		rw_exit(&ip->i_contents);
3243 
3244 		/*
3245 		 * Touch the page and fault it in if it is not in
3246 		 * core before segmap_getmapflt can lock it. This
3247 		 * is to avoid the deadlock if the buffer is mapped
3248 		 * to the same file through mmap which we want to
3249 		 * write to.
3250 		 */
3251 		uio_prefaultpages((long)n, uio);
3252 
3253 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3254 		    (uint32_t)n, !pagecreate, S_WRITE);
3255 
3256 		/*
3257 		 * segmap_pagecreate() returns 1 if it calls
3258 		 * page_create_va() to allocate any pages.
3259 		 */
3260 		newpage = 0;
3261 		if (pagecreate) {
3262 			newpage = segmap_pagecreate(segkmap, base,
3263 			    (size_t)n, 0);
3264 		}
3265 
3266 		premove_resid = uio->uio_resid;
3267 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3268 
3269 		if (pagecreate &&
3270 		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3271 			/*
3272 			 * We created pages w/o initializing them completely,
3273 			 * thus we need to zero the part that wasn't set up.
3274 			 * This happens on most EOF write cases and if
3275 			 * we had some sort of error during the uiomove.
3276 			 */
3277 			int nzero, nmoved;
3278 
3279 			nmoved = (int)(uio->uio_loffset - (off + mapon));
3280 			ASSERT(nmoved >= 0 && nmoved <= n);
3281 			nzero = roundup(on + n, PAGESIZE) - nmoved;
3282 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3283 			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3284 		}
3285 
3286 		/*
3287 		 * Unlock the pages allocated by page_create_va()
3288 		 * in segmap_pagecreate()
3289 		 */
3290 		if (newpage) {
3291 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3292 		}
3293 
3294 		if (error) {
3295 			/*
3296 			 * If we failed on a write, we may have already
3297 			 * allocated file blocks as well as pages.  It's
3298 			 * hard to undo the block allocation, but we must
3299 			 * be sure to invalidate any pages that may have
3300 			 * been allocated.
3301 			 */
3302 			(void) segmap_release(segkmap, base, SM_INVAL);
3303 		} else {
3304 			flags = 0;
3305 			/*
3306 			 * Force write back for synchronous write cases.
3307 			 */
3308 			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3309 				/*
3310 				 * If the sticky bit is set but the
3311 				 * execute bit is not set, we do a
3312 				 * synchronous write back and free
3313 				 * the page when done.  We set up swap
3314 				 * files to be handled this way to
3315 				 * prevent servers from keeping around
3316 				 * the client's swap pages too long.
3317 				 * XXX - there ought to be a better way.
3318 				 */
3319 				if (IS_SWAPVP(vp)) {
3320 					flags = SM_WRITE | SM_FREE |
3321 					    SM_DONTNEED;
3322 					iupdat_flag = 0;
3323 				} else {
3324 					flags = SM_WRITE;
3325 				}
3326 			} else if (((mapon + n) == MAXBSIZE) ||
3327 			    IS_SWAPVP(vp)) {
3328 				/*
3329 				 * Have written a whole block.
3330 				 * Start an asynchronous write and
3331 				 * mark the buffer to indicate that
3332 				 * it won't be needed again soon.
3333 				 */
3334 				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3335 			}
3336 			error = segmap_release(segkmap, base, flags);
3337 
3338 			/*
3339 			 * If the operation failed and is synchronous,
3340 			 * then we need to unwind what uiomove() last
3341 			 * did so we can potentially return an error to
3342 			 * the caller.  If this write operation was
3343 			 * done in two pieces and the first succeeded,
3344 			 * then we won't return an error for the second
3345 			 * piece that failed.  However, we only want to
3346 			 * return a resid value that reflects what was
3347 			 * really done.
3348 			 *
3349 			 * Failures for non-synchronous operations can
3350 			 * be ignored since the page subsystem will
3351 			 * retry the operation until it succeeds or the
3352 			 * file system is unmounted.
3353 			 */
3354 			if (error) {
3355 				if ((ioflag & (FSYNC | FDSYNC)) ||
3356 				    ip->i_type == VDIR) {
3357 					uio->uio_resid = premove_resid;
3358 				} else {
3359 					error = 0;
3360 				}
3361 			}
3362 		}
3363 
3364 		/*
3365 		 * Re-acquire contents lock.
3366 		 */
3367 		rw_enter(&ip->i_contents, RW_WRITER);
3368 		/*
3369 		 * If the uiomove() failed or if a synchronous
3370 		 * page push failed, fix up i_size.
3371 		 */
3372 		if (error) {
3373 			if (i_size_changed) {
3374 				/*
3375 				 * The uiomove failed, and we
3376 				 * allocated blocks,so get rid
3377 				 * of them.
3378 				 */
3379 				(void) ud_itrunc(ip, old_i_size, 0, cr);
3380 			}
3381 		} else {
3382 			/*
3383 			 * XXX - Can this be out of the loop?
3384 			 */
3385 			ip->i_flag |= IUPD | ICHG;
3386 			if (i_size_changed) {
3387 				ip->i_flag |= IATTCHG;
3388 			}
3389 			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3390 			    (IEXEC >> 10))) != 0 &&
3391 			    (ip->i_char & (ISUID | ISGID)) != 0 &&
3392 			    secpolicy_vnode_setid_retain(cr,
3393 			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3394 				/*
3395 				 * Clear Set-UID & Set-GID bits on
3396 				 * successful write if not privileged
3397 				 * and at least one of the execute bits
3398 				 * is set.  If we always clear Set-GID,
3399 				 * mandatory file and record locking is
3400 				 * unuseable.
3401 				 */
3402 				ip->i_char &= ~(ISUID | ISGID);
3403 			}
3404 		}
3405 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3406 
3407 out:
3408 	/*
3409 	 * Inode is updated according to this table -
3410 	 *
3411 	 *	FSYNC	FDSYNC(posix.4)
3412 	 *	--------------------------
3413 	 *	always@	IATTCHG|IBDWRITE
3414 	 *
3415 	 * @ -  If we are doing synchronous write the only time we should
3416 	 *	not be sync'ing the ip here is if we have the stickyhack
3417 	 *	activated, the file is marked with the sticky bit and
3418 	 *	no exec bit, the file length has not been changed and
3419 	 *	no new blocks have been allocated during this write.
3420 	 */
3421 	if ((ip->i_flag & ISYNC) != 0) {
3422 		/*
3423 		 * we have eliminated nosync
3424 		 */
3425 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3426 		    ((ioflag & FSYNC) && iupdat_flag)) {
3427 			ud_iupdat(ip, 1);
3428 		}
3429 	}
3430 
3431 	/*
3432 	 * If we've already done a partial-write, terminate
3433 	 * the write but return no error.
3434 	 */
3435 	if (start_resid != uio->uio_resid) {
3436 		error = 0;
3437 	}
3438 	ip->i_flag &= ~(INOACC | ISYNC);
3439 	ITIMES_NOLOCK(ip);
3440 
3441 	return (error);
3442 }
3443 
3444 int32_t
3445 ud_multi_strat(struct ud_inode *ip,
3446 	page_t *pp, struct buf *bp, u_offset_t start)
3447 {
3448 	daddr_t bn;
3449 	int32_t error = 0, io_count, contig, alloc_sz, i;
3450 	uint32_t io_off;
3451 	mio_master_t *mm = NULL;
3452 	mio_slave_t *ms = NULL;
3453 	struct buf *rbp;
3454 
3455 	ASSERT(!(start & PAGEOFFSET));
3456 
3457 	/*
3458 	 * Figure out how many buffers to allocate
3459 	 */
3460 	io_count = 0;
3461 	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3462 		contig = 0;
3463 		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3464 		    &bn, &contig)) {
3465 			goto end;
3466 		}
3467 		if (contig == 0) {
3468 			goto end;
3469 		}
3470 		contig = MIN(contig, PAGESIZE - io_off);
3471 		if (bn != UDF_HOLE) {
3472 			io_count ++;
3473 		} else {
3474 			/*
3475 			 * HOLE
3476 			 */
3477 			if (bp->b_flags & B_READ) {
3478 
3479 				/*
3480 				 * This is a hole and is read
3481 				 * it should be filled with 0's
3482 				 */
3483 				pagezero(pp, io_off, contig);
3484 			}
3485 		}
3486 	}
3487 
3488 
3489 	if (io_count != 0) {
3490 
3491 		/*
3492 		 * Allocate memory for all the
3493 		 * required number of buffers
3494 		 */
3495 		alloc_sz = sizeof (mio_master_t) +
3496 		    (sizeof (mio_slave_t) * io_count);
3497 		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3498 		if (mm == NULL) {
3499 			error = ENOMEM;
3500 			goto end;
3501 		}
3502 
3503 		/*
3504 		 * initialize master
3505 		 */
3506 		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3507 		mm->mm_size = alloc_sz;
3508 		mm->mm_bp = bp;
3509 		mm->mm_resid = 0;
3510 		mm->mm_error = 0;
3511 		mm->mm_index = master_index++;
3512 
3513 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3514 
3515 		/*
3516 		 * Initialize buffers
3517 		 */
3518 		io_count = 0;
3519 		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3520 			contig = 0;
3521 			if (error = ud_bmap_read(ip,
3522 			    (u_offset_t)(start + io_off),
3523 			    &bn, &contig)) {
3524 				goto end;
3525 			}
3526 			ASSERT(contig);
3527 			if ((io_off + contig) > bp->b_bcount) {
3528 				contig = bp->b_bcount - io_off;
3529 			}
3530 			if (bn != UDF_HOLE) {
3531 				/*
3532 				 * Clone the buffer
3533 				 * and prepare to start I/O
3534 				 */
3535 				ms->ms_ptr = mm;
3536 				bioinit(&ms->ms_buf);
3537 				rbp = bioclone(bp, io_off, (size_t)contig,
3538 				    bp->b_edev, bn, ud_slave_done,
3539 				    &ms->ms_buf, KM_NOSLEEP);
3540 				ASSERT(rbp == &ms->ms_buf);
3541 				mm->mm_resid += contig;
3542 				io_count++;
3543 				ms ++;
3544 			}
3545 		}
3546 
3547 		/*
3548 		 * Start I/O's
3549 		 */
3550 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3551 		for (i = 0; i < io_count; i++) {
3552 			(void) bdev_strategy(&ms->ms_buf);
3553 			ms ++;
3554 		}
3555 	}
3556 
3557 end:
3558 	if (error != 0) {
3559 		bp->b_flags |= B_ERROR;
3560 		bp->b_error = error;
3561 		if (mm != NULL) {
3562 			mutex_destroy(&mm->mm_mutex);
3563 			kmem_free(mm, mm->mm_size);
3564 		}
3565 	}
3566 	return (error);
3567 }
3568 
3569 int32_t
3570 ud_slave_done(struct buf *bp)
3571 {
3572 	mio_master_t *mm;
3573 	int32_t resid;
3574 
3575 	ASSERT(SEMA_HELD(&bp->b_sem));
3576 	ASSERT((bp->b_flags & B_DONE) == 0);
3577 
3578 	mm = ((mio_slave_t *)bp)->ms_ptr;
3579 
3580 	/*
3581 	 * Propagate error and byte count info from slave struct to
3582 	 * the master struct
3583 	 */
3584 	mutex_enter(&mm->mm_mutex);
3585 	if (bp->b_flags & B_ERROR) {
3586 
3587 		/*
3588 		 * If multiple slave buffers get
3589 		 * error we forget the old errors
3590 		 * this is ok because we any way
3591 		 * cannot return multiple errors
3592 		 */
3593 		mm->mm_error = bp->b_error;
3594 	}
3595 	mm->mm_resid -= bp->b_bcount;
3596 	resid = mm->mm_resid;
3597 	mutex_exit(&mm->mm_mutex);
3598 
3599 	/*
3600 	 * free up the resources allocated to cloned buffers.
3601 	 */
3602 	bp_mapout(bp);
3603 	biofini(bp);
3604 
3605 	if (resid == 0) {
3606 
3607 		/*
3608 		 * This is the last I/O operation
3609 		 * clean up and return the original buffer
3610 		 */
3611 		if (mm->mm_error) {
3612 			mm->mm_bp->b_flags |= B_ERROR;
3613 			mm->mm_bp->b_error = mm->mm_error;
3614 		}
3615 		biodone(mm->mm_bp);
3616 		mutex_destroy(&mm->mm_mutex);
3617 		kmem_free(mm, mm->mm_size);
3618 	}
3619 	return (0);
3620 }
3621