xref: /titanic_41/usr/src/uts/common/fs/udfs/udf_vnops.c (revision d4660949aa62dd6a963f4913b7120b383cf473c4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/signal.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/stat.h>
42 #include <sys/vnode.h>
43 #include <sys/mode.h>
44 #include <sys/proc.h>
45 #include <sys/disp.h>
46 #include <sys/file.h>
47 #include <sys/fcntl.h>
48 #include <sys/flock.h>
49 #include <sys/kmem.h>
50 #include <sys/uio.h>
51 #include <sys/dnlc.h>
52 #include <sys/conf.h>
53 #include <sys/errno.h>
54 #include <sys/mman.h>
55 #include <sys/fbuf.h>
56 #include <sys/pathname.h>
57 #include <sys/debug.h>
58 #include <sys/vmsystm.h>
59 #include <sys/cmn_err.h>
60 #include <sys/dirent.h>
61 #include <sys/errno.h>
62 #include <sys/modctl.h>
63 #include <sys/statvfs.h>
64 #include <sys/mount.h>
65 #include <sys/sunddi.h>
66 #include <sys/bootconf.h>
67 #include <sys/policy.h>
68 
69 #include <vm/hat.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/page.h>
79 #include <sys/swap.h>
80 
81 #include <fs/fs_subr.h>
82 
83 #include <sys/fs/udf_volume.h>
84 #include <sys/fs/udf_inode.h>
85 
86 static int32_t udf_open(struct vnode **,
87 	int32_t, struct cred *, caller_context_t *);
88 static int32_t udf_close(struct vnode *,
89 	int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
90 static int32_t udf_read(struct vnode *,
91 	struct uio *, int32_t, struct cred *, caller_context_t *);
92 static int32_t udf_write(struct vnode *,
93 	struct uio *, int32_t, struct cred *, caller_context_t *);
94 static int32_t udf_ioctl(struct vnode *,
95 	int32_t, intptr_t, int32_t, struct cred *, int32_t *,
96 	caller_context_t *);
97 static int32_t udf_getattr(struct vnode *,
98 	struct vattr *, int32_t, struct cred *, caller_context_t *);
99 static int32_t udf_setattr(struct vnode *,
100 	struct vattr *, int32_t, struct cred *, caller_context_t *);
101 static int32_t udf_access(struct vnode *,
102 	int32_t, int32_t, struct cred *, caller_context_t *);
103 static int32_t udf_lookup(struct vnode *,
104 	char *, struct vnode **, struct pathname *,
105 	int32_t, struct vnode *, struct cred *,
106 	caller_context_t *, int *, pathname_t *);
107 static int32_t udf_create(struct vnode *,
108 	char *, struct vattr *, enum vcexcl,
109 	int32_t, struct vnode **, struct cred *, int32_t,
110 	caller_context_t *, vsecattr_t *);
111 static int32_t udf_remove(struct vnode *,
112 	char *, struct cred *, caller_context_t *, int);
113 static int32_t udf_link(struct vnode *,
114 	struct vnode *, char *, struct cred *, caller_context_t *, int);
115 static int32_t udf_rename(struct vnode *,
116 	char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
117 static int32_t udf_mkdir(struct vnode *,
118 	char *, struct vattr *, struct vnode **, struct cred *,
119 	caller_context_t *, int, vsecattr_t *);
120 static int32_t udf_rmdir(struct vnode *,
121 	char *, struct vnode *, struct cred *, caller_context_t *, int);
122 static int32_t udf_readdir(struct vnode *,
123 	struct uio *, struct cred *, int32_t *, caller_context_t *, int);
124 static int32_t udf_symlink(struct vnode *,
125 	char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
126 static int32_t udf_readlink(struct vnode *,
127 	struct uio *, struct cred *, caller_context_t *);
128 static int32_t udf_fsync(struct vnode *,
129 	int32_t, struct cred *, caller_context_t *);
130 static void udf_inactive(struct vnode *,
131 	struct cred *, caller_context_t *);
132 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
133 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
134 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
135 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
136 	caller_context_t *);
137 static int32_t udf_frlock(struct vnode *, int32_t,
138 	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
139 	caller_context_t *);
140 static int32_t udf_space(struct vnode *, int32_t,
141 	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
142 static int32_t udf_getpage(struct vnode *, offset_t,
143 	size_t, uint32_t *, struct page **, size_t,
144 	struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
145 static int32_t udf_putpage(struct vnode *, offset_t,
146 	size_t, int32_t, struct cred *, caller_context_t *);
147 static int32_t udf_map(struct vnode *, offset_t, struct as *,
148 	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
149 	caller_context_t *);
150 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
151 	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
152 	caller_context_t *);
153 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
154 	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
155 	caller_context_t *);
156 static int32_t udf_l_pathconf(struct vnode *, int32_t,
157 	ulong_t *, struct cred *, caller_context_t *);
158 static int32_t udf_pageio(struct vnode *, struct page *,
159 	u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
160 
161 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
162 	size_t, struct seg *, caddr_t, page_t *pl[],
163 	size_t, enum seg_rw, int32_t);
164 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
165 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
166 int32_t ud_page_fill(struct ud_inode *, page_t *,
167 	u_offset_t, uint32_t, u_offset_t *);
168 int32_t ud_iodone(struct buf *);
169 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
170 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
171 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
172 int32_t ud_slave_done(struct buf *);
173 
174 /*
175  * Structures to control multiple IO operations to get or put pages
176  * that are backed by discontiguous blocks. The master struct is
177  * a dummy that holds the original bp from pageio_setup. The
178  * slave struct holds the working bp's to do the actual IO. Once
179  * all the slave IOs complete. The master is processed as if a single
180  * IO op has completed.
181  */
182 uint32_t master_index = 0;
183 typedef struct mio_master {
184 	kmutex_t	mm_mutex;	/* protect the fields below */
185 	int32_t		mm_size;
186 	buf_t		*mm_bp;		/* original bp */
187 	int32_t		mm_resid;	/* bytes remaining to transfer */
188 	int32_t		mm_error;	/* accumulated error from slaves */
189 	int32_t		mm_index;	/* XXX debugging */
190 } mio_master_t;
191 
192 typedef struct mio_slave {
193 	buf_t		ms_buf;		/* working buffer for this IO chunk */
194 	mio_master_t	*ms_ptr;	/* pointer to master */
195 } mio_slave_t;
196 
197 struct vnodeops *udf_vnodeops;
198 
199 const fs_operation_def_t udf_vnodeops_template[] = {
200 	VOPNAME_OPEN,		{ .vop_open = udf_open },
201 	VOPNAME_CLOSE,		{ .vop_close = udf_close },
202 	VOPNAME_READ,		{ .vop_read = udf_read },
203 	VOPNAME_WRITE,		{ .vop_write = udf_write },
204 	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
205 	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
206 	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
207 	VOPNAME_ACCESS,		{ .vop_access = udf_access },
208 	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
209 	VOPNAME_CREATE,		{ .vop_create = udf_create },
210 	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
211 	VOPNAME_LINK,		{ .vop_link = udf_link },
212 	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
213 	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
214 	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
215 	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
216 	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
217 	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
218 	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
219 	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
220 	VOPNAME_FID,		{ .vop_fid = udf_fid },
221 	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
222 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
223 	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
224 	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
225 	VOPNAME_SPACE,		{ .vop_space = udf_space },
226 	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
227 	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
228 	VOPNAME_MAP,		{ .vop_map = udf_map },
229 	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
230 	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
231 	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
232 	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
233 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
234 	NULL,			NULL
235 };
236 
237 /* ARGSUSED */
238 static int32_t
239 udf_open(
240 	struct vnode **vpp,
241 	int32_t flag,
242 	struct cred *cr,
243 	caller_context_t *ct)
244 {
245 	ud_printf("udf_open\n");
246 
247 	return (0);
248 }
249 
250 /* ARGSUSED */
251 static int32_t
252 udf_close(
253 	struct vnode *vp,
254 	int32_t flag,
255 	int32_t count,
256 	offset_t offset,
257 	struct cred *cr,
258 	caller_context_t *ct)
259 {
260 	struct ud_inode *ip = VTOI(vp);
261 
262 	ud_printf("udf_close\n");
263 
264 	ITIMES(ip);
265 
266 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
267 	cleanshares(vp, ttoproc(curthread)->p_pid);
268 
269 	/*
270 	 * Push partially filled cluster at last close.
271 	 * ``last close'' is approximated because the dnlc
272 	 * may have a hold on the vnode.
273 	 */
274 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
275 		struct ud_inode *ip = VTOI(vp);
276 		if (ip->i_delaylen) {
277 			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
278 					B_ASYNC | B_FREE, cr);
279 			ip->i_delaylen = 0;
280 		}
281 	}
282 
283 	return (0);
284 }
285 
286 /* ARGSUSED */
287 static int32_t
288 udf_read(
289 	struct vnode *vp,
290 	struct uio *uiop,
291 	int32_t ioflag,
292 	struct cred *cr,
293 	caller_context_t *ct)
294 {
295 	struct ud_inode *ip = VTOI(vp);
296 	int32_t error;
297 
298 	ud_printf("udf_read\n");
299 
300 #ifdef	__lock_lint
301 	rw_enter(&ip->i_rwlock, RW_READER);
302 #endif
303 
304 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
305 
306 	if (MANDLOCK(vp, ip->i_char)) {
307 		/*
308 		 * udf_getattr ends up being called by chklock
309 		 */
310 		error = chklock(vp, FREAD, uiop->uio_loffset,
311 			uiop->uio_resid, uiop->uio_fmode, ct);
312 		if (error) {
313 			goto end;
314 		}
315 	}
316 
317 	rw_enter(&ip->i_contents, RW_READER);
318 	error = ud_rdip(ip, uiop, ioflag, cr);
319 	rw_exit(&ip->i_contents);
320 
321 end:
322 #ifdef	__lock_lint
323 	rw_exit(&ip->i_rwlock);
324 #endif
325 
326 	return (error);
327 }
328 
329 
330 int32_t ud_WRITES = 1;
331 int32_t ud_HW = 96 * 1024;
332 int32_t ud_LW = 64 * 1024;
333 int32_t ud_throttles = 0;
334 
335 /* ARGSUSED */
336 static int32_t
337 udf_write(
338 	struct vnode *vp,
339 	struct uio *uiop,
340 	int32_t ioflag,
341 	struct cred *cr,
342 	caller_context_t *ct)
343 {
344 	struct ud_inode *ip = VTOI(vp);
345 	int32_t error = 0;
346 
347 	ud_printf("udf_write\n");
348 
349 #ifdef	__lock_lint
350 	rw_enter(&ip->i_rwlock, RW_WRITER);
351 #endif
352 
353 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
354 
355 	if (MANDLOCK(vp, ip->i_char)) {
356 		/*
357 		 * ud_getattr ends up being called by chklock
358 		 */
359 		error = chklock(vp, FWRITE, uiop->uio_loffset,
360 			uiop->uio_resid, uiop->uio_fmode, ct);
361 		if (error) {
362 			goto end;
363 		}
364 	}
365 	/*
366 	 * Throttle writes.
367 	 */
368 	mutex_enter(&ip->i_tlock);
369 	if (ud_WRITES && (ip->i_writes > ud_HW)) {
370 		while (ip->i_writes > ud_HW) {
371 			ud_throttles++;
372 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
373 		}
374 	}
375 	mutex_exit(&ip->i_tlock);
376 
377 	/*
378 	 * Write to the file
379 	 */
380 	rw_enter(&ip->i_contents, RW_WRITER);
381 	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
382 		/*
383 		 * In append mode start at end of file.
384 		 */
385 		uiop->uio_loffset = ip->i_size;
386 	}
387 	error = ud_wrip(ip, uiop, ioflag, cr);
388 	rw_exit(&ip->i_contents);
389 
390 end:
391 #ifdef	__lock_lint
392 	rw_exit(&ip->i_rwlock);
393 #endif
394 
395 	return (error);
396 }
397 
398 /* ARGSUSED */
399 static int32_t
400 udf_ioctl(
401 	struct vnode *vp,
402 	int32_t cmd,
403 	intptr_t arg,
404 	int32_t flag,
405 	struct cred *cr,
406 	int32_t *rvalp,
407 	caller_context_t *ct)
408 {
409 	return (ENOTTY);
410 }
411 
412 /* ARGSUSED */
413 static int32_t
414 udf_getattr(
415 	struct vnode *vp,
416 	struct vattr *vap,
417 	int32_t flags,
418 	struct cred *cr,
419 	caller_context_t *ct)
420 {
421 	struct ud_inode *ip = VTOI(vp);
422 
423 	ud_printf("udf_getattr\n");
424 
425 	if (vap->va_mask == AT_SIZE) {
426 		/*
427 		 * for performance, if only the size is requested don't bother
428 		 * with anything else.
429 		 */
430 		vap->va_size = ip->i_size;
431 		return (0);
432 	}
433 
434 	rw_enter(&ip->i_contents, RW_READER);
435 
436 	vap->va_type = vp->v_type;
437 	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
438 
439 	vap->va_uid = ip->i_uid;
440 	vap->va_gid = ip->i_gid;
441 	vap->va_fsid = ip->i_dev;
442 	vap->va_nodeid = ip->i_icb_lbano;
443 	vap->va_nlink = ip->i_nlink;
444 	vap->va_size = ip->i_size;
445 	vap->va_seq = ip->i_seq;
446 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
447 		vap->va_rdev = ip->i_rdev;
448 	} else {
449 		vap->va_rdev = 0;
450 	}
451 
452 	mutex_enter(&ip->i_tlock);
453 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
454 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
455 	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
456 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
457 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
458 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
459 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
460 	mutex_exit(&ip->i_tlock);
461 
462 	switch (ip->i_type) {
463 		case VBLK:
464 			vap->va_blksize = MAXBSIZE;
465 			break;
466 		case VCHR:
467 			vap->va_blksize = MAXBSIZE;
468 			break;
469 		default:
470 			vap->va_blksize = ip->i_udf->udf_lbsize;
471 			break;
472 	}
473 	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
474 
475 	rw_exit(&ip->i_contents);
476 
477 	return (0);
478 }
479 
480 static int
481 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
482 {
483 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr));
484 }
485 
486 /*ARGSUSED4*/
487 static int32_t
488 udf_setattr(
489 	struct vnode *vp,
490 	struct vattr *vap,
491 	int32_t flags,
492 	struct cred *cr,
493 	caller_context_t *ct)
494 {
495 	int32_t error = 0;
496 	uint32_t mask = vap->va_mask;
497 	struct ud_inode *ip;
498 	timestruc_t now;
499 	struct vattr ovap;
500 
501 	ud_printf("udf_setattr\n");
502 
503 	ip = VTOI(vp);
504 
505 	/*
506 	 * not updates allowed to 4096 files
507 	 */
508 	if (ip->i_astrat == STRAT_TYPE4096) {
509 		return (EINVAL);
510 	}
511 
512 	/*
513 	 * Cannot set these attributes
514 	 */
515 	if (mask & AT_NOSET) {
516 		return (EINVAL);
517 	}
518 
519 	rw_enter(&ip->i_rwlock, RW_WRITER);
520 	rw_enter(&ip->i_contents, RW_WRITER);
521 
522 	ovap.va_uid = ip->i_uid;
523 	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
524 	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
525 					    ud_iaccess_vmode, ip);
526 	if (error)
527 		goto update_inode;
528 
529 	mask = vap->va_mask;
530 	/*
531 	 * Change file access modes.
532 	 */
533 	if (mask & AT_MODE) {
534 		ip->i_perm = VA2UD_PERM(vap->va_mode);
535 		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
536 		mutex_enter(&ip->i_tlock);
537 		ip->i_flag |= ICHG;
538 		mutex_exit(&ip->i_tlock);
539 	}
540 	if (mask & (AT_UID|AT_GID)) {
541 		if (mask & AT_UID) {
542 			ip->i_uid = vap->va_uid;
543 		}
544 		if (mask & AT_GID) {
545 			ip->i_gid = vap->va_gid;
546 		}
547 		mutex_enter(&ip->i_tlock);
548 		ip->i_flag |= ICHG;
549 		mutex_exit(&ip->i_tlock);
550 	}
551 	/*
552 	 * Truncate file.  Must have write permission and not be a directory.
553 	 */
554 	if (mask & AT_SIZE) {
555 		if (vp->v_type == VDIR) {
556 			error = EISDIR;
557 			goto update_inode;
558 		}
559 		if (error = ud_iaccess(ip, IWRITE, cr)) {
560 			goto update_inode;
561 		}
562 		if (vap->va_size > MAXOFFSET_T) {
563 			error = EFBIG;
564 			goto update_inode;
565 		}
566 		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
567 			goto update_inode;
568 		}
569 	}
570 	/*
571 	 * Change file access or modified times.
572 	 */
573 	if (mask & (AT_ATIME|AT_MTIME)) {
574 		mutex_enter(&ip->i_tlock);
575 		if (mask & AT_ATIME) {
576 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
577 			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
578 			ip->i_flag &= ~IACC;
579 		}
580 		if (mask & AT_MTIME) {
581 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
582 			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
583 			gethrestime(&now);
584 			ip->i_ctime.tv_sec = now.tv_sec;
585 			ip->i_ctime.tv_nsec = now.tv_nsec;
586 			ip->i_flag &= ~(IUPD|ICHG);
587 			ip->i_flag |= IMODTIME;
588 		}
589 		ip->i_flag |= IMOD;
590 		mutex_exit(&ip->i_tlock);
591 	}
592 
593 update_inode:
594 	if (curthread->t_flag & T_DONTPEND) {
595 		ud_iupdat(ip, 1);
596 	} else {
597 		ITIMES_NOLOCK(ip);
598 	}
599 	rw_exit(&ip->i_contents);
600 	rw_exit(&ip->i_rwlock);
601 
602 	return (error);
603 }
604 
605 /* ARGSUSED */
606 static int32_t
607 udf_access(
608 	struct vnode *vp,
609 	int32_t mode,
610 	int32_t flags,
611 	struct cred *cr,
612 	caller_context_t *ct)
613 {
614 	struct ud_inode *ip = VTOI(vp);
615 	int32_t error;
616 
617 	ud_printf("udf_access\n");
618 
619 	if (ip->i_udf == NULL) {
620 		return (EIO);
621 	}
622 
623 	error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr);
624 
625 	return (error);
626 }
627 
628 int32_t udfs_stickyhack = 1;
629 
630 /* ARGSUSED */
631 static int32_t
632 udf_lookup(
633 	struct vnode *dvp,
634 	char *nm,
635 	struct vnode **vpp,
636 	struct pathname *pnp,
637 	int32_t flags,
638 	struct vnode *rdir,
639 	struct cred *cr,
640 	caller_context_t *ct,
641 	int *direntflags,
642 	pathname_t *realpnp)
643 {
644 	int32_t error;
645 	struct vnode *vp;
646 	struct ud_inode *ip, *xip;
647 
648 	ud_printf("udf_lookup\n");
649 	/*
650 	 * Null component name is a synonym for directory being searched.
651 	 */
652 	if (*nm == '\0') {
653 		VN_HOLD(dvp);
654 		*vpp = dvp;
655 		error = 0;
656 		goto out;
657 	}
658 
659 	/*
660 	 * Fast path: Check the directory name lookup cache.
661 	 */
662 	ip = VTOI(dvp);
663 	if (vp = dnlc_lookup(dvp, nm)) {
664 		/*
665 		 * Check accessibility of directory.
666 		 */
667 		if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) {
668 			VN_RELE(vp);
669 		}
670 		xip = VTOI(vp);
671 	} else {
672 		error = ud_dirlook(ip, nm, &xip, cr, 1);
673 		ITIMES(ip);
674 	}
675 
676 	if (error == 0) {
677 		ip = xip;
678 		*vpp = ITOV(ip);
679 		if ((ip->i_type != VDIR) &&
680 			(ip->i_char & ISVTX) &&
681 			((ip->i_perm & IEXEC) == 0) &&
682 			udfs_stickyhack) {
683 			mutex_enter(&(*vpp)->v_lock);
684 			(*vpp)->v_flag |= VISSWAP;
685 			mutex_exit(&(*vpp)->v_lock);
686 		}
687 		ITIMES(ip);
688 		/*
689 		 * If vnode is a device return special vnode instead.
690 		 */
691 		if (IS_DEVVP(*vpp)) {
692 			struct vnode *newvp;
693 			newvp = specvp(*vpp, (*vpp)->v_rdev,
694 					(*vpp)->v_type, cr);
695 			VN_RELE(*vpp);
696 			if (newvp == NULL) {
697 				error = ENOSYS;
698 			} else {
699 				*vpp = newvp;
700 			}
701 		}
702 	}
703 out:
704 	return (error);
705 }
706 
707 /* ARGSUSED */
708 static int32_t
709 udf_create(
710 	struct vnode *dvp,
711 	char *name,
712 	struct vattr *vap,
713 	enum vcexcl excl,
714 	int32_t mode,
715 	struct vnode **vpp,
716 	struct cred *cr,
717 	int32_t flag,
718 	caller_context_t *ct,
719 	vsecattr_t *vsecp)
720 {
721 	int32_t error;
722 	struct ud_inode *ip = VTOI(dvp), *xip;
723 
724 	ud_printf("udf_create\n");
725 
726 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
727 		vap->va_mode &= ~VSVTX;
728 
729 	if (*name == '\0') {
730 		/*
731 		 * Null component name refers to the directory itself.
732 		 */
733 		VN_HOLD(dvp);
734 		ITIMES(ip);
735 		error = EEXIST;
736 	} else {
737 		xip = NULL;
738 		rw_enter(&ip->i_rwlock, RW_WRITER);
739 		error = ud_direnter(ip, name, DE_CREATE,
740 			(struct ud_inode *)0, (struct ud_inode *)0,
741 			vap, &xip, cr, ct);
742 		rw_exit(&ip->i_rwlock);
743 		ITIMES(ip);
744 		ip = xip;
745 	}
746 #ifdef	__lock_lint
747 	rw_enter(&ip->i_contents, RW_WRITER);
748 #else
749 	if (ip != NULL) {
750 		rw_enter(&ip->i_contents, RW_WRITER);
751 	}
752 #endif
753 
754 	/*
755 	 * If the file already exists and this is a non-exclusive create,
756 	 * check permissions and allow access for non-directories.
757 	 * Read-only create of an existing directory is also allowed.
758 	 * We fail an exclusive create of anything which already exists.
759 	 */
760 	if (error == EEXIST) {
761 		if (excl == NONEXCL) {
762 			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
763 				error = EISDIR;
764 			} else if (mode) {
765 				error = ud_iaccess(ip,
766 					UD_UPERM2DPERM(mode), cr);
767 			} else {
768 				error = 0;
769 			}
770 		}
771 		if (error) {
772 			rw_exit(&ip->i_contents);
773 			VN_RELE(ITOV(ip));
774 			goto out;
775 		} else if ((ip->i_type == VREG) &&
776 			(vap->va_mask & AT_SIZE) && vap->va_size == 0) {
777 			/*
778 			 * Truncate regular files, if requested by caller.
779 			 * Grab i_rwlock to make sure no one else is
780 			 * currently writing to the file (we promised
781 			 * bmap we would do this).
782 			 * Must get the locks in the correct order.
783 			 */
784 			if (ip->i_size == 0) {
785 				ip->i_flag |= ICHG | IUPD;
786 			} else {
787 				rw_exit(&ip->i_contents);
788 				rw_enter(&ip->i_rwlock, RW_WRITER);
789 				rw_enter(&ip->i_contents, RW_WRITER);
790 				(void) ud_itrunc(ip, 0, 0, cr);
791 				rw_exit(&ip->i_rwlock);
792 			}
793 			vnevent_create(ITOV(ip), ct);
794 		}
795 	}
796 
797 	if (error == 0) {
798 		*vpp = ITOV(ip);
799 		ITIMES(ip);
800 	}
801 #ifdef	__lock_lint
802 	rw_exit(&ip->i_contents);
803 #else
804 	if (ip != NULL) {
805 		rw_exit(&ip->i_contents);
806 	}
807 #endif
808 	if (error) {
809 		goto out;
810 	}
811 
812 	/*
813 	 * If vnode is a device return special vnode instead.
814 	 */
815 	if (!error && IS_DEVVP(*vpp)) {
816 		struct vnode *newvp;
817 
818 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
819 		VN_RELE(*vpp);
820 		if (newvp == NULL) {
821 			error = ENOSYS;
822 			goto out;
823 		}
824 		*vpp = newvp;
825 	}
826 out:
827 	return (error);
828 }
829 
830 /* ARGSUSED */
831 static int32_t
832 udf_remove(
833 	struct vnode *vp,
834 	char *nm,
835 	struct cred *cr,
836 	caller_context_t *ct,
837 	int flags)
838 {
839 	int32_t error;
840 	struct ud_inode *ip = VTOI(vp);
841 
842 	ud_printf("udf_remove\n");
843 
844 	rw_enter(&ip->i_rwlock, RW_WRITER);
845 	error = ud_dirremove(ip, nm,
846 		(struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
847 	rw_exit(&ip->i_rwlock);
848 	ITIMES(ip);
849 
850 	return (error);
851 }
852 
853 /* ARGSUSED */
854 static int32_t
855 udf_link(
856 	struct vnode *tdvp,
857 	struct vnode *svp,
858 	char *tnm,
859 	struct cred *cr,
860 	caller_context_t *ct,
861 	int flags)
862 {
863 	int32_t error;
864 	struct vnode *realvp;
865 	struct ud_inode *sip;
866 	struct ud_inode *tdp;
867 
868 	ud_printf("udf_link\n");
869 	if (VOP_REALVP(svp, &realvp, ct) == 0) {
870 		svp = realvp;
871 	}
872 
873 	/*
874 	 * Do not allow links to directories
875 	 */
876 	if (svp->v_type == VDIR) {
877 		return (EPERM);
878 	}
879 
880 	sip = VTOI(svp);
881 
882 	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
883 		return (EPERM);
884 
885 	tdp = VTOI(tdvp);
886 
887 	rw_enter(&tdp->i_rwlock, RW_WRITER);
888 	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
889 		sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
890 	rw_exit(&tdp->i_rwlock);
891 	ITIMES(sip);
892 	ITIMES(tdp);
893 
894 	if (error == 0) {
895 		vnevent_link(svp, ct);
896 	}
897 
898 	return (error);
899 }
900 
901 /* ARGSUSED */
902 static int32_t
903 udf_rename(
904 	struct vnode *sdvp,
905 	char *snm,
906 	struct vnode *tdvp,
907 	char *tnm,
908 	struct cred *cr,
909 	caller_context_t *ct,
910 	int flags)
911 {
912 	int32_t error = 0;
913 	struct udf_vfs *udf_vfsp;
914 	struct ud_inode *sip;		/* source inode */
915 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
916 	struct vnode *realvp;
917 
918 	ud_printf("udf_rename\n");
919 
920 	if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
921 		tdvp = realvp;
922 	}
923 
924 	sdp = VTOI(sdvp);
925 	tdp = VTOI(tdvp);
926 
927 	udf_vfsp = sdp->i_udf;
928 
929 	mutex_enter(&udf_vfsp->udf_rename_lck);
930 	/*
931 	 * Look up inode of file we're supposed to rename.
932 	 */
933 	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
934 		mutex_exit(&udf_vfsp->udf_rename_lck);
935 		return (error);
936 	}
937 	/*
938 	 * be sure this is not a directory with another file system mounted
939 	 * over it.  If it is just give up the locks, and return with
940 	 * EBUSY
941 	 */
942 	if (vn_mountedvfs(ITOV(sip)) != NULL) {
943 		error = EBUSY;
944 		goto errout;
945 	}
946 	/*
947 	 * Make sure we can delete the source entry.  This requires
948 	 * write permission on the containing directory.  If that
949 	 * directory is "sticky" it further requires (except for
950 	 * privileged users) that the user own the directory or the
951 	 * source entry, or else have permission to write the source
952 	 * entry.
953 	 */
954 	rw_enter(&sdp->i_contents, RW_READER);
955 	rw_enter(&sip->i_contents, RW_READER);
956 	if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 ||
957 	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
958 		rw_exit(&sip->i_contents);
959 		rw_exit(&sdp->i_contents);
960 		ITIMES(sip);
961 		goto errout;
962 	}
963 
964 	/*
965 	 * Check for renaming '.' or '..' or alias of '.'
966 	 */
967 	if ((strcmp(snm, ".") == 0) ||
968 			(strcmp(snm, "..") == 0) ||
969 			(sdp == sip)) {
970 		error = EINVAL;
971 		rw_exit(&sip->i_contents);
972 		rw_exit(&sdp->i_contents);
973 		goto errout;
974 	}
975 	rw_exit(&sip->i_contents);
976 	rw_exit(&sdp->i_contents);
977 
978 
979 	/*
980 	 * Link source to the target.
981 	 */
982 	rw_enter(&tdp->i_rwlock, RW_WRITER);
983 	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
984 	    (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
985 		/*
986 		 * ESAME isn't really an error; it indicates that the
987 		 * operation should not be done because the source and target
988 		 * are the same file, but that no error should be reported.
989 		 */
990 		if (error == ESAME) {
991 			error = 0;
992 		}
993 		rw_exit(&tdp->i_rwlock);
994 		goto errout;
995 	}
996 	vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
997 	rw_exit(&tdp->i_rwlock);
998 
999 	rw_enter(&sdp->i_rwlock, RW_WRITER);
1000 	/*
1001 	 * Unlink the source.
1002 	 * Remove the source entry.  ud_dirremove() checks that the entry
1003 	 * still reflects sip, and returns an error if it doesn't.
1004 	 * If the entry has changed just forget about it.  Release
1005 	 * the source inode.
1006 	 */
1007 	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1008 	    DR_RENAME, cr, ct)) == ENOENT) {
1009 		error = 0;
1010 	}
1011 	rw_exit(&sdp->i_rwlock);
1012 errout:
1013 	ITIMES(sdp);
1014 	ITIMES(tdp);
1015 	VN_RELE(ITOV(sip));
1016 	mutex_exit(&udf_vfsp->udf_rename_lck);
1017 
1018 	return (error);
1019 }
1020 
1021 /* ARGSUSED */
1022 static int32_t
1023 udf_mkdir(
1024 	struct vnode *dvp,
1025 	char *dirname,
1026 	struct vattr *vap,
1027 	struct vnode **vpp,
1028 	struct cred *cr,
1029 	caller_context_t *ct,
1030 	int flags,
1031 	vsecattr_t *vsecp)
1032 {
1033 	int32_t error;
1034 	struct ud_inode *ip;
1035 	struct ud_inode *xip;
1036 
1037 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1038 
1039 	ud_printf("udf_mkdir\n");
1040 
1041 	ip = VTOI(dvp);
1042 	rw_enter(&ip->i_rwlock, RW_WRITER);
1043 	error = ud_direnter(ip, dirname, DE_MKDIR,
1044 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1045 	rw_exit(&ip->i_rwlock);
1046 	ITIMES(ip);
1047 	if (error == 0) {
1048 		ip = xip;
1049 		*vpp = ITOV(ip);
1050 		ITIMES(ip);
1051 	} else if (error == EEXIST) {
1052 		ITIMES(xip);
1053 		VN_RELE(ITOV(xip));
1054 	}
1055 
1056 	return (error);
1057 }
1058 
1059 /* ARGSUSED */
1060 static int32_t
1061 udf_rmdir(
1062 	struct vnode *vp,
1063 	char *nm,
1064 	struct vnode *cdir,
1065 	struct cred *cr,
1066 	caller_context_t *ct,
1067 	int flags)
1068 {
1069 	int32_t error;
1070 	struct ud_inode *ip = VTOI(vp);
1071 
1072 	ud_printf("udf_rmdir\n");
1073 
1074 	rw_enter(&ip->i_rwlock, RW_WRITER);
1075 	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1076 		cr, ct);
1077 	rw_exit(&ip->i_rwlock);
1078 	ITIMES(ip);
1079 
1080 	return (error);
1081 }
1082 
1083 /* ARGSUSED */
1084 static int32_t
1085 udf_readdir(
1086 	struct vnode *vp,
1087 	struct uio *uiop,
1088 	struct cred *cr,
1089 	int32_t *eofp,
1090 	caller_context_t *ct,
1091 	int flags)
1092 {
1093 	struct ud_inode *ip;
1094 	struct dirent64 *nd;
1095 	struct udf_vfs *udf_vfsp;
1096 	int32_t error = 0, len, outcount = 0;
1097 	uint32_t dirsiz, offset;
1098 	uint32_t bufsize, ndlen, dummy;
1099 	caddr_t outbuf;
1100 	caddr_t outb, end_outb;
1101 	struct iovec *iovp;
1102 
1103 	uint8_t *dname;
1104 	int32_t length;
1105 
1106 	uint8_t *buf = NULL;
1107 
1108 	struct fbuf *fbp = NULL;
1109 	struct file_id *fid;
1110 	uint8_t *name;
1111 
1112 
1113 	ud_printf("udf_readdir\n");
1114 
1115 	ip = VTOI(vp);
1116 	udf_vfsp = ip->i_udf;
1117 
1118 	dirsiz = ip->i_size;
1119 	if ((uiop->uio_offset >= dirsiz) ||
1120 			(ip->i_nlink <= 0)) {
1121 		if (eofp) {
1122 			*eofp = 1;
1123 		}
1124 		return (0);
1125 	}
1126 
1127 	offset = uiop->uio_offset;
1128 	iovp = uiop->uio_iov;
1129 	bufsize = iovp->iov_len;
1130 
1131 	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1132 	end_outb = outb + bufsize;
1133 	nd = (struct dirent64 *)outbuf;
1134 
1135 	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1136 	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1137 
1138 	if (offset == 0) {
1139 		len = DIRENT64_RECLEN(1);
1140 		if (((caddr_t)nd + len) >= end_outb) {
1141 			error = EINVAL;
1142 			goto end;
1143 		}
1144 		nd->d_ino = ip->i_icb_lbano;
1145 		nd->d_reclen = (uint16_t)len;
1146 		nd->d_off = 0x10;
1147 		nd->d_name[0] = '.';
1148 		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1149 		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1150 		outcount++;
1151 	} else if (offset == 0x10) {
1152 		offset = 0;
1153 	}
1154 
1155 	while (offset < dirsiz) {
1156 		error = ud_get_next_fid(ip, &fbp,
1157 				offset, &fid, &name, buf);
1158 		if (error != 0) {
1159 			break;
1160 		}
1161 
1162 		if ((fid->fid_flags & FID_DELETED) == 0) {
1163 			if (fid->fid_flags & FID_PARENT) {
1164 
1165 				len = DIRENT64_RECLEN(2);
1166 				if (((caddr_t)nd + len) >= end_outb) {
1167 					error = EINVAL;
1168 					break;
1169 				}
1170 
1171 				nd->d_ino = ip->i_icb_lbano;
1172 				nd->d_reclen = (uint16_t)len;
1173 				nd->d_off = offset + FID_LEN(fid);
1174 				nd->d_name[0] = '.';
1175 				nd->d_name[1] = '.';
1176 				bzero(&nd->d_name[2],
1177 				    DIRENT64_NAMELEN(len) - 2);
1178 				nd = (struct dirent64 *)
1179 					((char *)nd + nd->d_reclen);
1180 			} else {
1181 				if ((error = ud_uncompress(fid->fid_idlen,
1182 						&length, name, dname)) != 0) {
1183 					break;
1184 				}
1185 				if (length == 0) {
1186 					offset += FID_LEN(fid);
1187 					continue;
1188 				}
1189 				len = DIRENT64_RECLEN(length);
1190 				if (((caddr_t)nd + len) >= end_outb) {
1191 					if (!outcount) {
1192 						error = EINVAL;
1193 					}
1194 					break;
1195 				}
1196 				(void) strncpy(nd->d_name,
1197 				    (caddr_t)dname, length);
1198 				bzero(&nd->d_name[length],
1199 				    DIRENT64_NAMELEN(len) - length);
1200 				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1201 					SWAP_16(fid->fid_icb.lad_ext_prn),
1202 					SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1203 					&dummy);
1204 				nd->d_reclen = (uint16_t)len;
1205 				nd->d_off = offset + FID_LEN(fid);
1206 				nd = (struct dirent64 *)
1207 					((char *)nd + nd->d_reclen);
1208 			}
1209 			outcount++;
1210 		}
1211 
1212 		offset += FID_LEN(fid);
1213 	}
1214 
1215 end:
1216 	if (fbp != NULL) {
1217 		fbrelse(fbp, S_OTHER);
1218 	}
1219 	ndlen = ((char *)nd - outbuf);
1220 	/*
1221 	 * In case of error do not call uiomove.
1222 	 * Return the error to the caller.
1223 	 */
1224 	if ((error == 0) && (ndlen != 0)) {
1225 		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1226 		uiop->uio_offset = offset;
1227 	}
1228 	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1229 	kmem_free((caddr_t)dname, 1024);
1230 	kmem_free(outbuf, (uint32_t)bufsize);
1231 	if (eofp && error == 0) {
1232 		*eofp = (uiop->uio_offset >= dirsiz);
1233 	}
1234 	return (error);
1235 }
1236 
1237 /* ARGSUSED */
1238 static int32_t
1239 udf_symlink(
1240 	struct vnode *dvp,
1241 	char *linkname,
1242 	struct vattr *vap,
1243 	char *target,
1244 	struct cred *cr,
1245 	caller_context_t *ct,
1246 	int flags)
1247 {
1248 	int32_t error = 0, outlen;
1249 	uint32_t ioflag = 0;
1250 	struct ud_inode *ip, *dip = VTOI(dvp);
1251 
1252 	struct path_comp *pc;
1253 	int8_t *dname = NULL, *uname = NULL, *sp;
1254 
1255 	ud_printf("udf_symlink\n");
1256 
1257 	ip = (struct ud_inode *)0;
1258 	vap->va_type = VLNK;
1259 	vap->va_rdev = 0;
1260 
1261 	rw_enter(&dip->i_rwlock, RW_WRITER);
1262 	error = ud_direnter(dip, linkname, DE_CREATE,
1263 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1264 	rw_exit(&dip->i_rwlock);
1265 	if (error == 0) {
1266 		dname = kmem_zalloc(1024, KM_SLEEP);
1267 		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1268 
1269 		pc = (struct path_comp *)uname;
1270 		/*
1271 		 * If the first character in target is "/"
1272 		 * then skip it and create entry for it
1273 		 */
1274 		if (*target == '/') {
1275 			pc->pc_type = 2;
1276 			pc->pc_len = 0;
1277 			pc = (struct path_comp *)(((char *)pc) + 4);
1278 			while (*target == '/') {
1279 				target++;
1280 			}
1281 		}
1282 
1283 		while (*target != NULL) {
1284 			sp = target;
1285 			while ((*target != '/') && (*target != '\0')) {
1286 				target ++;
1287 			}
1288 			/*
1289 			 * We got the next component of the
1290 			 * path name. Create path_comp of
1291 			 * appropriate type
1292 			 */
1293 			if (((target - sp) == 1) && (*sp == '.')) {
1294 				/*
1295 				 * Dot entry.
1296 				 */
1297 				pc->pc_type = 4;
1298 				pc = (struct path_comp *)(((char *)pc) + 4);
1299 			} else if (((target - sp) == 2) &&
1300 				(*sp == '.') && ((*(sp + 1)) == '.')) {
1301 				/*
1302 				 * DotDot entry.
1303 				 */
1304 				pc->pc_type = 3;
1305 				pc = (struct path_comp *)(((char *)pc) + 4);
1306 			} else {
1307 				/*
1308 				 * convert the user given name
1309 				 * into appropriate form to be put
1310 				 * on the media
1311 				 */
1312 				outlen = 1024;	/* set to size of dname */
1313 				if (error = ud_compress(target - sp, &outlen,
1314 					(uint8_t *)sp, (uint8_t *)dname)) {
1315 					break;
1316 				}
1317 				pc->pc_type = 5;
1318 				/* LINTED */
1319 				pc->pc_len = outlen;
1320 				dname[outlen] = '\0';
1321 				(void) strcpy((char *)pc->pc_id, dname);
1322 				pc = (struct path_comp *)
1323 					(((char *)pc) + 4 + outlen);
1324 			}
1325 			while (*target == '/') {
1326 				target++;
1327 			}
1328 			if (*target == NULL) {
1329 				break;
1330 			}
1331 		}
1332 
1333 		rw_enter(&ip->i_contents, RW_WRITER);
1334 		if (error == 0) {
1335 			ioflag = FWRITE;
1336 			if (curthread->t_flag & T_DONTPEND) {
1337 				ioflag |= FDSYNC;
1338 			}
1339 			error = ud_rdwri(UIO_WRITE, ioflag, ip,
1340 				uname, ((int8_t *)pc) - uname,
1341 				(offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1342 		}
1343 		if (error) {
1344 			ud_idrop(ip);
1345 			rw_exit(&ip->i_contents);
1346 			rw_enter(&dip->i_rwlock, RW_WRITER);
1347 			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1348 				(struct vnode *)0, DR_REMOVE, cr, ct);
1349 			rw_exit(&dip->i_rwlock);
1350 			goto update_inode;
1351 		}
1352 		rw_exit(&ip->i_contents);
1353 	}
1354 
1355 	if ((error == 0) || (error == EEXIST)) {
1356 		VN_RELE(ITOV(ip));
1357 	}
1358 
1359 update_inode:
1360 	ITIMES(VTOI(dvp));
1361 	if (uname != NULL) {
1362 		kmem_free(uname, PAGESIZE);
1363 	}
1364 	if (dname != NULL) {
1365 		kmem_free(dname, 1024);
1366 	}
1367 
1368 	return (error);
1369 }
1370 
1371 /* ARGSUSED */
1372 static int32_t
1373 udf_readlink(
1374 	struct vnode *vp,
1375 	struct uio *uiop,
1376 	struct cred *cr,
1377 	caller_context_t *ct)
1378 {
1379 	int32_t error = 0, off, id_len, size, len;
1380 	int8_t *dname = NULL, *uname = NULL;
1381 	struct ud_inode *ip;
1382 	struct fbuf *fbp = NULL;
1383 	struct path_comp *pc;
1384 
1385 	ud_printf("udf_readlink\n");
1386 
1387 	if (vp->v_type != VLNK) {
1388 		return (EINVAL);
1389 	}
1390 
1391 	ip = VTOI(vp);
1392 	size = ip->i_size;
1393 	if (size > PAGESIZE) {
1394 		return (EIO);
1395 	}
1396 
1397 	if (size == 0) {
1398 		return (0);
1399 	}
1400 
1401 	dname = kmem_zalloc(1024, KM_SLEEP);
1402 	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1403 
1404 	rw_enter(&ip->i_contents, RW_READER);
1405 
1406 	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1407 		goto end;
1408 	}
1409 
1410 	off = 0;
1411 
1412 	while (off < size) {
1413 		pc = (struct path_comp *)(fbp->fb_addr + off);
1414 		switch (pc->pc_type) {
1415 			case 1 :
1416 				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
1417 				(void) strcat(uname, "/");
1418 				break;
1419 			case 2 :
1420 				if (pc->pc_len != 0) {
1421 					goto end;
1422 				}
1423 				uname[0] = '/';
1424 				uname[1] = '\0';
1425 				break;
1426 			case 3 :
1427 				(void) strcat(uname, "../");
1428 				break;
1429 			case 4 :
1430 				(void) strcat(uname, "./");
1431 				break;
1432 			case 5 :
1433 				if ((error = ud_uncompress(pc->pc_len, &id_len,
1434 					pc->pc_id, (uint8_t *)dname)) != 0) {
1435 					break;
1436 				}
1437 				dname[id_len] = '\0';
1438 				(void) strcat(uname, dname);
1439 				(void) strcat(uname, "/");
1440 				break;
1441 			default :
1442 				error = EINVAL;
1443 				goto end;
1444 		}
1445 		off += 4 + pc->pc_len;
1446 	}
1447 	len = strlen(uname) - 1;
1448 	if (uname[len] == '/') {
1449 		if (len == 0) {
1450 			/*
1451 			 * special case link to /
1452 			 */
1453 			len = 1;
1454 		} else {
1455 			uname[len] = '\0';
1456 		}
1457 	}
1458 
1459 	error = uiomove(uname, len, UIO_READ, uiop);
1460 
1461 	ITIMES(ip);
1462 
1463 end:
1464 	if (fbp != NULL) {
1465 		fbrelse(fbp, S_OTHER);
1466 	}
1467 	rw_exit(&ip->i_contents);
1468 	if (uname != NULL) {
1469 		kmem_free(uname, PAGESIZE);
1470 	}
1471 	if (dname != NULL) {
1472 		kmem_free(dname, 1024);
1473 	}
1474 	return (error);
1475 }
1476 
1477 /* ARGSUSED */
1478 static int32_t
1479 udf_fsync(
1480 	struct vnode *vp,
1481 	int32_t syncflag,
1482 	struct cred *cr,
1483 	caller_context_t *ct)
1484 {
1485 	int32_t error = 0;
1486 	struct ud_inode *ip = VTOI(vp);
1487 
1488 	ud_printf("udf_fsync\n");
1489 
1490 	rw_enter(&ip->i_contents, RW_WRITER);
1491 	if (!(IS_SWAPVP(vp))) {
1492 		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1493 	}
1494 	if (error == 0) {
1495 		error = ud_sync_indir(ip);
1496 	}
1497 	ITIMES(ip);		/* XXX: is this necessary ??? */
1498 	rw_exit(&ip->i_contents);
1499 
1500 	return (error);
1501 }
1502 
1503 /* ARGSUSED */
1504 static void
1505 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1506 {
1507 	ud_printf("udf_iinactive\n");
1508 
1509 	ud_iinactive(VTOI(vp), cr);
1510 }
1511 
1512 /* ARGSUSED */
1513 static int32_t
1514 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1515 {
1516 	struct udf_fid *udfidp;
1517 	struct ud_inode *ip = VTOI(vp);
1518 
1519 	ud_printf("udf_fid\n");
1520 
1521 	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1522 		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1523 		return (ENOSPC);
1524 	}
1525 
1526 	udfidp = (struct udf_fid *)fidp;
1527 	bzero((char *)udfidp, sizeof (struct udf_fid));
1528 	rw_enter(&ip->i_contents, RW_READER);
1529 	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1530 	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1531 	udfidp->udfid_prn = ip->i_icb_prn;
1532 	udfidp->udfid_icb_lbn = ip->i_icb_block;
1533 	rw_exit(&ip->i_contents);
1534 
1535 	return (0);
1536 }
1537 
1538 /* ARGSUSED2 */
1539 static int
1540 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1541 {
1542 	struct ud_inode *ip = VTOI(vp);
1543 
1544 	ud_printf("udf_rwlock\n");
1545 
1546 	if (write_lock) {
1547 		rw_enter(&ip->i_rwlock, RW_WRITER);
1548 	} else {
1549 		rw_enter(&ip->i_rwlock, RW_READER);
1550 	}
1551 #ifdef	__lock_lint
1552 	rw_exit(&ip->i_rwlock);
1553 #endif
1554 	return (write_lock);
1555 }
1556 
1557 /* ARGSUSED */
1558 static void
1559 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1560 {
1561 	struct ud_inode *ip = VTOI(vp);
1562 
1563 	ud_printf("udf_rwunlock\n");
1564 
1565 #ifdef	__lock_lint
1566 	rw_enter(&ip->i_rwlock, RW_WRITER);
1567 #endif
1568 
1569 	rw_exit(&ip->i_rwlock);
1570 
1571 }
1572 
1573 /* ARGSUSED */
1574 static int32_t
1575 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1576 {
1577 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1578 }
1579 
1580 static int32_t
1581 udf_frlock(
1582 	struct vnode *vp,
1583 	int32_t cmd,
1584 	struct flock64 *bfp,
1585 	int32_t flag,
1586 	offset_t offset,
1587 	struct flk_callback *flk_cbp,
1588 	cred_t *cr,
1589 	caller_context_t *ct)
1590 {
1591 	struct ud_inode *ip = VTOI(vp);
1592 
1593 	ud_printf("udf_frlock\n");
1594 
1595 	/*
1596 	 * If file is being mapped, disallow frlock.
1597 	 * XXX I am not holding tlock while checking i_mapcnt because the
1598 	 * current locking strategy drops all locks before calling fs_frlock.
1599 	 * So, mapcnt could change before we enter fs_frlock making is
1600 	 * meaningless to have held tlock in the first place.
1601 	 */
1602 	if ((ip->i_mapcnt > 0) &&
1603 		(MANDLOCK(vp, ip->i_char))) {
1604 		return (EAGAIN);
1605 	}
1606 
1607 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1608 }
1609 
1610 /*ARGSUSED6*/
1611 static int32_t
1612 udf_space(
1613 	struct vnode *vp,
1614 	int32_t cmd,
1615 	struct flock64 *bfp,
1616 	int32_t flag,
1617 	offset_t offset,
1618 	cred_t *cr,
1619 	caller_context_t *ct)
1620 {
1621 	int32_t error = 0;
1622 
1623 	ud_printf("udf_space\n");
1624 
1625 	if (cmd != F_FREESP) {
1626 		error =  EINVAL;
1627 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1628 		error = ud_freesp(vp, bfp, flag, cr);
1629 	}
1630 
1631 	return (error);
1632 }
1633 
1634 /* ARGSUSED */
1635 static int32_t
1636 udf_getpage(
1637 	struct vnode *vp,
1638 	offset_t off,
1639 	size_t len,
1640 	uint32_t *protp,
1641 	struct page **plarr,
1642 	size_t plsz,
1643 	struct seg *seg,
1644 	caddr_t addr,
1645 	enum seg_rw rw,
1646 	struct cred *cr,
1647 	caller_context_t *ct)
1648 {
1649 	struct ud_inode *ip = VTOI(vp);
1650 	int32_t error, has_holes, beyond_eof, seqmode, dolock;
1651 	int32_t pgsize = PAGESIZE;
1652 	struct udf_vfs *udf_vfsp = ip->i_udf;
1653 	page_t **pl;
1654 	u_offset_t pgoff, eoff, uoff;
1655 	krw_t rwtype;
1656 	caddr_t pgaddr;
1657 
1658 	ud_printf("udf_getpage\n");
1659 
1660 	uoff = (u_offset_t)off; /* type conversion */
1661 	if (protp) {
1662 		*protp = PROT_ALL;
1663 	}
1664 	if (vp->v_flag & VNOMAP) {
1665 		return (ENOSYS);
1666 	}
1667 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1668 
1669 	rwtype = RW_READER;
1670 	dolock = (rw_owner(&ip->i_contents) != curthread);
1671 retrylock:
1672 #ifdef	__lock_lint
1673 	rw_enter(&ip->i_contents, rwtype);
1674 #else
1675 	if (dolock) {
1676 		rw_enter(&ip->i_contents, rwtype);
1677 	}
1678 #endif
1679 
1680 	/*
1681 	 * We may be getting called as a side effect of a bmap using
1682 	 * fbread() when the blocks might be being allocated and the
1683 	 * size has not yet been up'ed.  In this case we want to be
1684 	 * able to return zero pages if we get back UDF_HOLE from
1685 	 * calling bmap for a non write case here.  We also might have
1686 	 * to read some frags from the disk into a page if we are
1687 	 * extending the number of frags for a given lbn in bmap().
1688 	 */
1689 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1690 	if (beyond_eof && seg != segkmap) {
1691 #ifdef	__lock_lint
1692 		rw_exit(&ip->i_contents);
1693 #else
1694 		if (dolock) {
1695 			rw_exit(&ip->i_contents);
1696 		}
1697 #endif
1698 		return (EFAULT);
1699 	}
1700 
1701 	/*
1702 	 * Must hold i_contents lock throughout the call to pvn_getpages
1703 	 * since locked pages are returned from each call to ud_getapage.
1704 	 * Must *not* return locked pages and then try for contents lock
1705 	 * due to lock ordering requirements (inode > page)
1706 	 */
1707 
1708 	has_holes = ud_bmap_has_holes(ip);
1709 
1710 	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1711 		int32_t	blk_size, count;
1712 		u_offset_t offset;
1713 
1714 		/*
1715 		 * We must acquire the RW_WRITER lock in order to
1716 		 * call bmap_write().
1717 		 */
1718 		if (dolock && rwtype == RW_READER) {
1719 			rwtype = RW_WRITER;
1720 
1721 			if (!rw_tryupgrade(&ip->i_contents)) {
1722 
1723 				rw_exit(&ip->i_contents);
1724 
1725 				goto retrylock;
1726 			}
1727 		}
1728 
1729 		/*
1730 		 * May be allocating disk blocks for holes here as
1731 		 * a result of mmap faults. write(2) does the bmap_write
1732 		 * in rdip/wrip, not here. We are not dealing with frags
1733 		 * in this case.
1734 		 */
1735 		offset = uoff;
1736 		while ((offset < uoff + len) &&
1737 			(offset < ip->i_size)) {
1738 			/*
1739 			 * the variable "bnp" is to simplify the expression for
1740 			 * the compiler; * just passing in &bn to bmap_write
1741 			 * causes a compiler "loop"
1742 			 */
1743 
1744 			blk_size = udf_vfsp->udf_lbsize;
1745 			if ((offset + blk_size) > ip->i_size) {
1746 				count = ip->i_size - offset;
1747 			} else {
1748 				count = blk_size;
1749 			}
1750 			error = ud_bmap_write(ip, offset, count, 0, cr);
1751 			if (error) {
1752 				goto update_inode;
1753 			}
1754 			offset += count; /* XXX - make this contig */
1755 		}
1756 	}
1757 
1758 	/*
1759 	 * Can be a reader from now on.
1760 	 */
1761 #ifdef	__lock_lint
1762 	if (rwtype == RW_WRITER) {
1763 		rw_downgrade(&ip->i_contents);
1764 	}
1765 #else
1766 	if (dolock && rwtype == RW_WRITER) {
1767 		rw_downgrade(&ip->i_contents);
1768 	}
1769 #endif
1770 
1771 	/*
1772 	 * We remove PROT_WRITE in cases when the file has UDF holes
1773 	 * because we don't  want to call bmap_read() to check each
1774 	 * page if it is backed with a disk block.
1775 	 */
1776 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1777 		*protp &= ~PROT_WRITE;
1778 	}
1779 
1780 	error = 0;
1781 
1782 	/*
1783 	 * The loop looks up pages in the range <off, off + len).
1784 	 * For each page, we first check if we should initiate an asynchronous
1785 	 * read ahead before we call page_lookup (we may sleep in page_lookup
1786 	 * for a previously initiated disk read).
1787 	 */
1788 	eoff = (uoff + len);
1789 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
1790 			pgoff < eoff; /* empty */) {
1791 		page_t	*pp;
1792 		u_offset_t	nextrio;
1793 		se_t	se;
1794 
1795 		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1796 
1797 		/*
1798 		 * Handle async getpage (faultahead)
1799 		 */
1800 		if (plarr == NULL) {
1801 			ip->i_nextrio = pgoff;
1802 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1803 			pgoff += pgsize;
1804 			pgaddr += pgsize;
1805 			continue;
1806 		}
1807 
1808 		/*
1809 		 * Check if we should initiate read ahead of next cluster.
1810 		 * We call page_exists only when we need to confirm that
1811 		 * we have the current page before we initiate the read ahead.
1812 		 */
1813 		nextrio = ip->i_nextrio;
1814 		if (seqmode &&
1815 		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1816 		    nextrio < ip->i_size && page_exists(vp, pgoff))
1817 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1818 
1819 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1820 
1821 			/*
1822 			 * We found the page in the page cache.
1823 			 */
1824 			*pl++ = pp;
1825 			pgoff += pgsize;
1826 			pgaddr += pgsize;
1827 			len -= pgsize;
1828 			plsz -= pgsize;
1829 		} else  {
1830 
1831 			/*
1832 			 * We have to create the page, or read it from disk.
1833 			 */
1834 			if (error = ud_getpage_miss(vp, pgoff, len,
1835 				seg, pgaddr, pl, plsz, rw, seqmode)) {
1836 				goto error_out;
1837 			}
1838 
1839 			while (*pl != NULL) {
1840 				pl++;
1841 				pgoff += pgsize;
1842 				pgaddr += pgsize;
1843 				len -= pgsize;
1844 				plsz -= pgsize;
1845 			}
1846 		}
1847 	}
1848 
1849 	/*
1850 	 * Return pages up to plsz if they are in the page cache.
1851 	 * We cannot return pages if there is a chance that they are
1852 	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1853 	 */
1854 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1855 
1856 		ASSERT((protp == NULL) ||
1857 			!(has_holes && (*protp & PROT_WRITE)));
1858 
1859 		eoff = pgoff + plsz;
1860 		while (pgoff < eoff) {
1861 			page_t		*pp;
1862 
1863 			if ((pp = page_lookup_nowait(vp, pgoff,
1864 			    SE_SHARED)) == NULL)
1865 				break;
1866 
1867 			*pl++ = pp;
1868 			pgoff += pgsize;
1869 			plsz -= pgsize;
1870 		}
1871 	}
1872 
1873 	if (plarr)
1874 		*pl = NULL;			/* Terminate page list */
1875 	ip->i_nextr = pgoff;
1876 
1877 error_out:
1878 	if (error && plarr) {
1879 		/*
1880 		 * Release any pages we have locked.
1881 		 */
1882 		while (pl > &plarr[0])
1883 			page_unlock(*--pl);
1884 
1885 		plarr[0] = NULL;
1886 	}
1887 
1888 update_inode:
1889 #ifdef	__lock_lint
1890 	rw_exit(&ip->i_contents);
1891 #else
1892 	if (dolock) {
1893 		rw_exit(&ip->i_contents);
1894 	}
1895 #endif
1896 
1897 	/*
1898 	 * If the inode is not already marked for IACC (in rwip() for read)
1899 	 * and the inode is not marked for no access time update (in rwip()
1900 	 * for write) then update the inode access time and mod time now.
1901 	 */
1902 	mutex_enter(&ip->i_tlock);
1903 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
1904 		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1905 			ip->i_flag |= IACC;
1906 		}
1907 		if (rw == S_WRITE) {
1908 			ip->i_flag |= IUPD;
1909 		}
1910 		ITIMES_NOLOCK(ip);
1911 	}
1912 	mutex_exit(&ip->i_tlock);
1913 
1914 	return (error);
1915 }
1916 
1917 int32_t ud_delay = 1;
1918 
1919 /* ARGSUSED */
1920 static int32_t
1921 udf_putpage(
1922 	struct vnode *vp,
1923 	offset_t off,
1924 	size_t len,
1925 	int32_t flags,
1926 	struct cred *cr,
1927 	caller_context_t *ct)
1928 {
1929 	struct ud_inode *ip;
1930 	int32_t error = 0;
1931 
1932 	ud_printf("udf_putpage\n");
1933 
1934 	ip = VTOI(vp);
1935 #ifdef	__lock_lint
1936 	rw_enter(&ip->i_contents, RW_WRITER);
1937 #endif
1938 
1939 	if (vp->v_count == 0) {
1940 		cmn_err(CE_WARN, "ud_putpage : bad v_count");
1941 		error = EINVAL;
1942 		goto out;
1943 	}
1944 
1945 	if (vp->v_flag & VNOMAP) {
1946 		error = ENOSYS;
1947 		goto out;
1948 	}
1949 
1950 	if (flags & B_ASYNC) {
1951 		if (ud_delay && len &&
1952 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1953 			mutex_enter(&ip->i_tlock);
1954 
1955 			/*
1956 			 * If nobody stalled, start a new cluster.
1957 			 */
1958 			if (ip->i_delaylen == 0) {
1959 				ip->i_delayoff = off;
1960 				ip->i_delaylen = len;
1961 				mutex_exit(&ip->i_tlock);
1962 				goto out;
1963 			}
1964 
1965 			/*
1966 			 * If we have a full cluster or they are not contig,
1967 			 * then push last cluster and start over.
1968 			 */
1969 			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1970 			    ip->i_delayoff + ip->i_delaylen != off) {
1971 				u_offset_t doff;
1972 				size_t dlen;
1973 
1974 				doff = ip->i_delayoff;
1975 				dlen = ip->i_delaylen;
1976 				ip->i_delayoff = off;
1977 				ip->i_delaylen = len;
1978 				mutex_exit(&ip->i_tlock);
1979 				error = ud_putpages(vp, doff, dlen, flags, cr);
1980 				/* LMXXX - flags are new val, not old */
1981 				goto out;
1982 			}
1983 
1984 			/*
1985 			 * There is something there, it's not full, and
1986 			 * it is contig.
1987 			 */
1988 			ip->i_delaylen += len;
1989 			mutex_exit(&ip->i_tlock);
1990 			goto out;
1991 		}
1992 
1993 		/*
1994 		 * Must have weird flags or we are not clustering.
1995 		 */
1996 	}
1997 
1998 	error = ud_putpages(vp, off, len, flags, cr);
1999 
2000 out:
2001 #ifdef	__lock_lint
2002 	rw_exit(&ip->i_contents);
2003 #endif
2004 	return (error);
2005 }
2006 
2007 /* ARGSUSED */
2008 static int32_t
2009 udf_map(
2010 	struct vnode *vp,
2011 	offset_t off,
2012 	struct as *as,
2013 	caddr_t *addrp,
2014 	size_t len,
2015 	uint8_t prot,
2016 	uint8_t maxprot,
2017 	uint32_t flags,
2018 	struct cred *cr,
2019 	caller_context_t *ct)
2020 {
2021 	struct segvn_crargs vn_a;
2022 	int32_t error = 0;
2023 
2024 	ud_printf("udf_map\n");
2025 
2026 	if (vp->v_flag & VNOMAP) {
2027 		error = ENOSYS;
2028 		goto end;
2029 	}
2030 
2031 	if ((off < (offset_t)0) ||
2032 		((off + len) < (offset_t)0)) {
2033 		error = EINVAL;
2034 		goto end;
2035 	}
2036 
2037 	if (vp->v_type != VREG) {
2038 		error = ENODEV;
2039 		goto end;
2040 	}
2041 
2042 	/*
2043 	 * If file is being locked, disallow mapping.
2044 	 */
2045 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2046 		error = EAGAIN;
2047 		goto end;
2048 	}
2049 
2050 	as_rangelock(as);
2051 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2052 	if (error != 0) {
2053 		as_rangeunlock(as);
2054 		goto end;
2055 	}
2056 
2057 	vn_a.vp = vp;
2058 	vn_a.offset = off;
2059 	vn_a.type = flags & MAP_TYPE;
2060 	vn_a.prot = prot;
2061 	vn_a.maxprot = maxprot;
2062 	vn_a.cred = cr;
2063 	vn_a.amp = NULL;
2064 	vn_a.flags = flags & ~MAP_TYPE;
2065 	vn_a.szc = 0;
2066 	vn_a.lgrp_mem_policy_flags = 0;
2067 
2068 	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2069 	as_rangeunlock(as);
2070 
2071 end:
2072 	return (error);
2073 }
2074 
2075 /* ARGSUSED */
2076 static int32_t
2077 udf_addmap(struct vnode *vp,
2078 	offset_t off,
2079 	struct as *as,
2080 	caddr_t addr,
2081 	size_t len,
2082 	uint8_t prot,
2083 	uint8_t maxprot,
2084 	uint32_t flags,
2085 	struct cred *cr,
2086 	caller_context_t *ct)
2087 {
2088 	struct ud_inode *ip = VTOI(vp);
2089 
2090 	ud_printf("udf_addmap\n");
2091 
2092 	if (vp->v_flag & VNOMAP) {
2093 		return (ENOSYS);
2094 	}
2095 
2096 	mutex_enter(&ip->i_tlock);
2097 	ip->i_mapcnt += btopr(len);
2098 	mutex_exit(&ip->i_tlock);
2099 
2100 	return (0);
2101 }
2102 
2103 /* ARGSUSED */
2104 static int32_t
2105 udf_delmap(
2106 	struct vnode *vp, offset_t off,
2107 	struct as *as,
2108 	caddr_t addr,
2109 	size_t len,
2110 	uint32_t prot,
2111 	uint32_t maxprot,
2112 	uint32_t flags,
2113 	struct cred *cr,
2114 	caller_context_t *ct)
2115 {
2116 	struct ud_inode *ip = VTOI(vp);
2117 
2118 	ud_printf("udf_delmap\n");
2119 
2120 	if (vp->v_flag & VNOMAP) {
2121 		return (ENOSYS);
2122 	}
2123 
2124 	mutex_enter(&ip->i_tlock);
2125 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
2126 	ASSERT(ip->i_mapcnt >= 0);
2127 	mutex_exit(&ip->i_tlock);
2128 
2129 	return (0);
2130 }
2131 
2132 /* ARGSUSED */
2133 static int32_t
2134 udf_l_pathconf(
2135 	struct vnode *vp,
2136 	int32_t cmd,
2137 	ulong_t *valp,
2138 	struct cred *cr,
2139 	caller_context_t *ct)
2140 {
2141 	int32_t error = 0;
2142 
2143 	ud_printf("udf_l_pathconf\n");
2144 
2145 	if (cmd == _PC_FILESIZEBITS) {
2146 		/*
2147 		 * udf supports 64 bits as file size
2148 		 * but there are several other restrictions
2149 		 * it only supports 32-bit block numbers and
2150 		 * daddr32_t is only and int32_t so taking these
2151 		 * into account we can stay just as where ufs is
2152 		 */
2153 		*valp = 41;
2154 	} else {
2155 		error = fs_pathconf(vp, cmd, valp, cr, ct);
2156 	}
2157 
2158 	return (error);
2159 }
2160 
2161 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2162 #ifndef	__lint
2163 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2164 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2165 #endif
2166 /*
2167  * Assumption is that there will not be a pageio request
2168  * to a enbedded file
2169  */
2170 /* ARGSUSED */
2171 static int32_t
2172 udf_pageio(
2173 	struct vnode *vp,
2174 	struct page *pp,
2175 	u_offset_t io_off,
2176 	size_t io_len,
2177 	int32_t flags,
2178 	struct cred *cr,
2179 	caller_context_t *ct)
2180 {
2181 	daddr_t bn;
2182 	struct buf *bp;
2183 	struct ud_inode *ip = VTOI(vp);
2184 	int32_t dolock, error = 0, contig, multi_io;
2185 	size_t done_len = 0, cur_len = 0;
2186 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
2187 
2188 	if (pp == NULL) {
2189 		return (EINVAL);
2190 	}
2191 
2192 	dolock = (rw_owner(&ip->i_contents) != curthread);
2193 
2194 	/*
2195 	 * We need a better check.  Ideally, we would use another
2196 	 * vnodeops so that hlocked and forcibly unmounted file
2197 	 * systems would return EIO where appropriate and w/o the
2198 	 * need for these checks.
2199 	 */
2200 	if (ip->i_udf == NULL) {
2201 		return (EIO);
2202 	}
2203 
2204 #ifdef	__lock_lint
2205 	rw_enter(&ip->i_contents, RW_READER);
2206 #else
2207 	if (dolock) {
2208 		rw_enter(&ip->i_contents, RW_READER);
2209 	}
2210 #endif
2211 
2212 	/*
2213 	 * Break the io request into chunks, one for each contiguous
2214 	 * stretch of disk blocks in the target file.
2215 	 */
2216 	while (done_len < io_len) {
2217 		ASSERT(cpp);
2218 		bp = NULL;
2219 		contig = 0;
2220 		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2221 					&bn, &contig)) {
2222 			break;
2223 		}
2224 
2225 		if (bn == UDF_HOLE) {   /* No holey swapfiles */
2226 			cmn_err(CE_WARN, "SWAP file has HOLES");
2227 			error = EINVAL;
2228 			break;
2229 		}
2230 
2231 		cur_len = MIN(io_len - done_len, contig);
2232 
2233 		/*
2234 		 * Check if more than one I/O is
2235 		 * required to complete the given
2236 		 * I/O operation
2237 		 */
2238 		if (ip->i_udf->udf_lbsize < PAGESIZE) {
2239 			if (cur_len >= PAGESIZE) {
2240 				multi_io = 0;
2241 				cur_len &= PAGEMASK;
2242 			} else {
2243 				multi_io = 1;
2244 				cur_len = MIN(io_len - done_len, PAGESIZE);
2245 			}
2246 		}
2247 		page_list_break(&cpp, &npp, btop(cur_len));
2248 
2249 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2250 		ASSERT(bp != NULL);
2251 
2252 		bp->b_edev = ip->i_dev;
2253 		bp->b_dev = cmpdev(ip->i_dev);
2254 		bp->b_blkno = bn;
2255 		bp->b_un.b_addr = (caddr_t)0;
2256 		bp->b_file = vp;
2257 		bp->b_offset = (offset_t)(io_off + done_len);
2258 
2259 /*
2260  *		ub.ub_pageios.value.ul++;
2261  */
2262 		if (multi_io == 0) {
2263 			(void) bdev_strategy(bp);
2264 		} else {
2265 			error = ud_multi_strat(ip, cpp, bp,
2266 				(u_offset_t)(io_off + done_len));
2267 			if (error != 0) {
2268 				pageio_done(bp);
2269 				break;
2270 			}
2271 		}
2272 		if (flags & B_READ) {
2273 			ud_pageio_reads++;
2274 		} else {
2275 			ud_pageio_writes++;
2276 		}
2277 
2278 		/*
2279 		 * If the request is not B_ASYNC, wait for i/o to complete
2280 		 * and re-assemble the page list to return to the caller.
2281 		 * If it is B_ASYNC we leave the page list in pieces and
2282 		 * cleanup() will dispose of them.
2283 		 */
2284 		if ((flags & B_ASYNC) == 0) {
2285 			error = biowait(bp);
2286 			pageio_done(bp);
2287 			if (error) {
2288 				break;
2289 			}
2290 			page_list_concat(&opp, &cpp);
2291 		}
2292 		cpp = npp;
2293 		npp = NULL;
2294 		done_len += cur_len;
2295 	}
2296 
2297 	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2298 	if (error) {
2299 		if (flags & B_ASYNC) {
2300 			/* Cleanup unprocessed parts of list */
2301 			page_list_concat(&cpp, &npp);
2302 			if (flags & B_READ) {
2303 				pvn_read_done(cpp, B_ERROR);
2304 			} else {
2305 				pvn_write_done(cpp, B_ERROR);
2306 			}
2307 		} else {
2308 			/* Re-assemble list and let caller clean up */
2309 			page_list_concat(&opp, &cpp);
2310 			page_list_concat(&opp, &npp);
2311 		}
2312 	}
2313 
2314 #ifdef	__lock_lint
2315 	rw_exit(&ip->i_contents);
2316 #else
2317 	if (dolock) {
2318 		rw_exit(&ip->i_contents);
2319 	}
2320 #endif
2321 	return (error);
2322 }
2323 
2324 
2325 
2326 
2327 /* -------------------- local functions --------------------------- */
2328 
2329 
2330 
2331 int32_t
2332 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2333 	struct ud_inode *ip, caddr_t base, int32_t len,
2334 	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2335 {
2336 	int32_t error;
2337 	struct uio auio;
2338 	struct iovec aiov;
2339 
2340 	ud_printf("ud_rdwri\n");
2341 
2342 	bzero((caddr_t)&auio, sizeof (uio_t));
2343 	bzero((caddr_t)&aiov, sizeof (iovec_t));
2344 
2345 	aiov.iov_base = base;
2346 	aiov.iov_len = len;
2347 	auio.uio_iov = &aiov;
2348 	auio.uio_iovcnt = 1;
2349 	auio.uio_loffset = offset;
2350 	auio.uio_segflg = (int16_t)seg;
2351 	auio.uio_resid = len;
2352 
2353 	if (rw == UIO_WRITE) {
2354 		auio.uio_fmode = FWRITE;
2355 		auio.uio_extflg = UIO_COPY_DEFAULT;
2356 		auio.uio_llimit = curproc->p_fsz_ctl;
2357 		error = ud_wrip(ip, &auio, ioflag, cr);
2358 	} else {
2359 		auio.uio_fmode = FREAD;
2360 		auio.uio_extflg = UIO_COPY_CACHED;
2361 		auio.uio_llimit = MAXOFFSET_T;
2362 		error = ud_rdip(ip, &auio, ioflag, cr);
2363 	}
2364 
2365 	if (aresid) {
2366 		*aresid = auio.uio_resid;
2367 	} else if (auio.uio_resid) {
2368 		error = EIO;
2369 	}
2370 	return (error);
2371 }
2372 
2373 /*
2374  * Free behind hacks.  The pager is busted.
2375  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2376  * or B_FREE_IF_TIGHT_ON_MEMORY.
2377  */
2378 int32_t ud_freebehind = 1;
2379 int32_t ud_smallfile = 32 * 1024;
2380 
2381 /* ARGSUSED */
2382 int32_t
2383 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2384 	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2385 	size_t plsz, enum seg_rw rw, int32_t seq)
2386 {
2387 	struct ud_inode *ip = VTOI(vp);
2388 	int32_t err = 0;
2389 	size_t io_len;
2390 	u_offset_t io_off;
2391 	u_offset_t pgoff;
2392 	page_t *pp;
2393 
2394 	pl[0] = NULL;
2395 
2396 	/*
2397 	 * Figure out whether the page can be created, or must be
2398 	 * read from the disk
2399 	 */
2400 	if (rw == S_CREATE) {
2401 		if ((pp = page_create_va(vp, off,
2402 				PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2403 			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2404 			return (EINVAL);
2405 		}
2406 		io_len = PAGESIZE;
2407 	} else {
2408 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2409 				&io_len, off, PAGESIZE, 0);
2410 
2411 		/*
2412 		 * Some other thread has entered the page.
2413 		 * ud_getpage will retry page_lookup.
2414 		 */
2415 		if (pp == NULL) {
2416 			return (0);
2417 		}
2418 
2419 		/*
2420 		 * Fill the page with as much data as we can from the file.
2421 		 */
2422 		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2423 		if (err) {
2424 			pvn_read_done(pp, B_ERROR);
2425 			return (err);
2426 		}
2427 
2428 		/*
2429 		 * XXX ??? ufs has io_len instead of pgoff below
2430 		 */
2431 		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2432 
2433 		/*
2434 		 * If the file access is sequential, initiate read ahead
2435 		 * of the next cluster.
2436 		 */
2437 		if (seq && ip->i_nextrio < ip->i_size) {
2438 			ud_getpage_ra(vp, off, seg, addr);
2439 		}
2440 	}
2441 
2442 outmiss:
2443 	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2444 	return (err);
2445 }
2446 
2447 /* ARGSUSED */
2448 void
2449 ud_getpage_ra(struct vnode *vp,
2450 	u_offset_t off, struct seg *seg, caddr_t addr)
2451 {
2452 	page_t *pp;
2453 	size_t io_len;
2454 	struct ud_inode *ip = VTOI(vp);
2455 	u_offset_t io_off = ip->i_nextrio, pgoff;
2456 	caddr_t addr2 = addr + (io_off - off);
2457 	daddr_t bn;
2458 	int32_t contig = 0;
2459 
2460 	/*
2461 	 * Is this test needed?
2462 	 */
2463 
2464 	if (addr2 >= seg->s_base + seg->s_size) {
2465 		return;
2466 	}
2467 
2468 	contig = 0;
2469 	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2470 		return;
2471 	}
2472 
2473 	pp = pvn_read_kluster(vp, io_off, seg, addr2,
2474 		&io_off, &io_len, io_off, PAGESIZE, 1);
2475 
2476 	/*
2477 	 * Some other thread has entered the page.
2478 	 * So no read head done here (ie we will have to and wait
2479 	 * for the read when needed).
2480 	 */
2481 
2482 	if (pp == NULL) {
2483 		return;
2484 	}
2485 
2486 	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2487 	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2488 }
2489 
2490 int
2491 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2492 	uint32_t bflgs, u_offset_t *pg_off)
2493 {
2494 	daddr_t bn;
2495 	struct buf *bp;
2496 	caddr_t kaddr, caddr;
2497 	int32_t error = 0, contig = 0, multi_io = 0;
2498 	int32_t lbsize = ip->i_udf->udf_lbsize;
2499 	int32_t lbmask = ip->i_udf->udf_lbmask;
2500 	uint64_t isize;
2501 
2502 	isize = (ip->i_size + lbmask) & (~lbmask);
2503 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2504 
2505 		/*
2506 		 * Embedded file read file_entry
2507 		 * from buffer cache and copy the required
2508 		 * portions
2509 		 */
2510 		bp = ud_bread(ip->i_dev,
2511 		ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2512 		if ((bp->b_error == 0) &&
2513 			(bp->b_resid == 0)) {
2514 
2515 			caddr = bp->b_un.b_addr + ip->i_data_off;
2516 
2517 			/*
2518 			 * mapin to kvm
2519 			 */
2520 			kaddr = (caddr_t)ppmapin(pp,
2521 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2522 			(void) kcopy(caddr, kaddr, ip->i_size);
2523 
2524 			/*
2525 			 * mapout of kvm
2526 			 */
2527 			ppmapout(kaddr);
2528 		}
2529 		brelse(bp);
2530 		contig = ip->i_size;
2531 	} else {
2532 
2533 		/*
2534 		 * Get the continuous size and block number
2535 		 * at offset "off"
2536 		 */
2537 		if (error = ud_bmap_read(ip, off, &bn, &contig))
2538 			goto out;
2539 		contig = MIN(contig, PAGESIZE);
2540 		contig = (contig + lbmask) & (~lbmask);
2541 
2542 		/*
2543 		 * Zero part of the page which we are not
2544 		 * going to read from the disk.
2545 		 */
2546 
2547 		if (bn == UDF_HOLE) {
2548 
2549 			/*
2550 			 * This is a HOLE. Just zero out
2551 			 * the page
2552 			 */
2553 			if (((off + contig) == isize) ||
2554 				(contig == PAGESIZE)) {
2555 				pagezero(pp->p_prev, 0, PAGESIZE);
2556 				goto out;
2557 			}
2558 		}
2559 
2560 		if (contig < PAGESIZE) {
2561 			uint64_t count;
2562 
2563 			count = isize - off;
2564 			if (contig != count) {
2565 				multi_io = 1;
2566 				contig = (int32_t)(MIN(count, PAGESIZE));
2567 			} else {
2568 				pagezero(pp->p_prev, contig, PAGESIZE - contig);
2569 			}
2570 		}
2571 
2572 		/*
2573 		 * Get a bp and initialize it
2574 		 */
2575 		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2576 		ASSERT(bp != NULL);
2577 
2578 		bp->b_edev = ip->i_dev;
2579 		bp->b_dev = cmpdev(ip->i_dev);
2580 		bp->b_blkno = bn;
2581 		bp->b_un.b_addr = 0;
2582 		bp->b_file = ip->i_vnode;
2583 
2584 		/*
2585 		 * Start I/O
2586 		 */
2587 		if (multi_io == 0) {
2588 
2589 			/*
2590 			 * Single I/O is sufficient for this page
2591 			 */
2592 			(void) bdev_strategy(bp);
2593 		} else {
2594 
2595 			/*
2596 			 * We need to do the I/O in
2597 			 * piece's
2598 			 */
2599 			error = ud_multi_strat(ip, pp, bp, off);
2600 			if (error != 0) {
2601 				goto out;
2602 			}
2603 		}
2604 		if ((bflgs & B_ASYNC) == 0) {
2605 
2606 			/*
2607 			 * Wait for i/o to complete.
2608 			 */
2609 
2610 			error = biowait(bp);
2611 			pageio_done(bp);
2612 			if (error) {
2613 				goto out;
2614 			}
2615 		}
2616 	}
2617 	if ((off + contig) >= ip->i_size) {
2618 		contig = ip->i_size - off;
2619 	}
2620 
2621 out:
2622 	*pg_off = contig;
2623 	return (error);
2624 }
2625 
2626 int32_t
2627 ud_putpages(struct vnode *vp, offset_t off,
2628 	size_t len, int32_t flags, struct cred *cr)
2629 {
2630 	struct ud_inode *ip;
2631 	page_t *pp;
2632 	u_offset_t io_off;
2633 	size_t io_len;
2634 	u_offset_t eoff;
2635 	int32_t err = 0;
2636 	int32_t dolock;
2637 
2638 	ud_printf("ud_putpages\n");
2639 
2640 	if (vp->v_count == 0) {
2641 		cmn_err(CE_WARN, "ud_putpages: bad v_count");
2642 		return (EINVAL);
2643 	}
2644 
2645 	ip = VTOI(vp);
2646 
2647 	/*
2648 	 * Acquire the readers/write inode lock before locking
2649 	 * any pages in this inode.
2650 	 * The inode lock is held during i/o.
2651 	 */
2652 	if (len == 0) {
2653 		mutex_enter(&ip->i_tlock);
2654 		ip->i_delayoff = ip->i_delaylen = 0;
2655 		mutex_exit(&ip->i_tlock);
2656 	}
2657 #ifdef	__lock_lint
2658 	rw_enter(&ip->i_contents, RW_READER);
2659 #else
2660 	dolock = (rw_owner(&ip->i_contents) != curthread);
2661 	if (dolock) {
2662 		rw_enter(&ip->i_contents, RW_READER);
2663 	}
2664 #endif
2665 
2666 	if (!vn_has_cached_data(vp)) {
2667 #ifdef	__lock_lint
2668 		rw_exit(&ip->i_contents);
2669 #else
2670 		if (dolock) {
2671 			rw_exit(&ip->i_contents);
2672 		}
2673 #endif
2674 		return (0);
2675 	}
2676 
2677 	if (len == 0) {
2678 		/*
2679 		 * Search the entire vp list for pages >= off.
2680 		 */
2681 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2682 					flags, cr);
2683 	} else {
2684 		/*
2685 		 * Loop over all offsets in the range looking for
2686 		 * pages to deal with.
2687 		 */
2688 		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2689 			eoff = MIN(off + len, eoff);
2690 		} else {
2691 			eoff = off + len;
2692 		}
2693 
2694 		for (io_off = off; io_off < eoff; io_off += io_len) {
2695 			/*
2696 			 * If we are not invalidating, synchronously
2697 			 * freeing or writing pages, use the routine
2698 			 * page_lookup_nowait() to prevent reclaiming
2699 			 * them from the free list.
2700 			 */
2701 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2702 				pp = page_lookup(vp, io_off,
2703 					(flags & (B_INVAL | B_FREE)) ?
2704 					    SE_EXCL : SE_SHARED);
2705 			} else {
2706 				pp = page_lookup_nowait(vp, io_off,
2707 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
2708 			}
2709 
2710 			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2711 				io_len = PAGESIZE;
2712 			} else {
2713 
2714 				err = ud_putapage(vp, pp,
2715 					&io_off, &io_len, flags, cr);
2716 				if (err != 0) {
2717 					break;
2718 				}
2719 				/*
2720 				 * "io_off" and "io_len" are returned as
2721 				 * the range of pages we actually wrote.
2722 				 * This allows us to skip ahead more quickly
2723 				 * since several pages may've been dealt
2724 				 * with by this iteration of the loop.
2725 				 */
2726 			}
2727 		}
2728 	}
2729 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2730 		/*
2731 		 * We have just sync'ed back all the pages on
2732 		 * the inode, turn off the IMODTIME flag.
2733 		 */
2734 		mutex_enter(&ip->i_tlock);
2735 		ip->i_flag &= ~IMODTIME;
2736 		mutex_exit(&ip->i_tlock);
2737 	}
2738 #ifdef	__lock_lint
2739 	rw_exit(&ip->i_contents);
2740 #else
2741 	if (dolock) {
2742 		rw_exit(&ip->i_contents);
2743 	}
2744 #endif
2745 	return (err);
2746 }
2747 
2748 /* ARGSUSED */
2749 int32_t
2750 ud_putapage(struct vnode *vp,
2751 	page_t *pp, u_offset_t *offp,
2752 	size_t *lenp, int32_t flags, struct cred *cr)
2753 {
2754 	daddr_t bn;
2755 	size_t io_len;
2756 	struct ud_inode *ip;
2757 	int32_t error = 0, contig, multi_io = 0;
2758 	struct udf_vfs *udf_vfsp;
2759 	u_offset_t off, io_off;
2760 	caddr_t kaddr, caddr;
2761 	struct buf *bp = NULL;
2762 	int32_t lbmask;
2763 	uint64_t isize;
2764 	int32_t crc_len;
2765 	struct file_entry *fe;
2766 
2767 	ud_printf("ud_putapage\n");
2768 
2769 	ip = VTOI(vp);
2770 	ASSERT(ip);
2771 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2772 	lbmask = ip->i_udf->udf_lbmask;
2773 	isize = (ip->i_size + lbmask) & (~lbmask);
2774 
2775 	udf_vfsp = ip->i_udf;
2776 	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2777 
2778 	/*
2779 	 * If the modified time on the inode has not already been
2780 	 * set elsewhere (e.g. for write/setattr) we set the time now.
2781 	 * This gives us approximate modified times for mmap'ed files
2782 	 * which are modified via stores in the user address space.
2783 	 */
2784 	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2785 		mutex_enter(&ip->i_tlock);
2786 		ip->i_flag |= IUPD;
2787 		ITIMES_NOLOCK(ip);
2788 		mutex_exit(&ip->i_tlock);
2789 	}
2790 
2791 
2792 	/*
2793 	 * Align the request to a block boundry (for old file systems),
2794 	 * and go ask bmap() how contiguous things are for this file.
2795 	 */
2796 	off = pp->p_offset & ~(offset_t)lbmask;
2797 				/* block align it */
2798 
2799 
2800 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2801 		ASSERT(ip->i_size <= ip->i_max_emb);
2802 
2803 		pp = pvn_write_kluster(vp, pp, &io_off,
2804 			&io_len, off, PAGESIZE, flags);
2805 		if (io_len == 0) {
2806 			io_len = PAGESIZE;
2807 		}
2808 
2809 		bp = ud_bread(ip->i_dev,
2810 			ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2811 			udf_vfsp->udf_lbsize);
2812 		fe = (struct file_entry *)bp->b_un.b_addr;
2813 		if ((bp->b_flags & B_ERROR) ||
2814 		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2815 		    ip->i_icb_block,
2816 		    1, udf_vfsp->udf_lbsize) != 0)) {
2817 			if (pp != NULL)
2818 				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2819 			if (bp->b_flags & B_ERROR) {
2820 				error = EIO;
2821 			} else {
2822 				error = EINVAL;
2823 			}
2824 			brelse(bp);
2825 			return (error);
2826 		}
2827 		if ((bp->b_error == 0) &&
2828 			(bp->b_resid == 0)) {
2829 
2830 			caddr = bp->b_un.b_addr + ip->i_data_off;
2831 			kaddr = (caddr_t)ppmapin(pp,
2832 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2833 			(void) kcopy(kaddr, caddr, ip->i_size);
2834 			ppmapout(kaddr);
2835 		}
2836 		crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) +
2837 				SWAP_32(fe->fe_len_ear);
2838 		crc_len += ip->i_size;
2839 		ud_make_tag(ip->i_udf, &fe->fe_tag,
2840 			UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2841 
2842 		bwrite(bp);
2843 
2844 		if (flags & B_ASYNC) {
2845 			pvn_write_done(pp, flags);
2846 		}
2847 		contig = ip->i_size;
2848 	} else {
2849 
2850 		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2851 			goto out;
2852 		}
2853 		contig = MIN(contig, PAGESIZE);
2854 		contig = (contig + lbmask) & (~lbmask);
2855 
2856 		if (contig < PAGESIZE) {
2857 			uint64_t count;
2858 
2859 			count = isize - off;
2860 			if (contig != count) {
2861 				multi_io = 1;
2862 				contig = (int32_t)(MIN(count, PAGESIZE));
2863 			}
2864 		}
2865 
2866 		if ((off + contig) > isize) {
2867 			contig = isize - off;
2868 		}
2869 
2870 		if (contig > PAGESIZE) {
2871 			if (contig & PAGEOFFSET) {
2872 				contig &= PAGEMASK;
2873 			}
2874 		}
2875 
2876 		pp = pvn_write_kluster(vp, pp, &io_off,
2877 			&io_len, off, contig, flags);
2878 		if (io_len == 0) {
2879 			io_len = PAGESIZE;
2880 		}
2881 
2882 		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2883 		ASSERT(bp != NULL);
2884 
2885 		bp->b_edev = ip->i_dev;
2886 		bp->b_dev = cmpdev(ip->i_dev);
2887 		bp->b_blkno = bn;
2888 		bp->b_un.b_addr = 0;
2889 		bp->b_file = vp;
2890 		bp->b_offset = (offset_t)off;
2891 
2892 
2893 		/*
2894 		 * write throttle
2895 		 */
2896 		ASSERT(bp->b_iodone == NULL);
2897 		bp->b_iodone = ud_iodone;
2898 		mutex_enter(&ip->i_tlock);
2899 		ip->i_writes += bp->b_bcount;
2900 		mutex_exit(&ip->i_tlock);
2901 
2902 		if (multi_io == 0) {
2903 
2904 			(void) bdev_strategy(bp);
2905 		} else {
2906 			error = ud_multi_strat(ip, pp, bp, off);
2907 			if (error != 0) {
2908 				goto out;
2909 			}
2910 		}
2911 
2912 		if ((flags & B_ASYNC) == 0) {
2913 			/*
2914 			 * Wait for i/o to complete.
2915 			 */
2916 			error = biowait(bp);
2917 			pageio_done(bp);
2918 		}
2919 	}
2920 
2921 	if ((flags & B_ASYNC) == 0) {
2922 		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2923 	}
2924 
2925 	pp = NULL;
2926 
2927 out:
2928 	if (error != 0 && pp != NULL) {
2929 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2930 	}
2931 
2932 	if (offp) {
2933 		*offp = io_off;
2934 	}
2935 	if (lenp) {
2936 		*lenp = io_len;
2937 	}
2938 
2939 	return (error);
2940 }
2941 
2942 
2943 int32_t
2944 ud_iodone(struct buf *bp)
2945 {
2946 	struct ud_inode *ip;
2947 
2948 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2949 
2950 	bp->b_iodone = NULL;
2951 
2952 	ip = VTOI(bp->b_pages->p_vnode);
2953 
2954 	mutex_enter(&ip->i_tlock);
2955 	if (ip->i_writes >= ud_LW) {
2956 		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2957 			if (ud_WRITES) {
2958 				cv_broadcast(&ip->i_wrcv); /* wake all up */
2959 			}
2960 		}
2961 	} else {
2962 		ip->i_writes -= bp->b_bcount;
2963 	}
2964 	mutex_exit(&ip->i_tlock);
2965 	iodone(bp);
2966 	return (0);
2967 }
2968 
2969 /* ARGSUSED3 */
2970 int32_t
2971 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2972 {
2973 	struct vnode *vp;
2974 	struct udf_vfs *udf_vfsp;
2975 	krw_t rwtype;
2976 	caddr_t base;
2977 	uint32_t flags;
2978 	int32_t error, n, on, mapon, dofree;
2979 	u_offset_t off;
2980 	long oresid = uio->uio_resid;
2981 
2982 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2983 	if ((ip->i_type != VREG) &&
2984 		(ip->i_type != VDIR) &&
2985 		(ip->i_type != VLNK)) {
2986 		return (EIO);
2987 	}
2988 
2989 	if (uio->uio_loffset > MAXOFFSET_T) {
2990 		return (0);
2991 	}
2992 
2993 	if ((uio->uio_loffset < (offset_t)0) ||
2994 		((uio->uio_loffset + uio->uio_resid) < 0)) {
2995 		return (EINVAL);
2996 	}
2997 	if (uio->uio_resid == 0) {
2998 		return (0);
2999 	}
3000 
3001 	vp = ITOV(ip);
3002 	udf_vfsp = ip->i_udf;
3003 	mutex_enter(&ip->i_tlock);
3004 	ip->i_flag |= IACC;
3005 	mutex_exit(&ip->i_tlock);
3006 
3007 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3008 
3009 	do {
3010 		offset_t diff;
3011 		u_offset_t uoff = uio->uio_loffset;
3012 		off = uoff & (offset_t)MAXBMASK;
3013 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3014 		on = (int)blkoff(udf_vfsp, uoff);
3015 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3016 
3017 		diff = ip->i_size - uoff;
3018 
3019 		if (diff <= (offset_t)0) {
3020 			error = 0;
3021 			goto out;
3022 		}
3023 		if (diff < (offset_t)n) {
3024 			n = (int)diff;
3025 		}
3026 		dofree = ud_freebehind &&
3027 			ip->i_nextr == (off & PAGEMASK) &&
3028 				off > ud_smallfile;
3029 
3030 #ifndef	__lock_lint
3031 		if (rwtype == RW_READER) {
3032 			rw_exit(&ip->i_contents);
3033 		}
3034 #endif
3035 
3036 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3037 					(uint32_t)n, 1, S_READ);
3038 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3039 
3040 		flags = 0;
3041 		if (!error) {
3042 			/*
3043 			 * If read a whole block, or read to eof,
3044 			 * won't need this buffer again soon.
3045 			 */
3046 			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3047 				freemem < lotsfree + pages_before_pager) {
3048 				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3049 			}
3050 			/*
3051 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3052 			 * we want to make sure that the page which has
3053 			 * been read, is written on disk if it is dirty.
3054 			 * And corresponding indirect blocks should also
3055 			 * be flushed out.
3056 			 */
3057 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3058 				flags &= ~SM_ASYNC;
3059 				flags |= SM_WRITE;
3060 			}
3061 			error = segmap_release(segkmap, base, flags);
3062 		} else    {
3063 			(void) segmap_release(segkmap, base, flags);
3064 		}
3065 
3066 #ifndef __lock_lint
3067 		if (rwtype == RW_READER) {
3068 			rw_enter(&ip->i_contents, rwtype);
3069 		}
3070 #endif
3071 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3072 out:
3073 	/*
3074 	 * Inode is updated according to this table if FRSYNC is set.
3075 	 *
3076 	 *	FSYNC	FDSYNC(posix.4)
3077 	 *	--------------------------
3078 	 *	always	IATTCHG|IBDWRITE
3079 	 */
3080 	if (ioflag & FRSYNC) {
3081 		if ((ioflag & FSYNC) ||
3082 		((ioflag & FDSYNC) && (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3083 		rw_exit(&ip->i_contents);
3084 		rw_enter(&ip->i_contents, RW_WRITER);
3085 		ud_iupdat(ip, 1);
3086 		}
3087 	}
3088 	/*
3089 	 * If we've already done a partial read, terminate
3090 	 * the read but return no error.
3091 	 */
3092 	if (oresid != uio->uio_resid) {
3093 		error = 0;
3094 	}
3095 	ITIMES(ip);
3096 
3097 	return (error);
3098 }
3099 
3100 int32_t
3101 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3102 {
3103 	caddr_t base;
3104 	struct vnode *vp;
3105 	struct udf_vfs *udf_vfsp;
3106 	uint32_t flags;
3107 	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3108 	int32_t pagecreate, newpage;
3109 	uint64_t old_i_size;
3110 	u_offset_t off;
3111 	long start_resid = uio->uio_resid, premove_resid;
3112 	rlim64_t limit = uio->uio_limit;
3113 
3114 
3115 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
3116 	if ((ip->i_type != VREG) &&
3117 		(ip->i_type != VDIR) &&
3118 		(ip->i_type != VLNK)) {
3119 		return (EIO);
3120 	}
3121 
3122 	if (uio->uio_loffset >= MAXOFFSET_T) {
3123 		return (EFBIG);
3124 	}
3125 	/*
3126 	 * see udf_l_pathconf
3127 	 */
3128 	if (limit > (((uint64_t)1 << 40) - 1)) {
3129 		limit = ((uint64_t)1 << 40) - 1;
3130 	}
3131 	if (uio->uio_loffset >= limit) {
3132 		proc_t *p = ttoproc(curthread);
3133 
3134 		mutex_enter(&p->p_lock);
3135 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3136 		    p, RCA_UNSAFE_SIGINFO);
3137 		mutex_exit(&p->p_lock);
3138 		return (EFBIG);
3139 	}
3140 	if ((uio->uio_loffset < (offset_t)0) ||
3141 		((uio->uio_loffset + uio->uio_resid) < 0)) {
3142 		return (EINVAL);
3143 	}
3144 	if (uio->uio_resid == 0) {
3145 		return (0);
3146 	}
3147 
3148 	mutex_enter(&ip->i_tlock);
3149 	ip->i_flag |= INOACC;
3150 
3151 	if (ioflag & (FSYNC | FDSYNC)) {
3152 		ip->i_flag |= ISYNC;
3153 		iupdat_flag = 1;
3154 	}
3155 	mutex_exit(&ip->i_tlock);
3156 
3157 	udf_vfsp = ip->i_udf;
3158 	vp = ITOV(ip);
3159 
3160 	do {
3161 		u_offset_t uoff = uio->uio_loffset;
3162 		off = uoff & (offset_t)MAXBMASK;
3163 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3164 		on = (int)blkoff(udf_vfsp, uoff);
3165 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3166 
3167 		if (ip->i_type == VREG && uoff + n >= limit) {
3168 			if (uoff >= limit) {
3169 				error = EFBIG;
3170 				goto out;
3171 			}
3172 			n = (int)(limit - (rlim64_t)uoff);
3173 		}
3174 		if (uoff + n > ip->i_size) {
3175 			/*
3176 			 * We are extending the length of the file.
3177 			 * bmap is used so that we are sure that
3178 			 * if we need to allocate new blocks, that it
3179 			 * is done here before we up the file size.
3180 			 */
3181 			error = ud_bmap_write(ip, uoff,
3182 					(int)(on + n), mapon == 0, cr);
3183 			if (error) {
3184 				break;
3185 			}
3186 			i_size_changed = 1;
3187 			old_i_size = ip->i_size;
3188 			ip->i_size = uoff + n;
3189 			/*
3190 			 * If we are writing from the beginning of
3191 			 * the mapping, we can just create the
3192 			 * pages without having to read them.
3193 			 */
3194 			pagecreate = (mapon == 0);
3195 		} else if (n == MAXBSIZE) {
3196 			/*
3197 			 * Going to do a whole mappings worth,
3198 			 * so we can just create the pages w/o
3199 			 * having to read them in.  But before
3200 			 * we do that, we need to make sure any
3201 			 * needed blocks are allocated first.
3202 			 */
3203 			error = ud_bmap_write(ip, uoff,
3204 					(int)(on + n), 1, cr);
3205 			if (error) {
3206 				break;
3207 			}
3208 			pagecreate = 1;
3209 		} else {
3210 			pagecreate = 0;
3211 		}
3212 
3213 		rw_exit(&ip->i_contents);
3214 
3215 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3216 				(uint32_t)n, !pagecreate, S_WRITE);
3217 
3218 		/*
3219 		 * segmap_pagecreate() returns 1 if it calls
3220 		 * page_create_va() to allocate any pages.
3221 		 */
3222 		newpage = 0;
3223 		if (pagecreate) {
3224 			newpage = segmap_pagecreate(segkmap, base,
3225 					(size_t)n, 0);
3226 		}
3227 
3228 		premove_resid = uio->uio_resid;
3229 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3230 
3231 		if (pagecreate &&
3232 			uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3233 			/*
3234 			 * We created pages w/o initializing them completely,
3235 			 * thus we need to zero the part that wasn't set up.
3236 			 * This happens on most EOF write cases and if
3237 			 * we had some sort of error during the uiomove.
3238 			 */
3239 			int nzero, nmoved;
3240 
3241 			nmoved = (int)(uio->uio_loffset - (off + mapon));
3242 			ASSERT(nmoved >= 0 && nmoved <= n);
3243 			nzero = roundup(on + n, PAGESIZE) - nmoved;
3244 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3245 			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3246 		}
3247 
3248 		/*
3249 		 * Unlock the pages allocated by page_create_va()
3250 		 * in segmap_pagecreate()
3251 		 */
3252 		if (newpage) {
3253 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3254 		}
3255 
3256 		if (error) {
3257 			/*
3258 			 * If we failed on a write, we may have already
3259 			 * allocated file blocks as well as pages.  It's
3260 			 * hard to undo the block allocation, but we must
3261 			 * be sure to invalidate any pages that may have
3262 			 * been allocated.
3263 			 */
3264 			(void) segmap_release(segkmap, base, SM_INVAL);
3265 		} else {
3266 			flags = 0;
3267 			/*
3268 			 * Force write back for synchronous write cases.
3269 			 */
3270 			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3271 				/*
3272 				 * If the sticky bit is set but the
3273 				 * execute bit is not set, we do a
3274 				 * synchronous write back and free
3275 				 * the page when done.  We set up swap
3276 				 * files to be handled this way to
3277 				 * prevent servers from keeping around
3278 				 * the client's swap pages too long.
3279 				 * XXX - there ought to be a better way.
3280 				 */
3281 				if (IS_SWAPVP(vp)) {
3282 					flags = SM_WRITE | SM_FREE |
3283 							SM_DONTNEED;
3284 					iupdat_flag = 0;
3285 				} else {
3286 					flags = SM_WRITE;
3287 				}
3288 			} else if (((mapon + n) == MAXBSIZE) ||
3289 					IS_SWAPVP(vp)) {
3290 				/*
3291 				 * Have written a whole block.
3292 				 * Start an asynchronous write and
3293 				 * mark the buffer to indicate that
3294 				 * it won't be needed again soon.
3295 				 */
3296 				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3297 			}
3298 			error = segmap_release(segkmap, base, flags);
3299 
3300 			/*
3301 			 * If the operation failed and is synchronous,
3302 			 * then we need to unwind what uiomove() last
3303 			 * did so we can potentially return an error to
3304 			 * the caller.  If this write operation was
3305 			 * done in two pieces and the first succeeded,
3306 			 * then we won't return an error for the second
3307 			 * piece that failed.  However, we only want to
3308 			 * return a resid value that reflects what was
3309 			 * really done.
3310 			 *
3311 			 * Failures for non-synchronous operations can
3312 			 * be ignored since the page subsystem will
3313 			 * retry the operation until it succeeds or the
3314 			 * file system is unmounted.
3315 			 */
3316 			if (error) {
3317 				if ((ioflag & (FSYNC | FDSYNC)) ||
3318 					ip->i_type == VDIR) {
3319 					uio->uio_resid = premove_resid;
3320 				} else {
3321 					error = 0;
3322 				}
3323 			}
3324 		}
3325 
3326 		/*
3327 		 * Re-acquire contents lock.
3328 		 */
3329 		rw_enter(&ip->i_contents, RW_WRITER);
3330 		/*
3331 		 * If the uiomove() failed or if a synchronous
3332 		 * page push failed, fix up i_size.
3333 		 */
3334 		if (error) {
3335 			if (i_size_changed) {
3336 				/*
3337 				 * The uiomove failed, and we
3338 				 * allocated blocks,so get rid
3339 				 * of them.
3340 				 */
3341 				(void) ud_itrunc(ip, old_i_size, 0, cr);
3342 			}
3343 		} else {
3344 			/*
3345 			 * XXX - Can this be out of the loop?
3346 			 */
3347 			ip->i_flag |= IUPD | ICHG;
3348 			if (i_size_changed) {
3349 				ip->i_flag |= IATTCHG;
3350 			}
3351 			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3352 				(IEXEC >> 10))) != 0 &&
3353 			    (ip->i_char & (ISUID | ISGID)) != 0 &&
3354 			    secpolicy_vnode_setid_retain(cr,
3355 			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3356 				/*
3357 				 * Clear Set-UID & Set-GID bits on
3358 				 * successful write if not privileged
3359 				 * and at least one of the execute bits
3360 				 * is set.  If we always clear Set-GID,
3361 				 * mandatory file and record locking is
3362 				 * unuseable.
3363 				 */
3364 				ip->i_char &= ~(ISUID | ISGID);
3365 			}
3366 		}
3367 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3368 
3369 out:
3370 	/*
3371 	 * Inode is updated according to this table -
3372 	 *
3373 	 *	FSYNC	FDSYNC(posix.4)
3374 	 *	--------------------------
3375 	 *	always@	IATTCHG|IBDWRITE
3376 	 *
3377 	 * @ -  If we are doing synchronous write the only time we should
3378 	 *	not be sync'ing the ip here is if we have the stickyhack
3379 	 *	activated, the file is marked with the sticky bit and
3380 	 *	no exec bit, the file length has not been changed and
3381 	 *	no new blocks have been allocated during this write.
3382 	 */
3383 	if ((ip->i_flag & ISYNC) != 0) {
3384 		/*
3385 		 * we have eliminated nosync
3386 		 */
3387 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3388 			((ioflag & FSYNC) && iupdat_flag)) {
3389 			ud_iupdat(ip, 1);
3390 		}
3391 	}
3392 
3393 	/*
3394 	 * If we've already done a partial-write, terminate
3395 	 * the write but return no error.
3396 	 */
3397 	if (start_resid != uio->uio_resid) {
3398 		error = 0;
3399 	}
3400 	ip->i_flag &= ~(INOACC | ISYNC);
3401 	ITIMES_NOLOCK(ip);
3402 
3403 	return (error);
3404 }
3405 
3406 int32_t
3407 ud_multi_strat(struct ud_inode *ip,
3408 	page_t *pp, struct buf *bp, u_offset_t start)
3409 {
3410 	daddr_t bn;
3411 	int32_t error = 0, io_count, contig, alloc_sz, i;
3412 	uint32_t io_off;
3413 	mio_master_t *mm = NULL;
3414 	mio_slave_t *ms = NULL;
3415 	struct buf *rbp;
3416 
3417 	ASSERT(!(start & PAGEOFFSET));
3418 
3419 	/*
3420 	 * Figure out how many buffers to allocate
3421 	 */
3422 	io_count = 0;
3423 	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3424 		contig = 0;
3425 		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3426 					&bn, &contig)) {
3427 			goto end;
3428 		}
3429 		if (contig == 0) {
3430 			goto end;
3431 		}
3432 		contig = MIN(contig, PAGESIZE - io_off);
3433 		if (bn != UDF_HOLE) {
3434 			io_count ++;
3435 		} else {
3436 			/*
3437 			 * HOLE
3438 			 */
3439 			if (bp->b_flags & B_READ) {
3440 
3441 				/*
3442 				 * This is a hole and is read
3443 				 * it should be filled with 0's
3444 				 */
3445 				pagezero(pp, io_off, contig);
3446 			}
3447 		}
3448 	}
3449 
3450 
3451 	if (io_count != 0) {
3452 
3453 		/*
3454 		 * Allocate memory for all the
3455 		 * required number of buffers
3456 		 */
3457 		alloc_sz = sizeof (mio_master_t) +
3458 			(sizeof (mio_slave_t) * io_count);
3459 		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3460 		if (mm == NULL) {
3461 			error = ENOMEM;
3462 			goto end;
3463 		}
3464 
3465 		/*
3466 		 * initialize master
3467 		 */
3468 		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3469 		mm->mm_size = alloc_sz;
3470 		mm->mm_bp = bp;
3471 		mm->mm_resid = 0;
3472 		mm->mm_error = 0;
3473 		mm->mm_index = master_index++;
3474 
3475 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3476 
3477 		/*
3478 		 * Initialize buffers
3479 		 */
3480 		io_count = 0;
3481 		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3482 			contig = 0;
3483 			if (error = ud_bmap_read(ip,
3484 					(u_offset_t)(start + io_off),
3485 					&bn, &contig)) {
3486 				goto end;
3487 			}
3488 			ASSERT(contig);
3489 			if ((io_off + contig) > bp->b_bcount) {
3490 				contig = bp->b_bcount - io_off;
3491 			}
3492 			if (bn != UDF_HOLE) {
3493 				/*
3494 				 * Clone the buffer
3495 				 * and prepare to start I/O
3496 				 */
3497 				ms->ms_ptr = mm;
3498 				bioinit(&ms->ms_buf);
3499 				rbp = bioclone(bp, io_off, (size_t)contig,
3500 					bp->b_edev, bn, ud_slave_done,
3501 					&ms->ms_buf, KM_NOSLEEP);
3502 				ASSERT(rbp == &ms->ms_buf);
3503 				mm->mm_resid += contig;
3504 				io_count++;
3505 				ms ++;
3506 			}
3507 		}
3508 
3509 		/*
3510 		 * Start I/O's
3511 		 */
3512 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3513 		for (i = 0; i < io_count; i++) {
3514 			(void) bdev_strategy(&ms->ms_buf);
3515 			ms ++;
3516 		}
3517 	}
3518 
3519 end:
3520 	if (error != 0) {
3521 		bp->b_flags |= B_ERROR;
3522 		bp->b_error = error;
3523 		if (mm != NULL) {
3524 			mutex_destroy(&mm->mm_mutex);
3525 			kmem_free(mm, mm->mm_size);
3526 		}
3527 	}
3528 	return (error);
3529 }
3530 
3531 int32_t
3532 ud_slave_done(struct buf *bp)
3533 {
3534 	mio_master_t *mm;
3535 	int32_t resid;
3536 
3537 	ASSERT(SEMA_HELD(&bp->b_sem));
3538 	ASSERT((bp->b_flags & B_DONE) == 0);
3539 
3540 	mm = ((mio_slave_t *)bp)->ms_ptr;
3541 
3542 	/*
3543 	 * Propagate error and byte count info from slave struct to
3544 	 * the master struct
3545 	 */
3546 	mutex_enter(&mm->mm_mutex);
3547 	if (bp->b_flags & B_ERROR) {
3548 
3549 		/*
3550 		 * If multiple slave buffers get
3551 		 * error we forget the old errors
3552 		 * this is ok because we any way
3553 		 * cannot return multiple errors
3554 		 */
3555 		mm->mm_error = bp->b_error;
3556 	}
3557 	mm->mm_resid -= bp->b_bcount;
3558 	resid = mm->mm_resid;
3559 	mutex_exit(&mm->mm_mutex);
3560 
3561 	/*
3562 	 * free up the resources allocated to cloned buffers.
3563 	 */
3564 	bp_mapout(bp);
3565 	biofini(bp);
3566 
3567 	if (resid == 0) {
3568 
3569 		/*
3570 		 * This is the last I/O operation
3571 		 * clean up and return the original buffer
3572 		 */
3573 		if (mm->mm_error) {
3574 			mm->mm_bp->b_flags |= B_ERROR;
3575 			mm->mm_bp->b_error = mm->mm_error;
3576 		}
3577 		biodone(mm->mm_bp);
3578 		mutex_destroy(&mm->mm_mutex);
3579 		kmem_free(mm, mm->mm_size);
3580 	}
3581 	return (0);
3582 }
3583