xref: /titanic_52/usr/src/uts/common/fs/udfs/udf_vnops.c (revision a93a1f58a8763fa69172980b98e3d24720c1136e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/signal.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/stat.h>
42 #include <sys/vnode.h>
43 #include <sys/mode.h>
44 #include <sys/proc.h>
45 #include <sys/disp.h>
46 #include <sys/file.h>
47 #include <sys/fcntl.h>
48 #include <sys/flock.h>
49 #include <sys/kmem.h>
50 #include <sys/uio.h>
51 #include <sys/dnlc.h>
52 #include <sys/conf.h>
53 #include <sys/errno.h>
54 #include <sys/mman.h>
55 #include <sys/fbuf.h>
56 #include <sys/pathname.h>
57 #include <sys/debug.h>
58 #include <sys/vmsystm.h>
59 #include <sys/cmn_err.h>
60 #include <sys/dirent.h>
61 #include <sys/errno.h>
62 #include <sys/modctl.h>
63 #include <sys/statvfs.h>
64 #include <sys/mount.h>
65 #include <sys/sunddi.h>
66 #include <sys/bootconf.h>
67 #include <sys/policy.h>
68 
69 #include <vm/hat.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/page.h>
79 #include <sys/swap.h>
80 
81 #include <fs/fs_subr.h>
82 
83 #include <sys/fs/udf_volume.h>
84 #include <sys/fs/udf_inode.h>
85 
86 static int32_t udf_open(struct vnode **,
87 	int32_t, struct cred *);
88 static int32_t udf_close(struct vnode *,
89 	int32_t, int32_t, offset_t, struct cred *);
90 static int32_t udf_read(struct vnode *,
91 	struct uio *, int32_t, struct cred *, struct caller_context *);
92 static int32_t udf_write(struct vnode *,
93 	struct uio *, int32_t, struct cred *, struct caller_context *);
94 static int32_t udf_ioctl(struct vnode *,
95 	int32_t, intptr_t, int32_t, struct cred *, int32_t *);
96 static int32_t udf_getattr(struct vnode *,
97 	struct vattr *, int32_t, struct cred *);
98 static int32_t udf_setattr(struct vnode *,
99 	struct vattr *, int32_t, struct cred *, caller_context_t *);
100 static int32_t udf_access(struct vnode *,
101 	int32_t, int32_t, struct cred *);
102 static int32_t udf_lookup(struct vnode *,
103 	char *, struct vnode **, struct pathname *,
104 	int32_t, struct vnode *, struct cred *);
105 static int32_t udf_create(struct vnode *,
106 	char *, struct vattr *, enum vcexcl,
107 	int32_t, struct vnode **, struct cred *, int32_t);
108 static int32_t udf_remove(struct vnode *,
109 	char *, struct cred *);
110 static int32_t udf_link(struct vnode *,
111 	struct vnode *, char *, struct cred *);
112 static int32_t udf_rename(struct vnode *,
113 	char *, struct vnode *, char *, struct cred *);
114 static int32_t udf_mkdir(struct vnode *,
115 	char *, struct vattr *, struct vnode **, struct cred *);
116 static int32_t udf_rmdir(struct vnode *,
117 	char *, struct vnode *, struct cred *);
118 static int32_t udf_readdir(struct vnode *,
119 	struct uio *, struct cred *, int32_t *);
120 static int32_t udf_symlink(struct vnode *,
121 	char *, struct vattr *, char *, struct cred *);
122 static int32_t udf_readlink(struct vnode *,
123 	struct uio *, struct cred *);
124 static int32_t udf_fsync(struct vnode *,
125 	int32_t, struct cred *);
126 static void udf_inactive(struct vnode *,
127 	struct cred *);
128 static int32_t udf_fid(struct vnode *, struct fid *);
129 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
130 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
131 static int32_t udf_seek(struct vnode *, offset_t, offset_t *);
132 static int32_t udf_frlock(struct vnode *, int32_t,
133 	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *);
134 static int32_t udf_space(struct vnode *, int32_t,
135 	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
136 static int32_t udf_getpage(struct vnode *, offset_t,
137 	size_t, uint32_t *, struct page **, size_t,
138 	struct seg *, caddr_t, enum seg_rw, struct cred *);
139 static int32_t udf_putpage(struct vnode *, offset_t,
140 	size_t, int32_t, struct cred *);
141 static int32_t udf_map(struct vnode *, offset_t, struct as *,
142 	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
143 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
144 	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
145 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
146 	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *);
147 static int32_t udf_l_pathconf(struct vnode *, int32_t,
148 	ulong_t *, struct cred *);
149 static int32_t udf_pageio(struct vnode *, struct page *,
150 	u_offset_t, size_t, int32_t, struct cred *);
151 
152 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
153 	size_t, struct seg *, caddr_t, page_t *pl[],
154 	size_t, enum seg_rw, int32_t);
155 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
156 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
157 int32_t ud_page_fill(struct ud_inode *, page_t *,
158 	u_offset_t, uint32_t, u_offset_t *);
159 int32_t ud_iodone(struct buf *);
160 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
161 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
162 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
163 int32_t ud_slave_done(struct buf *);
164 
165 /*
166  * Structures to control multiple IO operations to get or put pages
167  * that are backed by discontiguous blocks. The master struct is
168  * a dummy that holds the original bp from pageio_setup. The
169  * slave struct holds the working bp's to do the actual IO. Once
170  * all the slave IOs complete. The master is processed as if a single
171  * IO op has completed.
172  */
173 uint32_t master_index = 0;
174 typedef struct mio_master {
175 	kmutex_t	mm_mutex;	/* protect the fields below */
176 	int32_t		mm_size;
177 	buf_t		*mm_bp;		/* original bp */
178 	int32_t		mm_resid;	/* bytes remaining to transfer */
179 	int32_t		mm_error;	/* accumulated error from slaves */
180 	int32_t		mm_index;	/* XXX debugging */
181 } mio_master_t;
182 
183 typedef struct mio_slave {
184 	buf_t		ms_buf;		/* working buffer for this IO chunk */
185 	mio_master_t	*ms_ptr;	/* pointer to master */
186 } mio_slave_t;
187 
188 struct vnodeops *udf_vnodeops;
189 
190 const fs_operation_def_t udf_vnodeops_template[] = {
191 	VOPNAME_OPEN,		{ .vop_open = udf_open },
192 	VOPNAME_CLOSE,		{ .vop_close = udf_close },
193 	VOPNAME_READ,		{ .vop_read = udf_read },
194 	VOPNAME_WRITE,		{ .vop_write = udf_write },
195 	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
196 	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
197 	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
198 	VOPNAME_ACCESS,		{ .vop_access = udf_access },
199 	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
200 	VOPNAME_CREATE,		{ .vop_create = udf_create },
201 	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
202 	VOPNAME_LINK,		{ .vop_link = udf_link },
203 	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
204 	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
205 	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
206 	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
207 	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
208 	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
209 	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
210 	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
211 	VOPNAME_FID,		{ .vop_fid = udf_fid },
212 	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
213 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
214 	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
215 	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
216 	VOPNAME_SPACE,		{ .vop_space = udf_space },
217 	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
218 	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
219 	VOPNAME_MAP,		{ .vop_map = udf_map },
220 	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
221 	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
222 	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
223 	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
224 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
225 	NULL,			NULL
226 };
227 
228 /* ARGSUSED */
229 static int32_t
230 udf_open(struct vnode **vpp, int32_t flag, struct cred *cr)
231 {
232 	ud_printf("udf_open\n");
233 
234 	return (0);
235 }
236 
237 /* ARGSUSED */
238 static int32_t
239 udf_close(struct vnode *vp, int32_t flag,
240 	int32_t count, offset_t offset, struct cred *cr)
241 {
242 	struct ud_inode *ip = VTOI(vp);
243 
244 	ud_printf("udf_close\n");
245 
246 	ITIMES(ip);
247 
248 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
249 	cleanshares(vp, ttoproc(curthread)->p_pid);
250 
251 	/*
252 	 * Push partially filled cluster at last close.
253 	 * ``last close'' is approximated because the dnlc
254 	 * may have a hold on the vnode.
255 	 */
256 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
257 		struct ud_inode *ip = VTOI(vp);
258 		if (ip->i_delaylen) {
259 			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
260 					B_ASYNC | B_FREE, cr);
261 			ip->i_delaylen = 0;
262 		}
263 	}
264 
265 	return (0);
266 }
267 
268 static int32_t
269 udf_read(struct vnode *vp, struct uio *uiop,
270 	int32_t ioflag, struct cred *cr, struct caller_context *ct)
271 {
272 	struct ud_inode *ip = VTOI(vp);
273 	int32_t error;
274 
275 	ud_printf("udf_read\n");
276 
277 #ifdef	__lock_lint
278 	rw_enter(&ip->i_rwlock, RW_READER);
279 #endif
280 
281 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
282 
283 	if (MANDLOCK(vp, ip->i_char)) {
284 		/*
285 		 * udf_getattr ends up being called by chklock
286 		 */
287 		error = chklock(vp, FREAD, uiop->uio_loffset,
288 			uiop->uio_resid, uiop->uio_fmode, ct);
289 		if (error) {
290 			goto end;
291 		}
292 	}
293 
294 	rw_enter(&ip->i_contents, RW_READER);
295 	error = ud_rdip(ip, uiop, ioflag, cr);
296 	rw_exit(&ip->i_contents);
297 
298 end:
299 #ifdef	__lock_lint
300 	rw_exit(&ip->i_rwlock);
301 #endif
302 
303 	return (error);
304 }
305 
306 
307 int32_t ud_WRITES = 1;
308 int32_t ud_HW = 96 * 1024;
309 int32_t ud_LW = 64 * 1024;
310 int32_t ud_throttles = 0;
311 
312 static int32_t
313 udf_write(struct vnode *vp, struct uio *uiop,
314 	int32_t ioflag, struct cred *cr, struct caller_context *ct)
315 {
316 	struct ud_inode *ip = VTOI(vp);
317 	int32_t error = 0;
318 
319 	ud_printf("udf_write\n");
320 
321 #ifdef	__lock_lint
322 	rw_enter(&ip->i_rwlock, RW_WRITER);
323 #endif
324 
325 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
326 
327 	if (MANDLOCK(vp, ip->i_char)) {
328 		/*
329 		 * ud_getattr ends up being called by chklock
330 		 */
331 		error = chklock(vp, FWRITE, uiop->uio_loffset,
332 			uiop->uio_resid, uiop->uio_fmode, ct);
333 		if (error) {
334 			goto end;
335 		}
336 	}
337 	/*
338 	 * Throttle writes.
339 	 */
340 	mutex_enter(&ip->i_tlock);
341 	if (ud_WRITES && (ip->i_writes > ud_HW)) {
342 		while (ip->i_writes > ud_HW) {
343 			ud_throttles++;
344 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
345 		}
346 	}
347 	mutex_exit(&ip->i_tlock);
348 
349 	/*
350 	 * Write to the file
351 	 */
352 	rw_enter(&ip->i_contents, RW_WRITER);
353 	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
354 		/*
355 		 * In append mode start at end of file.
356 		 */
357 		uiop->uio_loffset = ip->i_size;
358 	}
359 	error = ud_wrip(ip, uiop, ioflag, cr);
360 	rw_exit(&ip->i_contents);
361 
362 end:
363 #ifdef	__lock_lint
364 	rw_exit(&ip->i_rwlock);
365 #endif
366 
367 	return (error);
368 }
369 
370 /* ARGSUSED */
371 static int32_t
372 udf_ioctl(struct vnode *vp, int32_t cmd, intptr_t arg,
373 	int32_t flag, struct cred *cr, int32_t *rvalp)
374 {
375 	return (ENOTTY);
376 }
377 
378 /* ARGSUSED */
379 static int32_t
380 udf_getattr(struct vnode *vp,
381 	struct vattr *vap, int32_t flags, struct cred *cr)
382 {
383 	struct ud_inode *ip = VTOI(vp);
384 
385 	ud_printf("udf_getattr\n");
386 
387 	if (vap->va_mask == AT_SIZE) {
388 		/*
389 		 * for performance, if only the size is requested don't bother
390 		 * with anything else.
391 		 */
392 		vap->va_size = ip->i_size;
393 		return (0);
394 	}
395 
396 	rw_enter(&ip->i_contents, RW_READER);
397 
398 	vap->va_type = vp->v_type;
399 	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
400 
401 	vap->va_uid = ip->i_uid;
402 	vap->va_gid = ip->i_gid;
403 	vap->va_fsid = ip->i_dev;
404 	vap->va_nodeid = ip->i_icb_lbano;
405 	vap->va_nlink = ip->i_nlink;
406 	vap->va_size = ip->i_size;
407 	vap->va_seq = ip->i_seq;
408 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
409 		vap->va_rdev = ip->i_rdev;
410 	} else {
411 		vap->va_rdev = 0;
412 	}
413 
414 	mutex_enter(&ip->i_tlock);
415 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
416 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
417 	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
418 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
419 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
420 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
421 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
422 	mutex_exit(&ip->i_tlock);
423 
424 	switch (ip->i_type) {
425 		case VBLK:
426 			vap->va_blksize = MAXBSIZE;
427 			break;
428 		case VCHR:
429 			vap->va_blksize = MAXBSIZE;
430 			break;
431 		default:
432 			vap->va_blksize = ip->i_udf->udf_lbsize;
433 			break;
434 	}
435 	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
436 
437 	rw_exit(&ip->i_contents);
438 
439 	return (0);
440 }
441 
442 static int
443 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
444 {
445 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr));
446 }
447 
448 /*ARGSUSED4*/
449 static int32_t
450 udf_setattr(
451 	struct vnode *vp,
452 	struct vattr *vap,
453 	int32_t flags,
454 	struct cred *cr,
455 	caller_context_t *ct)
456 {
457 	int32_t error = 0;
458 	uint32_t mask = vap->va_mask;
459 	struct ud_inode *ip;
460 	timestruc_t now;
461 	struct vattr ovap;
462 
463 	ud_printf("udf_setattr\n");
464 
465 	ip = VTOI(vp);
466 
467 	/*
468 	 * not updates allowed to 4096 files
469 	 */
470 	if (ip->i_astrat == STRAT_TYPE4096) {
471 		return (EINVAL);
472 	}
473 
474 	/*
475 	 * Cannot set these attributes
476 	 */
477 	if (mask & AT_NOSET) {
478 		return (EINVAL);
479 	}
480 
481 	rw_enter(&ip->i_rwlock, RW_WRITER);
482 	rw_enter(&ip->i_contents, RW_WRITER);
483 
484 	ovap.va_uid = ip->i_uid;
485 	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
486 	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
487 					    ud_iaccess_vmode, ip);
488 	if (error)
489 		goto update_inode;
490 
491 	mask = vap->va_mask;
492 	/*
493 	 * Change file access modes.
494 	 */
495 	if (mask & AT_MODE) {
496 		ip->i_perm = VA2UD_PERM(vap->va_mode);
497 		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
498 		mutex_enter(&ip->i_tlock);
499 		ip->i_flag |= ICHG;
500 		mutex_exit(&ip->i_tlock);
501 	}
502 	if (mask & (AT_UID|AT_GID)) {
503 		if (mask & AT_UID) {
504 			ip->i_uid = vap->va_uid;
505 		}
506 		if (mask & AT_GID) {
507 			ip->i_gid = vap->va_gid;
508 		}
509 		mutex_enter(&ip->i_tlock);
510 		ip->i_flag |= ICHG;
511 		mutex_exit(&ip->i_tlock);
512 	}
513 	/*
514 	 * Truncate file.  Must have write permission and not be a directory.
515 	 */
516 	if (mask & AT_SIZE) {
517 		if (vp->v_type == VDIR) {
518 			error = EISDIR;
519 			goto update_inode;
520 		}
521 		if (error = ud_iaccess(ip, IWRITE, cr)) {
522 			goto update_inode;
523 		}
524 		if (vap->va_size > MAXOFFSET_T) {
525 			error = EFBIG;
526 			goto update_inode;
527 		}
528 		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
529 			goto update_inode;
530 		}
531 	}
532 	/*
533 	 * Change file access or modified times.
534 	 */
535 	if (mask & (AT_ATIME|AT_MTIME)) {
536 		mutex_enter(&ip->i_tlock);
537 		if (mask & AT_ATIME) {
538 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
539 			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
540 			ip->i_flag &= ~IACC;
541 		}
542 		if (mask & AT_MTIME) {
543 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
544 			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
545 			gethrestime(&now);
546 			ip->i_ctime.tv_sec = now.tv_sec;
547 			ip->i_ctime.tv_nsec = now.tv_nsec;
548 			ip->i_flag &= ~(IUPD|ICHG);
549 			ip->i_flag |= IMODTIME;
550 		}
551 		ip->i_flag |= IMOD;
552 		mutex_exit(&ip->i_tlock);
553 	}
554 
555 update_inode:
556 	if (curthread->t_flag & T_DONTPEND) {
557 		ud_iupdat(ip, 1);
558 	} else {
559 		ITIMES_NOLOCK(ip);
560 	}
561 	rw_exit(&ip->i_contents);
562 	rw_exit(&ip->i_rwlock);
563 
564 	return (error);
565 }
566 
567 /* ARGSUSED */
568 static int32_t
569 udf_access(struct vnode *vp,
570 	int32_t mode, int32_t flags, struct cred *cr)
571 {
572 	struct ud_inode *ip = VTOI(vp);
573 	int32_t error;
574 
575 	ud_printf("udf_access\n");
576 
577 	if (ip->i_udf == NULL) {
578 		return (EIO);
579 	}
580 
581 	error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr);
582 
583 	return (error);
584 }
585 
586 int32_t udfs_stickyhack = 1;
587 
588 /* ARGSUSED */
589 static int32_t
590 udf_lookup(struct vnode *dvp,
591 	char *nm, struct vnode **vpp, struct pathname *pnp,
592 	int32_t flags, struct vnode *rdir, struct cred *cr)
593 {
594 	int32_t error;
595 	struct vnode *vp;
596 	struct ud_inode *ip, *xip;
597 
598 	ud_printf("udf_lookup\n");
599 	/*
600 	 * Null component name is a synonym for directory being searched.
601 	 */
602 	if (*nm == '\0') {
603 		VN_HOLD(dvp);
604 		*vpp = dvp;
605 		error = 0;
606 		goto out;
607 	}
608 
609 	/*
610 	 * Fast path: Check the directory name lookup cache.
611 	 */
612 	ip = VTOI(dvp);
613 	if (vp = dnlc_lookup(dvp, nm)) {
614 		/*
615 		 * Check accessibility of directory.
616 		 */
617 		if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) {
618 			VN_RELE(vp);
619 		}
620 		xip = VTOI(vp);
621 	} else {
622 		error = ud_dirlook(ip, nm, &xip, cr, 1);
623 		ITIMES(ip);
624 	}
625 
626 	if (error == 0) {
627 		ip = xip;
628 		*vpp = ITOV(ip);
629 		if ((ip->i_type != VDIR) &&
630 			(ip->i_char & ISVTX) &&
631 			((ip->i_perm & IEXEC) == 0) &&
632 			udfs_stickyhack) {
633 			mutex_enter(&(*vpp)->v_lock);
634 			(*vpp)->v_flag |= VISSWAP;
635 			mutex_exit(&(*vpp)->v_lock);
636 		}
637 		ITIMES(ip);
638 		/*
639 		 * If vnode is a device return special vnode instead.
640 		 */
641 		if (IS_DEVVP(*vpp)) {
642 			struct vnode *newvp;
643 			newvp = specvp(*vpp, (*vpp)->v_rdev,
644 					(*vpp)->v_type, cr);
645 			VN_RELE(*vpp);
646 			if (newvp == NULL) {
647 				error = ENOSYS;
648 			} else {
649 				*vpp = newvp;
650 			}
651 		}
652 	}
653 out:
654 	return (error);
655 }
656 
657 /* ARGSUSED */
658 static int32_t
659 udf_create(struct vnode *dvp,
660 	char *name, struct vattr *vap, enum vcexcl excl,
661 	int32_t mode, struct vnode **vpp, struct cred *cr, int32_t flag)
662 {
663 	int32_t error;
664 	struct ud_inode *ip = VTOI(dvp), *xip;
665 
666 	ud_printf("udf_create\n");
667 
668 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
669 		vap->va_mode &= ~VSVTX;
670 
671 	if (*name == '\0') {
672 		/*
673 		 * Null component name refers to the directory itself.
674 		 */
675 		VN_HOLD(dvp);
676 		ITIMES(ip);
677 		error = EEXIST;
678 	} else {
679 		xip = NULL;
680 		rw_enter(&ip->i_rwlock, RW_WRITER);
681 		error = ud_direnter(ip, name, DE_CREATE,
682 				(struct ud_inode *)0, (struct ud_inode *)0,
683 				vap, &xip, cr);
684 		rw_exit(&ip->i_rwlock);
685 		ITIMES(ip);
686 		ip = xip;
687 	}
688 #ifdef	__lock_lint
689 	rw_enter(&ip->i_contents, RW_WRITER);
690 #else
691 	if (ip != NULL) {
692 		rw_enter(&ip->i_contents, RW_WRITER);
693 	}
694 #endif
695 
696 	/*
697 	 * If the file already exists and this is a non-exclusive create,
698 	 * check permissions and allow access for non-directories.
699 	 * Read-only create of an existing directory is also allowed.
700 	 * We fail an exclusive create of anything which already exists.
701 	 */
702 	if (error == EEXIST) {
703 		if (excl == NONEXCL) {
704 			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
705 				error = EISDIR;
706 			} else if (mode) {
707 				error = ud_iaccess(ip,
708 					UD_UPERM2DPERM(mode), cr);
709 			} else {
710 				error = 0;
711 			}
712 		}
713 		if (error) {
714 			rw_exit(&ip->i_contents);
715 			VN_RELE(ITOV(ip));
716 			goto out;
717 		} else if ((ip->i_type == VREG) &&
718 			(vap->va_mask & AT_SIZE) && vap->va_size == 0) {
719 			/*
720 			 * Truncate regular files, if requested by caller.
721 			 * Grab i_rwlock to make sure no one else is
722 			 * currently writing to the file (we promised
723 			 * bmap we would do this).
724 			 * Must get the locks in the correct order.
725 			 */
726 			if (ip->i_size == 0) {
727 				ip->i_flag |= ICHG | IUPD;
728 			} else {
729 				rw_exit(&ip->i_contents);
730 				rw_enter(&ip->i_rwlock, RW_WRITER);
731 				rw_enter(&ip->i_contents, RW_WRITER);
732 				(void) ud_itrunc(ip, 0, 0, cr);
733 				rw_exit(&ip->i_rwlock);
734 			}
735 			vnevent_create(ITOV(ip));
736 		}
737 	}
738 
739 	if (error == 0) {
740 		*vpp = ITOV(ip);
741 		ITIMES(ip);
742 	}
743 #ifdef	__lock_lint
744 	rw_exit(&ip->i_contents);
745 #else
746 	if (ip != NULL) {
747 		rw_exit(&ip->i_contents);
748 	}
749 #endif
750 	if (error) {
751 		goto out;
752 	}
753 
754 	/*
755 	 * If vnode is a device return special vnode instead.
756 	 */
757 	if (!error && IS_DEVVP(*vpp)) {
758 		struct vnode *newvp;
759 
760 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
761 		VN_RELE(*vpp);
762 		if (newvp == NULL) {
763 			error = ENOSYS;
764 			goto out;
765 		}
766 		*vpp = newvp;
767 	}
768 out:
769 	return (error);
770 }
771 
772 static int32_t
773 udf_remove(struct vnode *vp, char *nm, struct cred *cr)
774 {
775 	int32_t error;
776 	struct ud_inode *ip = VTOI(vp);
777 
778 	ud_printf("udf_remove\n");
779 
780 	rw_enter(&ip->i_rwlock, RW_WRITER);
781 	error = ud_dirremove(ip, nm,
782 		(struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr);
783 	rw_exit(&ip->i_rwlock);
784 	ITIMES(ip);
785 
786 	return (error);
787 }
788 
789 static int32_t
790 udf_link(struct vnode *tdvp,
791 	struct vnode *svp, char *tnm, struct cred *cr)
792 {
793 	int32_t error;
794 	struct vnode *realvp;
795 	struct ud_inode *sip;
796 	struct ud_inode *tdp;
797 
798 	ud_printf("udf_link\n");
799 	if (VOP_REALVP(svp, &realvp) == 0) {
800 		svp = realvp;
801 	}
802 
803 	/*
804 	 * Do not allow links to directories
805 	 */
806 	if (svp->v_type == VDIR) {
807 		return (EPERM);
808 	}
809 
810 	sip = VTOI(svp);
811 
812 	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
813 		return (EPERM);
814 
815 	tdp = VTOI(tdvp);
816 
817 	rw_enter(&tdp->i_rwlock, RW_WRITER);
818 	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
819 		sip, (struct vattr *)0, (struct ud_inode **)0, cr);
820 	rw_exit(&tdp->i_rwlock);
821 	ITIMES(sip);
822 	ITIMES(tdp);
823 
824 	if (error == 0) {
825 		vnevent_link(svp);
826 	}
827 
828 	return (error);
829 }
830 
831 /* ARGSUSED */
832 static int32_t
833 udf_rename(struct vnode *sdvp,
834 	char *snm, struct vnode *tdvp,
835 	char *tnm, struct cred *cr)
836 {
837 	int32_t error = 0;
838 	struct udf_vfs *udf_vfsp;
839 	struct ud_inode *sip;		/* source inode */
840 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
841 	struct vnode *realvp;
842 
843 	ud_printf("udf_rename\n");
844 
845 	if (VOP_REALVP(tdvp, &realvp) == 0) {
846 		tdvp = realvp;
847 	}
848 
849 	sdp = VTOI(sdvp);
850 	tdp = VTOI(tdvp);
851 
852 	udf_vfsp = sdp->i_udf;
853 
854 	mutex_enter(&udf_vfsp->udf_rename_lck);
855 	/*
856 	 * Look up inode of file we're supposed to rename.
857 	 */
858 	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
859 		mutex_exit(&udf_vfsp->udf_rename_lck);
860 		return (error);
861 	}
862 	/*
863 	 * be sure this is not a directory with another file system mounted
864 	 * over it.  If it is just give up the locks, and return with
865 	 * EBUSY
866 	 */
867 	if (vn_mountedvfs(ITOV(sip)) != NULL) {
868 		error = EBUSY;
869 		goto errout;
870 	}
871 	/*
872 	 * Make sure we can delete the source entry.  This requires
873 	 * write permission on the containing directory.  If that
874 	 * directory is "sticky" it further requires (except for
875 	 * privileged users) that the user own the directory or the
876 	 * source entry, or else have permission to write the source
877 	 * entry.
878 	 */
879 	rw_enter(&sdp->i_contents, RW_READER);
880 	rw_enter(&sip->i_contents, RW_READER);
881 	if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 ||
882 	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
883 		rw_exit(&sip->i_contents);
884 		rw_exit(&sdp->i_contents);
885 		ITIMES(sip);
886 		goto errout;
887 	}
888 
889 	/*
890 	 * Check for renaming '.' or '..' or alias of '.'
891 	 */
892 	if ((strcmp(snm, ".") == 0) ||
893 			(strcmp(snm, "..") == 0) ||
894 			(sdp == sip)) {
895 		error = EINVAL;
896 		rw_exit(&sip->i_contents);
897 		rw_exit(&sdp->i_contents);
898 		goto errout;
899 	}
900 	rw_exit(&sip->i_contents);
901 	rw_exit(&sdp->i_contents);
902 
903 
904 	/*
905 	 * Link source to the target.
906 	 */
907 	rw_enter(&tdp->i_rwlock, RW_WRITER);
908 	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
909 			(struct vattr *)0, (struct ud_inode **)0, cr)) {
910 		/*
911 		 * ESAME isn't really an error; it indicates that the
912 		 * operation should not be done because the source and target
913 		 * are the same file, but that no error should be reported.
914 		 */
915 		if (error == ESAME) {
916 			error = 0;
917 		}
918 		rw_exit(&tdp->i_rwlock);
919 		goto errout;
920 	}
921 	vnevent_rename_src(ITOV(sip), sdvp, snm);
922 	rw_exit(&tdp->i_rwlock);
923 
924 	rw_enter(&sdp->i_rwlock, RW_WRITER);
925 	/*
926 	 * Unlink the source.
927 	 * Remove the source entry.  ud_dirremove() checks that the entry
928 	 * still reflects sip, and returns an error if it doesn't.
929 	 * If the entry has changed just forget about it.  Release
930 	 * the source inode.
931 	 */
932 	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
933 			DR_RENAME, cr)) == ENOENT) {
934 		error = 0;
935 	}
936 	rw_exit(&sdp->i_rwlock);
937 errout:
938 	ITIMES(sdp);
939 	ITIMES(tdp);
940 	VN_RELE(ITOV(sip));
941 	mutex_exit(&udf_vfsp->udf_rename_lck);
942 
943 	return (error);
944 }
945 
946 static int32_t
947 udf_mkdir(struct vnode *dvp,
948 	char *dirname, struct vattr *vap,
949 	struct vnode **vpp, struct cred *cr)
950 {
951 	int32_t error;
952 	struct ud_inode *ip;
953 	struct ud_inode *xip;
954 
955 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
956 
957 	ud_printf("udf_mkdir\n");
958 
959 	ip = VTOI(dvp);
960 	rw_enter(&ip->i_rwlock, RW_WRITER);
961 	error = ud_direnter(ip, dirname, DE_MKDIR,
962 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr);
963 	rw_exit(&ip->i_rwlock);
964 	ITIMES(ip);
965 	if (error == 0) {
966 		ip = xip;
967 		*vpp = ITOV(ip);
968 		ITIMES(ip);
969 	} else if (error == EEXIST) {
970 		ITIMES(xip);
971 		VN_RELE(ITOV(xip));
972 	}
973 
974 	return (error);
975 }
976 
977 static int32_t
978 udf_rmdir(struct vnode *vp,
979 	char *nm, struct vnode *cdir, struct cred *cr)
980 {
981 	int32_t error;
982 	struct ud_inode *ip = VTOI(vp);
983 
984 	ud_printf("udf_rmdir\n");
985 
986 	rw_enter(&ip->i_rwlock, RW_WRITER);
987 	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, cr);
988 	rw_exit(&ip->i_rwlock);
989 	ITIMES(ip);
990 
991 	return (error);
992 }
993 
994 /* ARGSUSED */
995 static int32_t
996 udf_readdir(struct vnode *vp,
997 	struct uio *uiop, struct cred *cr, int32_t *eofp)
998 {
999 	struct ud_inode *ip;
1000 	struct dirent64 *nd;
1001 	struct udf_vfs *udf_vfsp;
1002 	int32_t error = 0, len, outcount = 0;
1003 	uint32_t dirsiz, offset;
1004 	uint32_t bufsize, ndlen, dummy;
1005 	caddr_t outbuf;
1006 	caddr_t outb, end_outb;
1007 	struct iovec *iovp;
1008 
1009 	uint8_t *dname;
1010 	int32_t length;
1011 
1012 	uint8_t *buf = NULL;
1013 
1014 	struct fbuf *fbp = NULL;
1015 	struct file_id *fid;
1016 	uint8_t *name;
1017 
1018 
1019 	ud_printf("udf_readdir\n");
1020 
1021 	ip = VTOI(vp);
1022 	udf_vfsp = ip->i_udf;
1023 
1024 	dirsiz = ip->i_size;
1025 	if ((uiop->uio_offset >= dirsiz) ||
1026 			(ip->i_nlink <= 0)) {
1027 		if (eofp) {
1028 			*eofp = 1;
1029 		}
1030 		return (0);
1031 	}
1032 
1033 	offset = uiop->uio_offset;
1034 	iovp = uiop->uio_iov;
1035 	bufsize = iovp->iov_len;
1036 
1037 	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1038 	end_outb = outb + bufsize;
1039 	nd = (struct dirent64 *)outbuf;
1040 
1041 	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1042 	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1043 
1044 	if (offset == 0) {
1045 		len = DIRENT64_RECLEN(1);
1046 		if (((caddr_t)nd + len) >= end_outb) {
1047 			error = EINVAL;
1048 			goto end;
1049 		}
1050 		nd->d_ino = ip->i_icb_lbano;
1051 		nd->d_reclen = (uint16_t)len;
1052 		nd->d_off = 0x10;
1053 		nd->d_name[0] = '.';
1054 		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1055 		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1056 		outcount++;
1057 	} else if (offset == 0x10) {
1058 		offset = 0;
1059 	}
1060 
1061 	while (offset < dirsiz) {
1062 		error = ud_get_next_fid(ip, &fbp,
1063 				offset, &fid, &name, buf);
1064 		if (error != 0) {
1065 			break;
1066 		}
1067 
1068 		if ((fid->fid_flags & FID_DELETED) == 0) {
1069 			if (fid->fid_flags & FID_PARENT) {
1070 
1071 				len = DIRENT64_RECLEN(2);
1072 				if (((caddr_t)nd + len) >= end_outb) {
1073 					error = EINVAL;
1074 					break;
1075 				}
1076 
1077 				nd->d_ino = ip->i_icb_lbano;
1078 				nd->d_reclen = (uint16_t)len;
1079 				nd->d_off = offset + FID_LEN(fid);
1080 				nd->d_name[0] = '.';
1081 				nd->d_name[1] = '.';
1082 				bzero(&nd->d_name[2],
1083 				    DIRENT64_NAMELEN(len) - 2);
1084 				nd = (struct dirent64 *)
1085 					((char *)nd + nd->d_reclen);
1086 			} else {
1087 				if ((error = ud_uncompress(fid->fid_idlen,
1088 						&length, name, dname)) != 0) {
1089 					break;
1090 				}
1091 				if (length == 0) {
1092 					offset += FID_LEN(fid);
1093 					continue;
1094 				}
1095 				len = DIRENT64_RECLEN(length);
1096 				if (((caddr_t)nd + len) >= end_outb) {
1097 					if (!outcount) {
1098 						error = EINVAL;
1099 					}
1100 					break;
1101 				}
1102 				(void) strncpy(nd->d_name,
1103 				    (caddr_t)dname, length);
1104 				bzero(&nd->d_name[length],
1105 				    DIRENT64_NAMELEN(len) - length);
1106 				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1107 					SWAP_16(fid->fid_icb.lad_ext_prn),
1108 					SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1109 					&dummy);
1110 				nd->d_reclen = (uint16_t)len;
1111 				nd->d_off = offset + FID_LEN(fid);
1112 				nd = (struct dirent64 *)
1113 					((char *)nd + nd->d_reclen);
1114 			}
1115 			outcount++;
1116 		}
1117 
1118 		offset += FID_LEN(fid);
1119 	}
1120 
1121 end:
1122 	if (fbp != NULL) {
1123 		fbrelse(fbp, S_OTHER);
1124 	}
1125 	ndlen = ((char *)nd - outbuf);
1126 	/*
1127 	 * In case of error do not call uiomove.
1128 	 * Return the error to the caller.
1129 	 */
1130 	if ((error == 0) && (ndlen != 0)) {
1131 		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1132 		uiop->uio_offset = offset;
1133 	}
1134 	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1135 	kmem_free((caddr_t)dname, 1024);
1136 	kmem_free(outbuf, (uint32_t)bufsize);
1137 	if (eofp && error == 0) {
1138 		*eofp = (uiop->uio_offset >= dirsiz);
1139 	}
1140 	return (error);
1141 }
1142 
1143 /* ARGSUSED */
1144 static int32_t
1145 udf_symlink(struct vnode *dvp,
1146 	char *linkname, struct vattr *vap,
1147 	char *target, struct cred *cr)
1148 {
1149 	int32_t error = 0, outlen;
1150 	uint32_t ioflag = 0;
1151 	struct ud_inode *ip, *dip = VTOI(dvp);
1152 
1153 	struct path_comp *pc;
1154 	int8_t *dname = NULL, *uname = NULL, *sp;
1155 
1156 	ud_printf("udf_symlink\n");
1157 
1158 	ip = (struct ud_inode *)0;
1159 	vap->va_type = VLNK;
1160 	vap->va_rdev = 0;
1161 
1162 	rw_enter(&dip->i_rwlock, RW_WRITER);
1163 	error = ud_direnter(dip, linkname, DE_CREATE,
1164 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr);
1165 	rw_exit(&dip->i_rwlock);
1166 	if (error == 0) {
1167 		dname = kmem_zalloc(1024, KM_SLEEP);
1168 		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1169 
1170 		pc = (struct path_comp *)uname;
1171 		/*
1172 		 * If the first character in target is "/"
1173 		 * then skip it and create entry for it
1174 		 */
1175 		if (*target == '/') {
1176 			pc->pc_type = 2;
1177 			pc->pc_len = 0;
1178 			pc = (struct path_comp *)(((char *)pc) + 4);
1179 			while (*target == '/') {
1180 				target++;
1181 			}
1182 		}
1183 
1184 		while (*target != NULL) {
1185 			sp = target;
1186 			while ((*target != '/') && (*target != '\0')) {
1187 				target ++;
1188 			}
1189 			/*
1190 			 * We got the next component of the
1191 			 * path name. Create path_comp of
1192 			 * appropriate type
1193 			 */
1194 			if (((target - sp) == 1) && (*sp == '.')) {
1195 				/*
1196 				 * Dot entry.
1197 				 */
1198 				pc->pc_type = 4;
1199 				pc = (struct path_comp *)(((char *)pc) + 4);
1200 			} else if (((target - sp) == 2) &&
1201 				(*sp == '.') && ((*(sp + 1)) == '.')) {
1202 				/*
1203 				 * DotDot entry.
1204 				 */
1205 				pc->pc_type = 3;
1206 				pc = (struct path_comp *)(((char *)pc) + 4);
1207 			} else {
1208 				/*
1209 				 * convert the user given name
1210 				 * into appropriate form to be put
1211 				 * on the media
1212 				 */
1213 				outlen = 1024;	/* set to size of dname */
1214 				if (error = ud_compress(target - sp, &outlen,
1215 					(uint8_t *)sp, (uint8_t *)dname)) {
1216 					break;
1217 				}
1218 				pc->pc_type = 5;
1219 				/* LINTED */
1220 				pc->pc_len = outlen;
1221 				dname[outlen] = '\0';
1222 				(void) strcpy((char *)pc->pc_id, dname);
1223 				pc = (struct path_comp *)
1224 					(((char *)pc) + 4 + outlen);
1225 			}
1226 			while (*target == '/') {
1227 				target++;
1228 			}
1229 			if (*target == NULL) {
1230 				break;
1231 			}
1232 		}
1233 
1234 		rw_enter(&ip->i_contents, RW_WRITER);
1235 		if (error == 0) {
1236 			ioflag = FWRITE;
1237 			if (curthread->t_flag & T_DONTPEND) {
1238 				ioflag |= FDSYNC;
1239 			}
1240 			error = ud_rdwri(UIO_WRITE, ioflag, ip,
1241 				uname, ((int8_t *)pc) - uname,
1242 				(offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1243 		}
1244 		if (error) {
1245 			ud_idrop(ip);
1246 			rw_exit(&ip->i_contents);
1247 			rw_enter(&dip->i_rwlock, RW_WRITER);
1248 			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1249 					(struct vnode *)0, DR_REMOVE, cr);
1250 			rw_exit(&dip->i_rwlock);
1251 			goto update_inode;
1252 		}
1253 		rw_exit(&ip->i_contents);
1254 	}
1255 
1256 	if ((error == 0) || (error == EEXIST)) {
1257 		VN_RELE(ITOV(ip));
1258 	}
1259 
1260 update_inode:
1261 	ITIMES(VTOI(dvp));
1262 	if (uname != NULL) {
1263 		kmem_free(uname, PAGESIZE);
1264 	}
1265 	if (dname != NULL) {
1266 		kmem_free(dname, 1024);
1267 	}
1268 
1269 	return (error);
1270 }
1271 
1272 /* ARGSUSED */
1273 static int32_t
1274 udf_readlink(struct vnode *vp,
1275 	struct uio *uiop, struct cred *cr)
1276 {
1277 	int32_t error = 0, off, id_len, size, len;
1278 	int8_t *dname = NULL, *uname = NULL;
1279 	struct ud_inode *ip;
1280 	struct fbuf *fbp = NULL;
1281 	struct path_comp *pc;
1282 
1283 	ud_printf("udf_readlink\n");
1284 
1285 	if (vp->v_type != VLNK) {
1286 		return (EINVAL);
1287 	}
1288 
1289 	ip = VTOI(vp);
1290 	size = ip->i_size;
1291 	if (size > PAGESIZE) {
1292 		return (EIO);
1293 	}
1294 
1295 	if (size == 0) {
1296 		return (0);
1297 	}
1298 
1299 	dname = kmem_zalloc(1024, KM_SLEEP);
1300 	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1301 
1302 	rw_enter(&ip->i_contents, RW_READER);
1303 
1304 	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1305 		goto end;
1306 	}
1307 
1308 	off = 0;
1309 
1310 	while (off < size) {
1311 		pc = (struct path_comp *)(fbp->fb_addr + off);
1312 		switch (pc->pc_type) {
1313 			case 1 :
1314 				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
1315 				(void) strcat(uname, "/");
1316 				break;
1317 			case 2 :
1318 				if (pc->pc_len != 0) {
1319 					goto end;
1320 				}
1321 				uname[0] = '/';
1322 				uname[1] = '\0';
1323 				break;
1324 			case 3 :
1325 				(void) strcat(uname, "../");
1326 				break;
1327 			case 4 :
1328 				(void) strcat(uname, "./");
1329 				break;
1330 			case 5 :
1331 				if ((error = ud_uncompress(pc->pc_len, &id_len,
1332 					pc->pc_id, (uint8_t *)dname)) != 0) {
1333 					break;
1334 				}
1335 				dname[id_len] = '\0';
1336 				(void) strcat(uname, dname);
1337 				(void) strcat(uname, "/");
1338 				break;
1339 			default :
1340 				error = EINVAL;
1341 				goto end;
1342 		}
1343 		off += 4 + pc->pc_len;
1344 	}
1345 	len = strlen(uname) - 1;
1346 	if (uname[len] == '/') {
1347 		if (len == 0) {
1348 			/*
1349 			 * special case link to /
1350 			 */
1351 			len = 1;
1352 		} else {
1353 			uname[len] = '\0';
1354 		}
1355 	}
1356 
1357 	error = uiomove(uname, len, UIO_READ, uiop);
1358 
1359 	ITIMES(ip);
1360 
1361 end:
1362 	if (fbp != NULL) {
1363 		fbrelse(fbp, S_OTHER);
1364 	}
1365 	rw_exit(&ip->i_contents);
1366 	if (uname != NULL) {
1367 		kmem_free(uname, PAGESIZE);
1368 	}
1369 	if (dname != NULL) {
1370 		kmem_free(dname, 1024);
1371 	}
1372 	return (error);
1373 }
1374 
1375 /* ARGSUSED */
1376 static int32_t
1377 udf_fsync(struct vnode *vp,
1378 	int32_t syncflag, struct cred *cr)
1379 {
1380 	int32_t error = 0;
1381 	struct ud_inode *ip = VTOI(vp);
1382 
1383 	ud_printf("udf_fsync\n");
1384 
1385 	rw_enter(&ip->i_contents, RW_WRITER);
1386 	if (!(IS_SWAPVP(vp))) {
1387 		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1388 	}
1389 	if (error == 0) {
1390 		error = ud_sync_indir(ip);
1391 	}
1392 	ITIMES(ip);		/* XXX: is this necessary ??? */
1393 	rw_exit(&ip->i_contents);
1394 
1395 	return (error);
1396 }
1397 
1398 /* ARGSUSED */
1399 static void
1400 udf_inactive(struct vnode *vp, struct cred *cr)
1401 {
1402 	ud_printf("udf_iinactive\n");
1403 
1404 	ud_iinactive(VTOI(vp), cr);
1405 }
1406 
1407 static int32_t
1408 udf_fid(struct vnode *vp, struct fid *fidp)
1409 {
1410 	struct udf_fid *udfidp;
1411 	struct ud_inode *ip = VTOI(vp);
1412 
1413 	ud_printf("udf_fid\n");
1414 
1415 	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1416 		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1417 		return (ENOSPC);
1418 	}
1419 
1420 	udfidp = (struct udf_fid *)fidp;
1421 	bzero((char *)udfidp, sizeof (struct udf_fid));
1422 	rw_enter(&ip->i_contents, RW_READER);
1423 	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1424 	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1425 	udfidp->udfid_prn = ip->i_icb_prn;
1426 	udfidp->udfid_icb_lbn = ip->i_icb_block;
1427 	rw_exit(&ip->i_contents);
1428 
1429 	return (0);
1430 }
1431 
1432 /* ARGSUSED2 */
1433 static int
1434 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1435 {
1436 	struct ud_inode *ip = VTOI(vp);
1437 
1438 	ud_printf("udf_rwlock\n");
1439 
1440 	if (write_lock) {
1441 		rw_enter(&ip->i_rwlock, RW_WRITER);
1442 	} else {
1443 		rw_enter(&ip->i_rwlock, RW_READER);
1444 	}
1445 #ifdef	__lock_lint
1446 	rw_exit(&ip->i_rwlock);
1447 #endif
1448 	return (write_lock);
1449 }
1450 
1451 /* ARGSUSED */
1452 static void
1453 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1454 {
1455 	struct ud_inode *ip = VTOI(vp);
1456 
1457 	ud_printf("udf_rwunlock\n");
1458 
1459 #ifdef	__lock_lint
1460 	rw_enter(&ip->i_rwlock, RW_WRITER);
1461 #endif
1462 
1463 	rw_exit(&ip->i_rwlock);
1464 
1465 }
1466 
1467 /* ARGSUSED */
1468 static int32_t
1469 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1470 {
1471 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1472 }
1473 
1474 static int32_t
1475 udf_frlock(struct vnode *vp, int32_t cmd, struct flock64 *bfp,
1476 	int32_t flag, offset_t offset, struct flk_callback *flk_cbp,
1477 	cred_t *cr)
1478 {
1479 	struct ud_inode *ip = VTOI(vp);
1480 
1481 	ud_printf("udf_frlock\n");
1482 
1483 	/*
1484 	 * If file is being mapped, disallow frlock.
1485 	 * XXX I am not holding tlock while checking i_mapcnt because the
1486 	 * current locking strategy drops all locks before calling fs_frlock.
1487 	 * So, mapcnt could change before we enter fs_frlock making is
1488 	 * meaningless to have held tlock in the first place.
1489 	 */
1490 	if ((ip->i_mapcnt > 0) &&
1491 		(MANDLOCK(vp, ip->i_char))) {
1492 		return (EAGAIN);
1493 	}
1494 
1495 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
1496 }
1497 
1498 /*ARGSUSED6*/
1499 static int32_t
1500 udf_space(
1501 	struct vnode *vp,
1502 	int32_t cmd,
1503 	struct flock64 *bfp,
1504 	int32_t flag,
1505 	offset_t offset,
1506 	cred_t *cr,
1507 	caller_context_t *ct)
1508 {
1509 	int32_t error = 0;
1510 
1511 	ud_printf("udf_space\n");
1512 
1513 	if (cmd != F_FREESP) {
1514 		error =  EINVAL;
1515 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1516 		error = ud_freesp(vp, bfp, flag, cr);
1517 	}
1518 
1519 	return (error);
1520 }
1521 
1522 /* ARGSUSED */
1523 static int32_t
1524 udf_getpage(struct vnode *vp, offset_t off,
1525 	size_t len, uint32_t *protp, struct page  **plarr,
1526 	size_t plsz, struct seg *seg, caddr_t addr,
1527 	enum seg_rw rw, struct cred *cr)
1528 {
1529 	struct ud_inode *ip = VTOI(vp);
1530 	int32_t error, has_holes, beyond_eof, seqmode, dolock;
1531 	int32_t pgsize = PAGESIZE;
1532 	struct udf_vfs *udf_vfsp = ip->i_udf;
1533 	page_t **pl;
1534 	u_offset_t pgoff, eoff, uoff;
1535 	krw_t rwtype;
1536 	caddr_t pgaddr;
1537 
1538 	ud_printf("udf_getpage\n");
1539 
1540 	uoff = (u_offset_t)off; /* type conversion */
1541 	if (protp) {
1542 		*protp = PROT_ALL;
1543 	}
1544 	if (vp->v_flag & VNOMAP) {
1545 		return (ENOSYS);
1546 	}
1547 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1548 
1549 	rwtype = RW_READER;
1550 	dolock = (rw_owner(&ip->i_contents) != curthread);
1551 retrylock:
1552 #ifdef	__lock_lint
1553 	rw_enter(&ip->i_contents, rwtype);
1554 #else
1555 	if (dolock) {
1556 		rw_enter(&ip->i_contents, rwtype);
1557 	}
1558 #endif
1559 
1560 	/*
1561 	 * We may be getting called as a side effect of a bmap using
1562 	 * fbread() when the blocks might be being allocated and the
1563 	 * size has not yet been up'ed.  In this case we want to be
1564 	 * able to return zero pages if we get back UDF_HOLE from
1565 	 * calling bmap for a non write case here.  We also might have
1566 	 * to read some frags from the disk into a page if we are
1567 	 * extending the number of frags for a given lbn in bmap().
1568 	 */
1569 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1570 	if (beyond_eof && seg != segkmap) {
1571 #ifdef	__lock_lint
1572 		rw_exit(&ip->i_contents);
1573 #else
1574 		if (dolock) {
1575 			rw_exit(&ip->i_contents);
1576 		}
1577 #endif
1578 		return (EFAULT);
1579 	}
1580 
1581 	/*
1582 	 * Must hold i_contents lock throughout the call to pvn_getpages
1583 	 * since locked pages are returned from each call to ud_getapage.
1584 	 * Must *not* return locked pages and then try for contents lock
1585 	 * due to lock ordering requirements (inode > page)
1586 	 */
1587 
1588 	has_holes = ud_bmap_has_holes(ip);
1589 
1590 	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1591 		int32_t	blk_size, count;
1592 		u_offset_t offset;
1593 
1594 		/*
1595 		 * We must acquire the RW_WRITER lock in order to
1596 		 * call bmap_write().
1597 		 */
1598 		if (dolock && rwtype == RW_READER) {
1599 			rwtype = RW_WRITER;
1600 
1601 			if (!rw_tryupgrade(&ip->i_contents)) {
1602 
1603 				rw_exit(&ip->i_contents);
1604 
1605 				goto retrylock;
1606 			}
1607 		}
1608 
1609 		/*
1610 		 * May be allocating disk blocks for holes here as
1611 		 * a result of mmap faults. write(2) does the bmap_write
1612 		 * in rdip/wrip, not here. We are not dealing with frags
1613 		 * in this case.
1614 		 */
1615 		offset = uoff;
1616 		while ((offset < uoff + len) &&
1617 			(offset < ip->i_size)) {
1618 			/*
1619 			 * the variable "bnp" is to simplify the expression for
1620 			 * the compiler; * just passing in &bn to bmap_write
1621 			 * causes a compiler "loop"
1622 			 */
1623 
1624 			blk_size = udf_vfsp->udf_lbsize;
1625 			if ((offset + blk_size) > ip->i_size) {
1626 				count = ip->i_size - offset;
1627 			} else {
1628 				count = blk_size;
1629 			}
1630 			error = ud_bmap_write(ip, offset, count, 0, cr);
1631 			if (error) {
1632 				goto update_inode;
1633 			}
1634 			offset += count; /* XXX - make this contig */
1635 		}
1636 	}
1637 
1638 	/*
1639 	 * Can be a reader from now on.
1640 	 */
1641 #ifdef	__lock_lint
1642 	if (rwtype == RW_WRITER) {
1643 		rw_downgrade(&ip->i_contents);
1644 	}
1645 #else
1646 	if (dolock && rwtype == RW_WRITER) {
1647 		rw_downgrade(&ip->i_contents);
1648 	}
1649 #endif
1650 
1651 	/*
1652 	 * We remove PROT_WRITE in cases when the file has UDF holes
1653 	 * because we don't  want to call bmap_read() to check each
1654 	 * page if it is backed with a disk block.
1655 	 */
1656 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1657 		*protp &= ~PROT_WRITE;
1658 	}
1659 
1660 	error = 0;
1661 
1662 	/*
1663 	 * The loop looks up pages in the range <off, off + len).
1664 	 * For each page, we first check if we should initiate an asynchronous
1665 	 * read ahead before we call page_lookup (we may sleep in page_lookup
1666 	 * for a previously initiated disk read).
1667 	 */
1668 	eoff = (uoff + len);
1669 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
1670 			pgoff < eoff; /* empty */) {
1671 		page_t	*pp;
1672 		u_offset_t	nextrio;
1673 		se_t	se;
1674 
1675 		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1676 
1677 		/*
1678 		 * Handle async getpage (faultahead)
1679 		 */
1680 		if (plarr == NULL) {
1681 			ip->i_nextrio = pgoff;
1682 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1683 			pgoff += pgsize;
1684 			pgaddr += pgsize;
1685 			continue;
1686 		}
1687 
1688 		/*
1689 		 * Check if we should initiate read ahead of next cluster.
1690 		 * We call page_exists only when we need to confirm that
1691 		 * we have the current page before we initiate the read ahead.
1692 		 */
1693 		nextrio = ip->i_nextrio;
1694 		if (seqmode &&
1695 		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1696 		    nextrio < ip->i_size && page_exists(vp, pgoff))
1697 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1698 
1699 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1700 
1701 			/*
1702 			 * We found the page in the page cache.
1703 			 */
1704 			*pl++ = pp;
1705 			pgoff += pgsize;
1706 			pgaddr += pgsize;
1707 			len -= pgsize;
1708 			plsz -= pgsize;
1709 		} else  {
1710 
1711 			/*
1712 			 * We have to create the page, or read it from disk.
1713 			 */
1714 			if (error = ud_getpage_miss(vp, pgoff, len,
1715 				seg, pgaddr, pl, plsz, rw, seqmode)) {
1716 				goto error_out;
1717 			}
1718 
1719 			while (*pl != NULL) {
1720 				pl++;
1721 				pgoff += pgsize;
1722 				pgaddr += pgsize;
1723 				len -= pgsize;
1724 				plsz -= pgsize;
1725 			}
1726 		}
1727 	}
1728 
1729 	/*
1730 	 * Return pages up to plsz if they are in the page cache.
1731 	 * We cannot return pages if there is a chance that they are
1732 	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1733 	 */
1734 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1735 
1736 		ASSERT((protp == NULL) ||
1737 			!(has_holes && (*protp & PROT_WRITE)));
1738 
1739 		eoff = pgoff + plsz;
1740 		while (pgoff < eoff) {
1741 			page_t		*pp;
1742 
1743 			if ((pp = page_lookup_nowait(vp, pgoff,
1744 			    SE_SHARED)) == NULL)
1745 				break;
1746 
1747 			*pl++ = pp;
1748 			pgoff += pgsize;
1749 			plsz -= pgsize;
1750 		}
1751 	}
1752 
1753 	if (plarr)
1754 		*pl = NULL;			/* Terminate page list */
1755 	ip->i_nextr = pgoff;
1756 
1757 error_out:
1758 	if (error && plarr) {
1759 		/*
1760 		 * Release any pages we have locked.
1761 		 */
1762 		while (pl > &plarr[0])
1763 			page_unlock(*--pl);
1764 
1765 		plarr[0] = NULL;
1766 	}
1767 
1768 update_inode:
1769 #ifdef	__lock_lint
1770 	rw_exit(&ip->i_contents);
1771 #else
1772 	if (dolock) {
1773 		rw_exit(&ip->i_contents);
1774 	}
1775 #endif
1776 
1777 	/*
1778 	 * If the inode is not already marked for IACC (in rwip() for read)
1779 	 * and the inode is not marked for no access time update (in rwip()
1780 	 * for write) then update the inode access time and mod time now.
1781 	 */
1782 	mutex_enter(&ip->i_tlock);
1783 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
1784 		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1785 			ip->i_flag |= IACC;
1786 		}
1787 		if (rw == S_WRITE) {
1788 			ip->i_flag |= IUPD;
1789 		}
1790 		ITIMES_NOLOCK(ip);
1791 	}
1792 	mutex_exit(&ip->i_tlock);
1793 
1794 	return (error);
1795 }
1796 
1797 int32_t ud_delay = 1;
1798 
1799 /* ARGSUSED */
1800 static int32_t
1801 udf_putpage(struct vnode *vp, offset_t off,
1802 	size_t len, int32_t flags, struct cred *cr)
1803 {
1804 	struct ud_inode *ip;
1805 	int32_t error = 0;
1806 
1807 	ud_printf("udf_putpage\n");
1808 
1809 	ip = VTOI(vp);
1810 #ifdef	__lock_lint
1811 	rw_enter(&ip->i_contents, RW_WRITER);
1812 #endif
1813 
1814 	if (vp->v_count == 0) {
1815 		cmn_err(CE_WARN, "ud_putpage : bad v_count");
1816 		error = EINVAL;
1817 		goto out;
1818 	}
1819 
1820 	if (vp->v_flag & VNOMAP) {
1821 		error = ENOSYS;
1822 		goto out;
1823 	}
1824 
1825 	if (flags & B_ASYNC) {
1826 		if (ud_delay && len &&
1827 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1828 			mutex_enter(&ip->i_tlock);
1829 
1830 			/*
1831 			 * If nobody stalled, start a new cluster.
1832 			 */
1833 			if (ip->i_delaylen == 0) {
1834 				ip->i_delayoff = off;
1835 				ip->i_delaylen = len;
1836 				mutex_exit(&ip->i_tlock);
1837 				goto out;
1838 			}
1839 
1840 			/*
1841 			 * If we have a full cluster or they are not contig,
1842 			 * then push last cluster and start over.
1843 			 */
1844 			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1845 			    ip->i_delayoff + ip->i_delaylen != off) {
1846 				u_offset_t doff;
1847 				size_t dlen;
1848 
1849 				doff = ip->i_delayoff;
1850 				dlen = ip->i_delaylen;
1851 				ip->i_delayoff = off;
1852 				ip->i_delaylen = len;
1853 				mutex_exit(&ip->i_tlock);
1854 				error = ud_putpages(vp, doff, dlen, flags, cr);
1855 				/* LMXXX - flags are new val, not old */
1856 				goto out;
1857 			}
1858 
1859 			/*
1860 			 * There is something there, it's not full, and
1861 			 * it is contig.
1862 			 */
1863 			ip->i_delaylen += len;
1864 			mutex_exit(&ip->i_tlock);
1865 			goto out;
1866 		}
1867 
1868 		/*
1869 		 * Must have weird flags or we are not clustering.
1870 		 */
1871 	}
1872 
1873 	error = ud_putpages(vp, off, len, flags, cr);
1874 
1875 out:
1876 #ifdef	__lock_lint
1877 	rw_exit(&ip->i_contents);
1878 #endif
1879 	return (error);
1880 }
1881 
1882 static int32_t
1883 udf_map(struct vnode *vp, offset_t off,
1884 	struct as *as, caddr_t *addrp, size_t len,
1885 	uint8_t prot, uint8_t maxprot, uint32_t flags,
1886 	struct cred *cr)
1887 {
1888 	struct segvn_crargs vn_a;
1889 	int32_t error = 0;
1890 
1891 	ud_printf("udf_map\n");
1892 
1893 	if (vp->v_flag & VNOMAP) {
1894 		error = ENOSYS;
1895 		goto end;
1896 	}
1897 
1898 	if ((off < (offset_t)0) ||
1899 		((off + len) < (offset_t)0)) {
1900 		error = EINVAL;
1901 		goto end;
1902 	}
1903 
1904 	if (vp->v_type != VREG) {
1905 		error = ENODEV;
1906 		goto end;
1907 	}
1908 
1909 	/*
1910 	 * If file is being locked, disallow mapping.
1911 	 */
1912 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
1913 		error = EAGAIN;
1914 		goto end;
1915 	}
1916 
1917 	as_rangelock(as);
1918 	if ((flags & MAP_FIXED) == 0) {
1919 		map_addr(addrp, len, off, 1, flags);
1920 		if (*addrp == NULL) {
1921 			as_rangeunlock(as);
1922 			error = ENOMEM;
1923 			goto end;
1924 		}
1925 	} else {
1926 		/*
1927 		 * User specified address - blow away any previous mappings
1928 		 */
1929 		(void) as_unmap(as, *addrp, len);
1930 	}
1931 
1932 	vn_a.vp = vp;
1933 	vn_a.offset = off;
1934 	vn_a.type = flags & MAP_TYPE;
1935 	vn_a.prot = prot;
1936 	vn_a.maxprot = maxprot;
1937 	vn_a.cred = cr;
1938 	vn_a.amp = NULL;
1939 	vn_a.flags = flags & ~MAP_TYPE;
1940 	vn_a.szc = 0;
1941 	vn_a.lgrp_mem_policy_flags = 0;
1942 
1943 	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
1944 	as_rangeunlock(as);
1945 
1946 end:
1947 	return (error);
1948 }
1949 
1950 /* ARGSUSED */
1951 static int32_t
1952 udf_addmap(struct vnode *vp, offset_t off,
1953 	struct as *as, caddr_t addr, size_t len,
1954 	uint8_t prot, uint8_t maxprot, uint32_t flags,
1955 	struct cred *cr)
1956 {
1957 	struct ud_inode *ip = VTOI(vp);
1958 
1959 	ud_printf("udf_addmap\n");
1960 
1961 	if (vp->v_flag & VNOMAP) {
1962 		return (ENOSYS);
1963 	}
1964 
1965 	mutex_enter(&ip->i_tlock);
1966 	ip->i_mapcnt += btopr(len);
1967 	mutex_exit(&ip->i_tlock);
1968 
1969 	return (0);
1970 }
1971 
1972 /* ARGSUSED */
1973 static int32_t
1974 udf_delmap(struct vnode *vp, offset_t off,
1975 	struct as *as, caddr_t addr, size_t len,
1976 	uint32_t prot, uint32_t maxprot, uint32_t flags,
1977 	struct cred *cr)
1978 {
1979 	struct ud_inode *ip = VTOI(vp);
1980 
1981 	ud_printf("udf_delmap\n");
1982 
1983 	if (vp->v_flag & VNOMAP) {
1984 		return (ENOSYS);
1985 	}
1986 
1987 	mutex_enter(&ip->i_tlock);
1988 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
1989 	ASSERT(ip->i_mapcnt >= 0);
1990 	mutex_exit(&ip->i_tlock);
1991 
1992 	return (0);
1993 }
1994 
1995 static int32_t
1996 udf_l_pathconf(struct vnode *vp, int32_t cmd,
1997 	ulong_t *valp, struct cred *cr)
1998 {
1999 	int32_t error = 0;
2000 
2001 	ud_printf("udf_l_pathconf\n");
2002 
2003 	if (cmd == _PC_FILESIZEBITS) {
2004 		/*
2005 		 * udf supports 64 bits as file size
2006 		 * but there are several other restrictions
2007 		 * it only supports 32-bit block numbers and
2008 		 * daddr32_t is only and int32_t so taking these
2009 		 * into account we can stay just as where ufs is
2010 		 */
2011 		*valp = 41;
2012 	} else {
2013 		error = fs_pathconf(vp, cmd, valp, cr);
2014 	}
2015 
2016 	return (error);
2017 }
2018 
2019 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2020 #ifndef	__lint
2021 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2022 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2023 #endif
2024 /*
2025  * Assumption is that there will not be a pageio request
2026  * to a enbedded file
2027  */
2028 /* ARGSUSED */
2029 static int32_t
2030 udf_pageio(struct vnode *vp, struct page *pp,
2031 	u_offset_t io_off, size_t io_len,
2032 	int32_t flags, struct cred *cr)
2033 {
2034 	daddr_t bn;
2035 	struct buf *bp;
2036 	struct ud_inode *ip = VTOI(vp);
2037 	int32_t dolock, error = 0, contig, multi_io;
2038 	size_t done_len = 0, cur_len = 0;
2039 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
2040 
2041 	if (pp == NULL) {
2042 		return (EINVAL);
2043 	}
2044 
2045 	dolock = (rw_owner(&ip->i_contents) != curthread);
2046 
2047 	/*
2048 	 * We need a better check.  Ideally, we would use another
2049 	 * vnodeops so that hlocked and forcibly unmounted file
2050 	 * systems would return EIO where appropriate and w/o the
2051 	 * need for these checks.
2052 	 */
2053 	if (ip->i_udf == NULL) {
2054 		return (EIO);
2055 	}
2056 
2057 #ifdef	__lock_lint
2058 	rw_enter(&ip->i_contents, RW_READER);
2059 #else
2060 	if (dolock) {
2061 		rw_enter(&ip->i_contents, RW_READER);
2062 	}
2063 #endif
2064 
2065 	/*
2066 	 * Break the io request into chunks, one for each contiguous
2067 	 * stretch of disk blocks in the target file.
2068 	 */
2069 	while (done_len < io_len) {
2070 		ASSERT(cpp);
2071 		bp = NULL;
2072 		contig = 0;
2073 		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2074 					&bn, &contig)) {
2075 			break;
2076 		}
2077 
2078 		if (bn == UDF_HOLE) {   /* No holey swapfiles */
2079 			cmn_err(CE_WARN, "SWAP file has HOLES");
2080 			error = EINVAL;
2081 			break;
2082 		}
2083 
2084 		cur_len = MIN(io_len - done_len, contig);
2085 
2086 		/*
2087 		 * Check if more than one I/O is
2088 		 * required to complete the given
2089 		 * I/O operation
2090 		 */
2091 		if (ip->i_udf->udf_lbsize < PAGESIZE) {
2092 			if (cur_len >= PAGESIZE) {
2093 				multi_io = 0;
2094 				cur_len &= PAGEMASK;
2095 			} else {
2096 				multi_io = 1;
2097 				cur_len = MIN(io_len - done_len, PAGESIZE);
2098 			}
2099 		}
2100 		page_list_break(&cpp, &npp, btop(cur_len));
2101 
2102 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2103 		ASSERT(bp != NULL);
2104 
2105 		bp->b_edev = ip->i_dev;
2106 		bp->b_dev = cmpdev(ip->i_dev);
2107 		bp->b_blkno = bn;
2108 		bp->b_un.b_addr = (caddr_t)0;
2109 		bp->b_file = vp;
2110 		bp->b_offset = (offset_t)(io_off + done_len);
2111 
2112 /*
2113  *		ub.ub_pageios.value.ul++;
2114  */
2115 		if (multi_io == 0) {
2116 			(void) bdev_strategy(bp);
2117 		} else {
2118 			error = ud_multi_strat(ip, cpp, bp,
2119 				(u_offset_t)(io_off + done_len));
2120 			if (error != 0) {
2121 				pageio_done(bp);
2122 				break;
2123 			}
2124 		}
2125 		if (flags & B_READ) {
2126 			ud_pageio_reads++;
2127 		} else {
2128 			ud_pageio_writes++;
2129 		}
2130 
2131 		/*
2132 		 * If the request is not B_ASYNC, wait for i/o to complete
2133 		 * and re-assemble the page list to return to the caller.
2134 		 * If it is B_ASYNC we leave the page list in pieces and
2135 		 * cleanup() will dispose of them.
2136 		 */
2137 		if ((flags & B_ASYNC) == 0) {
2138 			error = biowait(bp);
2139 			pageio_done(bp);
2140 			if (error) {
2141 				break;
2142 			}
2143 			page_list_concat(&opp, &cpp);
2144 		}
2145 		cpp = npp;
2146 		npp = NULL;
2147 		done_len += cur_len;
2148 	}
2149 
2150 	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2151 	if (error) {
2152 		if (flags & B_ASYNC) {
2153 			/* Cleanup unprocessed parts of list */
2154 			page_list_concat(&cpp, &npp);
2155 			if (flags & B_READ) {
2156 				pvn_read_done(cpp, B_ERROR);
2157 			} else {
2158 				pvn_write_done(cpp, B_ERROR);
2159 			}
2160 		} else {
2161 			/* Re-assemble list and let caller clean up */
2162 			page_list_concat(&opp, &cpp);
2163 			page_list_concat(&opp, &npp);
2164 		}
2165 	}
2166 
2167 #ifdef	__lock_lint
2168 	rw_exit(&ip->i_contents);
2169 #else
2170 	if (dolock) {
2171 		rw_exit(&ip->i_contents);
2172 	}
2173 #endif
2174 	return (error);
2175 }
2176 
2177 
2178 
2179 
2180 /* -------------------- local functions --------------------------- */
2181 
2182 
2183 
2184 int32_t
2185 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2186 	struct ud_inode *ip, caddr_t base, int32_t len,
2187 	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2188 {
2189 	int32_t error;
2190 	struct uio auio;
2191 	struct iovec aiov;
2192 
2193 	ud_printf("ud_rdwri\n");
2194 
2195 	bzero((caddr_t)&auio, sizeof (uio_t));
2196 	bzero((caddr_t)&aiov, sizeof (iovec_t));
2197 
2198 	aiov.iov_base = base;
2199 	aiov.iov_len = len;
2200 	auio.uio_iov = &aiov;
2201 	auio.uio_iovcnt = 1;
2202 	auio.uio_loffset = offset;
2203 	auio.uio_segflg = (int16_t)seg;
2204 	auio.uio_resid = len;
2205 
2206 	if (rw == UIO_WRITE) {
2207 		auio.uio_fmode = FWRITE;
2208 		auio.uio_extflg = UIO_COPY_DEFAULT;
2209 		auio.uio_llimit = curproc->p_fsz_ctl;
2210 		error = ud_wrip(ip, &auio, ioflag, cr);
2211 	} else {
2212 		auio.uio_fmode = FREAD;
2213 		auio.uio_extflg = UIO_COPY_CACHED;
2214 		auio.uio_llimit = MAXOFFSET_T;
2215 		error = ud_rdip(ip, &auio, ioflag, cr);
2216 	}
2217 
2218 	if (aresid) {
2219 		*aresid = auio.uio_resid;
2220 	} else if (auio.uio_resid) {
2221 		error = EIO;
2222 	}
2223 	return (error);
2224 }
2225 
2226 /*
2227  * Free behind hacks.  The pager is busted.
2228  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2229  * or B_FREE_IF_TIGHT_ON_MEMORY.
2230  */
2231 int32_t ud_freebehind = 1;
2232 int32_t ud_smallfile = 32 * 1024;
2233 
2234 /* ARGSUSED */
2235 int32_t
2236 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2237 	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2238 	size_t plsz, enum seg_rw rw, int32_t seq)
2239 {
2240 	struct ud_inode *ip = VTOI(vp);
2241 	int32_t err = 0;
2242 	size_t io_len;
2243 	u_offset_t io_off;
2244 	u_offset_t pgoff;
2245 	page_t *pp;
2246 
2247 	pl[0] = NULL;
2248 
2249 	/*
2250 	 * Figure out whether the page can be created, or must be
2251 	 * read from the disk
2252 	 */
2253 	if (rw == S_CREATE) {
2254 		if ((pp = page_create_va(vp, off,
2255 				PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2256 			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2257 			return (EINVAL);
2258 		}
2259 		io_len = PAGESIZE;
2260 	} else {
2261 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2262 				&io_len, off, PAGESIZE, 0);
2263 
2264 		/*
2265 		 * Some other thread has entered the page.
2266 		 * ud_getpage will retry page_lookup.
2267 		 */
2268 		if (pp == NULL) {
2269 			return (0);
2270 		}
2271 
2272 		/*
2273 		 * Fill the page with as much data as we can from the file.
2274 		 */
2275 		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2276 		if (err) {
2277 			pvn_read_done(pp, B_ERROR);
2278 			return (err);
2279 		}
2280 
2281 		/*
2282 		 * XXX ??? ufs has io_len instead of pgoff below
2283 		 */
2284 		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2285 
2286 		/*
2287 		 * If the file access is sequential, initiate read ahead
2288 		 * of the next cluster.
2289 		 */
2290 		if (seq && ip->i_nextrio < ip->i_size) {
2291 			ud_getpage_ra(vp, off, seg, addr);
2292 		}
2293 	}
2294 
2295 outmiss:
2296 	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2297 	return (err);
2298 }
2299 
2300 /* ARGSUSED */
2301 void
2302 ud_getpage_ra(struct vnode *vp,
2303 	u_offset_t off, struct seg *seg, caddr_t addr)
2304 {
2305 	page_t *pp;
2306 	size_t io_len;
2307 	struct ud_inode *ip = VTOI(vp);
2308 	u_offset_t io_off = ip->i_nextrio, pgoff;
2309 	caddr_t addr2 = addr + (io_off - off);
2310 	daddr_t bn;
2311 	int32_t contig = 0;
2312 
2313 	/*
2314 	 * Is this test needed?
2315 	 */
2316 
2317 	if (addr2 >= seg->s_base + seg->s_size) {
2318 		return;
2319 	}
2320 
2321 	contig = 0;
2322 	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2323 		return;
2324 	}
2325 
2326 	pp = pvn_read_kluster(vp, io_off, seg, addr2,
2327 		&io_off, &io_len, io_off, PAGESIZE, 1);
2328 
2329 	/*
2330 	 * Some other thread has entered the page.
2331 	 * So no read head done here (ie we will have to and wait
2332 	 * for the read when needed).
2333 	 */
2334 
2335 	if (pp == NULL) {
2336 		return;
2337 	}
2338 
2339 	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2340 	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2341 }
2342 
2343 int
2344 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2345 	uint32_t bflgs, u_offset_t *pg_off)
2346 {
2347 	daddr_t bn;
2348 	struct buf *bp;
2349 	caddr_t kaddr, caddr;
2350 	int32_t error = 0, contig = 0, multi_io = 0;
2351 	int32_t lbsize = ip->i_udf->udf_lbsize;
2352 	int32_t lbmask = ip->i_udf->udf_lbmask;
2353 	uint64_t isize;
2354 
2355 	isize = (ip->i_size + lbmask) & (~lbmask);
2356 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2357 
2358 		/*
2359 		 * Embedded file read file_entry
2360 		 * from buffer cache and copy the required
2361 		 * portions
2362 		 */
2363 		bp = ud_bread(ip->i_dev,
2364 		ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2365 		if ((bp->b_error == 0) &&
2366 			(bp->b_resid == 0)) {
2367 
2368 			caddr = bp->b_un.b_addr + ip->i_data_off;
2369 
2370 			/*
2371 			 * mapin to kvm
2372 			 */
2373 			kaddr = (caddr_t)ppmapin(pp,
2374 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2375 			(void) kcopy(caddr, kaddr, ip->i_size);
2376 
2377 			/*
2378 			 * mapout of kvm
2379 			 */
2380 			ppmapout(kaddr);
2381 		}
2382 		brelse(bp);
2383 		contig = ip->i_size;
2384 	} else {
2385 
2386 		/*
2387 		 * Get the continuous size and block number
2388 		 * at offset "off"
2389 		 */
2390 		if (error = ud_bmap_read(ip, off, &bn, &contig))
2391 			goto out;
2392 		contig = MIN(contig, PAGESIZE);
2393 		contig = (contig + lbmask) & (~lbmask);
2394 
2395 		/*
2396 		 * Zero part of the page which we are not
2397 		 * going to read from the disk.
2398 		 */
2399 
2400 		if (bn == UDF_HOLE) {
2401 
2402 			/*
2403 			 * This is a HOLE. Just zero out
2404 			 * the page
2405 			 */
2406 			if (((off + contig) == isize) ||
2407 				(contig == PAGESIZE)) {
2408 				pagezero(pp->p_prev, 0, PAGESIZE);
2409 				goto out;
2410 			}
2411 		}
2412 
2413 		if (contig < PAGESIZE) {
2414 			uint64_t count;
2415 
2416 			count = isize - off;
2417 			if (contig != count) {
2418 				multi_io = 1;
2419 				contig = (int32_t)(MIN(count, PAGESIZE));
2420 			} else {
2421 				pagezero(pp->p_prev, contig, PAGESIZE - contig);
2422 			}
2423 		}
2424 
2425 		/*
2426 		 * Get a bp and initialize it
2427 		 */
2428 		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2429 		ASSERT(bp != NULL);
2430 
2431 		bp->b_edev = ip->i_dev;
2432 		bp->b_dev = cmpdev(ip->i_dev);
2433 		bp->b_blkno = bn;
2434 		bp->b_un.b_addr = 0;
2435 		bp->b_file = ip->i_vnode;
2436 
2437 		/*
2438 		 * Start I/O
2439 		 */
2440 		if (multi_io == 0) {
2441 
2442 			/*
2443 			 * Single I/O is sufficient for this page
2444 			 */
2445 			(void) bdev_strategy(bp);
2446 		} else {
2447 
2448 			/*
2449 			 * We need to do the I/O in
2450 			 * piece's
2451 			 */
2452 			error = ud_multi_strat(ip, pp, bp, off);
2453 			if (error != 0) {
2454 				goto out;
2455 			}
2456 		}
2457 		if ((bflgs & B_ASYNC) == 0) {
2458 
2459 			/*
2460 			 * Wait for i/o to complete.
2461 			 */
2462 
2463 			error = biowait(bp);
2464 			pageio_done(bp);
2465 			if (error) {
2466 				goto out;
2467 			}
2468 		}
2469 	}
2470 	if ((off + contig) >= ip->i_size) {
2471 		contig = ip->i_size - off;
2472 	}
2473 
2474 out:
2475 	*pg_off = contig;
2476 	return (error);
2477 }
2478 
2479 int32_t
2480 ud_putpages(struct vnode *vp, offset_t off,
2481 	size_t len, int32_t flags, struct cred *cr)
2482 {
2483 	struct ud_inode *ip;
2484 	page_t *pp;
2485 	u_offset_t io_off;
2486 	size_t io_len;
2487 	u_offset_t eoff;
2488 	int32_t err = 0;
2489 	int32_t dolock;
2490 
2491 	ud_printf("ud_putpages\n");
2492 
2493 	if (vp->v_count == 0) {
2494 		cmn_err(CE_WARN, "ud_putpages: bad v_count");
2495 		return (EINVAL);
2496 	}
2497 
2498 	ip = VTOI(vp);
2499 
2500 	/*
2501 	 * Acquire the readers/write inode lock before locking
2502 	 * any pages in this inode.
2503 	 * The inode lock is held during i/o.
2504 	 */
2505 	if (len == 0) {
2506 		mutex_enter(&ip->i_tlock);
2507 		ip->i_delayoff = ip->i_delaylen = 0;
2508 		mutex_exit(&ip->i_tlock);
2509 	}
2510 #ifdef	__lock_lint
2511 	rw_enter(&ip->i_contents, RW_READER);
2512 #else
2513 	dolock = (rw_owner(&ip->i_contents) != curthread);
2514 	if (dolock) {
2515 		rw_enter(&ip->i_contents, RW_READER);
2516 	}
2517 #endif
2518 
2519 	if (!vn_has_cached_data(vp)) {
2520 #ifdef	__lock_lint
2521 		rw_exit(&ip->i_contents);
2522 #else
2523 		if (dolock) {
2524 			rw_exit(&ip->i_contents);
2525 		}
2526 #endif
2527 		return (0);
2528 	}
2529 
2530 	if (len == 0) {
2531 		/*
2532 		 * Search the entire vp list for pages >= off.
2533 		 */
2534 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2535 					flags, cr);
2536 	} else {
2537 		/*
2538 		 * Loop over all offsets in the range looking for
2539 		 * pages to deal with.
2540 		 */
2541 		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2542 			eoff = MIN(off + len, eoff);
2543 		} else {
2544 			eoff = off + len;
2545 		}
2546 
2547 		for (io_off = off; io_off < eoff; io_off += io_len) {
2548 			/*
2549 			 * If we are not invalidating, synchronously
2550 			 * freeing or writing pages, use the routine
2551 			 * page_lookup_nowait() to prevent reclaiming
2552 			 * them from the free list.
2553 			 */
2554 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2555 				pp = page_lookup(vp, io_off,
2556 					(flags & (B_INVAL | B_FREE)) ?
2557 					    SE_EXCL : SE_SHARED);
2558 			} else {
2559 				pp = page_lookup_nowait(vp, io_off,
2560 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
2561 			}
2562 
2563 			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2564 				io_len = PAGESIZE;
2565 			} else {
2566 
2567 				err = ud_putapage(vp, pp,
2568 					&io_off, &io_len, flags, cr);
2569 				if (err != 0) {
2570 					break;
2571 				}
2572 				/*
2573 				 * "io_off" and "io_len" are returned as
2574 				 * the range of pages we actually wrote.
2575 				 * This allows us to skip ahead more quickly
2576 				 * since several pages may've been dealt
2577 				 * with by this iteration of the loop.
2578 				 */
2579 			}
2580 		}
2581 	}
2582 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2583 		/*
2584 		 * We have just sync'ed back all the pages on
2585 		 * the inode, turn off the IMODTIME flag.
2586 		 */
2587 		mutex_enter(&ip->i_tlock);
2588 		ip->i_flag &= ~IMODTIME;
2589 		mutex_exit(&ip->i_tlock);
2590 	}
2591 #ifdef	__lock_lint
2592 	rw_exit(&ip->i_contents);
2593 #else
2594 	if (dolock) {
2595 		rw_exit(&ip->i_contents);
2596 	}
2597 #endif
2598 	return (err);
2599 }
2600 
2601 /* ARGSUSED */
2602 int32_t
2603 ud_putapage(struct vnode *vp,
2604 	page_t *pp, u_offset_t *offp,
2605 	size_t *lenp, int32_t flags, struct cred *cr)
2606 {
2607 	daddr_t bn;
2608 	size_t io_len;
2609 	struct ud_inode *ip;
2610 	int32_t error = 0, contig, multi_io = 0;
2611 	struct udf_vfs *udf_vfsp;
2612 	u_offset_t off, io_off;
2613 	caddr_t kaddr, caddr;
2614 	struct buf *bp = NULL;
2615 	int32_t lbmask;
2616 	uint64_t isize;
2617 	int32_t crc_len;
2618 	struct file_entry *fe;
2619 
2620 	ud_printf("ud_putapage\n");
2621 
2622 	ip = VTOI(vp);
2623 	ASSERT(ip);
2624 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2625 	lbmask = ip->i_udf->udf_lbmask;
2626 	isize = (ip->i_size + lbmask) & (~lbmask);
2627 
2628 	udf_vfsp = ip->i_udf;
2629 	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2630 
2631 	/*
2632 	 * If the modified time on the inode has not already been
2633 	 * set elsewhere (e.g. for write/setattr) we set the time now.
2634 	 * This gives us approximate modified times for mmap'ed files
2635 	 * which are modified via stores in the user address space.
2636 	 */
2637 	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2638 		mutex_enter(&ip->i_tlock);
2639 		ip->i_flag |= IUPD;
2640 		ITIMES_NOLOCK(ip);
2641 		mutex_exit(&ip->i_tlock);
2642 	}
2643 
2644 
2645 	/*
2646 	 * Align the request to a block boundry (for old file systems),
2647 	 * and go ask bmap() how contiguous things are for this file.
2648 	 */
2649 	off = pp->p_offset & ~(offset_t)lbmask;
2650 				/* block align it */
2651 
2652 
2653 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2654 		ASSERT(ip->i_size <= ip->i_max_emb);
2655 
2656 		pp = pvn_write_kluster(vp, pp, &io_off,
2657 			&io_len, off, PAGESIZE, flags);
2658 		if (io_len == 0) {
2659 			io_len = PAGESIZE;
2660 		}
2661 
2662 		bp = ud_bread(ip->i_dev,
2663 			ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2664 			udf_vfsp->udf_lbsize);
2665 		fe = (struct file_entry *)bp->b_un.b_addr;
2666 		if ((bp->b_flags & B_ERROR) ||
2667 		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2668 		    ip->i_icb_block,
2669 		    1, udf_vfsp->udf_lbsize) != 0)) {
2670 			if (pp != NULL)
2671 				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2672 			if (bp->b_flags & B_ERROR) {
2673 				error = EIO;
2674 			} else {
2675 				error = EINVAL;
2676 			}
2677 			brelse(bp);
2678 			return (error);
2679 		}
2680 		if ((bp->b_error == 0) &&
2681 			(bp->b_resid == 0)) {
2682 
2683 			caddr = bp->b_un.b_addr + ip->i_data_off;
2684 			kaddr = (caddr_t)ppmapin(pp,
2685 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2686 			(void) kcopy(kaddr, caddr, ip->i_size);
2687 			ppmapout(kaddr);
2688 		}
2689 		crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) +
2690 				SWAP_32(fe->fe_len_ear);
2691 		crc_len += ip->i_size;
2692 		ud_make_tag(ip->i_udf, &fe->fe_tag,
2693 			UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2694 
2695 		bwrite(bp);
2696 
2697 		if (flags & B_ASYNC) {
2698 			pvn_write_done(pp, flags);
2699 		}
2700 		contig = ip->i_size;
2701 	} else {
2702 
2703 		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2704 			goto out;
2705 		}
2706 		contig = MIN(contig, PAGESIZE);
2707 		contig = (contig + lbmask) & (~lbmask);
2708 
2709 		if (contig < PAGESIZE) {
2710 			uint64_t count;
2711 
2712 			count = isize - off;
2713 			if (contig != count) {
2714 				multi_io = 1;
2715 				contig = (int32_t)(MIN(count, PAGESIZE));
2716 			}
2717 		}
2718 
2719 		if ((off + contig) > isize) {
2720 			contig = isize - off;
2721 		}
2722 
2723 		if (contig > PAGESIZE) {
2724 			if (contig & PAGEOFFSET) {
2725 				contig &= PAGEMASK;
2726 			}
2727 		}
2728 
2729 		pp = pvn_write_kluster(vp, pp, &io_off,
2730 			&io_len, off, contig, flags);
2731 		if (io_len == 0) {
2732 			io_len = PAGESIZE;
2733 		}
2734 
2735 		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2736 		ASSERT(bp != NULL);
2737 
2738 		bp->b_edev = ip->i_dev;
2739 		bp->b_dev = cmpdev(ip->i_dev);
2740 		bp->b_blkno = bn;
2741 		bp->b_un.b_addr = 0;
2742 		bp->b_file = vp;
2743 		bp->b_offset = (offset_t)off;
2744 
2745 
2746 		/*
2747 		 * write throttle
2748 		 */
2749 		ASSERT(bp->b_iodone == NULL);
2750 		bp->b_iodone = ud_iodone;
2751 		mutex_enter(&ip->i_tlock);
2752 		ip->i_writes += bp->b_bcount;
2753 		mutex_exit(&ip->i_tlock);
2754 
2755 		if (multi_io == 0) {
2756 
2757 			(void) bdev_strategy(bp);
2758 		} else {
2759 			error = ud_multi_strat(ip, pp, bp, off);
2760 			if (error != 0) {
2761 				goto out;
2762 			}
2763 		}
2764 
2765 		if ((flags & B_ASYNC) == 0) {
2766 			/*
2767 			 * Wait for i/o to complete.
2768 			 */
2769 			error = biowait(bp);
2770 			pageio_done(bp);
2771 		}
2772 	}
2773 
2774 	if ((flags & B_ASYNC) == 0) {
2775 		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2776 	}
2777 
2778 	pp = NULL;
2779 
2780 out:
2781 	if (error != 0 && pp != NULL) {
2782 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2783 	}
2784 
2785 	if (offp) {
2786 		*offp = io_off;
2787 	}
2788 	if (lenp) {
2789 		*lenp = io_len;
2790 	}
2791 
2792 	return (error);
2793 }
2794 
2795 
2796 int32_t
2797 ud_iodone(struct buf *bp)
2798 {
2799 	struct ud_inode *ip;
2800 
2801 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2802 
2803 	bp->b_iodone = NULL;
2804 
2805 	ip = VTOI(bp->b_pages->p_vnode);
2806 
2807 	mutex_enter(&ip->i_tlock);
2808 	if (ip->i_writes >= ud_LW) {
2809 		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2810 			if (ud_WRITES) {
2811 				cv_broadcast(&ip->i_wrcv); /* wake all up */
2812 			}
2813 		}
2814 	} else {
2815 		ip->i_writes -= bp->b_bcount;
2816 	}
2817 	mutex_exit(&ip->i_tlock);
2818 	iodone(bp);
2819 	return (0);
2820 }
2821 
2822 /* ARGSUSED3 */
2823 int32_t
2824 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2825 {
2826 	struct vnode *vp;
2827 	struct udf_vfs *udf_vfsp;
2828 	krw_t rwtype;
2829 	caddr_t base;
2830 	uint32_t flags;
2831 	int32_t error, n, on, mapon, dofree;
2832 	u_offset_t off;
2833 	long oresid = uio->uio_resid;
2834 
2835 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2836 	if ((ip->i_type != VREG) &&
2837 		(ip->i_type != VDIR) &&
2838 		(ip->i_type != VLNK)) {
2839 		return (EIO);
2840 	}
2841 
2842 	if (uio->uio_loffset > MAXOFFSET_T) {
2843 		return (0);
2844 	}
2845 
2846 	if ((uio->uio_loffset < (offset_t)0) ||
2847 		((uio->uio_loffset + uio->uio_resid) < 0)) {
2848 		return (EINVAL);
2849 	}
2850 	if (uio->uio_resid == 0) {
2851 		return (0);
2852 	}
2853 
2854 	vp = ITOV(ip);
2855 	udf_vfsp = ip->i_udf;
2856 	mutex_enter(&ip->i_tlock);
2857 	ip->i_flag |= IACC;
2858 	mutex_exit(&ip->i_tlock);
2859 
2860 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
2861 
2862 	do {
2863 		offset_t diff;
2864 		u_offset_t uoff = uio->uio_loffset;
2865 		off = uoff & (offset_t)MAXBMASK;
2866 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
2867 		on = (int)blkoff(udf_vfsp, uoff);
2868 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
2869 
2870 		diff = ip->i_size - uoff;
2871 
2872 		if (diff <= (offset_t)0) {
2873 			error = 0;
2874 			goto out;
2875 		}
2876 		if (diff < (offset_t)n) {
2877 			n = (int)diff;
2878 		}
2879 		dofree = ud_freebehind &&
2880 			ip->i_nextr == (off & PAGEMASK) &&
2881 				off > ud_smallfile;
2882 
2883 #ifndef	__lock_lint
2884 		if (rwtype == RW_READER) {
2885 			rw_exit(&ip->i_contents);
2886 		}
2887 #endif
2888 
2889 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
2890 					(uint32_t)n, 1, S_READ);
2891 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
2892 
2893 		flags = 0;
2894 		if (!error) {
2895 			/*
2896 			 * If read a whole block, or read to eof,
2897 			 * won't need this buffer again soon.
2898 			 */
2899 			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
2900 				freemem < lotsfree + pages_before_pager) {
2901 				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
2902 			}
2903 			/*
2904 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
2905 			 * we want to make sure that the page which has
2906 			 * been read, is written on disk if it is dirty.
2907 			 * And corresponding indirect blocks should also
2908 			 * be flushed out.
2909 			 */
2910 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
2911 				flags &= ~SM_ASYNC;
2912 				flags |= SM_WRITE;
2913 			}
2914 			error = segmap_release(segkmap, base, flags);
2915 		} else    {
2916 			(void) segmap_release(segkmap, base, flags);
2917 		}
2918 
2919 #ifndef __lock_lint
2920 		if (rwtype == RW_READER) {
2921 			rw_enter(&ip->i_contents, rwtype);
2922 		}
2923 #endif
2924 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
2925 out:
2926 	/*
2927 	 * Inode is updated according to this table if FRSYNC is set.
2928 	 *
2929 	 *	FSYNC	FDSYNC(posix.4)
2930 	 *	--------------------------
2931 	 *	always	IATTCHG|IBDWRITE
2932 	 */
2933 	if (ioflag & FRSYNC) {
2934 		if ((ioflag & FSYNC) ||
2935 		((ioflag & FDSYNC) && (ip->i_flag & (IATTCHG|IBDWRITE)))) {
2936 		rw_exit(&ip->i_contents);
2937 		rw_enter(&ip->i_contents, RW_WRITER);
2938 		ud_iupdat(ip, 1);
2939 		}
2940 	}
2941 	/*
2942 	 * If we've already done a partial read, terminate
2943 	 * the read but return no error.
2944 	 */
2945 	if (oresid != uio->uio_resid) {
2946 		error = 0;
2947 	}
2948 	ITIMES(ip);
2949 
2950 	return (error);
2951 }
2952 
2953 int32_t
2954 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
2955 {
2956 	caddr_t base;
2957 	struct vnode *vp;
2958 	struct udf_vfs *udf_vfsp;
2959 	uint32_t flags;
2960 	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
2961 	int32_t pagecreate, newpage;
2962 	uint64_t old_i_size;
2963 	u_offset_t off;
2964 	long start_resid = uio->uio_resid, premove_resid;
2965 	rlim64_t limit = uio->uio_limit;
2966 
2967 
2968 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2969 	if ((ip->i_type != VREG) &&
2970 		(ip->i_type != VDIR) &&
2971 		(ip->i_type != VLNK)) {
2972 		return (EIO);
2973 	}
2974 
2975 	if (uio->uio_loffset >= MAXOFFSET_T) {
2976 		return (EFBIG);
2977 	}
2978 	/*
2979 	 * see udf_l_pathconf
2980 	 */
2981 	if (limit > (((uint64_t)1 << 40) - 1)) {
2982 		limit = ((uint64_t)1 << 40) - 1;
2983 	}
2984 	if (uio->uio_loffset >= limit) {
2985 		proc_t *p = ttoproc(curthread);
2986 
2987 		mutex_enter(&p->p_lock);
2988 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
2989 		    p, RCA_UNSAFE_SIGINFO);
2990 		mutex_exit(&p->p_lock);
2991 		return (EFBIG);
2992 	}
2993 	if ((uio->uio_loffset < (offset_t)0) ||
2994 		((uio->uio_loffset + uio->uio_resid) < 0)) {
2995 		return (EINVAL);
2996 	}
2997 	if (uio->uio_resid == 0) {
2998 		return (0);
2999 	}
3000 
3001 	mutex_enter(&ip->i_tlock);
3002 	ip->i_flag |= INOACC;
3003 
3004 	if (ioflag & (FSYNC | FDSYNC)) {
3005 		ip->i_flag |= ISYNC;
3006 		iupdat_flag = 1;
3007 	}
3008 	mutex_exit(&ip->i_tlock);
3009 
3010 	udf_vfsp = ip->i_udf;
3011 	vp = ITOV(ip);
3012 
3013 	do {
3014 		u_offset_t uoff = uio->uio_loffset;
3015 		off = uoff & (offset_t)MAXBMASK;
3016 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3017 		on = (int)blkoff(udf_vfsp, uoff);
3018 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3019 
3020 		if (ip->i_type == VREG && uoff + n >= limit) {
3021 			if (uoff >= limit) {
3022 				error = EFBIG;
3023 				goto out;
3024 			}
3025 			n = (int)(limit - (rlim64_t)uoff);
3026 		}
3027 		if (uoff + n > ip->i_size) {
3028 			/*
3029 			 * We are extending the length of the file.
3030 			 * bmap is used so that we are sure that
3031 			 * if we need to allocate new blocks, that it
3032 			 * is done here before we up the file size.
3033 			 */
3034 			error = ud_bmap_write(ip, uoff,
3035 					(int)(on + n), mapon == 0, cr);
3036 			if (error) {
3037 				break;
3038 			}
3039 			i_size_changed = 1;
3040 			old_i_size = ip->i_size;
3041 			ip->i_size = uoff + n;
3042 			/*
3043 			 * If we are writing from the beginning of
3044 			 * the mapping, we can just create the
3045 			 * pages without having to read them.
3046 			 */
3047 			pagecreate = (mapon == 0);
3048 		} else if (n == MAXBSIZE) {
3049 			/*
3050 			 * Going to do a whole mappings worth,
3051 			 * so we can just create the pages w/o
3052 			 * having to read them in.  But before
3053 			 * we do that, we need to make sure any
3054 			 * needed blocks are allocated first.
3055 			 */
3056 			error = ud_bmap_write(ip, uoff,
3057 					(int)(on + n), 1, cr);
3058 			if (error) {
3059 				break;
3060 			}
3061 			pagecreate = 1;
3062 		} else {
3063 			pagecreate = 0;
3064 		}
3065 
3066 		rw_exit(&ip->i_contents);
3067 
3068 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3069 				(uint32_t)n, !pagecreate, S_WRITE);
3070 
3071 		/*
3072 		 * segmap_pagecreate() returns 1 if it calls
3073 		 * page_create_va() to allocate any pages.
3074 		 */
3075 		newpage = 0;
3076 		if (pagecreate) {
3077 			newpage = segmap_pagecreate(segkmap, base,
3078 					(size_t)n, 0);
3079 		}
3080 
3081 		premove_resid = uio->uio_resid;
3082 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3083 
3084 		if (pagecreate &&
3085 			uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3086 			/*
3087 			 * We created pages w/o initializing them completely,
3088 			 * thus we need to zero the part that wasn't set up.
3089 			 * This happens on most EOF write cases and if
3090 			 * we had some sort of error during the uiomove.
3091 			 */
3092 			int nzero, nmoved;
3093 
3094 			nmoved = (int)(uio->uio_loffset - (off + mapon));
3095 			ASSERT(nmoved >= 0 && nmoved <= n);
3096 			nzero = roundup(on + n, PAGESIZE) - nmoved;
3097 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3098 			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3099 		}
3100 
3101 		/*
3102 		 * Unlock the pages allocated by page_create_va()
3103 		 * in segmap_pagecreate()
3104 		 */
3105 		if (newpage) {
3106 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3107 		}
3108 
3109 		if (error) {
3110 			/*
3111 			 * If we failed on a write, we may have already
3112 			 * allocated file blocks as well as pages.  It's
3113 			 * hard to undo the block allocation, but we must
3114 			 * be sure to invalidate any pages that may have
3115 			 * been allocated.
3116 			 */
3117 			(void) segmap_release(segkmap, base, SM_INVAL);
3118 		} else {
3119 			flags = 0;
3120 			/*
3121 			 * Force write back for synchronous write cases.
3122 			 */
3123 			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3124 				/*
3125 				 * If the sticky bit is set but the
3126 				 * execute bit is not set, we do a
3127 				 * synchronous write back and free
3128 				 * the page when done.  We set up swap
3129 				 * files to be handled this way to
3130 				 * prevent servers from keeping around
3131 				 * the client's swap pages too long.
3132 				 * XXX - there ought to be a better way.
3133 				 */
3134 				if (IS_SWAPVP(vp)) {
3135 					flags = SM_WRITE | SM_FREE |
3136 							SM_DONTNEED;
3137 					iupdat_flag = 0;
3138 				} else {
3139 					flags = SM_WRITE;
3140 				}
3141 			} else if (((mapon + n) == MAXBSIZE) ||
3142 					IS_SWAPVP(vp)) {
3143 				/*
3144 				 * Have written a whole block.
3145 				 * Start an asynchronous write and
3146 				 * mark the buffer to indicate that
3147 				 * it won't be needed again soon.
3148 				 */
3149 				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3150 			}
3151 			error = segmap_release(segkmap, base, flags);
3152 
3153 			/*
3154 			 * If the operation failed and is synchronous,
3155 			 * then we need to unwind what uiomove() last
3156 			 * did so we can potentially return an error to
3157 			 * the caller.  If this write operation was
3158 			 * done in two pieces and the first succeeded,
3159 			 * then we won't return an error for the second
3160 			 * piece that failed.  However, we only want to
3161 			 * return a resid value that reflects what was
3162 			 * really done.
3163 			 *
3164 			 * Failures for non-synchronous operations can
3165 			 * be ignored since the page subsystem will
3166 			 * retry the operation until it succeeds or the
3167 			 * file system is unmounted.
3168 			 */
3169 			if (error) {
3170 				if ((ioflag & (FSYNC | FDSYNC)) ||
3171 					ip->i_type == VDIR) {
3172 					uio->uio_resid = premove_resid;
3173 				} else {
3174 					error = 0;
3175 				}
3176 			}
3177 		}
3178 
3179 		/*
3180 		 * Re-acquire contents lock.
3181 		 */
3182 		rw_enter(&ip->i_contents, RW_WRITER);
3183 		/*
3184 		 * If the uiomove() failed or if a synchronous
3185 		 * page push failed, fix up i_size.
3186 		 */
3187 		if (error) {
3188 			if (i_size_changed) {
3189 				/*
3190 				 * The uiomove failed, and we
3191 				 * allocated blocks,so get rid
3192 				 * of them.
3193 				 */
3194 				(void) ud_itrunc(ip, old_i_size, 0, cr);
3195 			}
3196 		} else {
3197 			/*
3198 			 * XXX - Can this be out of the loop?
3199 			 */
3200 			ip->i_flag |= IUPD | ICHG;
3201 			if (i_size_changed) {
3202 				ip->i_flag |= IATTCHG;
3203 			}
3204 			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3205 				(IEXEC >> 10))) != 0 &&
3206 			    (ip->i_char & (ISUID | ISGID)) != 0 &&
3207 			    secpolicy_vnode_setid_retain(cr,
3208 			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3209 				/*
3210 				 * Clear Set-UID & Set-GID bits on
3211 				 * successful write if not privileged
3212 				 * and at least one of the execute bits
3213 				 * is set.  If we always clear Set-GID,
3214 				 * mandatory file and record locking is
3215 				 * unuseable.
3216 				 */
3217 				ip->i_char &= ~(ISUID | ISGID);
3218 			}
3219 		}
3220 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3221 
3222 out:
3223 	/*
3224 	 * Inode is updated according to this table -
3225 	 *
3226 	 *	FSYNC	FDSYNC(posix.4)
3227 	 *	--------------------------
3228 	 *	always@	IATTCHG|IBDWRITE
3229 	 *
3230 	 * @ -  If we are doing synchronous write the only time we should
3231 	 *	not be sync'ing the ip here is if we have the stickyhack
3232 	 *	activated, the file is marked with the sticky bit and
3233 	 *	no exec bit, the file length has not been changed and
3234 	 *	no new blocks have been allocated during this write.
3235 	 */
3236 	if ((ip->i_flag & ISYNC) != 0) {
3237 		/*
3238 		 * we have eliminated nosync
3239 		 */
3240 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3241 			((ioflag & FSYNC) && iupdat_flag)) {
3242 			ud_iupdat(ip, 1);
3243 		}
3244 	}
3245 
3246 	/*
3247 	 * If we've already done a partial-write, terminate
3248 	 * the write but return no error.
3249 	 */
3250 	if (start_resid != uio->uio_resid) {
3251 		error = 0;
3252 	}
3253 	ip->i_flag &= ~(INOACC | ISYNC);
3254 	ITIMES_NOLOCK(ip);
3255 
3256 	return (error);
3257 }
3258 
3259 int32_t
3260 ud_multi_strat(struct ud_inode *ip,
3261 	page_t *pp, struct buf *bp, u_offset_t start)
3262 {
3263 	daddr_t bn;
3264 	int32_t error = 0, io_count, contig, alloc_sz, i;
3265 	uint32_t io_off;
3266 	mio_master_t *mm = NULL;
3267 	mio_slave_t *ms = NULL;
3268 	struct buf *rbp;
3269 
3270 	ASSERT(!(start & PAGEOFFSET));
3271 
3272 	/*
3273 	 * Figure out how many buffers to allocate
3274 	 */
3275 	io_count = 0;
3276 	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3277 		contig = 0;
3278 		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3279 					&bn, &contig)) {
3280 			goto end;
3281 		}
3282 		if (contig == 0) {
3283 			goto end;
3284 		}
3285 		contig = MIN(contig, PAGESIZE - io_off);
3286 		if (bn != UDF_HOLE) {
3287 			io_count ++;
3288 		} else {
3289 			/*
3290 			 * HOLE
3291 			 */
3292 			if (bp->b_flags & B_READ) {
3293 
3294 				/*
3295 				 * This is a hole and is read
3296 				 * it should be filled with 0's
3297 				 */
3298 				pagezero(pp, io_off, contig);
3299 			}
3300 		}
3301 	}
3302 
3303 
3304 	if (io_count != 0) {
3305 
3306 		/*
3307 		 * Allocate memory for all the
3308 		 * required number of buffers
3309 		 */
3310 		alloc_sz = sizeof (mio_master_t) +
3311 			(sizeof (mio_slave_t) * io_count);
3312 		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3313 		if (mm == NULL) {
3314 			error = ENOMEM;
3315 			goto end;
3316 		}
3317 
3318 		/*
3319 		 * initialize master
3320 		 */
3321 		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3322 		mm->mm_size = alloc_sz;
3323 		mm->mm_bp = bp;
3324 		mm->mm_resid = 0;
3325 		mm->mm_error = 0;
3326 		mm->mm_index = master_index++;
3327 
3328 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3329 
3330 		/*
3331 		 * Initialize buffers
3332 		 */
3333 		io_count = 0;
3334 		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3335 			contig = 0;
3336 			if (error = ud_bmap_read(ip,
3337 					(u_offset_t)(start + io_off),
3338 					&bn, &contig)) {
3339 				goto end;
3340 			}
3341 			ASSERT(contig);
3342 			if ((io_off + contig) > bp->b_bcount) {
3343 				contig = bp->b_bcount - io_off;
3344 			}
3345 			if (bn != UDF_HOLE) {
3346 				/*
3347 				 * Clone the buffer
3348 				 * and prepare to start I/O
3349 				 */
3350 				ms->ms_ptr = mm;
3351 				bioinit(&ms->ms_buf);
3352 				rbp = bioclone(bp, io_off, (size_t)contig,
3353 					bp->b_edev, bn, ud_slave_done,
3354 					&ms->ms_buf, KM_NOSLEEP);
3355 				ASSERT(rbp == &ms->ms_buf);
3356 				mm->mm_resid += contig;
3357 				io_count++;
3358 				ms ++;
3359 			}
3360 		}
3361 
3362 		/*
3363 		 * Start I/O's
3364 		 */
3365 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3366 		for (i = 0; i < io_count; i++) {
3367 			(void) bdev_strategy(&ms->ms_buf);
3368 			ms ++;
3369 		}
3370 	}
3371 
3372 end:
3373 	if (error != 0) {
3374 		bp->b_flags |= B_ERROR;
3375 		bp->b_error = error;
3376 		if (mm != NULL) {
3377 			mutex_destroy(&mm->mm_mutex);
3378 			kmem_free(mm, mm->mm_size);
3379 		}
3380 	}
3381 	return (error);
3382 }
3383 
3384 int32_t
3385 ud_slave_done(struct buf *bp)
3386 {
3387 	mio_master_t *mm;
3388 	int32_t resid;
3389 
3390 	ASSERT(SEMA_HELD(&bp->b_sem));
3391 	ASSERT((bp->b_flags & B_DONE) == 0);
3392 
3393 	mm = ((mio_slave_t *)bp)->ms_ptr;
3394 
3395 	/*
3396 	 * Propagate error and byte count info from slave struct to
3397 	 * the master struct
3398 	 */
3399 	mutex_enter(&mm->mm_mutex);
3400 	if (bp->b_flags & B_ERROR) {
3401 
3402 		/*
3403 		 * If multiple slave buffers get
3404 		 * error we forget the old errors
3405 		 * this is ok because we any way
3406 		 * cannot return multiple errors
3407 		 */
3408 		mm->mm_error = bp->b_error;
3409 	}
3410 	mm->mm_resid -= bp->b_bcount;
3411 	resid = mm->mm_resid;
3412 	mutex_exit(&mm->mm_mutex);
3413 
3414 	/*
3415 	 * free up the resources allocated to cloned buffers.
3416 	 */
3417 	bp_mapout(bp);
3418 	biofini(bp);
3419 
3420 	if (resid == 0) {
3421 
3422 		/*
3423 		 * This is the last I/O operation
3424 		 * clean up and return the original buffer
3425 		 */
3426 		if (mm->mm_error) {
3427 			mm->mm_bp->b_flags |= B_ERROR;
3428 			mm->mm_bp->b_error = mm->mm_error;
3429 		}
3430 		biodone(mm->mm_bp);
3431 		mutex_destroy(&mm->mm_mutex);
3432 		kmem_free(mm, mm->mm_size);
3433 	}
3434 	return (0);
3435 }
3436