xref: /titanic_44/usr/src/uts/common/fs/udfs/udf_vnops.c (revision 6185db853e024a486ff8837e6784dd290d866112)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/stat.h>
42 #include <sys/vnode.h>
43 #include <sys/mode.h>
44 #include <sys/proc.h>
45 #include <sys/disp.h>
46 #include <sys/file.h>
47 #include <sys/fcntl.h>
48 #include <sys/flock.h>
49 #include <sys/kmem.h>
50 #include <sys/uio.h>
51 #include <sys/dnlc.h>
52 #include <sys/conf.h>
53 #include <sys/errno.h>
54 #include <sys/mman.h>
55 #include <sys/fbuf.h>
56 #include <sys/pathname.h>
57 #include <sys/debug.h>
58 #include <sys/vmsystm.h>
59 #include <sys/cmn_err.h>
60 #include <sys/dirent.h>
61 #include <sys/errno.h>
62 #include <sys/modctl.h>
63 #include <sys/statvfs.h>
64 #include <sys/mount.h>
65 #include <sys/sunddi.h>
66 #include <sys/bootconf.h>
67 #include <sys/policy.h>
68 
69 #include <vm/hat.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/page.h>
79 #include <sys/swap.h>
80 
81 #include <fs/fs_subr.h>
82 
83 #include <sys/fs/udf_volume.h>
84 #include <sys/fs/udf_inode.h>
85 
86 static int32_t udf_open(struct vnode **,
87 	int32_t, struct cred *);
88 static int32_t udf_close(struct vnode *,
89 	int32_t, int32_t, offset_t, struct cred *);
90 static int32_t udf_read(struct vnode *,
91 	struct uio *, int32_t, struct cred *, struct caller_context *);
92 static int32_t udf_write(struct vnode *,
93 	struct uio *, int32_t, struct cred *, struct caller_context *);
94 static int32_t udf_ioctl(struct vnode *,
95 	int32_t, intptr_t, int32_t, struct cred *, int32_t *);
96 static int32_t udf_getattr(struct vnode *,
97 	struct vattr *, int32_t, struct cred *);
98 static int32_t udf_setattr(struct vnode *,
99 	struct vattr *, int32_t, struct cred *, caller_context_t *);
100 static int32_t udf_access(struct vnode *,
101 	int32_t, int32_t, struct cred *);
102 static int32_t udf_lookup(struct vnode *,
103 	char *, struct vnode **, struct pathname *,
104 	int32_t, struct vnode *, struct cred *);
105 static int32_t udf_create(struct vnode *,
106 	char *, struct vattr *, enum vcexcl,
107 	int32_t, struct vnode **, struct cred *, int32_t);
108 static int32_t udf_remove(struct vnode *,
109 	char *, struct cred *);
110 static int32_t udf_link(struct vnode *,
111 	struct vnode *, char *, struct cred *);
112 static int32_t udf_rename(struct vnode *,
113 	char *, struct vnode *, char *, struct cred *);
114 static int32_t udf_mkdir(struct vnode *,
115 	char *, struct vattr *, struct vnode **, struct cred *);
116 static int32_t udf_rmdir(struct vnode *,
117 	char *, struct vnode *, struct cred *);
118 static int32_t udf_readdir(struct vnode *,
119 	struct uio *, struct cred *, int32_t *);
120 static int32_t udf_symlink(struct vnode *,
121 	char *, struct vattr *, char *, struct cred *);
122 static int32_t udf_readlink(struct vnode *,
123 	struct uio *, struct cred *);
124 static int32_t udf_fsync(struct vnode *,
125 	int32_t, struct cred *);
126 static void udf_inactive(struct vnode *,
127 	struct cred *);
128 static int32_t udf_fid(struct vnode *, struct fid *);
129 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
130 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
131 static int32_t udf_seek(struct vnode *, offset_t, offset_t *);
132 static int32_t udf_frlock(struct vnode *, int32_t,
133 	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *);
134 static int32_t udf_space(struct vnode *, int32_t,
135 	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
136 static int32_t udf_getpage(struct vnode *, offset_t,
137 	size_t, uint32_t *, struct page **, size_t,
138 	struct seg *, caddr_t, enum seg_rw, struct cred *);
139 static int32_t udf_putpage(struct vnode *, offset_t,
140 	size_t, int32_t, struct cred *);
141 static int32_t udf_map(struct vnode *, offset_t, struct as *,
142 	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
143 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
144 	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *);
145 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
146 	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *);
147 static int32_t udf_l_pathconf(struct vnode *, int32_t,
148 	ulong_t *, struct cred *);
149 static int32_t udf_pageio(struct vnode *, struct page *,
150 	u_offset_t, size_t, int32_t, struct cred *);
151 
152 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
153 	size_t, struct seg *, caddr_t, page_t *pl[],
154 	size_t, enum seg_rw, int32_t);
155 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
156 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
157 int32_t ud_page_fill(struct ud_inode *, page_t *,
158 	u_offset_t, uint32_t, u_offset_t *);
159 int32_t ud_iodone(struct buf *);
160 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
161 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
162 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
163 int32_t ud_slave_done(struct buf *);
164 
165 /*
166  * Structures to control multiple IO operations to get or put pages
167  * that are backed by discontiguous blocks. The master struct is
168  * a dummy that holds the original bp from pageio_setup. The
169  * slave struct holds the working bp's to do the actual IO. Once
170  * all the slave IOs complete. The master is processed as if a single
171  * IO op has completed.
172  */
173 uint32_t master_index = 0;
174 typedef struct mio_master {
175 	kmutex_t	mm_mutex;	/* protect the fields below */
176 	int32_t		mm_size;
177 	buf_t		*mm_bp;		/* original bp */
178 	int32_t		mm_resid;	/* bytes remaining to transfer */
179 	int32_t		mm_error;	/* accumulated error from slaves */
180 	int32_t		mm_index;	/* XXX debugging */
181 } mio_master_t;
182 
183 typedef struct mio_slave {
184 	buf_t		ms_buf;		/* working buffer for this IO chunk */
185 	mio_master_t	*ms_ptr;	/* pointer to master */
186 } mio_slave_t;
187 
188 struct vnodeops *udf_vnodeops;
189 
190 const fs_operation_def_t udf_vnodeops_template[] = {
191 	VOPNAME_OPEN, udf_open,
192 	VOPNAME_CLOSE, udf_close,
193 	VOPNAME_READ, udf_read,
194 	VOPNAME_WRITE, udf_write,
195 	VOPNAME_IOCTL, udf_ioctl,
196 	VOPNAME_GETATTR, udf_getattr,
197 	VOPNAME_SETATTR, udf_setattr,
198 	VOPNAME_ACCESS, udf_access,
199 	VOPNAME_LOOKUP, udf_lookup,
200 	VOPNAME_CREATE, udf_create,
201 	VOPNAME_REMOVE, udf_remove,
202 	VOPNAME_LINK, udf_link,
203 	VOPNAME_RENAME, udf_rename,
204 	VOPNAME_MKDIR, udf_mkdir,
205 	VOPNAME_RMDIR, udf_rmdir,
206 	VOPNAME_READDIR, udf_readdir,
207 	VOPNAME_SYMLINK, udf_symlink,
208 	VOPNAME_READLINK, udf_readlink,
209 	VOPNAME_FSYNC, udf_fsync,
210 	VOPNAME_INACTIVE, (fs_generic_func_p) udf_inactive,
211 	VOPNAME_FID, udf_fid,
212 	VOPNAME_RWLOCK, udf_rwlock,
213 	VOPNAME_RWUNLOCK, (fs_generic_func_p) udf_rwunlock,
214 	VOPNAME_SEEK, udf_seek,
215 	VOPNAME_FRLOCK, udf_frlock,
216 	VOPNAME_SPACE, udf_space,
217 	VOPNAME_GETPAGE, udf_getpage,
218 	VOPNAME_PUTPAGE, udf_putpage,
219 	VOPNAME_MAP, (fs_generic_func_p) udf_map,
220 	VOPNAME_ADDMAP, (fs_generic_func_p) udf_addmap,
221 	VOPNAME_DELMAP, udf_delmap,
222 	VOPNAME_PATHCONF, udf_l_pathconf,
223 	VOPNAME_PAGEIO, udf_pageio,
224 	VOPNAME_VNEVENT, fs_vnevent_support,
225 	NULL, NULL
226 };
227 
228 /* ARGSUSED */
229 static int32_t
230 udf_open(struct vnode **vpp, int32_t flag, struct cred *cr)
231 {
232 	ud_printf("udf_open\n");
233 
234 	return (0);
235 }
236 
237 /* ARGSUSED */
238 static int32_t
239 udf_close(struct vnode *vp, int32_t flag,
240 	int32_t count, offset_t offset, struct cred *cr)
241 {
242 	struct ud_inode *ip = VTOI(vp);
243 
244 	ud_printf("udf_close\n");
245 
246 	ITIMES(ip);
247 
248 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
249 	cleanshares(vp, ttoproc(curthread)->p_pid);
250 
251 	/*
252 	 * Push partially filled cluster at last close.
253 	 * ``last close'' is approximated because the dnlc
254 	 * may have a hold on the vnode.
255 	 */
256 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
257 		struct ud_inode *ip = VTOI(vp);
258 		if (ip->i_delaylen) {
259 			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
260 					B_ASYNC | B_FREE, cr);
261 			ip->i_delaylen = 0;
262 		}
263 	}
264 
265 	return (0);
266 }
267 
268 static int32_t
269 udf_read(struct vnode *vp, struct uio *uiop,
270 	int32_t ioflag, struct cred *cr, struct caller_context *ct)
271 {
272 	struct ud_inode *ip = VTOI(vp);
273 	int32_t error;
274 
275 	ud_printf("udf_read\n");
276 
277 #ifdef	__lock_lint
278 	rw_enter(&ip->i_rwlock, RW_READER);
279 #endif
280 
281 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
282 
283 	if (MANDLOCK(vp, ip->i_char)) {
284 		/*
285 		 * udf_getattr ends up being called by chklock
286 		 */
287 		error = chklock(vp, FREAD, uiop->uio_loffset,
288 			uiop->uio_resid, uiop->uio_fmode, ct);
289 		if (error) {
290 			goto end;
291 		}
292 	}
293 
294 	rw_enter(&ip->i_contents, RW_READER);
295 	error = ud_rdip(ip, uiop, ioflag, cr);
296 	rw_exit(&ip->i_contents);
297 
298 end:
299 #ifdef	__lock_lint
300 	rw_exit(&ip->i_rwlock);
301 #endif
302 
303 	return (error);
304 }
305 
306 
307 int32_t ud_WRITES = 1;
308 int32_t ud_HW = 96 * 1024;
309 int32_t ud_LW = 64 * 1024;
310 int32_t ud_throttles = 0;
311 
312 static int32_t
313 udf_write(struct vnode *vp, struct uio *uiop,
314 	int32_t ioflag, struct cred *cr, struct caller_context *ct)
315 {
316 	struct ud_inode *ip = VTOI(vp);
317 	int32_t error = 0;
318 
319 	ud_printf("udf_write\n");
320 
321 #ifdef	__lock_lint
322 	rw_enter(&ip->i_rwlock, RW_WRITER);
323 #endif
324 
325 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
326 
327 	if (MANDLOCK(vp, ip->i_char)) {
328 		/*
329 		 * ud_getattr ends up being called by chklock
330 		 */
331 		error = chklock(vp, FWRITE, uiop->uio_loffset,
332 			uiop->uio_resid, uiop->uio_fmode, ct);
333 		if (error) {
334 			goto end;
335 		}
336 	}
337 	/*
338 	 * Throttle writes.
339 	 */
340 	mutex_enter(&ip->i_tlock);
341 	if (ud_WRITES && (ip->i_writes > ud_HW)) {
342 		while (ip->i_writes > ud_HW) {
343 			ud_throttles++;
344 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
345 		}
346 	}
347 	mutex_exit(&ip->i_tlock);
348 
349 	/*
350 	 * Write to the file
351 	 */
352 	rw_enter(&ip->i_contents, RW_WRITER);
353 	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
354 		/*
355 		 * In append mode start at end of file.
356 		 */
357 		uiop->uio_loffset = ip->i_size;
358 	}
359 	error = ud_wrip(ip, uiop, ioflag, cr);
360 	rw_exit(&ip->i_contents);
361 
362 end:
363 #ifdef	__lock_lint
364 	rw_exit(&ip->i_rwlock);
365 #endif
366 
367 	return (error);
368 }
369 
370 /* ARGSUSED */
371 static int32_t
372 udf_ioctl(struct vnode *vp, int32_t cmd, intptr_t arg,
373 	int32_t flag, struct cred *cr, int32_t *rvalp)
374 {
375 	return (ENOTTY);
376 }
377 
378 /* ARGSUSED */
379 static int32_t
380 udf_getattr(struct vnode *vp,
381 	struct vattr *vap, int32_t flags, struct cred *cr)
382 {
383 	struct ud_inode *ip = VTOI(vp);
384 
385 	ud_printf("udf_getattr\n");
386 
387 	if (vap->va_mask == AT_SIZE) {
388 		/*
389 		 * for performance, if only the size is requested don't bother
390 		 * with anything else.
391 		 */
392 		vap->va_size = ip->i_size;
393 		return (0);
394 	}
395 
396 	rw_enter(&ip->i_contents, RW_READER);
397 
398 	vap->va_type = vp->v_type;
399 	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
400 
401 	vap->va_uid = ip->i_uid;
402 	vap->va_gid = ip->i_gid;
403 	vap->va_fsid = ip->i_dev;
404 	vap->va_nodeid = ip->i_icb_lbano;
405 	vap->va_nlink = ip->i_nlink;
406 	vap->va_size = ip->i_size;
407 	vap->va_seq = ip->i_seq;
408 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
409 		vap->va_rdev = ip->i_rdev;
410 	} else {
411 		vap->va_rdev = 0;
412 	}
413 
414 	mutex_enter(&ip->i_tlock);
415 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
416 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
417 	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
418 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
419 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
420 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
421 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
422 	mutex_exit(&ip->i_tlock);
423 
424 	switch (ip->i_type) {
425 		case VBLK:
426 			vap->va_blksize = MAXBSIZE;
427 			break;
428 		case VCHR:
429 			vap->va_blksize = MAXBSIZE;
430 			break;
431 		default:
432 			vap->va_blksize = ip->i_udf->udf_lbsize;
433 			break;
434 	}
435 	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
436 
437 	rw_exit(&ip->i_contents);
438 
439 	return (0);
440 }
441 
442 static int
443 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
444 {
445 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr));
446 }
447 
448 /*ARGSUSED4*/
449 static int32_t
450 udf_setattr(
451 	struct vnode *vp,
452 	struct vattr *vap,
453 	int32_t flags,
454 	struct cred *cr,
455 	caller_context_t *ct)
456 {
457 	int32_t error = 0;
458 	uint32_t mask = vap->va_mask;
459 	struct ud_inode *ip;
460 	timestruc_t now;
461 	struct vattr ovap;
462 
463 	ud_printf("udf_setattr\n");
464 
465 	ip = VTOI(vp);
466 
467 	/*
468 	 * not updates allowed to 4096 files
469 	 */
470 	if (ip->i_astrat == STRAT_TYPE4096) {
471 		return (EINVAL);
472 	}
473 
474 	/*
475 	 * Cannot set these attributes
476 	 */
477 	if (mask & AT_NOSET) {
478 		return (EINVAL);
479 	}
480 
481 	rw_enter(&ip->i_rwlock, RW_WRITER);
482 	rw_enter(&ip->i_contents, RW_WRITER);
483 
484 	ovap.va_uid = ip->i_uid;
485 	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
486 	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
487 					    ud_iaccess_vmode, ip);
488 	if (error)
489 		goto update_inode;
490 
491 	mask = vap->va_mask;
492 	/*
493 	 * Change file access modes.
494 	 */
495 	if (mask & AT_MODE) {
496 		ip->i_perm = VA2UD_PERM(vap->va_mode);
497 		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
498 		mutex_enter(&ip->i_tlock);
499 		ip->i_flag |= ICHG;
500 		mutex_exit(&ip->i_tlock);
501 	}
502 	if (mask & (AT_UID|AT_GID)) {
503 		if (mask & AT_UID) {
504 			ip->i_uid = vap->va_uid;
505 		}
506 		if (mask & AT_GID) {
507 			ip->i_gid = vap->va_gid;
508 		}
509 		mutex_enter(&ip->i_tlock);
510 		ip->i_flag |= ICHG;
511 		mutex_exit(&ip->i_tlock);
512 	}
513 	/*
514 	 * Truncate file.  Must have write permission and not be a directory.
515 	 */
516 	if (mask & AT_SIZE) {
517 		if (vp->v_type == VDIR) {
518 			error = EISDIR;
519 			goto update_inode;
520 		}
521 		if (error = ud_iaccess(ip, IWRITE, cr)) {
522 			goto update_inode;
523 		}
524 		if (vap->va_size > MAXOFFSET_T) {
525 			error = EFBIG;
526 			goto update_inode;
527 		}
528 		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
529 			goto update_inode;
530 		}
531 	}
532 	/*
533 	 * Change file access or modified times.
534 	 */
535 	if (mask & (AT_ATIME|AT_MTIME)) {
536 		mutex_enter(&ip->i_tlock);
537 		if (mask & AT_ATIME) {
538 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
539 			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
540 			ip->i_flag &= ~IACC;
541 		}
542 		if (mask & AT_MTIME) {
543 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
544 			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
545 			gethrestime(&now);
546 			ip->i_ctime.tv_sec = now.tv_sec;
547 			ip->i_ctime.tv_nsec = now.tv_nsec;
548 			ip->i_flag &= ~(IUPD|ICHG);
549 			ip->i_flag |= IMODTIME;
550 		}
551 		ip->i_flag |= IMOD;
552 		mutex_exit(&ip->i_tlock);
553 	}
554 
555 update_inode:
556 	if (curthread->t_flag & T_DONTPEND) {
557 		ud_iupdat(ip, 1);
558 	} else {
559 		ITIMES_NOLOCK(ip);
560 	}
561 	rw_exit(&ip->i_contents);
562 	rw_exit(&ip->i_rwlock);
563 
564 	return (error);
565 }
566 
567 /* ARGSUSED */
568 static int32_t
569 udf_access(struct vnode *vp,
570 	int32_t mode, int32_t flags, struct cred *cr)
571 {
572 	struct ud_inode *ip = VTOI(vp);
573 	int32_t error;
574 
575 	ud_printf("udf_access\n");
576 
577 	if (ip->i_udf == NULL) {
578 		return (EIO);
579 	}
580 
581 	error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr);
582 
583 	return (error);
584 }
585 
586 int32_t udfs_stickyhack = 1;
587 
588 /* ARGSUSED */
589 static int32_t
590 udf_lookup(struct vnode *dvp,
591 	char *nm, struct vnode **vpp, struct pathname *pnp,
592 	int32_t flags, struct vnode *rdir, struct cred *cr)
593 {
594 	int32_t error;
595 	struct vnode *vp;
596 	struct ud_inode *ip, *xip;
597 
598 	ud_printf("udf_lookup\n");
599 	/*
600 	 * Null component name is a synonym for directory being searched.
601 	 */
602 	if (*nm == '\0') {
603 		VN_HOLD(dvp);
604 		*vpp = dvp;
605 		error = 0;
606 		goto out;
607 	}
608 
609 	/*
610 	 * Fast path: Check the directory name lookup cache.
611 	 */
612 	ip = VTOI(dvp);
613 	if (vp = dnlc_lookup(dvp, nm)) {
614 		/*
615 		 * Check accessibility of directory.
616 		 */
617 		if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) {
618 			VN_RELE(vp);
619 		}
620 		xip = VTOI(vp);
621 	} else {
622 		error = ud_dirlook(ip, nm, &xip, cr, 1);
623 		ITIMES(ip);
624 	}
625 
626 	if (error == 0) {
627 		ip = xip;
628 		*vpp = ITOV(ip);
629 		if ((ip->i_type != VDIR) &&
630 			(ip->i_char & ISVTX) &&
631 			((ip->i_perm & IEXEC) == 0) &&
632 			udfs_stickyhack) {
633 			mutex_enter(&(*vpp)->v_lock);
634 			(*vpp)->v_flag |= VISSWAP;
635 			mutex_exit(&(*vpp)->v_lock);
636 		}
637 		ITIMES(ip);
638 		/*
639 		 * If vnode is a device return special vnode instead.
640 		 */
641 		if (IS_DEVVP(*vpp)) {
642 			struct vnode *newvp;
643 			newvp = specvp(*vpp, (*vpp)->v_rdev,
644 					(*vpp)->v_type, cr);
645 			VN_RELE(*vpp);
646 			if (newvp == NULL) {
647 				error = ENOSYS;
648 			} else {
649 				*vpp = newvp;
650 			}
651 		}
652 	}
653 out:
654 	return (error);
655 }
656 
657 /* ARGSUSED */
658 static int32_t
659 udf_create(struct vnode *dvp,
660 	char *name, struct vattr *vap, enum vcexcl excl,
661 	int32_t mode, struct vnode **vpp, struct cred *cr, int32_t flag)
662 {
663 	int32_t error;
664 	struct ud_inode *ip = VTOI(dvp), *xip;
665 
666 	ud_printf("udf_create\n");
667 
668 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
669 		vap->va_mode &= ~VSVTX;
670 
671 	if (*name == '\0') {
672 		/*
673 		 * Null component name refers to the directory itself.
674 		 */
675 		VN_HOLD(dvp);
676 		ITIMES(ip);
677 		error = EEXIST;
678 	} else {
679 		xip = NULL;
680 		rw_enter(&ip->i_rwlock, RW_WRITER);
681 		error = ud_direnter(ip, name, DE_CREATE,
682 				(struct ud_inode *)0, (struct ud_inode *)0,
683 				vap, &xip, cr);
684 		rw_exit(&ip->i_rwlock);
685 		ITIMES(ip);
686 		ip = xip;
687 	}
688 #ifdef	__lock_lint
689 	rw_enter(&ip->i_contents, RW_WRITER);
690 #else
691 	if (ip != NULL) {
692 		rw_enter(&ip->i_contents, RW_WRITER);
693 	}
694 #endif
695 
696 	/*
697 	 * If the file already exists and this is a non-exclusive create,
698 	 * check permissions and allow access for non-directories.
699 	 * Read-only create of an existing directory is also allowed.
700 	 * We fail an exclusive create of anything which already exists.
701 	 */
702 	if (error == EEXIST) {
703 		if (excl == NONEXCL) {
704 			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
705 				error = EISDIR;
706 			} else if (mode) {
707 				error = ud_iaccess(ip,
708 					UD_UPERM2DPERM(mode), cr);
709 			} else {
710 				error = 0;
711 			}
712 		}
713 		if (error) {
714 			rw_exit(&ip->i_contents);
715 			VN_RELE(ITOV(ip));
716 			goto out;
717 		} else if ((ip->i_type == VREG) &&
718 			(vap->va_mask & AT_SIZE) && vap->va_size == 0) {
719 			/*
720 			 * Truncate regular files, if requested by caller.
721 			 * Grab i_rwlock to make sure no one else is
722 			 * currently writing to the file (we promised
723 			 * bmap we would do this).
724 			 * Must get the locks in the correct order.
725 			 */
726 			if (ip->i_size == 0) {
727 				ip->i_flag |= ICHG | IUPD;
728 			} else {
729 				rw_exit(&ip->i_contents);
730 				rw_enter(&ip->i_rwlock, RW_WRITER);
731 				rw_enter(&ip->i_contents, RW_WRITER);
732 				(void) ud_itrunc(ip, 0, 0, cr);
733 				rw_exit(&ip->i_rwlock);
734 			}
735 		}
736 	}
737 
738 	if (error == 0) {
739 		*vpp = ITOV(ip);
740 		ITIMES(ip);
741 	}
742 #ifdef	__lock_lint
743 	rw_exit(&ip->i_contents);
744 #else
745 	if (ip != NULL) {
746 		rw_exit(&ip->i_contents);
747 	}
748 #endif
749 	if (error) {
750 		goto out;
751 	}
752 
753 	/*
754 	 * If vnode is a device return special vnode instead.
755 	 */
756 	if (!error && IS_DEVVP(*vpp)) {
757 		struct vnode *newvp;
758 
759 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
760 		VN_RELE(*vpp);
761 		if (newvp == NULL) {
762 			error = ENOSYS;
763 			goto out;
764 		}
765 		*vpp = newvp;
766 	}
767 out:
768 	return (error);
769 }
770 
771 static int32_t
772 udf_remove(struct vnode *vp, char *nm, struct cred *cr)
773 {
774 	int32_t error;
775 	struct ud_inode *ip = VTOI(vp);
776 
777 	ud_printf("udf_remove\n");
778 
779 	rw_enter(&ip->i_rwlock, RW_WRITER);
780 	error = ud_dirremove(ip, nm,
781 		(struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr);
782 	rw_exit(&ip->i_rwlock);
783 	ITIMES(ip);
784 
785 	return (error);
786 }
787 
788 static int32_t
789 udf_link(struct vnode *tdvp,
790 	struct vnode *svp, char *tnm, struct cred *cr)
791 {
792 	int32_t error;
793 	struct vnode *realvp;
794 	struct ud_inode *sip;
795 	struct ud_inode *tdp;
796 
797 	ud_printf("udf_link\n");
798 	if (VOP_REALVP(svp, &realvp) == 0) {
799 		svp = realvp;
800 	}
801 
802 	/*
803 	 * Do not allow links to directories
804 	 */
805 	if (svp->v_type == VDIR) {
806 		return (EPERM);
807 	}
808 
809 	sip = VTOI(svp);
810 
811 	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
812 		return (EPERM);
813 
814 	tdp = VTOI(tdvp);
815 
816 	rw_enter(&tdp->i_rwlock, RW_WRITER);
817 	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
818 		sip, (struct vattr *)0, (struct ud_inode **)0, cr);
819 	rw_exit(&tdp->i_rwlock);
820 	ITIMES(sip);
821 	ITIMES(tdp);
822 
823 	return (error);
824 }
825 
826 /* ARGSUSED */
827 static int32_t
828 udf_rename(struct vnode *sdvp,
829 	char *snm, struct vnode *tdvp,
830 	char *tnm, struct cred *cr)
831 {
832 	int32_t error = 0;
833 	struct udf_vfs *udf_vfsp;
834 	struct ud_inode *sip;		/* source inode */
835 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
836 	struct vnode *realvp;
837 
838 	ud_printf("udf_rename\n");
839 
840 	if (VOP_REALVP(tdvp, &realvp) == 0) {
841 		tdvp = realvp;
842 	}
843 
844 	sdp = VTOI(sdvp);
845 	tdp = VTOI(tdvp);
846 
847 	udf_vfsp = sdp->i_udf;
848 
849 	mutex_enter(&udf_vfsp->udf_rename_lck);
850 	/*
851 	 * Look up inode of file we're supposed to rename.
852 	 */
853 	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
854 		mutex_exit(&udf_vfsp->udf_rename_lck);
855 		return (error);
856 	}
857 	/*
858 	 * be sure this is not a directory with another file system mounted
859 	 * over it.  If it is just give up the locks, and return with
860 	 * EBUSY
861 	 */
862 	if (vn_mountedvfs(ITOV(sip)) != NULL) {
863 		error = EBUSY;
864 		goto errout;
865 	}
866 	/*
867 	 * Make sure we can delete the source entry.  This requires
868 	 * write permission on the containing directory.  If that
869 	 * directory is "sticky" it further requires (except for
870 	 * privileged users) that the user own the directory or the
871 	 * source entry, or else have permission to write the source
872 	 * entry.
873 	 */
874 	rw_enter(&sdp->i_contents, RW_READER);
875 	rw_enter(&sip->i_contents, RW_READER);
876 	if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 ||
877 	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
878 		rw_exit(&sip->i_contents);
879 		rw_exit(&sdp->i_contents);
880 		ITIMES(sip);
881 		goto errout;
882 	}
883 
884 	/*
885 	 * Check for renaming '.' or '..' or alias of '.'
886 	 */
887 	if ((strcmp(snm, ".") == 0) ||
888 			(strcmp(snm, "..") == 0) ||
889 			(sdp == sip)) {
890 		error = EINVAL;
891 		rw_exit(&sip->i_contents);
892 		rw_exit(&sdp->i_contents);
893 		goto errout;
894 	}
895 	rw_exit(&sip->i_contents);
896 	rw_exit(&sdp->i_contents);
897 
898 
899 	/*
900 	 * Link source to the target.
901 	 */
902 	rw_enter(&tdp->i_rwlock, RW_WRITER);
903 	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
904 			(struct vattr *)0, (struct ud_inode **)0, cr)) {
905 		/*
906 		 * ESAME isn't really an error; it indicates that the
907 		 * operation should not be done because the source and target
908 		 * are the same file, but that no error should be reported.
909 		 */
910 		if (error == ESAME) {
911 			error = 0;
912 		}
913 		rw_exit(&tdp->i_rwlock);
914 		goto errout;
915 	}
916 	rw_exit(&tdp->i_rwlock);
917 
918 	rw_enter(&sdp->i_rwlock, RW_WRITER);
919 	/*
920 	 * Unlink the source.
921 	 * Remove the source entry.  ud_dirremove() checks that the entry
922 	 * still reflects sip, and returns an error if it doesn't.
923 	 * If the entry has changed just forget about it.  Release
924 	 * the source inode.
925 	 */
926 	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
927 			DR_RENAME, cr)) == ENOENT) {
928 		error = 0;
929 	}
930 	rw_exit(&sdp->i_rwlock);
931 errout:
932 	ITIMES(sdp);
933 	ITIMES(tdp);
934 	VN_RELE(ITOV(sip));
935 	mutex_exit(&udf_vfsp->udf_rename_lck);
936 
937 	return (error);
938 }
939 
940 static int32_t
941 udf_mkdir(struct vnode *dvp,
942 	char *dirname, struct vattr *vap,
943 	struct vnode **vpp, struct cred *cr)
944 {
945 	int32_t error;
946 	struct ud_inode *ip;
947 	struct ud_inode *xip;
948 
949 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
950 
951 	ud_printf("udf_mkdir\n");
952 
953 	ip = VTOI(dvp);
954 	rw_enter(&ip->i_rwlock, RW_WRITER);
955 	error = ud_direnter(ip, dirname, DE_MKDIR,
956 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr);
957 	rw_exit(&ip->i_rwlock);
958 	ITIMES(ip);
959 	if (error == 0) {
960 		ip = xip;
961 		*vpp = ITOV(ip);
962 		ITIMES(ip);
963 	} else if (error == EEXIST) {
964 		ITIMES(xip);
965 		VN_RELE(ITOV(xip));
966 	}
967 
968 	return (error);
969 }
970 
971 static int32_t
972 udf_rmdir(struct vnode *vp,
973 	char *nm, struct vnode *cdir, struct cred *cr)
974 {
975 	int32_t error;
976 	struct ud_inode *ip = VTOI(vp);
977 
978 	ud_printf("udf_rmdir\n");
979 
980 	rw_enter(&ip->i_rwlock, RW_WRITER);
981 	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, cr);
982 	rw_exit(&ip->i_rwlock);
983 	ITIMES(ip);
984 
985 	return (error);
986 }
987 
988 /* ARGSUSED */
989 static int32_t
990 udf_readdir(struct vnode *vp,
991 	struct uio *uiop, struct cred *cr, int32_t *eofp)
992 {
993 	struct ud_inode *ip;
994 	struct dirent64 *nd;
995 	struct udf_vfs *udf_vfsp;
996 	int32_t error = 0, len, outcount = 0;
997 	uint32_t dirsiz, offset;
998 	uint32_t bufsize, ndlen, dummy;
999 	caddr_t outbuf;
1000 	caddr_t outb, end_outb;
1001 	struct iovec *iovp;
1002 
1003 	uint8_t *dname;
1004 	int32_t length;
1005 
1006 	uint8_t *buf = NULL;
1007 
1008 	struct fbuf *fbp = NULL;
1009 	struct file_id *fid;
1010 	uint8_t *name;
1011 
1012 
1013 	ud_printf("udf_readdir\n");
1014 
1015 	ip = VTOI(vp);
1016 	udf_vfsp = ip->i_udf;
1017 
1018 	dirsiz = ip->i_size;
1019 	if ((uiop->uio_offset >= dirsiz) ||
1020 			(ip->i_nlink <= 0)) {
1021 		if (eofp) {
1022 			*eofp = 1;
1023 		}
1024 		return (0);
1025 	}
1026 
1027 	offset = uiop->uio_offset;
1028 	iovp = uiop->uio_iov;
1029 	bufsize = iovp->iov_len;
1030 
1031 	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1032 	end_outb = outb + bufsize;
1033 	nd = (struct dirent64 *)outbuf;
1034 
1035 	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1036 	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1037 
1038 	if (offset == 0) {
1039 		len = DIRENT64_RECLEN(1);
1040 		if (((caddr_t)nd + len) >= end_outb) {
1041 			error = EINVAL;
1042 			goto end;
1043 		}
1044 		nd->d_ino = ip->i_icb_lbano;
1045 		nd->d_reclen = (uint16_t)len;
1046 		nd->d_off = 0x10;
1047 		nd->d_name[0] = '.';
1048 		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1049 		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1050 		outcount++;
1051 	} else if (offset == 0x10) {
1052 		offset = 0;
1053 	}
1054 
1055 	while (offset < dirsiz) {
1056 		error = ud_get_next_fid(ip, &fbp,
1057 				offset, &fid, &name, buf);
1058 		if (error != 0) {
1059 			break;
1060 		}
1061 
1062 		if ((fid->fid_flags & FID_DELETED) == 0) {
1063 			if (fid->fid_flags & FID_PARENT) {
1064 
1065 				len = DIRENT64_RECLEN(2);
1066 				if (((caddr_t)nd + len) >= end_outb) {
1067 					error = EINVAL;
1068 					break;
1069 				}
1070 
1071 				nd->d_ino = ip->i_icb_lbano;
1072 				nd->d_reclen = (uint16_t)len;
1073 				nd->d_off = offset + FID_LEN(fid);
1074 				nd->d_name[0] = '.';
1075 				nd->d_name[1] = '.';
1076 				bzero(&nd->d_name[2],
1077 				    DIRENT64_NAMELEN(len) - 2);
1078 				nd = (struct dirent64 *)
1079 					((char *)nd + nd->d_reclen);
1080 			} else {
1081 				if ((error = ud_uncompress(fid->fid_idlen,
1082 						&length, name, dname)) != 0) {
1083 					break;
1084 				}
1085 				if (length == 0) {
1086 					offset += FID_LEN(fid);
1087 					continue;
1088 				}
1089 				len = DIRENT64_RECLEN(length);
1090 				if (((caddr_t)nd + len) >= end_outb) {
1091 					if (!outcount) {
1092 						error = EINVAL;
1093 					}
1094 					break;
1095 				}
1096 				(void) strncpy(nd->d_name,
1097 				    (caddr_t)dname, length);
1098 				bzero(&nd->d_name[length],
1099 				    DIRENT64_NAMELEN(len) - length);
1100 				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1101 					SWAP_16(fid->fid_icb.lad_ext_prn),
1102 					SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1103 					&dummy);
1104 				nd->d_reclen = (uint16_t)len;
1105 				nd->d_off = offset + FID_LEN(fid);
1106 				nd = (struct dirent64 *)
1107 					((char *)nd + nd->d_reclen);
1108 			}
1109 			outcount++;
1110 		}
1111 
1112 		offset += FID_LEN(fid);
1113 	}
1114 
1115 end:
1116 	if (fbp != NULL) {
1117 		fbrelse(fbp, S_OTHER);
1118 	}
1119 	ndlen = ((char *)nd - outbuf);
1120 	/*
1121 	 * In case of error do not call uiomove.
1122 	 * Return the error to the caller.
1123 	 */
1124 	if ((error == 0) && (ndlen != 0)) {
1125 		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1126 		uiop->uio_offset = offset;
1127 	}
1128 	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1129 	kmem_free((caddr_t)dname, 1024);
1130 	kmem_free(outbuf, (uint32_t)bufsize);
1131 	if (eofp && error == 0) {
1132 		*eofp = (uiop->uio_offset >= dirsiz);
1133 	}
1134 	return (error);
1135 }
1136 
1137 /* ARGSUSED */
1138 static int32_t
1139 udf_symlink(struct vnode *dvp,
1140 	char *linkname, struct vattr *vap,
1141 	char *target, struct cred *cr)
1142 {
1143 	int32_t error = 0, outlen;
1144 	uint32_t ioflag = 0;
1145 	struct ud_inode *ip, *dip = VTOI(dvp);
1146 
1147 	struct path_comp *pc;
1148 	int8_t *dname = NULL, *uname = NULL, *sp;
1149 
1150 	ud_printf("udf_symlink\n");
1151 
1152 	ip = (struct ud_inode *)0;
1153 	vap->va_type = VLNK;
1154 	vap->va_rdev = 0;
1155 
1156 	rw_enter(&dip->i_rwlock, RW_WRITER);
1157 	error = ud_direnter(dip, linkname, DE_CREATE,
1158 		(struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr);
1159 	rw_exit(&dip->i_rwlock);
1160 	if (error == 0) {
1161 		dname = kmem_zalloc(1024, KM_SLEEP);
1162 		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1163 
1164 		pc = (struct path_comp *)uname;
1165 		/*
1166 		 * If the first character in target is "/"
1167 		 * then skip it and create entry for it
1168 		 */
1169 		if (*target == '/') {
1170 			pc->pc_type = 2;
1171 			pc->pc_len = 0;
1172 			pc = (struct path_comp *)(((char *)pc) + 4);
1173 			while (*target == '/') {
1174 				target++;
1175 			}
1176 		}
1177 
1178 		while (*target != NULL) {
1179 			sp = target;
1180 			while ((*target != '/') && (*target != '\0')) {
1181 				target ++;
1182 			}
1183 			/*
1184 			 * We got the next component of the
1185 			 * path name. Create path_comp of
1186 			 * appropriate type
1187 			 */
1188 			if (((target - sp) == 1) && (*sp == '.')) {
1189 				/*
1190 				 * Dot entry.
1191 				 */
1192 				pc->pc_type = 4;
1193 				pc = (struct path_comp *)(((char *)pc) + 4);
1194 			} else if (((target - sp) == 2) &&
1195 				(*sp == '.') && ((*(sp + 1)) == '.')) {
1196 				/*
1197 				 * DotDot entry.
1198 				 */
1199 				pc->pc_type = 3;
1200 				pc = (struct path_comp *)(((char *)pc) + 4);
1201 			} else {
1202 				/*
1203 				 * convert the user given name
1204 				 * into appropriate form to be put
1205 				 * on the media
1206 				 */
1207 				outlen = 1024;	/* set to size of dname */
1208 				if (error = ud_compress(target - sp, &outlen,
1209 					(uint8_t *)sp, (uint8_t *)dname)) {
1210 					break;
1211 				}
1212 				pc->pc_type = 5;
1213 				/* LINTED */
1214 				pc->pc_len = outlen;
1215 				dname[outlen] = '\0';
1216 				(void) strcpy((char *)pc->pc_id, dname);
1217 				pc = (struct path_comp *)
1218 					(((char *)pc) + 4 + outlen);
1219 			}
1220 			while (*target == '/') {
1221 				target++;
1222 			}
1223 			if (*target == NULL) {
1224 				break;
1225 			}
1226 		}
1227 
1228 		rw_enter(&ip->i_contents, RW_WRITER);
1229 		if (error == 0) {
1230 			ioflag = FWRITE;
1231 			if (curthread->t_flag & T_DONTPEND) {
1232 				ioflag |= FDSYNC;
1233 			}
1234 			error = ud_rdwri(UIO_WRITE, ioflag, ip,
1235 				uname, ((int8_t *)pc) - uname,
1236 				(offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1237 		}
1238 		if (error) {
1239 			ud_idrop(ip);
1240 			rw_exit(&ip->i_contents);
1241 			rw_enter(&dip->i_rwlock, RW_WRITER);
1242 			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1243 					(struct vnode *)0, DR_REMOVE, cr);
1244 			rw_exit(&dip->i_rwlock);
1245 			goto update_inode;
1246 		}
1247 		rw_exit(&ip->i_contents);
1248 	}
1249 
1250 	if ((error == 0) || (error == EEXIST)) {
1251 		VN_RELE(ITOV(ip));
1252 	}
1253 
1254 update_inode:
1255 	ITIMES(VTOI(dvp));
1256 	if (uname != NULL) {
1257 		kmem_free(uname, PAGESIZE);
1258 	}
1259 	if (dname != NULL) {
1260 		kmem_free(dname, 1024);
1261 	}
1262 
1263 	return (error);
1264 }
1265 
1266 /* ARGSUSED */
1267 static int32_t
1268 udf_readlink(struct vnode *vp,
1269 	struct uio *uiop, struct cred *cr)
1270 {
1271 	int32_t error = 0, off, id_len, size, len;
1272 	int8_t *dname = NULL, *uname = NULL;
1273 	struct ud_inode *ip;
1274 	struct fbuf *fbp = NULL;
1275 	struct path_comp *pc;
1276 
1277 	ud_printf("udf_readlink\n");
1278 
1279 	if (vp->v_type != VLNK) {
1280 		return (EINVAL);
1281 	}
1282 
1283 	ip = VTOI(vp);
1284 	size = ip->i_size;
1285 	if (size > PAGESIZE) {
1286 		return (EIO);
1287 	}
1288 
1289 	if (size == 0) {
1290 		return (0);
1291 	}
1292 
1293 	dname = kmem_zalloc(1024, KM_SLEEP);
1294 	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1295 
1296 	rw_enter(&ip->i_contents, RW_READER);
1297 
1298 	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1299 		goto end;
1300 	}
1301 
1302 	off = 0;
1303 
1304 	while (off < size) {
1305 		pc = (struct path_comp *)(fbp->fb_addr + off);
1306 		switch (pc->pc_type) {
1307 			case 1 :
1308 				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
1309 				(void) strcat(uname, "/");
1310 				break;
1311 			case 2 :
1312 				if (pc->pc_len != 0) {
1313 					goto end;
1314 				}
1315 				uname[0] = '/';
1316 				uname[1] = '\0';
1317 				break;
1318 			case 3 :
1319 				(void) strcat(uname, "../");
1320 				break;
1321 			case 4 :
1322 				(void) strcat(uname, "./");
1323 				break;
1324 			case 5 :
1325 				if ((error = ud_uncompress(pc->pc_len, &id_len,
1326 					pc->pc_id, (uint8_t *)dname)) != 0) {
1327 					break;
1328 				}
1329 				dname[id_len] = '\0';
1330 				(void) strcat(uname, dname);
1331 				(void) strcat(uname, "/");
1332 				break;
1333 			default :
1334 				error = EINVAL;
1335 				goto end;
1336 		}
1337 		off += 4 + pc->pc_len;
1338 	}
1339 	len = strlen(uname) - 1;
1340 	if (uname[len] == '/') {
1341 		if (len == 0) {
1342 			/*
1343 			 * special case link to /
1344 			 */
1345 			len = 1;
1346 		} else {
1347 			uname[len] = '\0';
1348 		}
1349 	}
1350 
1351 	error = uiomove(uname, len, UIO_READ, uiop);
1352 
1353 	ITIMES(ip);
1354 
1355 end:
1356 	if (fbp != NULL) {
1357 		fbrelse(fbp, S_OTHER);
1358 	}
1359 	rw_exit(&ip->i_contents);
1360 	if (uname != NULL) {
1361 		kmem_free(uname, PAGESIZE);
1362 	}
1363 	if (dname != NULL) {
1364 		kmem_free(dname, 1024);
1365 	}
1366 	return (error);
1367 }
1368 
1369 /* ARGSUSED */
1370 static int32_t
1371 udf_fsync(struct vnode *vp,
1372 	int32_t syncflag, struct cred *cr)
1373 {
1374 	int32_t error = 0;
1375 	struct ud_inode *ip = VTOI(vp);
1376 
1377 	ud_printf("udf_fsync\n");
1378 
1379 	rw_enter(&ip->i_contents, RW_WRITER);
1380 	if (!(IS_SWAPVP(vp))) {
1381 		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1382 	}
1383 	if (error == 0) {
1384 		error = ud_sync_indir(ip);
1385 	}
1386 	ITIMES(ip);		/* XXX: is this necessary ??? */
1387 	rw_exit(&ip->i_contents);
1388 
1389 	return (error);
1390 }
1391 
1392 /* ARGSUSED */
1393 static void
1394 udf_inactive(struct vnode *vp, struct cred *cr)
1395 {
1396 	ud_printf("udf_iinactive\n");
1397 
1398 	ud_iinactive(VTOI(vp), cr);
1399 }
1400 
1401 static int32_t
1402 udf_fid(struct vnode *vp, struct fid *fidp)
1403 {
1404 	struct udf_fid *udfidp;
1405 	struct ud_inode *ip = VTOI(vp);
1406 
1407 	ud_printf("udf_fid\n");
1408 
1409 	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1410 		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1411 		return (ENOSPC);
1412 	}
1413 
1414 	udfidp = (struct udf_fid *)fidp;
1415 	bzero((char *)udfidp, sizeof (struct udf_fid));
1416 	rw_enter(&ip->i_contents, RW_READER);
1417 	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1418 	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1419 	udfidp->udfid_prn = ip->i_icb_prn;
1420 	udfidp->udfid_icb_lbn = ip->i_icb_block;
1421 	rw_exit(&ip->i_contents);
1422 
1423 	return (0);
1424 }
1425 
1426 /* ARGSUSED2 */
1427 static int
1428 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1429 {
1430 	struct ud_inode *ip = VTOI(vp);
1431 
1432 	ud_printf("udf_rwlock\n");
1433 
1434 	if (write_lock) {
1435 		rw_enter(&ip->i_rwlock, RW_WRITER);
1436 	} else {
1437 		rw_enter(&ip->i_rwlock, RW_READER);
1438 	}
1439 #ifdef	__lock_lint
1440 	rw_exit(&ip->i_rwlock);
1441 #endif
1442 	return (write_lock);
1443 }
1444 
1445 /* ARGSUSED */
1446 static void
1447 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1448 {
1449 	struct ud_inode *ip = VTOI(vp);
1450 
1451 	ud_printf("udf_rwunlock\n");
1452 
1453 #ifdef	__lock_lint
1454 	rw_enter(&ip->i_rwlock, RW_WRITER);
1455 #endif
1456 
1457 	rw_exit(&ip->i_rwlock);
1458 
1459 }
1460 
1461 /* ARGSUSED */
1462 static int32_t
1463 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1464 {
1465 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1466 }
1467 
1468 static int32_t
1469 udf_frlock(struct vnode *vp, int32_t cmd, struct flock64 *bfp,
1470 	int32_t flag, offset_t offset, struct flk_callback *flk_cbp,
1471 	cred_t *cr)
1472 {
1473 	struct ud_inode *ip = VTOI(vp);
1474 
1475 	ud_printf("udf_frlock\n");
1476 
1477 	/*
1478 	 * If file is being mapped, disallow frlock.
1479 	 * XXX I am not holding tlock while checking i_mapcnt because the
1480 	 * current locking strategy drops all locks before calling fs_frlock.
1481 	 * So, mapcnt could change before we enter fs_frlock making is
1482 	 * meaningless to have held tlock in the first place.
1483 	 */
1484 	if ((ip->i_mapcnt > 0) &&
1485 		(MANDLOCK(vp, ip->i_char))) {
1486 		return (EAGAIN);
1487 	}
1488 
1489 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
1490 }
1491 
1492 /*ARGSUSED6*/
1493 static int32_t
1494 udf_space(
1495 	struct vnode *vp,
1496 	int32_t cmd,
1497 	struct flock64 *bfp,
1498 	int32_t flag,
1499 	offset_t offset,
1500 	cred_t *cr,
1501 	caller_context_t *ct)
1502 {
1503 	int32_t error = 0;
1504 
1505 	ud_printf("udf_space\n");
1506 
1507 	if (cmd != F_FREESP) {
1508 		error =  EINVAL;
1509 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1510 		error = ud_freesp(vp, bfp, flag, cr);
1511 	}
1512 
1513 	return (error);
1514 }
1515 
1516 /* ARGSUSED */
1517 static int32_t
1518 udf_getpage(struct vnode *vp, offset_t off,
1519 	size_t len, uint32_t *protp, struct page  **plarr,
1520 	size_t plsz, struct seg *seg, caddr_t addr,
1521 	enum seg_rw rw, struct cred *cr)
1522 {
1523 	struct ud_inode *ip = VTOI(vp);
1524 	int32_t error, has_holes, beyond_eof, seqmode, dolock;
1525 	int32_t pgsize = PAGESIZE;
1526 	struct udf_vfs *udf_vfsp = ip->i_udf;
1527 	page_t **pl;
1528 	u_offset_t pgoff, eoff, uoff;
1529 	krw_t rwtype;
1530 	caddr_t pgaddr;
1531 
1532 	ud_printf("udf_getpage\n");
1533 
1534 	uoff = (u_offset_t)off; /* type conversion */
1535 	if (protp) {
1536 		*protp = PROT_ALL;
1537 	}
1538 	if (vp->v_flag & VNOMAP) {
1539 		return (ENOSYS);
1540 	}
1541 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1542 
1543 	rwtype = RW_READER;
1544 	dolock = (rw_owner(&ip->i_contents) != curthread);
1545 retrylock:
1546 #ifdef	__lock_lint
1547 	rw_enter(&ip->i_contents, rwtype);
1548 #else
1549 	if (dolock) {
1550 		rw_enter(&ip->i_contents, rwtype);
1551 	}
1552 #endif
1553 
1554 	/*
1555 	 * We may be getting called as a side effect of a bmap using
1556 	 * fbread() when the blocks might be being allocated and the
1557 	 * size has not yet been up'ed.  In this case we want to be
1558 	 * able to return zero pages if we get back UDF_HOLE from
1559 	 * calling bmap for a non write case here.  We also might have
1560 	 * to read some frags from the disk into a page if we are
1561 	 * extending the number of frags for a given lbn in bmap().
1562 	 */
1563 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1564 	if (beyond_eof && seg != segkmap) {
1565 #ifdef	__lock_lint
1566 		rw_exit(&ip->i_contents);
1567 #else
1568 		if (dolock) {
1569 			rw_exit(&ip->i_contents);
1570 		}
1571 #endif
1572 		return (EFAULT);
1573 	}
1574 
1575 	/*
1576 	 * Must hold i_contents lock throughout the call to pvn_getpages
1577 	 * since locked pages are returned from each call to ud_getapage.
1578 	 * Must *not* return locked pages and then try for contents lock
1579 	 * due to lock ordering requirements (inode > page)
1580 	 */
1581 
1582 	has_holes = ud_bmap_has_holes(ip);
1583 
1584 	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1585 		int32_t	blk_size, count;
1586 		u_offset_t offset;
1587 
1588 		/*
1589 		 * We must acquire the RW_WRITER lock in order to
1590 		 * call bmap_write().
1591 		 */
1592 		if (dolock && rwtype == RW_READER) {
1593 			rwtype = RW_WRITER;
1594 
1595 			if (!rw_tryupgrade(&ip->i_contents)) {
1596 
1597 				rw_exit(&ip->i_contents);
1598 
1599 				goto retrylock;
1600 			}
1601 		}
1602 
1603 		/*
1604 		 * May be allocating disk blocks for holes here as
1605 		 * a result of mmap faults. write(2) does the bmap_write
1606 		 * in rdip/wrip, not here. We are not dealing with frags
1607 		 * in this case.
1608 		 */
1609 		offset = uoff;
1610 		while ((offset < uoff + len) &&
1611 			(offset < ip->i_size)) {
1612 			/*
1613 			 * the variable "bnp" is to simplify the expression for
1614 			 * the compiler; * just passing in &bn to bmap_write
1615 			 * causes a compiler "loop"
1616 			 */
1617 
1618 			blk_size = udf_vfsp->udf_lbsize;
1619 			if ((offset + blk_size) > ip->i_size) {
1620 				count = ip->i_size - offset;
1621 			} else {
1622 				count = blk_size;
1623 			}
1624 			error = ud_bmap_write(ip, offset, count, 0, cr);
1625 			if (error) {
1626 				goto update_inode;
1627 			}
1628 			offset += count; /* XXX - make this contig */
1629 		}
1630 	}
1631 
1632 	/*
1633 	 * Can be a reader from now on.
1634 	 */
1635 #ifdef	__lock_lint
1636 	if (rwtype == RW_WRITER) {
1637 		rw_downgrade(&ip->i_contents);
1638 	}
1639 #else
1640 	if (dolock && rwtype == RW_WRITER) {
1641 		rw_downgrade(&ip->i_contents);
1642 	}
1643 #endif
1644 
1645 	/*
1646 	 * We remove PROT_WRITE in cases when the file has UDF holes
1647 	 * because we don't  want to call bmap_read() to check each
1648 	 * page if it is backed with a disk block.
1649 	 */
1650 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1651 		*protp &= ~PROT_WRITE;
1652 	}
1653 
1654 	error = 0;
1655 
1656 	/*
1657 	 * The loop looks up pages in the range <off, off + len).
1658 	 * For each page, we first check if we should initiate an asynchronous
1659 	 * read ahead before we call page_lookup (we may sleep in page_lookup
1660 	 * for a previously initiated disk read).
1661 	 */
1662 	eoff = (uoff + len);
1663 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
1664 			pgoff < eoff; /* empty */) {
1665 		page_t	*pp;
1666 		u_offset_t	nextrio;
1667 		se_t	se;
1668 
1669 		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1670 
1671 		/*
1672 		 * Handle async getpage (faultahead)
1673 		 */
1674 		if (plarr == NULL) {
1675 			ip->i_nextrio = pgoff;
1676 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1677 			pgoff += pgsize;
1678 			pgaddr += pgsize;
1679 			continue;
1680 		}
1681 
1682 		/*
1683 		 * Check if we should initiate read ahead of next cluster.
1684 		 * We call page_exists only when we need to confirm that
1685 		 * we have the current page before we initiate the read ahead.
1686 		 */
1687 		nextrio = ip->i_nextrio;
1688 		if (seqmode &&
1689 		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1690 		    nextrio < ip->i_size && page_exists(vp, pgoff))
1691 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1692 
1693 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1694 
1695 			/*
1696 			 * We found the page in the page cache.
1697 			 */
1698 			*pl++ = pp;
1699 			pgoff += pgsize;
1700 			pgaddr += pgsize;
1701 			len -= pgsize;
1702 			plsz -= pgsize;
1703 		} else  {
1704 
1705 			/*
1706 			 * We have to create the page, or read it from disk.
1707 			 */
1708 			if (error = ud_getpage_miss(vp, pgoff, len,
1709 				seg, pgaddr, pl, plsz, rw, seqmode)) {
1710 				goto error_out;
1711 			}
1712 
1713 			while (*pl != NULL) {
1714 				pl++;
1715 				pgoff += pgsize;
1716 				pgaddr += pgsize;
1717 				len -= pgsize;
1718 				plsz -= pgsize;
1719 			}
1720 		}
1721 	}
1722 
1723 	/*
1724 	 * Return pages up to plsz if they are in the page cache.
1725 	 * We cannot return pages if there is a chance that they are
1726 	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1727 	 */
1728 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1729 
1730 		ASSERT((protp == NULL) ||
1731 			!(has_holes && (*protp & PROT_WRITE)));
1732 
1733 		eoff = pgoff + plsz;
1734 		while (pgoff < eoff) {
1735 			page_t		*pp;
1736 
1737 			if ((pp = page_lookup_nowait(vp, pgoff,
1738 			    SE_SHARED)) == NULL)
1739 				break;
1740 
1741 			*pl++ = pp;
1742 			pgoff += pgsize;
1743 			plsz -= pgsize;
1744 		}
1745 	}
1746 
1747 	if (plarr)
1748 		*pl = NULL;			/* Terminate page list */
1749 	ip->i_nextr = pgoff;
1750 
1751 error_out:
1752 	if (error && plarr) {
1753 		/*
1754 		 * Release any pages we have locked.
1755 		 */
1756 		while (pl > &plarr[0])
1757 			page_unlock(*--pl);
1758 
1759 		plarr[0] = NULL;
1760 	}
1761 
1762 update_inode:
1763 #ifdef	__lock_lint
1764 	rw_exit(&ip->i_contents);
1765 #else
1766 	if (dolock) {
1767 		rw_exit(&ip->i_contents);
1768 	}
1769 #endif
1770 
1771 	/*
1772 	 * If the inode is not already marked for IACC (in rwip() for read)
1773 	 * and the inode is not marked for no access time update (in rwip()
1774 	 * for write) then update the inode access time and mod time now.
1775 	 */
1776 	mutex_enter(&ip->i_tlock);
1777 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
1778 		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1779 			ip->i_flag |= IACC;
1780 		}
1781 		if (rw == S_WRITE) {
1782 			ip->i_flag |= IUPD;
1783 		}
1784 		ITIMES_NOLOCK(ip);
1785 	}
1786 	mutex_exit(&ip->i_tlock);
1787 
1788 	return (error);
1789 }
1790 
1791 int32_t ud_delay = 1;
1792 
1793 /* ARGSUSED */
1794 static int32_t
1795 udf_putpage(struct vnode *vp, offset_t off,
1796 	size_t len, int32_t flags, struct cred *cr)
1797 {
1798 	struct ud_inode *ip;
1799 	int32_t error = 0;
1800 
1801 	ud_printf("udf_putpage\n");
1802 
1803 	ip = VTOI(vp);
1804 #ifdef	__lock_lint
1805 	rw_enter(&ip->i_contents, RW_WRITER);
1806 #endif
1807 
1808 	if (vp->v_count == 0) {
1809 		cmn_err(CE_WARN, "ud_putpage : bad v_count");
1810 		error = EINVAL;
1811 		goto out;
1812 	}
1813 
1814 	if (vp->v_flag & VNOMAP) {
1815 		error = ENOSYS;
1816 		goto out;
1817 	}
1818 
1819 	if (flags & B_ASYNC) {
1820 		if (ud_delay && len &&
1821 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1822 			mutex_enter(&ip->i_tlock);
1823 
1824 			/*
1825 			 * If nobody stalled, start a new cluster.
1826 			 */
1827 			if (ip->i_delaylen == 0) {
1828 				ip->i_delayoff = off;
1829 				ip->i_delaylen = len;
1830 				mutex_exit(&ip->i_tlock);
1831 				goto out;
1832 			}
1833 
1834 			/*
1835 			 * If we have a full cluster or they are not contig,
1836 			 * then push last cluster and start over.
1837 			 */
1838 			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1839 			    ip->i_delayoff + ip->i_delaylen != off) {
1840 				u_offset_t doff;
1841 				size_t dlen;
1842 
1843 				doff = ip->i_delayoff;
1844 				dlen = ip->i_delaylen;
1845 				ip->i_delayoff = off;
1846 				ip->i_delaylen = len;
1847 				mutex_exit(&ip->i_tlock);
1848 				error = ud_putpages(vp, doff, dlen, flags, cr);
1849 				/* LMXXX - flags are new val, not old */
1850 				goto out;
1851 			}
1852 
1853 			/*
1854 			 * There is something there, it's not full, and
1855 			 * it is contig.
1856 			 */
1857 			ip->i_delaylen += len;
1858 			mutex_exit(&ip->i_tlock);
1859 			goto out;
1860 		}
1861 
1862 		/*
1863 		 * Must have weird flags or we are not clustering.
1864 		 */
1865 	}
1866 
1867 	error = ud_putpages(vp, off, len, flags, cr);
1868 
1869 out:
1870 #ifdef	__lock_lint
1871 	rw_exit(&ip->i_contents);
1872 #endif
1873 	return (error);
1874 }
1875 
1876 static int32_t
1877 udf_map(struct vnode *vp, offset_t off,
1878 	struct as *as, caddr_t *addrp, size_t len,
1879 	uint8_t prot, uint8_t maxprot, uint32_t flags,
1880 	struct cred *cr)
1881 {
1882 	struct segvn_crargs vn_a;
1883 	int32_t error = 0;
1884 
1885 	ud_printf("udf_map\n");
1886 
1887 	if (vp->v_flag & VNOMAP) {
1888 		error = ENOSYS;
1889 		goto end;
1890 	}
1891 
1892 	if ((off < (offset_t)0) ||
1893 		((off + len) < (offset_t)0)) {
1894 		error = EINVAL;
1895 		goto end;
1896 	}
1897 
1898 	if (vp->v_type != VREG) {
1899 		error = ENODEV;
1900 		goto end;
1901 	}
1902 
1903 	/*
1904 	 * If file is being locked, disallow mapping.
1905 	 */
1906 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
1907 		error = EAGAIN;
1908 		goto end;
1909 	}
1910 
1911 	as_rangelock(as);
1912 	if ((flags & MAP_FIXED) == 0) {
1913 		map_addr(addrp, len, off, 1, flags);
1914 		if (*addrp == NULL) {
1915 			as_rangeunlock(as);
1916 			error = ENOMEM;
1917 			goto end;
1918 		}
1919 	} else {
1920 		/*
1921 		 * User specified address - blow away any previous mappings
1922 		 */
1923 		(void) as_unmap(as, *addrp, len);
1924 	}
1925 
1926 	vn_a.vp = vp;
1927 	vn_a.offset = off;
1928 	vn_a.type = flags & MAP_TYPE;
1929 	vn_a.prot = prot;
1930 	vn_a.maxprot = maxprot;
1931 	vn_a.cred = cr;
1932 	vn_a.amp = NULL;
1933 	vn_a.flags = flags & ~MAP_TYPE;
1934 	vn_a.szc = 0;
1935 	vn_a.lgrp_mem_policy_flags = 0;
1936 
1937 	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
1938 	as_rangeunlock(as);
1939 
1940 end:
1941 	return (error);
1942 }
1943 
1944 /* ARGSUSED */
1945 static int32_t
1946 udf_addmap(struct vnode *vp, offset_t off,
1947 	struct as *as, caddr_t addr, size_t len,
1948 	uint8_t prot, uint8_t maxprot, uint32_t flags,
1949 	struct cred *cr)
1950 {
1951 	struct ud_inode *ip = VTOI(vp);
1952 
1953 	ud_printf("udf_addmap\n");
1954 
1955 	if (vp->v_flag & VNOMAP) {
1956 		return (ENOSYS);
1957 	}
1958 
1959 	mutex_enter(&ip->i_tlock);
1960 	ip->i_mapcnt += btopr(len);
1961 	mutex_exit(&ip->i_tlock);
1962 
1963 	return (0);
1964 }
1965 
1966 /* ARGSUSED */
1967 static int32_t
1968 udf_delmap(struct vnode *vp, offset_t off,
1969 	struct as *as, caddr_t addr, size_t len,
1970 	uint32_t prot, uint32_t maxprot, uint32_t flags,
1971 	struct cred *cr)
1972 {
1973 	struct ud_inode *ip = VTOI(vp);
1974 
1975 	ud_printf("udf_delmap\n");
1976 
1977 	if (vp->v_flag & VNOMAP) {
1978 		return (ENOSYS);
1979 	}
1980 
1981 	mutex_enter(&ip->i_tlock);
1982 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
1983 	ASSERT(ip->i_mapcnt >= 0);
1984 	mutex_exit(&ip->i_tlock);
1985 
1986 	return (0);
1987 }
1988 
1989 static int32_t
1990 udf_l_pathconf(struct vnode *vp, int32_t cmd,
1991 	ulong_t *valp, struct cred *cr)
1992 {
1993 	int32_t error = 0;
1994 
1995 	ud_printf("udf_l_pathconf\n");
1996 
1997 	if (cmd == _PC_FILESIZEBITS) {
1998 		/*
1999 		 * udf supports 64 bits as file size
2000 		 * but there are several other restrictions
2001 		 * it only supports 32-bit block numbers and
2002 		 * daddr32_t is only and int32_t so taking these
2003 		 * into account we can stay just as where ufs is
2004 		 */
2005 		*valp = 41;
2006 	} else {
2007 		error = fs_pathconf(vp, cmd, valp, cr);
2008 	}
2009 
2010 	return (error);
2011 }
2012 
2013 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2014 #ifndef	__lint
2015 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2016 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2017 #endif
2018 /*
2019  * Assumption is that there will not be a pageio request
2020  * to a enbedded file
2021  */
2022 /* ARGSUSED */
2023 static int32_t
2024 udf_pageio(struct vnode *vp, struct page *pp,
2025 	u_offset_t io_off, size_t io_len,
2026 	int32_t flags, struct cred *cr)
2027 {
2028 	daddr_t bn;
2029 	struct buf *bp;
2030 	struct ud_inode *ip = VTOI(vp);
2031 	int32_t dolock, error = 0, contig, multi_io;
2032 	size_t done_len = 0, cur_len = 0;
2033 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
2034 
2035 	if (pp == NULL) {
2036 		return (EINVAL);
2037 	}
2038 
2039 	dolock = (rw_owner(&ip->i_contents) != curthread);
2040 
2041 	/*
2042 	 * We need a better check.  Ideally, we would use another
2043 	 * vnodeops so that hlocked and forcibly unmounted file
2044 	 * systems would return EIO where appropriate and w/o the
2045 	 * need for these checks.
2046 	 */
2047 	if (ip->i_udf == NULL) {
2048 		return (EIO);
2049 	}
2050 
2051 #ifdef	__lock_lint
2052 	rw_enter(&ip->i_contents, RW_READER);
2053 #else
2054 	if (dolock) {
2055 		rw_enter(&ip->i_contents, RW_READER);
2056 	}
2057 #endif
2058 
2059 	/*
2060 	 * Break the io request into chunks, one for each contiguous
2061 	 * stretch of disk blocks in the target file.
2062 	 */
2063 	while (done_len < io_len) {
2064 		ASSERT(cpp);
2065 		bp = NULL;
2066 		contig = 0;
2067 		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2068 					&bn, &contig)) {
2069 			break;
2070 		}
2071 
2072 		if (bn == UDF_HOLE) {   /* No holey swapfiles */
2073 			cmn_err(CE_WARN, "SWAP file has HOLES");
2074 			error = EINVAL;
2075 			break;
2076 		}
2077 
2078 		cur_len = MIN(io_len - done_len, contig);
2079 
2080 		/*
2081 		 * Check if more than one I/O is
2082 		 * required to complete the given
2083 		 * I/O operation
2084 		 */
2085 		if (ip->i_udf->udf_lbsize < PAGESIZE) {
2086 			if (cur_len >= PAGESIZE) {
2087 				multi_io = 0;
2088 				cur_len &= PAGEMASK;
2089 			} else {
2090 				multi_io = 1;
2091 				cur_len = MIN(io_len - done_len, PAGESIZE);
2092 			}
2093 		}
2094 		page_list_break(&cpp, &npp, btop(cur_len));
2095 
2096 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2097 		ASSERT(bp != NULL);
2098 
2099 		bp->b_edev = ip->i_dev;
2100 		bp->b_dev = cmpdev(ip->i_dev);
2101 		bp->b_blkno = bn;
2102 		bp->b_un.b_addr = (caddr_t)0;
2103 		bp->b_file = vp;
2104 		bp->b_offset = (offset_t)(io_off + done_len);
2105 
2106 /*
2107  *		ub.ub_pageios.value.ul++;
2108  */
2109 		if (multi_io == 0) {
2110 			(void) bdev_strategy(bp);
2111 		} else {
2112 			error = ud_multi_strat(ip, cpp, bp,
2113 				(u_offset_t)(io_off + done_len));
2114 			if (error != 0) {
2115 				pageio_done(bp);
2116 				break;
2117 			}
2118 		}
2119 		if (flags & B_READ) {
2120 			ud_pageio_reads++;
2121 		} else {
2122 			ud_pageio_writes++;
2123 		}
2124 
2125 		/*
2126 		 * If the request is not B_ASYNC, wait for i/o to complete
2127 		 * and re-assemble the page list to return to the caller.
2128 		 * If it is B_ASYNC we leave the page list in pieces and
2129 		 * cleanup() will dispose of them.
2130 		 */
2131 		if ((flags & B_ASYNC) == 0) {
2132 			error = biowait(bp);
2133 			pageio_done(bp);
2134 			if (error) {
2135 				break;
2136 			}
2137 			page_list_concat(&opp, &cpp);
2138 		}
2139 		cpp = npp;
2140 		npp = NULL;
2141 		done_len += cur_len;
2142 	}
2143 
2144 	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2145 	if (error) {
2146 		if (flags & B_ASYNC) {
2147 			/* Cleanup unprocessed parts of list */
2148 			page_list_concat(&cpp, &npp);
2149 			if (flags & B_READ) {
2150 				pvn_read_done(cpp, B_ERROR);
2151 			} else {
2152 				pvn_write_done(cpp, B_ERROR);
2153 			}
2154 		} else {
2155 			/* Re-assemble list and let caller clean up */
2156 			page_list_concat(&opp, &cpp);
2157 			page_list_concat(&opp, &npp);
2158 		}
2159 	}
2160 
2161 #ifdef	__lock_lint
2162 	rw_exit(&ip->i_contents);
2163 #else
2164 	if (dolock) {
2165 		rw_exit(&ip->i_contents);
2166 	}
2167 #endif
2168 	return (error);
2169 }
2170 
2171 
2172 
2173 
2174 /* -------------------- local functions --------------------------- */
2175 
2176 
2177 
2178 int32_t
2179 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2180 	struct ud_inode *ip, caddr_t base, int32_t len,
2181 	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2182 {
2183 	int32_t error;
2184 	struct uio auio;
2185 	struct iovec aiov;
2186 
2187 	ud_printf("ud_rdwri\n");
2188 
2189 	bzero((caddr_t)&auio, sizeof (uio_t));
2190 	bzero((caddr_t)&aiov, sizeof (iovec_t));
2191 
2192 	aiov.iov_base = base;
2193 	aiov.iov_len = len;
2194 	auio.uio_iov = &aiov;
2195 	auio.uio_iovcnt = 1;
2196 	auio.uio_loffset = offset;
2197 	auio.uio_segflg = (int16_t)seg;
2198 	auio.uio_resid = len;
2199 
2200 	if (rw == UIO_WRITE) {
2201 		auio.uio_fmode = FWRITE;
2202 		auio.uio_extflg = UIO_COPY_DEFAULT;
2203 		auio.uio_llimit = curproc->p_fsz_ctl;
2204 		error = ud_wrip(ip, &auio, ioflag, cr);
2205 	} else {
2206 		auio.uio_fmode = FREAD;
2207 		auio.uio_extflg = UIO_COPY_CACHED;
2208 		auio.uio_llimit = MAXOFFSET_T;
2209 		error = ud_rdip(ip, &auio, ioflag, cr);
2210 	}
2211 
2212 	if (aresid) {
2213 		*aresid = auio.uio_resid;
2214 	} else if (auio.uio_resid) {
2215 		error = EIO;
2216 	}
2217 	return (error);
2218 }
2219 
2220 /*
2221  * Free behind hacks.  The pager is busted.
2222  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2223  * or B_FREE_IF_TIGHT_ON_MEMORY.
2224  */
2225 int32_t ud_freebehind = 1;
2226 int32_t ud_smallfile = 32 * 1024;
2227 
2228 /* ARGSUSED */
2229 int32_t
2230 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2231 	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2232 	size_t plsz, enum seg_rw rw, int32_t seq)
2233 {
2234 	struct ud_inode *ip = VTOI(vp);
2235 	int32_t err = 0;
2236 	size_t io_len;
2237 	u_offset_t io_off;
2238 	u_offset_t pgoff;
2239 	page_t *pp;
2240 
2241 	pl[0] = NULL;
2242 
2243 	/*
2244 	 * Figure out whether the page can be created, or must be
2245 	 * read from the disk
2246 	 */
2247 	if (rw == S_CREATE) {
2248 		if ((pp = page_create_va(vp, off,
2249 				PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2250 			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2251 			return (EINVAL);
2252 		}
2253 		io_len = PAGESIZE;
2254 	} else {
2255 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2256 				&io_len, off, PAGESIZE, 0);
2257 
2258 		/*
2259 		 * Some other thread has entered the page.
2260 		 * ud_getpage will retry page_lookup.
2261 		 */
2262 		if (pp == NULL) {
2263 			return (0);
2264 		}
2265 
2266 		/*
2267 		 * Fill the page with as much data as we can from the file.
2268 		 */
2269 		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2270 		if (err) {
2271 			pvn_read_done(pp, B_ERROR);
2272 			return (err);
2273 		}
2274 
2275 		/*
2276 		 * XXX ??? ufs has io_len instead of pgoff below
2277 		 */
2278 		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2279 
2280 		/*
2281 		 * If the file access is sequential, initiate read ahead
2282 		 * of the next cluster.
2283 		 */
2284 		if (seq && ip->i_nextrio < ip->i_size) {
2285 			ud_getpage_ra(vp, off, seg, addr);
2286 		}
2287 	}
2288 
2289 outmiss:
2290 	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2291 	return (err);
2292 }
2293 
2294 /* ARGSUSED */
2295 void
2296 ud_getpage_ra(struct vnode *vp,
2297 	u_offset_t off, struct seg *seg, caddr_t addr)
2298 {
2299 	page_t *pp;
2300 	size_t io_len;
2301 	struct ud_inode *ip = VTOI(vp);
2302 	u_offset_t io_off = ip->i_nextrio, pgoff;
2303 	caddr_t addr2 = addr + (io_off - off);
2304 	daddr_t bn;
2305 	int32_t contig = 0;
2306 
2307 	/*
2308 	 * Is this test needed?
2309 	 */
2310 
2311 	if (addr2 >= seg->s_base + seg->s_size) {
2312 		return;
2313 	}
2314 
2315 	contig = 0;
2316 	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2317 		return;
2318 	}
2319 
2320 	pp = pvn_read_kluster(vp, io_off, seg, addr2,
2321 		&io_off, &io_len, io_off, PAGESIZE, 1);
2322 
2323 	/*
2324 	 * Some other thread has entered the page.
2325 	 * So no read head done here (ie we will have to and wait
2326 	 * for the read when needed).
2327 	 */
2328 
2329 	if (pp == NULL) {
2330 		return;
2331 	}
2332 
2333 	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2334 	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2335 }
2336 
2337 int
2338 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2339 	uint32_t bflgs, u_offset_t *pg_off)
2340 {
2341 	daddr_t bn;
2342 	struct buf *bp;
2343 	caddr_t kaddr, caddr;
2344 	int32_t error = 0, contig = 0, multi_io = 0;
2345 	int32_t lbsize = ip->i_udf->udf_lbsize;
2346 	int32_t lbmask = ip->i_udf->udf_lbmask;
2347 	uint64_t isize;
2348 
2349 	isize = (ip->i_size + lbmask) & (~lbmask);
2350 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2351 
2352 		/*
2353 		 * Embedded file read file_entry
2354 		 * from buffer cache and copy the required
2355 		 * portions
2356 		 */
2357 		bp = ud_bread(ip->i_dev,
2358 		ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2359 		if ((bp->b_error == 0) &&
2360 			(bp->b_resid == 0)) {
2361 
2362 			caddr = bp->b_un.b_addr + ip->i_data_off;
2363 
2364 			/*
2365 			 * mapin to kvm
2366 			 */
2367 			kaddr = (caddr_t)ppmapin(pp,
2368 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2369 			(void) kcopy(caddr, kaddr, ip->i_size);
2370 
2371 			/*
2372 			 * mapout of kvm
2373 			 */
2374 			ppmapout(kaddr);
2375 		}
2376 		brelse(bp);
2377 		contig = ip->i_size;
2378 	} else {
2379 
2380 		/*
2381 		 * Get the continuous size and block number
2382 		 * at offset "off"
2383 		 */
2384 		if (error = ud_bmap_read(ip, off, &bn, &contig))
2385 			goto out;
2386 		contig = MIN(contig, PAGESIZE);
2387 		contig = (contig + lbmask) & (~lbmask);
2388 
2389 		/*
2390 		 * Zero part of the page which we are not
2391 		 * going to read from the disk.
2392 		 */
2393 
2394 		if (bn == UDF_HOLE) {
2395 
2396 			/*
2397 			 * This is a HOLE. Just zero out
2398 			 * the page
2399 			 */
2400 			if (((off + contig) == isize) ||
2401 				(contig == PAGESIZE)) {
2402 				pagezero(pp->p_prev, 0, PAGESIZE);
2403 				goto out;
2404 			}
2405 		}
2406 
2407 		if (contig < PAGESIZE) {
2408 			uint64_t count;
2409 
2410 			count = isize - off;
2411 			if (contig != count) {
2412 				multi_io = 1;
2413 				contig = (int32_t)(MIN(count, PAGESIZE));
2414 			} else {
2415 				pagezero(pp->p_prev, contig, PAGESIZE - contig);
2416 			}
2417 		}
2418 
2419 		/*
2420 		 * Get a bp and initialize it
2421 		 */
2422 		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2423 		ASSERT(bp != NULL);
2424 
2425 		bp->b_edev = ip->i_dev;
2426 		bp->b_dev = cmpdev(ip->i_dev);
2427 		bp->b_blkno = bn;
2428 		bp->b_un.b_addr = 0;
2429 		bp->b_file = ip->i_vnode;
2430 
2431 		/*
2432 		 * Start I/O
2433 		 */
2434 		if (multi_io == 0) {
2435 
2436 			/*
2437 			 * Single I/O is sufficient for this page
2438 			 */
2439 			(void) bdev_strategy(bp);
2440 		} else {
2441 
2442 			/*
2443 			 * We need to do the I/O in
2444 			 * piece's
2445 			 */
2446 			error = ud_multi_strat(ip, pp, bp, off);
2447 			if (error != 0) {
2448 				goto out;
2449 			}
2450 		}
2451 		if ((bflgs & B_ASYNC) == 0) {
2452 
2453 			/*
2454 			 * Wait for i/o to complete.
2455 			 */
2456 
2457 			error = biowait(bp);
2458 			pageio_done(bp);
2459 			if (error) {
2460 				goto out;
2461 			}
2462 		}
2463 	}
2464 	if ((off + contig) >= ip->i_size) {
2465 		contig = ip->i_size - off;
2466 	}
2467 
2468 out:
2469 	*pg_off = contig;
2470 	return (error);
2471 }
2472 
2473 int32_t
2474 ud_putpages(struct vnode *vp, offset_t off,
2475 	size_t len, int32_t flags, struct cred *cr)
2476 {
2477 	struct ud_inode *ip;
2478 	page_t *pp;
2479 	u_offset_t io_off;
2480 	size_t io_len;
2481 	u_offset_t eoff;
2482 	int32_t err = 0;
2483 	int32_t dolock;
2484 
2485 	ud_printf("ud_putpages\n");
2486 
2487 	if (vp->v_count == 0) {
2488 		cmn_err(CE_WARN, "ud_putpages: bad v_count");
2489 		return (EINVAL);
2490 	}
2491 
2492 	ip = VTOI(vp);
2493 
2494 	/*
2495 	 * Acquire the readers/write inode lock before locking
2496 	 * any pages in this inode.
2497 	 * The inode lock is held during i/o.
2498 	 */
2499 	if (len == 0) {
2500 		mutex_enter(&ip->i_tlock);
2501 		ip->i_delayoff = ip->i_delaylen = 0;
2502 		mutex_exit(&ip->i_tlock);
2503 	}
2504 #ifdef	__lock_lint
2505 	rw_enter(&ip->i_contents, RW_READER);
2506 #else
2507 	dolock = (rw_owner(&ip->i_contents) != curthread);
2508 	if (dolock) {
2509 		rw_enter(&ip->i_contents, RW_READER);
2510 	}
2511 #endif
2512 
2513 	if (!vn_has_cached_data(vp)) {
2514 #ifdef	__lock_lint
2515 		rw_exit(&ip->i_contents);
2516 #else
2517 		if (dolock) {
2518 			rw_exit(&ip->i_contents);
2519 		}
2520 #endif
2521 		return (0);
2522 	}
2523 
2524 	if (len == 0) {
2525 		/*
2526 		 * Search the entire vp list for pages >= off.
2527 		 */
2528 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2529 					flags, cr);
2530 	} else {
2531 		/*
2532 		 * Loop over all offsets in the range looking for
2533 		 * pages to deal with.
2534 		 */
2535 		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2536 			eoff = MIN(off + len, eoff);
2537 		} else {
2538 			eoff = off + len;
2539 		}
2540 
2541 		for (io_off = off; io_off < eoff; io_off += io_len) {
2542 			/*
2543 			 * If we are not invalidating, synchronously
2544 			 * freeing or writing pages, use the routine
2545 			 * page_lookup_nowait() to prevent reclaiming
2546 			 * them from the free list.
2547 			 */
2548 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2549 				pp = page_lookup(vp, io_off,
2550 					(flags & (B_INVAL | B_FREE)) ?
2551 					    SE_EXCL : SE_SHARED);
2552 			} else {
2553 				pp = page_lookup_nowait(vp, io_off,
2554 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
2555 			}
2556 
2557 			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2558 				io_len = PAGESIZE;
2559 			} else {
2560 
2561 				err = ud_putapage(vp, pp,
2562 					&io_off, &io_len, flags, cr);
2563 				if (err != 0) {
2564 					break;
2565 				}
2566 				/*
2567 				 * "io_off" and "io_len" are returned as
2568 				 * the range of pages we actually wrote.
2569 				 * This allows us to skip ahead more quickly
2570 				 * since several pages may've been dealt
2571 				 * with by this iteration of the loop.
2572 				 */
2573 			}
2574 		}
2575 	}
2576 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2577 		/*
2578 		 * We have just sync'ed back all the pages on
2579 		 * the inode, turn off the IMODTIME flag.
2580 		 */
2581 		mutex_enter(&ip->i_tlock);
2582 		ip->i_flag &= ~IMODTIME;
2583 		mutex_exit(&ip->i_tlock);
2584 	}
2585 #ifdef	__lock_lint
2586 	rw_exit(&ip->i_contents);
2587 #else
2588 	if (dolock) {
2589 		rw_exit(&ip->i_contents);
2590 	}
2591 #endif
2592 	return (err);
2593 }
2594 
2595 /* ARGSUSED */
2596 int32_t
2597 ud_putapage(struct vnode *vp,
2598 	page_t *pp, u_offset_t *offp,
2599 	size_t *lenp, int32_t flags, struct cred *cr)
2600 {
2601 	daddr_t bn;
2602 	size_t io_len;
2603 	struct ud_inode *ip;
2604 	int32_t error = 0, contig, multi_io = 0;
2605 	struct udf_vfs *udf_vfsp;
2606 	u_offset_t off, io_off;
2607 	caddr_t kaddr, caddr;
2608 	struct buf *bp = NULL;
2609 	int32_t lbmask;
2610 	uint64_t isize;
2611 	int32_t crc_len;
2612 	struct file_entry *fe;
2613 
2614 	ud_printf("ud_putapage\n");
2615 
2616 	ip = VTOI(vp);
2617 	ASSERT(ip);
2618 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2619 	lbmask = ip->i_udf->udf_lbmask;
2620 	isize = (ip->i_size + lbmask) & (~lbmask);
2621 
2622 	udf_vfsp = ip->i_udf;
2623 	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2624 
2625 	/*
2626 	 * If the modified time on the inode has not already been
2627 	 * set elsewhere (e.g. for write/setattr) we set the time now.
2628 	 * This gives us approximate modified times for mmap'ed files
2629 	 * which are modified via stores in the user address space.
2630 	 */
2631 	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2632 		mutex_enter(&ip->i_tlock);
2633 		ip->i_flag |= IUPD;
2634 		ITIMES_NOLOCK(ip);
2635 		mutex_exit(&ip->i_tlock);
2636 	}
2637 
2638 
2639 	/*
2640 	 * Align the request to a block boundry (for old file systems),
2641 	 * and go ask bmap() how contiguous things are for this file.
2642 	 */
2643 	off = pp->p_offset & ~(offset_t)lbmask;
2644 				/* block align it */
2645 
2646 
2647 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2648 		ASSERT(ip->i_size <= ip->i_max_emb);
2649 
2650 		pp = pvn_write_kluster(vp, pp, &io_off,
2651 			&io_len, off, PAGESIZE, flags);
2652 		if (io_len == 0) {
2653 			io_len = PAGESIZE;
2654 		}
2655 
2656 		bp = ud_bread(ip->i_dev,
2657 			ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2658 			udf_vfsp->udf_lbsize);
2659 		fe = (struct file_entry *)bp->b_un.b_addr;
2660 		if ((bp->b_flags & B_ERROR) ||
2661 		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2662 		    ip->i_icb_block,
2663 		    1, udf_vfsp->udf_lbsize) != 0)) {
2664 			if (pp != NULL)
2665 				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2666 			if (bp->b_flags & B_ERROR) {
2667 				error = EIO;
2668 			} else {
2669 				error = EINVAL;
2670 			}
2671 			brelse(bp);
2672 			return (error);
2673 		}
2674 		if ((bp->b_error == 0) &&
2675 			(bp->b_resid == 0)) {
2676 
2677 			caddr = bp->b_un.b_addr + ip->i_data_off;
2678 			kaddr = (caddr_t)ppmapin(pp,
2679 				PROT_READ | PROT_WRITE, (caddr_t)-1);
2680 			(void) kcopy(kaddr, caddr, ip->i_size);
2681 			ppmapout(kaddr);
2682 		}
2683 		crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) +
2684 				SWAP_32(fe->fe_len_ear);
2685 		crc_len += ip->i_size;
2686 		ud_make_tag(ip->i_udf, &fe->fe_tag,
2687 			UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2688 
2689 		bwrite(bp);
2690 
2691 		if (flags & B_ASYNC) {
2692 			pvn_write_done(pp, flags);
2693 		}
2694 		contig = ip->i_size;
2695 	} else {
2696 
2697 		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2698 			goto out;
2699 		}
2700 		contig = MIN(contig, PAGESIZE);
2701 		contig = (contig + lbmask) & (~lbmask);
2702 
2703 		if (contig < PAGESIZE) {
2704 			uint64_t count;
2705 
2706 			count = isize - off;
2707 			if (contig != count) {
2708 				multi_io = 1;
2709 				contig = (int32_t)(MIN(count, PAGESIZE));
2710 			}
2711 		}
2712 
2713 		if ((off + contig) > isize) {
2714 			contig = isize - off;
2715 		}
2716 
2717 		if (contig > PAGESIZE) {
2718 			if (contig & PAGEOFFSET) {
2719 				contig &= PAGEMASK;
2720 			}
2721 		}
2722 
2723 		pp = pvn_write_kluster(vp, pp, &io_off,
2724 			&io_len, off, contig, flags);
2725 		if (io_len == 0) {
2726 			io_len = PAGESIZE;
2727 		}
2728 
2729 		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2730 		ASSERT(bp != NULL);
2731 
2732 		bp->b_edev = ip->i_dev;
2733 		bp->b_dev = cmpdev(ip->i_dev);
2734 		bp->b_blkno = bn;
2735 		bp->b_un.b_addr = 0;
2736 		bp->b_file = vp;
2737 		bp->b_offset = (offset_t)off;
2738 
2739 
2740 		/*
2741 		 * write throttle
2742 		 */
2743 		ASSERT(bp->b_iodone == NULL);
2744 		bp->b_iodone = ud_iodone;
2745 		mutex_enter(&ip->i_tlock);
2746 		ip->i_writes += bp->b_bcount;
2747 		mutex_exit(&ip->i_tlock);
2748 
2749 		if (multi_io == 0) {
2750 
2751 			(void) bdev_strategy(bp);
2752 		} else {
2753 			error = ud_multi_strat(ip, pp, bp, off);
2754 			if (error != 0) {
2755 				goto out;
2756 			}
2757 		}
2758 
2759 		if ((flags & B_ASYNC) == 0) {
2760 			/*
2761 			 * Wait for i/o to complete.
2762 			 */
2763 			error = biowait(bp);
2764 			pageio_done(bp);
2765 		}
2766 	}
2767 
2768 	if ((flags & B_ASYNC) == 0) {
2769 		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2770 	}
2771 
2772 	pp = NULL;
2773 
2774 out:
2775 	if (error != 0 && pp != NULL) {
2776 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2777 	}
2778 
2779 	if (offp) {
2780 		*offp = io_off;
2781 	}
2782 	if (lenp) {
2783 		*lenp = io_len;
2784 	}
2785 
2786 	return (error);
2787 }
2788 
2789 
2790 int32_t
2791 ud_iodone(struct buf *bp)
2792 {
2793 	struct ud_inode *ip;
2794 
2795 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2796 
2797 	bp->b_iodone = NULL;
2798 
2799 	ip = VTOI(bp->b_pages->p_vnode);
2800 
2801 	mutex_enter(&ip->i_tlock);
2802 	if (ip->i_writes >= ud_LW) {
2803 		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2804 			if (ud_WRITES) {
2805 				cv_broadcast(&ip->i_wrcv); /* wake all up */
2806 			}
2807 		}
2808 	} else {
2809 		ip->i_writes -= bp->b_bcount;
2810 	}
2811 	mutex_exit(&ip->i_tlock);
2812 	iodone(bp);
2813 	return (0);
2814 }
2815 
2816 /* ARGSUSED3 */
2817 int32_t
2818 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2819 {
2820 	struct vnode *vp;
2821 	struct udf_vfs *udf_vfsp;
2822 	krw_t rwtype;
2823 	caddr_t base;
2824 	uint32_t flags;
2825 	int32_t error, n, on, mapon, dofree;
2826 	u_offset_t off;
2827 	long oresid = uio->uio_resid;
2828 
2829 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2830 	if ((ip->i_type != VREG) &&
2831 		(ip->i_type != VDIR) &&
2832 		(ip->i_type != VLNK)) {
2833 		return (EIO);
2834 	}
2835 
2836 	if (uio->uio_loffset > MAXOFFSET_T) {
2837 		return (0);
2838 	}
2839 
2840 	if ((uio->uio_loffset < (offset_t)0) ||
2841 		((uio->uio_loffset + uio->uio_resid) < 0)) {
2842 		return (EINVAL);
2843 	}
2844 	if (uio->uio_resid == 0) {
2845 		return (0);
2846 	}
2847 
2848 	vp = ITOV(ip);
2849 	udf_vfsp = ip->i_udf;
2850 	mutex_enter(&ip->i_tlock);
2851 	ip->i_flag |= IACC;
2852 	mutex_exit(&ip->i_tlock);
2853 
2854 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
2855 
2856 	do {
2857 		offset_t diff;
2858 		u_offset_t uoff = uio->uio_loffset;
2859 		off = uoff & (offset_t)MAXBMASK;
2860 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
2861 		on = (int)blkoff(udf_vfsp, uoff);
2862 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
2863 
2864 		diff = ip->i_size - uoff;
2865 
2866 		if (diff <= (offset_t)0) {
2867 			error = 0;
2868 			goto out;
2869 		}
2870 		if (diff < (offset_t)n) {
2871 			n = (int)diff;
2872 		}
2873 		dofree = ud_freebehind &&
2874 			ip->i_nextr == (off & PAGEMASK) &&
2875 				off > ud_smallfile;
2876 
2877 #ifndef	__lock_lint
2878 		if (rwtype == RW_READER) {
2879 			rw_exit(&ip->i_contents);
2880 		}
2881 #endif
2882 
2883 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
2884 					(uint32_t)n, 1, S_READ);
2885 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
2886 
2887 		flags = 0;
2888 		if (!error) {
2889 			/*
2890 			 * If read a whole block, or read to eof,
2891 			 * won't need this buffer again soon.
2892 			 */
2893 			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
2894 				freemem < lotsfree + pages_before_pager) {
2895 				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
2896 			}
2897 			/*
2898 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
2899 			 * we want to make sure that the page which has
2900 			 * been read, is written on disk if it is dirty.
2901 			 * And corresponding indirect blocks should also
2902 			 * be flushed out.
2903 			 */
2904 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
2905 				flags &= ~SM_ASYNC;
2906 				flags |= SM_WRITE;
2907 			}
2908 			error = segmap_release(segkmap, base, flags);
2909 		} else    {
2910 			(void) segmap_release(segkmap, base, flags);
2911 		}
2912 
2913 #ifndef __lock_lint
2914 		if (rwtype == RW_READER) {
2915 			rw_enter(&ip->i_contents, rwtype);
2916 		}
2917 #endif
2918 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
2919 out:
2920 	/*
2921 	 * Inode is updated according to this table if FRSYNC is set.
2922 	 *
2923 	 *	FSYNC	FDSYNC(posix.4)
2924 	 *	--------------------------
2925 	 *	always	IATTCHG|IBDWRITE
2926 	 */
2927 	if (ioflag & FRSYNC) {
2928 		if ((ioflag & FSYNC) ||
2929 		((ioflag & FDSYNC) && (ip->i_flag & (IATTCHG|IBDWRITE)))) {
2930 		rw_exit(&ip->i_contents);
2931 		rw_enter(&ip->i_contents, RW_WRITER);
2932 		ud_iupdat(ip, 1);
2933 		}
2934 	}
2935 	/*
2936 	 * If we've already done a partial read, terminate
2937 	 * the read but return no error.
2938 	 */
2939 	if (oresid != uio->uio_resid) {
2940 		error = 0;
2941 	}
2942 	ITIMES(ip);
2943 
2944 	return (error);
2945 }
2946 
2947 int32_t
2948 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
2949 {
2950 	caddr_t base;
2951 	struct vnode *vp;
2952 	struct udf_vfs *udf_vfsp;
2953 	uint32_t flags;
2954 	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
2955 	int32_t pagecreate, newpage;
2956 	uint64_t old_i_size;
2957 	u_offset_t off;
2958 	long start_resid = uio->uio_resid, premove_resid;
2959 	rlim64_t limit = uio->uio_limit;
2960 
2961 
2962 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2963 	if ((ip->i_type != VREG) &&
2964 		(ip->i_type != VDIR) &&
2965 		(ip->i_type != VLNK)) {
2966 		return (EIO);
2967 	}
2968 
2969 	if (uio->uio_loffset >= MAXOFFSET_T) {
2970 		return (EFBIG);
2971 	}
2972 	/*
2973 	 * see udf_l_pathconf
2974 	 */
2975 	if (limit > (((uint64_t)1 << 40) - 1)) {
2976 		limit = ((uint64_t)1 << 40) - 1;
2977 	}
2978 	if (uio->uio_loffset >= limit) {
2979 		proc_t *p = ttoproc(curthread);
2980 
2981 		mutex_enter(&p->p_lock);
2982 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
2983 		    p, RCA_UNSAFE_SIGINFO);
2984 		mutex_exit(&p->p_lock);
2985 		return (EFBIG);
2986 	}
2987 	if ((uio->uio_loffset < (offset_t)0) ||
2988 		((uio->uio_loffset + uio->uio_resid) < 0)) {
2989 		return (EINVAL);
2990 	}
2991 	if (uio->uio_resid == 0) {
2992 		return (0);
2993 	}
2994 
2995 	mutex_enter(&ip->i_tlock);
2996 	ip->i_flag |= INOACC;
2997 
2998 	if (ioflag & (FSYNC | FDSYNC)) {
2999 		ip->i_flag |= ISYNC;
3000 		iupdat_flag = 1;
3001 	}
3002 	mutex_exit(&ip->i_tlock);
3003 
3004 	udf_vfsp = ip->i_udf;
3005 	vp = ITOV(ip);
3006 
3007 	do {
3008 		u_offset_t uoff = uio->uio_loffset;
3009 		off = uoff & (offset_t)MAXBMASK;
3010 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3011 		on = (int)blkoff(udf_vfsp, uoff);
3012 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3013 
3014 		if (ip->i_type == VREG && uoff + n >= limit) {
3015 			if (uoff >= limit) {
3016 				error = EFBIG;
3017 				goto out;
3018 			}
3019 			n = (int)(limit - (rlim64_t)uoff);
3020 		}
3021 		if (uoff + n > ip->i_size) {
3022 			/*
3023 			 * We are extending the length of the file.
3024 			 * bmap is used so that we are sure that
3025 			 * if we need to allocate new blocks, that it
3026 			 * is done here before we up the file size.
3027 			 */
3028 			error = ud_bmap_write(ip, uoff,
3029 					(int)(on + n), mapon == 0, cr);
3030 			if (error) {
3031 				break;
3032 			}
3033 			i_size_changed = 1;
3034 			old_i_size = ip->i_size;
3035 			ip->i_size = uoff + n;
3036 			/*
3037 			 * If we are writing from the beginning of
3038 			 * the mapping, we can just create the
3039 			 * pages without having to read them.
3040 			 */
3041 			pagecreate = (mapon == 0);
3042 		} else if (n == MAXBSIZE) {
3043 			/*
3044 			 * Going to do a whole mappings worth,
3045 			 * so we can just create the pages w/o
3046 			 * having to read them in.  But before
3047 			 * we do that, we need to make sure any
3048 			 * needed blocks are allocated first.
3049 			 */
3050 			error = ud_bmap_write(ip, uoff,
3051 					(int)(on + n), 1, cr);
3052 			if (error) {
3053 				break;
3054 			}
3055 			pagecreate = 1;
3056 		} else {
3057 			pagecreate = 0;
3058 		}
3059 
3060 		rw_exit(&ip->i_contents);
3061 
3062 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3063 				(uint32_t)n, !pagecreate, S_WRITE);
3064 
3065 		/*
3066 		 * segmap_pagecreate() returns 1 if it calls
3067 		 * page_create_va() to allocate any pages.
3068 		 */
3069 		newpage = 0;
3070 		if (pagecreate) {
3071 			newpage = segmap_pagecreate(segkmap, base,
3072 					(size_t)n, 0);
3073 		}
3074 
3075 		premove_resid = uio->uio_resid;
3076 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3077 
3078 		if (pagecreate &&
3079 			uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3080 			/*
3081 			 * We created pages w/o initializing them completely,
3082 			 * thus we need to zero the part that wasn't set up.
3083 			 * This happens on most EOF write cases and if
3084 			 * we had some sort of error during the uiomove.
3085 			 */
3086 			int nzero, nmoved;
3087 
3088 			nmoved = (int)(uio->uio_loffset - (off + mapon));
3089 			ASSERT(nmoved >= 0 && nmoved <= n);
3090 			nzero = roundup(on + n, PAGESIZE) - nmoved;
3091 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3092 			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3093 		}
3094 
3095 		/*
3096 		 * Unlock the pages allocated by page_create_va()
3097 		 * in segmap_pagecreate()
3098 		 */
3099 		if (newpage) {
3100 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3101 		}
3102 
3103 		if (error) {
3104 			/*
3105 			 * If we failed on a write, we may have already
3106 			 * allocated file blocks as well as pages.  It's
3107 			 * hard to undo the block allocation, but we must
3108 			 * be sure to invalidate any pages that may have
3109 			 * been allocated.
3110 			 */
3111 			(void) segmap_release(segkmap, base, SM_INVAL);
3112 		} else {
3113 			flags = 0;
3114 			/*
3115 			 * Force write back for synchronous write cases.
3116 			 */
3117 			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3118 				/*
3119 				 * If the sticky bit is set but the
3120 				 * execute bit is not set, we do a
3121 				 * synchronous write back and free
3122 				 * the page when done.  We set up swap
3123 				 * files to be handled this way to
3124 				 * prevent servers from keeping around
3125 				 * the client's swap pages too long.
3126 				 * XXX - there ought to be a better way.
3127 				 */
3128 				if (IS_SWAPVP(vp)) {
3129 					flags = SM_WRITE | SM_FREE |
3130 							SM_DONTNEED;
3131 					iupdat_flag = 0;
3132 				} else {
3133 					flags = SM_WRITE;
3134 				}
3135 			} else if (((mapon + n) == MAXBSIZE) ||
3136 					IS_SWAPVP(vp)) {
3137 				/*
3138 				 * Have written a whole block.
3139 				 * Start an asynchronous write and
3140 				 * mark the buffer to indicate that
3141 				 * it won't be needed again soon.
3142 				 */
3143 				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3144 			}
3145 			error = segmap_release(segkmap, base, flags);
3146 
3147 			/*
3148 			 * If the operation failed and is synchronous,
3149 			 * then we need to unwind what uiomove() last
3150 			 * did so we can potentially return an error to
3151 			 * the caller.  If this write operation was
3152 			 * done in two pieces and the first succeeded,
3153 			 * then we won't return an error for the second
3154 			 * piece that failed.  However, we only want to
3155 			 * return a resid value that reflects what was
3156 			 * really done.
3157 			 *
3158 			 * Failures for non-synchronous operations can
3159 			 * be ignored since the page subsystem will
3160 			 * retry the operation until it succeeds or the
3161 			 * file system is unmounted.
3162 			 */
3163 			if (error) {
3164 				if ((ioflag & (FSYNC | FDSYNC)) ||
3165 					ip->i_type == VDIR) {
3166 					uio->uio_resid = premove_resid;
3167 				} else {
3168 					error = 0;
3169 				}
3170 			}
3171 		}
3172 
3173 		/*
3174 		 * Re-acquire contents lock.
3175 		 */
3176 		rw_enter(&ip->i_contents, RW_WRITER);
3177 		/*
3178 		 * If the uiomove() failed or if a synchronous
3179 		 * page push failed, fix up i_size.
3180 		 */
3181 		if (error) {
3182 			if (i_size_changed) {
3183 				/*
3184 				 * The uiomove failed, and we
3185 				 * allocated blocks,so get rid
3186 				 * of them.
3187 				 */
3188 				(void) ud_itrunc(ip, old_i_size, 0, cr);
3189 			}
3190 		} else {
3191 			/*
3192 			 * XXX - Can this be out of the loop?
3193 			 */
3194 			ip->i_flag |= IUPD | ICHG;
3195 			if (i_size_changed) {
3196 				ip->i_flag |= IATTCHG;
3197 			}
3198 			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3199 				(IEXEC >> 10))) != 0 &&
3200 			    (ip->i_char & (ISUID | ISGID)) != 0 &&
3201 			    secpolicy_vnode_setid_retain(cr,
3202 			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3203 				/*
3204 				 * Clear Set-UID & Set-GID bits on
3205 				 * successful write if not privileged
3206 				 * and at least one of the execute bits
3207 				 * is set.  If we always clear Set-GID,
3208 				 * mandatory file and record locking is
3209 				 * unuseable.
3210 				 */
3211 				ip->i_char &= ~(ISUID | ISGID);
3212 			}
3213 		}
3214 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3215 
3216 out:
3217 	/*
3218 	 * Inode is updated according to this table -
3219 	 *
3220 	 *	FSYNC	FDSYNC(posix.4)
3221 	 *	--------------------------
3222 	 *	always@	IATTCHG|IBDWRITE
3223 	 *
3224 	 * @ -  If we are doing synchronous write the only time we should
3225 	 *	not be sync'ing the ip here is if we have the stickyhack
3226 	 *	activated, the file is marked with the sticky bit and
3227 	 *	no exec bit, the file length has not been changed and
3228 	 *	no new blocks have been allocated during this write.
3229 	 */
3230 	if ((ip->i_flag & ISYNC) != 0) {
3231 		/*
3232 		 * we have eliminated nosync
3233 		 */
3234 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3235 			((ioflag & FSYNC) && iupdat_flag)) {
3236 			ud_iupdat(ip, 1);
3237 		}
3238 	}
3239 
3240 	/*
3241 	 * If we've already done a partial-write, terminate
3242 	 * the write but return no error.
3243 	 */
3244 	if (start_resid != uio->uio_resid) {
3245 		error = 0;
3246 	}
3247 	ip->i_flag &= ~(INOACC | ISYNC);
3248 	ITIMES_NOLOCK(ip);
3249 
3250 	return (error);
3251 }
3252 
3253 int32_t
3254 ud_multi_strat(struct ud_inode *ip,
3255 	page_t *pp, struct buf *bp, u_offset_t start)
3256 {
3257 	daddr_t bn;
3258 	int32_t error = 0, io_count, contig, alloc_sz, i;
3259 	uint32_t io_off;
3260 	mio_master_t *mm = NULL;
3261 	mio_slave_t *ms = NULL;
3262 	struct buf *rbp;
3263 
3264 	ASSERT(!(start & PAGEOFFSET));
3265 
3266 	/*
3267 	 * Figure out how many buffers to allocate
3268 	 */
3269 	io_count = 0;
3270 	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3271 		contig = 0;
3272 		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3273 					&bn, &contig)) {
3274 			goto end;
3275 		}
3276 		if (contig == 0) {
3277 			goto end;
3278 		}
3279 		contig = MIN(contig, PAGESIZE - io_off);
3280 		if (bn != UDF_HOLE) {
3281 			io_count ++;
3282 		} else {
3283 			/*
3284 			 * HOLE
3285 			 */
3286 			if (bp->b_flags & B_READ) {
3287 
3288 				/*
3289 				 * This is a hole and is read
3290 				 * it should be filled with 0's
3291 				 */
3292 				pagezero(pp, io_off, contig);
3293 			}
3294 		}
3295 	}
3296 
3297 
3298 	if (io_count != 0) {
3299 
3300 		/*
3301 		 * Allocate memory for all the
3302 		 * required number of buffers
3303 		 */
3304 		alloc_sz = sizeof (mio_master_t) +
3305 			(sizeof (mio_slave_t) * io_count);
3306 		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3307 		if (mm == NULL) {
3308 			error = ENOMEM;
3309 			goto end;
3310 		}
3311 
3312 		/*
3313 		 * initialize master
3314 		 */
3315 		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3316 		mm->mm_size = alloc_sz;
3317 		mm->mm_bp = bp;
3318 		mm->mm_resid = 0;
3319 		mm->mm_error = 0;
3320 		mm->mm_index = master_index++;
3321 
3322 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3323 
3324 		/*
3325 		 * Initialize buffers
3326 		 */
3327 		io_count = 0;
3328 		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3329 			contig = 0;
3330 			if (error = ud_bmap_read(ip,
3331 					(u_offset_t)(start + io_off),
3332 					&bn, &contig)) {
3333 				goto end;
3334 			}
3335 			ASSERT(contig);
3336 			if ((io_off + contig) > bp->b_bcount) {
3337 				contig = bp->b_bcount - io_off;
3338 			}
3339 			if (bn != UDF_HOLE) {
3340 				/*
3341 				 * Clone the buffer
3342 				 * and prepare to start I/O
3343 				 */
3344 				ms->ms_ptr = mm;
3345 				bioinit(&ms->ms_buf);
3346 				rbp = bioclone(bp, io_off, (size_t)contig,
3347 					bp->b_edev, bn, ud_slave_done,
3348 					&ms->ms_buf, KM_NOSLEEP);
3349 				ASSERT(rbp == &ms->ms_buf);
3350 				mm->mm_resid += contig;
3351 				io_count++;
3352 				ms ++;
3353 			}
3354 		}
3355 
3356 		/*
3357 		 * Start I/O's
3358 		 */
3359 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3360 		for (i = 0; i < io_count; i++) {
3361 			(void) bdev_strategy(&ms->ms_buf);
3362 			ms ++;
3363 		}
3364 	}
3365 
3366 end:
3367 	if (error != 0) {
3368 		bp->b_flags |= B_ERROR;
3369 		bp->b_error = error;
3370 		if (mm != NULL) {
3371 			mutex_destroy(&mm->mm_mutex);
3372 			kmem_free(mm, mm->mm_size);
3373 		}
3374 	}
3375 	return (error);
3376 }
3377 
3378 int32_t
3379 ud_slave_done(struct buf *bp)
3380 {
3381 	mio_master_t *mm;
3382 	int32_t resid;
3383 
3384 	ASSERT(SEMA_HELD(&bp->b_sem));
3385 	ASSERT((bp->b_flags & B_DONE) == 0);
3386 
3387 	mm = ((mio_slave_t *)bp)->ms_ptr;
3388 
3389 	/*
3390 	 * Propagate error and byte count info from slave struct to
3391 	 * the master struct
3392 	 */
3393 	mutex_enter(&mm->mm_mutex);
3394 	if (bp->b_flags & B_ERROR) {
3395 
3396 		/*
3397 		 * If multiple slave buffers get
3398 		 * error we forget the old errors
3399 		 * this is ok because we any way
3400 		 * cannot return multiple errors
3401 		 */
3402 		mm->mm_error = bp->b_error;
3403 	}
3404 	mm->mm_resid -= bp->b_bcount;
3405 	resid = mm->mm_resid;
3406 	mutex_exit(&mm->mm_mutex);
3407 
3408 	/*
3409 	 * free up the resources allocated to cloned buffers.
3410 	 */
3411 	bp_mapout(bp);
3412 	biofini(bp);
3413 
3414 	if (resid == 0) {
3415 
3416 		/*
3417 		 * This is the last I/O operation
3418 		 * clean up and return the original buffer
3419 		 */
3420 		if (mm->mm_error) {
3421 			mm->mm_bp->b_flags |= B_ERROR;
3422 			mm->mm_bp->b_error = mm->mm_error;
3423 		}
3424 		biodone(mm->mm_bp);
3425 		mutex_destroy(&mm->mm_mutex);
3426 		kmem_free(mm, mm->mm_size);
3427 	}
3428 	return (0);
3429 }
3430