xref: /titanic_50/usr/src/uts/common/fs/hsfs/hsfs_vnops.c (revision d89fccd8788afe1e920f842edd883fe192a1b8fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Vnode operations for the High Sierra filesystem
30  */
31 
32 #include <sys/types.h>
33 #include <sys/t_lock.h>
34 #include <sys/param.h>
35 #include <sys/time.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/resource.h>
39 #include <sys/signal.h>
40 #include <sys/cred.h>
41 #include <sys/user.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/stat.h>
45 #include <sys/vnode.h>
46 #include <sys/mode.h>
47 #include <sys/proc.h>
48 #include <sys/disp.h>
49 #include <sys/file.h>
50 #include <sys/fcntl.h>
51 #include <sys/flock.h>
52 #include <sys/kmem.h>
53 #include <sys/uio.h>
54 #include <sys/conf.h>
55 #include <sys/errno.h>
56 #include <sys/mman.h>
57 #include <sys/pathname.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/cmn_err.h>
61 #include <sys/fbuf.h>
62 #include <sys/dirent.h>
63 #include <sys/errno.h>
64 
65 #include <vm/hat.h>
66 #include <vm/page.h>
67 #include <vm/pvn.h>
68 #include <vm/as.h>
69 #include <vm/seg.h>
70 #include <vm/seg_map.h>
71 #include <vm/seg_kmem.h>
72 #include <vm/seg_vn.h>
73 #include <vm/rm.h>
74 #include <vm/page.h>
75 #include <sys/swap.h>
76 
77 #include <sys/fs/hsfs_spec.h>
78 #include <sys/fs/hsfs_node.h>
79 #include <sys/fs/hsfs_impl.h>
80 #include <sys/fs/hsfs_susp.h>
81 #include <sys/fs/hsfs_rrip.h>
82 
83 #include <fs/fs_subr.h>
84 
85 /* ARGSUSED */
86 static int
87 hsfs_fsync(vnode_t *cp, int syncflag, cred_t *cred)
88 {
89 	return (0);
90 }
91 
92 
93 /*ARGSUSED*/
94 static int
95 hsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
96 	struct caller_context *ct)
97 {
98 	caddr_t base;
99 	offset_t diff;
100 	int error;
101 	struct hsnode *hp;
102 	uint_t filesize;
103 
104 	hp = VTOH(vp);
105 	/*
106 	 * if vp is of type VDIR, make sure dirent
107 	 * is filled up with all info (because of ptbl)
108 	 */
109 	if (vp->v_type == VDIR) {
110 		if (hp->hs_dirent.ext_size == 0)
111 			hs_filldirent(vp, &hp->hs_dirent);
112 	}
113 	filesize = hp->hs_dirent.ext_size;
114 
115 	/* Sanity checks. */
116 	if (uiop->uio_resid == 0 ||		/* No data wanted. */
117 	    uiop->uio_loffset > HS_MAXFILEOFF ||	/* Offset too big. */
118 	    uiop->uio_loffset >= filesize)	/* Past EOF. */
119 		return (0);
120 
121 	do {
122 		/*
123 		 * We want to ask for only the "right" amount of data.
124 		 * In this case that means:-
125 		 *
126 		 * We can't get data from beyond our EOF. If asked,
127 		 * we will give a short read.
128 		 *
129 		 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
130 		 * These buffers are always MAXBSIZE aligned.
131 		 * If our starting offset is not MAXBSIZE aligned,
132 		 * we can only ask for less than MAXBSIZE bytes.
133 		 *
134 		 * If our requested offset and length are such that
135 		 * they belong in different MAXBSIZE aligned slots
136 		 * then we'll be making more than one call on
137 		 * segmap_getmapflt.
138 		 *
139 		 * This diagram shows the variables we use and their
140 		 * relationships.
141 		 *
142 		 * |<-----MAXBSIZE----->|
143 		 * +--------------------------...+
144 		 * |.....mapon->|<--n-->|....*...|EOF
145 		 * +--------------------------...+
146 		 * uio_loffset->|
147 		 * uio_resid....|<---------->|
148 		 * diff.........|<-------------->|
149 		 *
150 		 * So, in this case our offset is not aligned
151 		 * and our request takes us outside of the
152 		 * MAXBSIZE window. We will break this up into
153 		 * two segmap_getmapflt calls.
154 		 */
155 		size_t nbytes;
156 		offset_t mapon;
157 		size_t n;
158 		uint_t flags;
159 
160 		mapon = uiop->uio_loffset & MAXBOFFSET;
161 		diff = filesize - uiop->uio_loffset;
162 		nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
163 		n = MIN(diff, nbytes);
164 		if (n <= 0) {
165 			/* EOF or request satisfied. */
166 			return (0);
167 		}
168 
169 		base = segmap_getmapflt(segkmap, vp,
170 		    (u_offset_t)uiop->uio_loffset, n, 1, S_READ);
171 
172 		error = uiomove(base + mapon, n, UIO_READ, uiop);
173 
174 		if (error == 0) {
175 			/*
176 			 * if read a whole block, or read to eof,
177 			 *  won't need this buffer again soon.
178 			 */
179 			if (n + mapon == MAXBSIZE ||
180 			    uiop->uio_loffset == filesize)
181 				flags = SM_DONTNEED;
182 			else
183 				flags = 0;
184 			error = segmap_release(segkmap, base, flags);
185 		} else
186 			(void) segmap_release(segkmap, base, 0);
187 	} while (error == 0 && uiop->uio_resid > 0);
188 
189 	return (error);
190 }
191 
192 /*ARGSUSED2*/
193 static int
194 hsfs_getattr(
195 	struct vnode *vp,
196 	struct vattr *vap,
197 	int flags,
198 	struct cred *cred)
199 {
200 	struct hsnode *hp;
201 	struct vfs *vfsp;
202 	struct hsfs *fsp;
203 
204 	hp = VTOH(vp);
205 	fsp = VFS_TO_HSFS(vp->v_vfsp);
206 	vfsp = vp->v_vfsp;
207 
208 	if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
209 		hs_filldirent(vp, &hp->hs_dirent);
210 	}
211 	vap->va_type = IFTOVT(hp->hs_dirent.mode);
212 	vap->va_mode = hp->hs_dirent.mode;
213 	vap->va_uid = hp->hs_dirent.uid;
214 	vap->va_gid = hp->hs_dirent.gid;
215 
216 	vap->va_fsid = vfsp->vfs_dev;
217 	vap->va_nodeid = (ino64_t)hp->hs_nodeid;
218 	vap->va_nlink = hp->hs_dirent.nlink;
219 	vap->va_size =	(offset_t)hp->hs_dirent.ext_size;
220 
221 	vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
222 	vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
223 	vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
224 	vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
225 	vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
226 	vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
227 	if (vp->v_type == VCHR || vp->v_type == VBLK)
228 		vap->va_rdev = hp->hs_dirent.r_dev;
229 	else
230 		vap->va_rdev = 0;
231 	vap->va_blksize = vfsp->vfs_bsize;
232 	/* no. of blocks = no. of data blocks + no. of xar blocks */
233 	vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
234 	    (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
235 	vap->va_seq = hp->hs_seq;
236 	return (0);
237 }
238 
239 /*ARGSUSED*/
240 static int
241 hsfs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred)
242 {
243 	struct hsnode *hp;
244 
245 	if (vp->v_type != VLNK)
246 		return (EINVAL);
247 
248 	hp = VTOH(vp);
249 
250 	if (hp->hs_dirent.sym_link == (char *)NULL)
251 		return (ENOENT);
252 
253 	return (uiomove(hp->hs_dirent.sym_link,
254 	    (size_t)MIN(hp->hs_dirent.ext_size,
255 	    uiop->uio_resid), UIO_READ, uiop));
256 }
257 
258 /*ARGSUSED*/
259 static void
260 hsfs_inactive(struct vnode *vp, struct cred *cred)
261 {
262 	struct hsnode *hp;
263 	struct hsfs *fsp;
264 
265 	int nopage;
266 
267 	hp = VTOH(vp);
268 	fsp = VFS_TO_HSFS(vp->v_vfsp);
269 	/*
270 	 * Note: acquiring and holding v_lock for quite a while
271 	 * here serializes on the vnode; this is unfortunate, but
272 	 * likely not to overly impact performance, as the underlying
273 	 * device (CDROM drive) is quite slow.
274 	 */
275 	rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
276 	mutex_enter(&hp->hs_contents_lock);
277 	mutex_enter(&vp->v_lock);
278 
279 	if (vp->v_count < 1) {
280 		panic("hsfs_inactive: v_count < 1");
281 		/*NOTREACHED*/
282 	}
283 
284 	if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) {
285 		vp->v_count--;	/* release hold from vn_rele */
286 		mutex_exit(&vp->v_lock);
287 		mutex_exit(&hp->hs_contents_lock);
288 		rw_exit(&fsp->hsfs_hash_lock);
289 		return;
290 	}
291 	vp->v_count--;	/* release hold from vn_rele */
292 	if (vp->v_count == 0) {
293 		/*
294 		 * Free the hsnode.
295 		 * If there are no pages associated with the
296 		 * hsnode, give it back to the kmem_cache,
297 		 * else put at the end of this file system's
298 		 * internal free list.
299 		 */
300 		nopage = !vn_has_cached_data(vp);
301 		hp->hs_flags = 0;
302 		/*
303 		 * exit these locks now, since hs_freenode may
304 		 * kmem_free the hsnode and embedded vnode
305 		 */
306 		mutex_exit(&vp->v_lock);
307 		mutex_exit(&hp->hs_contents_lock);
308 		hs_freenode(vp, fsp, nopage);
309 	} else {
310 		mutex_exit(&vp->v_lock);
311 		mutex_exit(&hp->hs_contents_lock);
312 	}
313 	rw_exit(&fsp->hsfs_hash_lock);
314 }
315 
316 
317 /*ARGSUSED*/
318 static int
319 hsfs_lookup(
320 	struct vnode *dvp,
321 	char *nm,
322 	struct vnode **vpp,
323 	struct pathname *pnp,
324 	int flags,
325 	struct vnode *rdir,
326 	struct cred *cred)
327 {
328 	int error;
329 	int namelen = (int)strlen(nm);
330 
331 	if (*nm == '\0') {
332 		VN_HOLD(dvp);
333 		*vpp = dvp;
334 		return (0);
335 	}
336 
337 	/*
338 	 * If we're looking for ourself, life is simple.
339 	 */
340 	if (namelen == 1 && *nm == '.') {
341 		if (error = hs_access(dvp, (mode_t)VEXEC, cred))
342 			return (error);
343 		VN_HOLD(dvp);
344 		*vpp = dvp;
345 		return (0);
346 	}
347 
348 	return (hs_dirlook(dvp, nm, namelen, vpp, cred));
349 }
350 
351 
352 /*ARGSUSED*/
353 static int
354 hsfs_readdir(
355 	struct vnode	*vp,
356 	struct uio	*uiop,
357 	struct cred	*cred,
358 	int		*eofp)
359 {
360 	struct hsnode	*dhp;
361 	struct hsfs	*fsp;
362 	struct hs_direntry hd;
363 	struct dirent64	*nd;
364 	int		error;
365 	uint_t		offset;		/* real offset in directory */
366 	uint_t		dirsiz;		/* real size of directory */
367 	uchar_t		*blkp;
368 	int		hdlen;		/* length of hs directory entry */
369 	long		ndlen;		/* length of dirent entry */
370 	int		bytes_wanted;
371 	size_t		bufsize;	/* size of dirent buffer */
372 	char		*outbuf;	/* ptr to dirent buffer */
373 	char		*dname;
374 	int		dnamelen;
375 	size_t		dname_size;
376 	struct fbuf	*fbp;
377 	uint_t		last_offset;	/* last index into current dir block */
378 	ulong_t		dir_lbn;	/* lbn of directory */
379 	ino64_t		dirino;	/* temporary storage before storing in dirent */
380 	off_t		diroff;
381 
382 	dhp = VTOH(vp);
383 	fsp = VFS_TO_HSFS(vp->v_vfsp);
384 	if (dhp->hs_dirent.ext_size == 0)
385 		hs_filldirent(vp, &dhp->hs_dirent);
386 	dirsiz = dhp->hs_dirent.ext_size;
387 	dir_lbn = dhp->hs_dirent.ext_lbn;
388 	if (uiop->uio_loffset >= dirsiz) {	/* at or beyond EOF */
389 		if (eofp)
390 			*eofp = 1;
391 		return (0);
392 	}
393 	ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
394 	offset = uiop->uio_loffset;
395 
396 	dname_size = fsp->hsfs_namemax + 1;	/* 1 for the ending NUL */
397 	dname = kmem_alloc(dname_size, KM_SLEEP);
398 	bufsize = uiop->uio_resid + sizeof (struct dirent64);
399 
400 	outbuf = kmem_alloc(bufsize, KM_SLEEP);
401 	nd = (struct dirent64 *)outbuf;
402 
403 	while (offset < dirsiz) {
404 		bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
405 
406 		error = fbread(vp, (offset_t)(offset & MAXBMASK),
407 			(unsigned int)bytes_wanted, S_READ, &fbp);
408 		if (error)
409 			goto done;
410 
411 		blkp = (uchar_t *)fbp->fb_addr;
412 		last_offset = (offset & MAXBMASK) + fbp->fb_count;
413 
414 #define	rel_offset(offset) ((offset) & MAXBOFFSET)	/* index into blkp */
415 
416 		while (offset < last_offset) {
417 			/*
418 			 * Very similar validation code is found in
419 			 * process_dirblock(), hsfs_node.c.
420 			 * For an explanation, see there.
421 			 * It may make sense for the future to
422 			 * "consolidate" the code in hs_parsedir(),
423 			 * process_dirblock() and hsfs_readdir() into
424 			 * a single utility function.
425 			 */
426 			hdlen = (int)((uchar_t)
427 				HDE_DIR_LEN(&blkp[rel_offset(offset)]));
428 			if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
429 			    offset + hdlen > last_offset) {
430 				/*
431 				 * advance to next sector boundary
432 				 */
433 				offset = roundup(offset + 1, HS_SECTOR_SIZE);
434 				if (hdlen)
435 					hs_log_bogus_disk_warning(fsp,
436 					    HSFS_ERR_TRAILING_JUNK, 0);
437 
438 				continue;
439 			}
440 
441 			bzero(&hd, sizeof (hd));
442 
443 			/*
444 			 * Just ignore invalid directory entries.
445 			 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
446 			 */
447 			if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
448 				&hd, dname, &dnamelen)) {
449 				/*
450 				 * Determine if there is enough room
451 				 */
452 				ndlen = (long)DIRENT64_RECLEN((dnamelen));
453 
454 				if ((ndlen + ((char *)nd - outbuf)) >
455 				    uiop->uio_resid) {
456 					fbrelse(fbp, S_READ);
457 					goto done; /* output buffer full */
458 				}
459 
460 				diroff = offset + hdlen;
461 				/*
462 				 * Generate nodeid.
463 				 * If a directory, nodeid points to the
464 				 * canonical dirent describing the directory:
465 				 * the dirent of the "." entry for the
466 				 * directory, which is pointed to by all
467 				 * dirents for that directory.
468 				 * Otherwise, nodeid points to dirent of file.
469 				 */
470 				if (hd.type == VDIR) {
471 					dirino = (ino64_t)
472 					    MAKE_NODEID(hd.ext_lbn, 0,
473 					    vp->v_vfsp);
474 				} else {
475 					struct hs_volume *hvp;
476 					offset_t lbn, off;
477 
478 					/*
479 					 * Normalize lbn and off
480 					 */
481 					hvp = &fsp->hsfs_vol;
482 					lbn = dir_lbn +
483 					    (offset >> hvp->lbn_shift);
484 					off = offset & hvp->lbn_maxoffset;
485 					dirino = (ino64_t)MAKE_NODEID(lbn,
486 					    off, vp->v_vfsp);
487 				}
488 
489 
490 				/* strncpy(9f) will zero uninitialized bytes */
491 
492 				ASSERT(strlen(dname) + 1 <=
493 				    DIRENT64_NAMELEN(ndlen));
494 				(void) strncpy(nd->d_name, dname,
495 				    DIRENT64_NAMELEN(ndlen));
496 				nd->d_reclen = (ushort_t)ndlen;
497 				nd->d_off = (offset_t)diroff;
498 				nd->d_ino = dirino;
499 				nd = (struct dirent64 *)((char *)nd + ndlen);
500 
501 				/*
502 				 * free up space allocated for symlink
503 				 */
504 				if (hd.sym_link != (char *)NULL) {
505 					kmem_free(hd.sym_link,
506 					    (size_t)(hd.ext_size+1));
507 					hd.sym_link = (char *)NULL;
508 				}
509 			}
510 			offset += hdlen;
511 		}
512 		fbrelse(fbp, S_READ);
513 	}
514 
515 	/*
516 	 * Got here for one of the following reasons:
517 	 *	1) outbuf is full (error == 0)
518 	 *	2) end of directory reached (error == 0)
519 	 *	3) error reading directory sector (error != 0)
520 	 *	4) directory entry crosses sector boundary (error == 0)
521 	 *
522 	 * If any directory entries have been copied, don't report
523 	 * case 4.  Instead, return the valid directory entries.
524 	 *
525 	 * If no entries have been copied, report the error.
526 	 * If case 4, this will be indistiguishable from EOF.
527 	 */
528 done:
529 	ndlen = ((char *)nd - outbuf);
530 	if (ndlen != 0) {
531 		error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
532 		uiop->uio_loffset = offset;
533 	}
534 	kmem_free(dname, dname_size);
535 	kmem_free(outbuf, bufsize);
536 	if (eofp && error == 0)
537 		*eofp = (uiop->uio_loffset >= dirsiz);
538 	return (error);
539 }
540 
541 static int
542 hsfs_fid(struct vnode *vp, struct fid *fidp)
543 {
544 	struct hsnode *hp;
545 	struct hsfid *fid;
546 
547 	if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
548 		fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
549 		return (ENOSPC);
550 	}
551 
552 	fid = (struct hsfid *)fidp;
553 	fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
554 	hp = VTOH(vp);
555 	mutex_enter(&hp->hs_contents_lock);
556 	fid->hf_dir_lbn = hp->hs_dir_lbn;
557 	fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
558 	mutex_exit(&hp->hs_contents_lock);
559 	return (0);
560 }
561 
562 /*ARGSUSED*/
563 static int
564 hsfs_open(struct vnode **vpp, int flag, struct cred *cred)
565 {
566 	return (0);
567 }
568 
569 /*ARGSUSED*/
570 static int
571 hsfs_close(
572 	struct vnode *vp,
573 	int flag,
574 	int count,
575 	offset_t offset,
576 	struct cred *cred)
577 {
578 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
579 	cleanshares(vp, ttoproc(curthread)->p_pid);
580 	return (0);
581 }
582 
583 /*ARGSUSED2*/
584 static int
585 hsfs_access(struct vnode *vp, int mode, int flags, cred_t *cred)
586 {
587 	return (hs_access(vp, (mode_t)mode, cred));
588 }
589 
590 /*
591  * the seek time of a CD-ROM is very slow, and data transfer
592  * rate is even worse (max. 150K per sec).  The design
593  * decision is to reduce access to cd-rom as much as possible,
594  * and to transfer a sizable block (read-ahead) of data at a time.
595  * UFS style of read ahead one block at a time is not appropriate,
596  * and is not supported
597  */
598 
599 /*
600  * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
601  */
602 #define	KLUSTSIZE	(56 * 1024)
603 /* we don't support read ahead */
604 int hsfs_lostpage;	/* no. of times we lost original page */
605 
606 /*
607  * Used to prevent biodone() from releasing buf resources that
608  * we didn't allocate in quite the usual way.
609  */
610 /*ARGSUSED*/
611 int
612 hsfs_iodone(struct buf *bp)
613 {
614 	sema_v(&bp->b_io);
615 	return (0);
616 }
617 
618 /*
619  * Each file may have a different interleaving on disk.  This makes
620  * things somewhat interesting.  The gist is that there are some
621  * number of contiguous data sectors, followed by some other number
622  * of contiguous skip sectors.  The sum of those two sets of sectors
623  * defines the interleave size.  Unfortunately, it means that we generally
624  * can't simply read N sectors starting at a given offset to satisfy
625  * any given request.
626  *
627  * What we do is get the relevant memory pages via pvn_read_kluster(),
628  * then stride through the interleaves, setting up a buf for each
629  * sector that needs to be brought in.  Instead of kmem_alloc'ing
630  * space for the sectors, though, we just point at the appropriate
631  * spot in the relevant page for each of them.  This saves us a bunch
632  * of copying.
633  */
634 /*ARGSUSED*/
635 static int
636 hsfs_getapage(
637 	struct vnode *vp,
638 	u_offset_t off,
639 	size_t len,
640 	uint_t *protp,
641 	struct page *pl[],
642 	size_t plsz,
643 	struct seg *seg,
644 	caddr_t addr,
645 	enum seg_rw rw,
646 	struct cred *cred)
647 {
648 	struct hsnode *hp;
649 	struct hsfs *fsp;
650 	int	err;
651 	struct buf *bufs;
652 	caddr_t *vas;
653 	caddr_t va;
654 	struct page *pp, *searchp, *lastp;
655 	page_t	*pagefound;
656 	offset_t	bof;
657 	struct vnode *devvp;
658 	ulong_t	byte_offset;
659 	size_t	io_len_tmp;
660 	uint_t	io_off, io_len;
661 	uint_t	xlen;
662 	uint_t	filsiz;
663 	uint_t	secsize;
664 	uint_t	bufcnt;
665 	uint_t	bufsused;
666 	uint_t	count;
667 	uint_t	io_end;
668 	uint_t	which_chunk_lbn;
669 	uint_t	offset_lbn;
670 	uint_t	offset_extra;
671 	offset_t	offset_bytes;
672 	uint_t	remaining_bytes;
673 	uint_t	extension;
674 	int	remainder;	/* must be signed */
675 	int	chunk_lbn_count;
676 	int	chunk_data_bytes;
677 	int	xarsiz;
678 	diskaddr_t driver_block;
679 	u_offset_t io_off_tmp;
680 
681 	/*
682 	 * We don't support asynchronous operation at the moment, so
683 	 * just pretend we did it.  If the pages are ever actually
684 	 * needed, they'll get brought in then.
685 	 */
686 	if (pl == NULL)
687 		return (0);
688 
689 	hp = VTOH(vp);
690 	fsp = VFS_TO_HSFS(vp->v_vfsp);
691 	devvp = fsp->hsfs_devvp;
692 	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
693 
694 	/* file data size */
695 	filsiz = hp->hs_dirent.ext_size;
696 
697 	/* disk addr for start of file */
698 	bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
699 
700 	/* xarsiz byte must be skipped for data */
701 	xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
702 
703 	/* how many logical blocks in an interleave (data+skip) */
704 	chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
705 
706 	if (chunk_lbn_count == 0) {
707 		chunk_lbn_count = 1;
708 	}
709 
710 	/*
711 	 * Convert interleaving size into bytes.  The zero case
712 	 * (no interleaving) optimization is handled as a side-
713 	 * effect of the read-ahead logic.
714 	 */
715 	if (hp->hs_dirent.intlf_sz == 0) {
716 		chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
717 	} else {
718 		chunk_data_bytes = LBN_TO_BYTE(hp->hs_dirent.intlf_sz,
719 			vp->v_vfsp);
720 	}
721 
722 reread:
723 	err = 0;
724 	pagefound = 0;
725 
726 	/*
727 	 * Do some read-ahead.  This mostly saves us a bit of
728 	 * system cpu time more than anything else when doing
729 	 * sequential reads.  At some point, could do the
730 	 * read-ahead asynchronously which might gain us something
731 	 * on wall time, but it seems unlikely....
732 	 *
733 	 * We do the easy case here, which is to read through
734 	 * the end of the chunk, minus whatever's at the end that
735 	 * won't exactly fill a page.
736 	 */
737 	which_chunk_lbn = (off + len) / chunk_data_bytes;
738 	extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
739 	extension -= (extension % PAGESIZE);
740 	if (extension != 0 && extension < filsiz - off) {
741 		len = extension;
742 	} else {
743 		len = PAGESIZE;
744 	}
745 	/*
746 	 * Some cd writers don't write sectors that aren't used.  Also,
747 	 * there's no point in reading sectors we'll never look at.  So,
748 	 * if we're asked to go beyond the end of a file, truncate to the
749 	 * length of that file.
750 	 *
751 	 * Additionally, this behaviour is required by section 6.4.5 of
752 	 * ISO 9660:1988(E).
753 	 */
754 	if (len > (filsiz - off)) {
755 		len = filsiz - off;
756 	}
757 
758 	/* A little paranoia. */
759 	ASSERT(len > 0);
760 
761 	/*
762 	 * After all that, make sure we're asking for things in units
763 	 * that bdev_strategy() will understand (see bug 4202551).
764 	 */
765 	len = roundup(len, DEV_BSIZE);
766 
767 	pp = NULL;
768 again:
769 	/* search for page in buffer */
770 	if ((pagefound = page_exists(vp, off)) == 0) {
771 		/*
772 		 * Need to really do disk IO to get the page.
773 		 */
774 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
775 		    &io_len_tmp, off, len, 0);
776 
777 		if (pp == NULL)
778 			goto again;
779 
780 		io_off = (uint_t)io_off_tmp;
781 		io_len = (uint_t)io_len_tmp;
782 
783 		/* check for truncation */
784 		/*
785 		 * xxx Clean up and return EIO instead?
786 		 * xxx Ought to go to u_offset_t for everything, but we
787 		 * xxx call lots of things that want uint_t arguments.
788 		 */
789 		ASSERT(io_off == io_off_tmp);
790 
791 		/*
792 		 * get enough buffers for worst-case scenario
793 		 * (i.e., no coalescing possible).
794 		 */
795 		bufcnt = (len + secsize - 1) / secsize;
796 		bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
797 		vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
798 		for (count = 0; count < bufcnt; count++) {
799 			bufs[count].b_edev = devvp->v_rdev;
800 			bufs[count].b_dev = cmpdev(devvp->v_rdev);
801 			bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
802 			bufs[count].b_iodone = hsfs_iodone;
803 			bufs[count].b_vp = vp;
804 			bufs[count].b_file = vp;
805 			sema_init(&bufs[count].b_io, 0, NULL,
806 			    SEMA_DEFAULT, NULL);
807 			sema_init(&bufs[count].b_sem, 0, NULL,
808 			    SEMA_DEFAULT, NULL);
809 		}
810 
811 		/*
812 		 * If our filesize is not an integer multiple of PAGESIZE,
813 		 * we zero that part of the last page that's between EOF and
814 		 * the PAGESIZE boundary.
815 		 */
816 		xlen = io_len & PAGEOFFSET;
817 		if (xlen != 0)
818 			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
819 
820 		va = NULL;
821 		lastp = NULL;
822 		searchp = pp;
823 		io_end = io_off + io_len;
824 		for (count = 0, byte_offset = io_off;
825 			byte_offset < io_end;
826 			count++) {
827 			ASSERT(count < bufcnt);
828 
829 			/* Compute disk address for interleaving. */
830 
831 			/* considered without skips */
832 			which_chunk_lbn = byte_offset / chunk_data_bytes;
833 
834 			/* factor in skips */
835 			offset_lbn = which_chunk_lbn * chunk_lbn_count;
836 
837 			/* convert to physical byte offset for lbn */
838 			offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
839 
840 			/* don't forget offset into lbn */
841 			offset_extra = byte_offset % chunk_data_bytes;
842 
843 			/* get virtual block number for driver */
844 			driver_block = lbtodb(bof + xarsiz
845 				+ offset_bytes + offset_extra);
846 
847 			if (lastp != searchp) {
848 				/* this branch taken first time through loop */
849 				va = vas[count]
850 					= ppmapin(searchp, PROT_WRITE,
851 						(caddr_t)-1);
852 				/* ppmapin() guarantees not to return NULL */
853 			} else {
854 				vas[count] = NULL;
855 			}
856 
857 			bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
858 			bufs[count].b_offset =
859 			    (offset_t)(byte_offset - io_off + off);
860 
861 			/*
862 			 * We specifically use the b_lblkno member here
863 			 * as even in the 32 bit world driver_block can
864 			 * get very large in line with the ISO9660 spec.
865 			 */
866 
867 			bufs[count].b_lblkno = driver_block;
868 
869 			remaining_bytes = ((which_chunk_lbn + 1)
870 				* chunk_data_bytes)
871 				- byte_offset;
872 
873 			/*
874 			 * remaining_bytes can't be zero, as we derived
875 			 * which_chunk_lbn directly from byte_offset.
876 			 */
877 			if ((remaining_bytes + byte_offset) < (off + len)) {
878 				/* coalesce-read the rest of the chunk */
879 				bufs[count].b_bcount = remaining_bytes;
880 			} else {
881 				/* get the final bits */
882 				bufs[count].b_bcount = off + len - byte_offset;
883 			}
884 
885 			/*
886 			 * It would be nice to do multiple pages'
887 			 * worth at once here when the opportunity
888 			 * arises, as that has been shown to improve
889 			 * our wall time.  However, to do that
890 			 * requires that we use the pageio subsystem,
891 			 * which doesn't mix well with what we're
892 			 * already using here.  We can't use pageio
893 			 * all the time, because that subsystem
894 			 * assumes that a page is stored in N
895 			 * contiguous blocks on the device.
896 			 * Interleaving violates that assumption.
897 			 */
898 
899 			remainder = PAGESIZE - (byte_offset % PAGESIZE);
900 			if (bufs[count].b_bcount > remainder) {
901 				bufs[count].b_bcount = remainder;
902 			}
903 
904 			bufs[count].b_bufsize = bufs[count].b_bcount;
905 			if (((offset_t)byte_offset + bufs[count].b_bcount) >
906 				HS_MAXFILEOFF) {
907 				break;
908 			}
909 			byte_offset += bufs[count].b_bcount;
910 
911 			(void) bdev_strategy(&bufs[count]);
912 
913 			lwp_stat_update(LWP_STAT_INBLK, 1);
914 			lastp = searchp;
915 			if ((remainder - bufs[count].b_bcount) < 1) {
916 				searchp = searchp->p_next;
917 			}
918 		}
919 
920 		bufsused = count;
921 		/* Now wait for everything to come in */
922 		for (count = 0; count < bufsused; count++) {
923 			if (err == 0) {
924 				err = biowait(&bufs[count]);
925 			} else
926 				(void) biowait(&bufs[count]);
927 		}
928 
929 		/* Don't leak resources */
930 		for (count = 0; count < bufcnt; count++) {
931 			sema_destroy(&bufs[count].b_io);
932 			sema_destroy(&bufs[count].b_sem);
933 			if (count < bufsused && vas[count] != NULL) {
934 				ppmapout(vas[count]);
935 			}
936 		}
937 
938 		kmem_free(vas, bufcnt * sizeof (caddr_t));
939 		kmem_free(bufs, bufcnt * sizeof (struct buf));
940 	}
941 
942 	if (err) {
943 		pvn_read_done(pp, B_ERROR);
944 		return (err);
945 	}
946 
947 	/*
948 	 * Lock the requested page, and the one after it if possible.
949 	 * Don't bother if our caller hasn't given us a place to stash
950 	 * the page pointers, since otherwise we'd lock pages that would
951 	 * never get unlocked.
952 	 */
953 	if (pagefound) {
954 		int index;
955 		ulong_t soff;
956 
957 		/*
958 		 * Make sure it's in memory before we say it's here.
959 		 */
960 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
961 			hsfs_lostpage++;
962 			goto reread;
963 		}
964 
965 		pl[0] = pp;
966 		index = 1;
967 
968 		/*
969 		 * Try to lock the next page, if it exists, without
970 		 * blocking.
971 		 */
972 		plsz -= PAGESIZE;
973 		/* LINTED (plsz is unsigned) */
974 		for (soff = off + PAGESIZE; plsz > 0;
975 		    soff += PAGESIZE, plsz -= PAGESIZE) {
976 			pp = page_lookup_nowait(vp, (u_offset_t)soff,
977 					SE_SHARED);
978 			if (pp == NULL)
979 				break;
980 			pl[index++] = pp;
981 		}
982 		pl[index] = NULL;
983 		return (0);
984 	}
985 
986 	if (pp != NULL) {
987 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
988 	}
989 
990 	return (err);
991 }
992 
993 static int
994 hsfs_getpage(
995 	struct vnode *vp,
996 	offset_t off,
997 	size_t len,
998 	uint_t *protp,
999 	struct page *pl[],
1000 	size_t plsz,
1001 	struct seg *seg,
1002 	caddr_t addr,
1003 	enum seg_rw rw,
1004 	struct cred *cred)
1005 {
1006 	int err;
1007 	uint_t filsiz;
1008 	struct hsnode *hp = VTOH(vp);
1009 
1010 	/* does not support write */
1011 	if (rw == S_WRITE) {
1012 		panic("write attempt on READ ONLY HSFS");
1013 		/*NOTREACHED*/
1014 	}
1015 
1016 	if (vp->v_flag & VNOMAP) {
1017 		return (ENOSYS);
1018 	}
1019 
1020 	ASSERT(off <= HS_MAXFILEOFF);
1021 
1022 	/*
1023 	 * Determine file data size for EOF check.
1024 	 */
1025 	filsiz = hp->hs_dirent.ext_size;
1026 	if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
1027 		return (EFAULT);	/* beyond EOF */
1028 
1029 	if (protp != NULL)
1030 		*protp = PROT_ALL;
1031 
1032 	if (len <= PAGESIZE)
1033 		err = hsfs_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
1034 		    seg, addr, rw, cred);
1035 	else
1036 		err = pvn_getpages(hsfs_getapage, vp, off, len, protp,
1037 		    pl, plsz, seg, addr, rw, cred);
1038 
1039 	return (err);
1040 }
1041 
1042 
1043 
1044 /*
1045  * This function should never be called. We need to have it to pass
1046  * it as an argument to other functions.
1047  */
1048 /*ARGSUSED*/
1049 int
1050 hsfs_putapage(
1051 	vnode_t		*vp,
1052 	page_t		*pp,
1053 	u_offset_t	*offp,
1054 	size_t		*lenp,
1055 	int		flags,
1056 	cred_t		*cr)
1057 {
1058 	/* should never happen - just destroy it */
1059 	cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
1060 	pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
1061 	return (0);
1062 }
1063 
1064 
1065 /*
1066  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
1067  * B_INVAL is set by:
1068  *
1069  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
1070  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
1071  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
1072  *
1073  * The B_FREE (as well as the B_DONTNEED) flag is set when the
1074  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
1075  * from SEGVN to release pages behind a pagefault.
1076  */
1077 /*ARGSUSED*/
1078 static int
1079 hsfs_putpage(
1080 	struct vnode	*vp,
1081 	offset_t	off,
1082 	size_t		len,
1083 	int		flags,
1084 	struct cred	*cr)
1085 {
1086 	int error = 0;
1087 
1088 	if (vp->v_count == 0) {
1089 		panic("hsfs_putpage: bad v_count");
1090 		/*NOTREACHED*/
1091 	}
1092 
1093 	if (vp->v_flag & VNOMAP)
1094 		return (ENOSYS);
1095 
1096 	ASSERT(off <= HS_MAXFILEOFF);
1097 
1098 	if (!vn_has_cached_data(vp))	/* no pages mapped */
1099 		return (0);
1100 
1101 	if (len == 0)		/* from 'off' to EOF */
1102 		error = pvn_vplist_dirty(vp, off,
1103 					hsfs_putapage, flags, cr);
1104 	else {
1105 		offset_t end_off = off + len;
1106 		offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
1107 		offset_t io_off;
1108 
1109 		file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
1110 		if (end_off > file_size)
1111 			end_off = file_size;
1112 
1113 		for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
1114 			page_t *pp;
1115 
1116 			/*
1117 			 * We insist on getting the page only if we are
1118 			 * about to invalidate, free or write it and
1119 			 * the B_ASYNC flag is not set.
1120 			 */
1121 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1122 				pp = page_lookup(vp, io_off,
1123 					(flags & (B_INVAL | B_FREE)) ?
1124 					    SE_EXCL : SE_SHARED);
1125 			} else {
1126 				pp = page_lookup_nowait(vp, io_off,
1127 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
1128 			}
1129 
1130 			if (pp == NULL)
1131 				continue;
1132 			/*
1133 			 * Normally pvn_getdirty() should return 0, which
1134 			 * impies that it has done the job for us.
1135 			 * The shouldn't-happen scenario is when it returns 1.
1136 			 * This means that the page has been modified and
1137 			 * needs to be put back.
1138 			 * Since we can't write on a CD, we fake a failed
1139 			 * I/O and force pvn_write_done() to destroy the page.
1140 			 */
1141 			if (pvn_getdirty(pp, flags) == 1) {
1142 				cmn_err(CE_NOTE,
1143 					"hsfs_putpage: dirty HSFS page");
1144 				pvn_write_done(pp, flags |
1145 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
1146 			}
1147 		}
1148 	}
1149 	return (error);
1150 }
1151 
1152 
1153 /*ARGSUSED*/
1154 static int
1155 hsfs_map(
1156 	struct vnode *vp,
1157 	offset_t off,
1158 	struct as *as,
1159 	caddr_t *addrp,
1160 	size_t len,
1161 	uchar_t prot,
1162 	uchar_t maxprot,
1163 	uint_t flags,
1164 	struct cred *cred)
1165 {
1166 	struct segvn_crargs vn_a;
1167 	int error;
1168 
1169 	/* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
1170 
1171 	if (vp->v_flag & VNOMAP)
1172 		return (ENOSYS);
1173 
1174 	if (off > HS_MAXFILEOFF || off < 0 ||
1175 	    (off + len) < 0 || (off + len) > HS_MAXFILEOFF)
1176 		return (ENXIO);
1177 
1178 	if (vp->v_type != VREG) {
1179 		return (ENODEV);
1180 	}
1181 
1182 	/*
1183 	 * If file is being locked, disallow mapping.
1184 	 */
1185 	if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
1186 		return (EAGAIN);
1187 
1188 	as_rangelock(as);
1189 
1190 	if ((flags & MAP_FIXED) == 0) {
1191 		map_addr(addrp, len, off, 1, flags);
1192 		if (*addrp == NULL) {
1193 			as_rangeunlock(as);
1194 			return (ENOMEM);
1195 		}
1196 	} else {
1197 		/*
1198 		 * User specified address - blow away any previous mappings
1199 		 */
1200 		(void) as_unmap(as, *addrp, len);
1201 	}
1202 
1203 	vn_a.vp = vp;
1204 	vn_a.offset = off;
1205 	vn_a.type = flags & MAP_TYPE;
1206 	vn_a.prot = prot;
1207 	vn_a.maxprot = maxprot;
1208 	vn_a.flags = flags & ~MAP_TYPE;
1209 	vn_a.cred = cred;
1210 	vn_a.amp = NULL;
1211 	vn_a.szc = 0;
1212 	vn_a.lgrp_mem_policy_flags = 0;
1213 
1214 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
1215 	as_rangeunlock(as);
1216 	return (error);
1217 }
1218 
1219 /* ARGSUSED */
1220 static int
1221 hsfs_addmap(
1222 	struct vnode *vp,
1223 	offset_t off,
1224 	struct as *as,
1225 	caddr_t addr,
1226 	size_t len,
1227 	uchar_t prot,
1228 	uchar_t maxprot,
1229 	uint_t flags,
1230 	struct cred *cr)
1231 {
1232 	struct hsnode *hp;
1233 
1234 	if (vp->v_flag & VNOMAP)
1235 		return (ENOSYS);
1236 
1237 	hp = VTOH(vp);
1238 	mutex_enter(&hp->hs_contents_lock);
1239 	hp->hs_mapcnt += btopr(len);
1240 	mutex_exit(&hp->hs_contents_lock);
1241 	return (0);
1242 }
1243 
1244 /*ARGSUSED*/
1245 static int
1246 hsfs_delmap(
1247 	struct vnode *vp,
1248 	offset_t off,
1249 	struct as *as,
1250 	caddr_t addr,
1251 	size_t len,
1252 	uint_t prot,
1253 	uint_t maxprot,
1254 	uint_t flags,
1255 	struct cred *cr)
1256 {
1257 	struct hsnode *hp;
1258 
1259 	if (vp->v_flag & VNOMAP)
1260 		return (ENOSYS);
1261 
1262 	hp = VTOH(vp);
1263 	mutex_enter(&hp->hs_contents_lock);
1264 	hp->hs_mapcnt -= btopr(len);	/* Count released mappings */
1265 	ASSERT(hp->hs_mapcnt >= 0);
1266 	mutex_exit(&hp->hs_contents_lock);
1267 	return (0);
1268 }
1269 
1270 /* ARGSUSED */
1271 static int
1272 hsfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1273 {
1274 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1275 }
1276 
1277 /* ARGSUSED */
1278 static int
1279 hsfs_frlock(
1280 	struct vnode *vp,
1281 	int cmd,
1282 	struct flock64 *bfp,
1283 	int flag,
1284 	offset_t offset,
1285 	struct flk_callback *flk_cbp,
1286 	cred_t *cr)
1287 {
1288 	struct hsnode *hp = VTOH(vp);
1289 
1290 	/*
1291 	 * If the file is being mapped, disallow fs_frlock.
1292 	 * We are not holding the hs_contents_lock while checking
1293 	 * hs_mapcnt because the current locking strategy drops all
1294 	 * locks before calling fs_frlock.
1295 	 * So, hs_mapcnt could change before we enter fs_frlock making
1296 	 * it meaningless to have held hs_contents_lock in the first place.
1297 	 */
1298 	if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
1299 		return (EAGAIN);
1300 
1301 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
1302 }
1303 
1304 const fs_operation_def_t hsfs_vnodeops_template[] = {
1305 	VOPNAME_OPEN, hsfs_open,
1306 	VOPNAME_CLOSE, hsfs_close,
1307 	VOPNAME_READ, hsfs_read,
1308 	VOPNAME_GETATTR, hsfs_getattr,
1309 	VOPNAME_ACCESS, hsfs_access,
1310 	VOPNAME_LOOKUP, hsfs_lookup,
1311 	VOPNAME_READDIR, hsfs_readdir,
1312 	VOPNAME_READLINK, hsfs_readlink,
1313 	VOPNAME_FSYNC, hsfs_fsync,
1314 	VOPNAME_INACTIVE, (fs_generic_func_p) hsfs_inactive,
1315 	VOPNAME_FID, hsfs_fid,
1316 	VOPNAME_SEEK, hsfs_seek,
1317 	VOPNAME_FRLOCK, hsfs_frlock,
1318 	VOPNAME_GETPAGE, hsfs_getpage,
1319 	VOPNAME_PUTPAGE, hsfs_putpage,
1320 	VOPNAME_MAP, (fs_generic_func_p) hsfs_map,
1321 	VOPNAME_ADDMAP, (fs_generic_func_p) hsfs_addmap,
1322 	VOPNAME_DELMAP, hsfs_delmap,
1323 	NULL, NULL
1324 };
1325 
1326 struct vnodeops *hsfs_vnodeops;
1327