xref: /illumos-gate/usr/src/uts/common/fs/pcfs/pc_vfsops.c (revision 7f3d7c9289dee6488b3cd2848a68c0b8580d750c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright (c) 2017 by Delphix. All rights reserved.
28  * Copyright 2024 Oxide Computer Company
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kmem.h>
34 #include <sys/user.h>
35 #include <sys/proc.h>
36 #include <sys/cred.h>
37 #include <sys/disp.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/vnode.h>
42 #include <sys/fdio.h>
43 #include <sys/file.h>
44 #include <sys/uio.h>
45 #include <sys/conf.h>
46 #include <sys/statvfs.h>
47 #include <sys/mount.h>
48 #include <sys/pathname.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/sysmacros.h>
52 #include <sys/conf.h>
53 #include <sys/mkdev.h>
54 #include <sys/swap.h>
55 #include <sys/sunddi.h>
56 #include <sys/sunldi.h>
57 #include <sys/dktp/fdisk.h>
58 #include <sys/fs/pc_label.h>
59 #include <sys/fs/pc_fs.h>
60 #include <sys/fs/pc_dir.h>
61 #include <sys/fs/pc_node.h>
62 #include <fs/fs_subr.h>
63 #include <sys/modctl.h>
64 #include <sys/dkio.h>
65 #include <sys/open.h>
66 #include <sys/mntent.h>
67 #include <sys/policy.h>
68 #include <sys/atomic.h>
69 #include <sys/sdt.h>
70 
71 /*
72  * The majority of PC media use a 512 sector size, but
73  * occasionally you will run across a 1k sector size.
74  * For media with a 1k sector size, fd_strategy() requires
75  * the I/O size to be a 1k multiple; so when the sector size
76  * is not yet known, always read 1k.
77  */
78 #define	PC_SAFESECSIZE	(PC_SECSIZE * 2)
79 
80 static int pcfs_pseudo_floppy(dev_t);
81 
82 static int pcfsinit(int, char *);
83 static int pcfs_mount(struct vfs *, struct vnode *, struct mounta *,
84 	struct cred *);
85 static int pcfs_unmount(struct vfs *, int, struct cred *);
86 static int pcfs_root(struct vfs *, struct vnode **);
87 static int pcfs_statvfs(struct vfs *, struct statvfs64 *);
88 static int pc_syncfsnodes(struct pcfs *);
89 static int pcfs_sync(struct vfs *, short, struct cred *);
90 static int pcfs_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp);
91 static void pcfs_freevfs(vfs_t *vfsp);
92 static int pcfs_syncfs(struct vfs *, uint64_t, struct cred *);
93 
94 static int pc_readfat(struct pcfs *fsp, uchar_t *fatp);
95 static int pc_writefat(struct pcfs *fsp, daddr_t start);
96 
97 static int pc_getfattype(struct pcfs *fsp);
98 static void pcfs_parse_mntopts(struct pcfs *fsp);
99 
100 
101 /*
102  * pcfs mount options table
103  */
104 
105 static char *nohidden_cancel[] = { MNTOPT_PCFS_HIDDEN, NULL };
106 static char *hidden_cancel[] = { MNTOPT_PCFS_NOHIDDEN, NULL };
107 static char *nofoldcase_cancel[] = { MNTOPT_PCFS_FOLDCASE, NULL };
108 static char *foldcase_cancel[] = { MNTOPT_PCFS_NOFOLDCASE, NULL };
109 static char *clamptime_cancel[] = { MNTOPT_PCFS_NOCLAMPTIME, NULL };
110 static char *noclamptime_cancel[] = { MNTOPT_PCFS_CLAMPTIME, NULL };
111 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
112 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
113 
114 static mntopt_t mntopts[] = {
115 /*
116  *	option name	cancel option	default arg	flags	opt data
117  */
118 	{ MNTOPT_PCFS_NOHIDDEN, nohidden_cancel, NULL, 0, NULL },
119 	{ MNTOPT_PCFS_HIDDEN, hidden_cancel, NULL, MO_DEFAULT, NULL },
120 	{ MNTOPT_PCFS_NOFOLDCASE, nofoldcase_cancel, NULL, MO_DEFAULT, NULL },
121 	{ MNTOPT_PCFS_FOLDCASE, foldcase_cancel, NULL, 0, NULL },
122 	{ MNTOPT_PCFS_CLAMPTIME, clamptime_cancel, NULL, MO_DEFAULT, NULL },
123 	{ MNTOPT_PCFS_NOCLAMPTIME, noclamptime_cancel, NULL, 0, NULL },
124 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
125 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL },
126 	{ MNTOPT_PCFS_TIMEZONE, NULL, "+0", MO_DEFAULT | MO_HASVALUE, NULL },
127 	{ MNTOPT_PCFS_SECSIZE, NULL, NULL, MO_HASVALUE, NULL }
128 };
129 
130 static mntopts_t pcfs_mntopts = {
131 	sizeof (mntopts) / sizeof (mntopt_t),
132 	mntopts
133 };
134 
135 int pcfsdebuglevel = 0;
136 
137 /*
138  * pcfslock:	protects the list of mounted pc filesystems "pc_mounttab.
139  * pcfs_lock:	(inside per filesystem structure "pcfs")
140  *		per filesystem lock. Most of the vfsops and vnodeops are
141  *		protected by this lock.
142  * pcnodes_lock: protects the pcnode hash table "pcdhead", "pcfhead".
143  *
144  * Lock hierarchy: pcfslock > pcfs_lock > pcnodes_lock
145  *
146  * pcfs_mountcount:	used to prevent module unloads while there is still
147  *			pcfs state from a former mount hanging around. With
148  *			forced umount support, the filesystem module must not
149  *			be allowed to go away before the last VFS_FREEVFS()
150  *			call has been made.
151  *			Since this is just an atomic counter, there's no need
152  *			for locking.
153  */
154 kmutex_t	pcfslock;
155 krwlock_t	pcnodes_lock;
156 uint32_t	pcfs_mountcount;
157 
158 static int pcfstype;
159 
160 static vfsdef_t vfw = {
161 	VFSDEF_VERSION,
162 	"pcfs",
163 	pcfsinit,
164 	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_CANLOFI|VSW_MOUNTDEV,
165 	&pcfs_mntopts
166 };
167 
168 extern struct mod_ops mod_fsops;
169 
170 static struct modlfs modlfs = {
171 	&mod_fsops,
172 	"PC filesystem",
173 	&vfw
174 };
175 
176 static struct modlinkage modlinkage = {
177 	MODREV_1,
178 	&modlfs,
179 	NULL
180 };
181 
182 int
183 _init(void)
184 {
185 	int	error;
186 
187 #if !defined(lint)
188 	/* make sure the on-disk structures are sane */
189 	ASSERT(sizeof (struct pcdir) == 32);
190 	ASSERT(sizeof (struct pcdir_lfn) == 32);
191 #endif
192 	mutex_init(&pcfslock, NULL, MUTEX_DEFAULT, NULL);
193 	rw_init(&pcnodes_lock, NULL, RW_DEFAULT, NULL);
194 	error = mod_install(&modlinkage);
195 	if (error) {
196 		mutex_destroy(&pcfslock);
197 		rw_destroy(&pcnodes_lock);
198 	}
199 	return (error);
200 }
201 
202 int
203 _fini(void)
204 {
205 	int	error;
206 
207 	/*
208 	 * If a forcedly unmounted instance is still hanging around,
209 	 * we cannot allow the module to be unloaded because that would
210 	 * cause panics once the VFS framework decides it's time to call
211 	 * into VFS_FREEVFS().
212 	 */
213 	if (pcfs_mountcount)
214 		return (EBUSY);
215 
216 	error = mod_remove(&modlinkage);
217 	if (error)
218 		return (error);
219 	mutex_destroy(&pcfslock);
220 	rw_destroy(&pcnodes_lock);
221 	/*
222 	 * Tear down the operations vectors
223 	 */
224 	(void) vfs_freevfsops_by_type(pcfstype);
225 	vn_freevnodeops(pcfs_fvnodeops);
226 	vn_freevnodeops(pcfs_dvnodeops);
227 	return (0);
228 }
229 
230 int
231 _info(struct modinfo *modinfop)
232 {
233 	return (mod_info(&modlinkage, modinfop));
234 }
235 
236 /* ARGSUSED1 */
237 static int
238 pcfsinit(int fstype, char *name)
239 {
240 	static const fs_operation_def_t pcfs_vfsops_template[] = {
241 		VFSNAME_MOUNT,		{ .vfs_mount = pcfs_mount },
242 		VFSNAME_UNMOUNT,	{ .vfs_unmount = pcfs_unmount },
243 		VFSNAME_ROOT,		{ .vfs_root = pcfs_root },
244 		VFSNAME_STATVFS,	{ .vfs_statvfs = pcfs_statvfs },
245 		VFSNAME_SYNC,		{ .vfs_sync = pcfs_sync },
246 		VFSNAME_VGET,		{ .vfs_vget = pcfs_vget },
247 		VFSNAME_FREEVFS,	{ .vfs_freevfs = pcfs_freevfs },
248 		VFSNAME_SYNCFS,		{ .vfs_syncfs = pcfs_syncfs },
249 		NULL,			NULL
250 	};
251 	int error;
252 
253 	error = vfs_setfsops(fstype, pcfs_vfsops_template, NULL);
254 	if (error != 0) {
255 		cmn_err(CE_WARN, "pcfsinit: bad vfs ops template");
256 		return (error);
257 	}
258 
259 	error = vn_make_ops("pcfs", pcfs_fvnodeops_template, &pcfs_fvnodeops);
260 	if (error != 0) {
261 		(void) vfs_freevfsops_by_type(fstype);
262 		cmn_err(CE_WARN, "pcfsinit: bad file vnode ops template");
263 		return (error);
264 	}
265 
266 	error = vn_make_ops("pcfsd", pcfs_dvnodeops_template, &pcfs_dvnodeops);
267 	if (error != 0) {
268 		(void) vfs_freevfsops_by_type(fstype);
269 		vn_freevnodeops(pcfs_fvnodeops);
270 		cmn_err(CE_WARN, "pcfsinit: bad dir vnode ops template");
271 		return (error);
272 	}
273 
274 	pcfstype = fstype;
275 	(void) pc_init();
276 	pcfs_mountcount = 0;
277 	return (0);
278 }
279 
280 static struct pcfs *pc_mounttab = NULL;
281 
282 extern struct pcfs_args pc_tz;
283 
284 /*
285  *  Define some special logical drives we use internal to this file.
286  */
287 #define	BOOT_PARTITION_DRIVE	99
288 #define	PRIMARY_DOS_DRIVE	1
289 #define	UNPARTITIONED_DRIVE	0
290 
291 static int
292 pcfs_device_identify(
293 	struct vfs *vfsp,
294 	struct mounta *uap,
295 	struct cred *cr,
296 	int *dos_ldrive,
297 	dev_t *xdev)
298 {
299 	struct pathname special;
300 	char *c;
301 	struct vnode *svp = NULL;
302 	struct vnode *lvp = NULL;
303 	int oflag, aflag;
304 	int error;
305 
306 	/*
307 	 * Resolve path name of special file being mounted.
308 	 */
309 	if (error = pn_get(uap->spec, UIO_USERSPACE, &special)) {
310 		return (error);
311 	}
312 
313 	*dos_ldrive = -1;
314 
315 	if (error =
316 	    lookupname(special.pn_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &svp)) {
317 		/*
318 		 * If there's no device node, the name specified most likely
319 		 * maps to a PCFS-style "partition specifier" to select a
320 		 * harddisk primary/logical partition. Disable floppy-specific
321 		 * checks in such cases unless an explicit :A or :B is
322 		 * requested.
323 		 */
324 
325 		/*
326 		 * Split the pathname string at the last ':' separator.
327 		 * If there's no ':' in the device name, or the ':' is the
328 		 * last character in the string, the name is invalid and
329 		 * the error from the previous lookup will be returned.
330 		 */
331 		c = strrchr(special.pn_path, ':');
332 		if (c == NULL || strlen(c) == 0)
333 			goto devlookup_done;
334 
335 		*c++ = '\0';
336 
337 		/*
338 		 * PCFS partition name suffixes can be:
339 		 *	- "boot" to indicate the X86BOOT partition
340 		 *	- a drive letter [c-z] for the "DOS logical drive"
341 		 *	- a drive number 1..24 for the "DOS logical drive"
342 		 *	- a "floppy name letter", 'a' or 'b' (just strip this)
343 		 */
344 		if (strcasecmp(c, "boot") == 0) {
345 			/*
346 			 * The Solaris boot partition is requested.
347 			 */
348 			*dos_ldrive = BOOT_PARTITION_DRIVE;
349 		} else if (strspn(c, "0123456789") == strlen(c)) {
350 			/*
351 			 * All digits - parse the partition number.
352 			 */
353 			long drvnum = 0;
354 
355 			if ((error = ddi_strtol(c, NULL, 10, &drvnum)) == 0) {
356 				/*
357 				 * A number alright - in the allowed range ?
358 				 */
359 				if (drvnum > 24 || drvnum == 0)
360 					error = ENXIO;
361 			}
362 			if (error)
363 				goto devlookup_done;
364 			*dos_ldrive = (int)drvnum;
365 		} else if (strlen(c) == 1) {
366 			/*
367 			 * A single trailing character was specified.
368 			 *	- [c-zC-Z] means a harddisk partition, and
369 			 *	  we retrieve the partition number.
370 			 *	- [abAB] means a floppy drive, so we swallow
371 			 *	  the "drive specifier" and test later
372 			 *	  whether the physical device is a floppy.
373 			 */
374 			*c = tolower(*c);
375 			if (*c == 'a' || *c == 'b') {
376 				*dos_ldrive = UNPARTITIONED_DRIVE;
377 			} else if (*c < 'c' || *c > 'z') {
378 				error = ENXIO;
379 				goto devlookup_done;
380 			} else {
381 				*dos_ldrive = 1 + *c - 'c';
382 			}
383 		} else {
384 			/*
385 			 * Can't parse this - pass through previous error.
386 			 */
387 			goto devlookup_done;
388 		}
389 
390 
391 		error = lookupname(special.pn_path, UIO_SYSSPACE, FOLLOW,
392 		    NULLVPP, &svp);
393 	} else {
394 		*dos_ldrive = UNPARTITIONED_DRIVE;
395 	}
396 devlookup_done:
397 	pn_free(&special);
398 	if (error)
399 		return (error);
400 
401 	ASSERT(*dos_ldrive >= UNPARTITIONED_DRIVE);
402 
403 	/*
404 	 * Verify caller's permission to open the device special file.
405 	 */
406 	if ((vfsp->vfs_flag & VFS_RDONLY) != 0 ||
407 	    ((uap->flags & MS_RDONLY) != 0)) {
408 		oflag = FREAD;
409 		aflag = VREAD;
410 	} else {
411 		oflag = FREAD | FWRITE;
412 		aflag = VREAD | VWRITE;
413 	}
414 
415 	error = vfs_get_lofi(vfsp, &lvp);
416 
417 	if (error > 0) {
418 		if (error == ENOENT)
419 			error = ENODEV;
420 		goto out;
421 	} else if (error == 0) {
422 		*xdev = lvp->v_rdev;
423 	} else {
424 		*xdev = svp->v_rdev;
425 
426 		if (svp->v_type != VBLK) {
427 			error = ENOTBLK;
428 			goto out;
429 		}
430 
431 		if ((error = secpolicy_spec_open(cr, svp, oflag)) != 0)
432 			goto out;
433 	}
434 
435 	if (getmajor(*xdev) >= devcnt) {
436 		error = ENXIO;
437 		goto out;
438 	}
439 
440 	if ((error = VOP_ACCESS(svp, aflag, 0, cr, NULL)) != 0)
441 		goto out;
442 
443 out:
444 	if (svp != NULL)
445 		VN_RELE(svp);
446 	if (lvp != NULL)
447 		VN_RELE(lvp);
448 	return (error);
449 }
450 
451 static int
452 pcfs_device_ismounted(
453 	struct vfs *vfsp,
454 	int dos_ldrive,
455 	dev_t xdev,
456 	int *remounting,
457 	dev_t *pseudodev)
458 {
459 	struct pcfs *fsp;
460 	int remount = *remounting;
461 
462 	/*
463 	 * Ensure that this logical drive isn't already mounted, unless
464 	 * this is a REMOUNT request.
465 	 * Note: The framework will perform this check if the "...:c"
466 	 * PCFS-style "logical drive" syntax has not been used and an
467 	 * actually existing physical device is backing this filesystem.
468 	 * Once all block device drivers support PC-style partitioning,
469 	 * this codeblock can be dropped.
470 	 */
471 	*pseudodev = xdev;
472 
473 	if (dos_ldrive) {
474 		mutex_enter(&pcfslock);
475 		for (fsp = pc_mounttab; fsp; fsp = fsp->pcfs_nxt)
476 			if (fsp->pcfs_xdev == xdev &&
477 			    fsp->pcfs_ldrive == dos_ldrive) {
478 				mutex_exit(&pcfslock);
479 				if (remount) {
480 					return (0);
481 				} else {
482 					return (EBUSY);
483 				}
484 			}
485 		/*
486 		 * Assign a unique device number for the vfs
487 		 * The old way (getudev() + a constantly incrementing
488 		 * major number) was wrong because it changes vfs_dev
489 		 * across mounts and reboots, which breaks nfs file handles.
490 		 * UFS just uses the real dev_t. We can't do that because
491 		 * of the way pcfs opens fdisk partitons (the :c and :d
492 		 * partitions are on the same dev_t). Though that _might_
493 		 * actually be ok, since the file handle contains an
494 		 * absolute block number, it's probably better to make them
495 		 * different. So I think we should retain the original
496 		 * dev_t, but come up with a different minor number based
497 		 * on the logical drive that will _always_ come up the same.
498 		 * For now, we steal the upper 6 bits.
499 		 */
500 #ifdef notdef
501 		/* what should we do here? */
502 		if (((getminor(xdev) >> 12) & 0x3F) != 0)
503 			printf("whoops - upper bits used!\n");
504 #endif
505 		*pseudodev = makedevice(getmajor(xdev),
506 		    ((dos_ldrive << 12) | getminor(xdev)) & MAXMIN32);
507 		if (vfs_devmounting(*pseudodev, vfsp)) {
508 			mutex_exit(&pcfslock);
509 			return (EBUSY);
510 		}
511 		if (vfs_devismounted(*pseudodev)) {
512 			mutex_exit(&pcfslock);
513 			if (remount) {
514 				return (0);
515 			} else {
516 				return (EBUSY);
517 			}
518 		}
519 		mutex_exit(&pcfslock);
520 	} else {
521 		*pseudodev = xdev;
522 		if (vfs_devmounting(*pseudodev, vfsp)) {
523 			return (EBUSY);
524 		}
525 		if (vfs_devismounted(*pseudodev))
526 			if (remount) {
527 				return (0);
528 			} else {
529 				return (EBUSY);
530 			}
531 	}
532 
533 	/*
534 	 * This is not a remount. Even if MS_REMOUNT was requested,
535 	 * the caller needs to proceed as it would on an ordinary
536 	 * mount.
537 	 */
538 	*remounting = 0;
539 
540 	ASSERT(*pseudodev);
541 	return (0);
542 }
543 
544 /*
545  * Get the PCFS-specific mount options from the VFS framework.
546  * For "timezone" and "secsize", we need to parse the number
547  * ourselves and ensure its validity.
548  * Note: "secsize" is deliberately undocumented at this time,
549  * it's a workaround for devices (particularly: lofi image files)
550  * that don't support the DKIOCGMEDIAINFO ioctl for autodetection.
551  */
552 static void
553 pcfs_parse_mntopts(struct pcfs *fsp)
554 {
555 	char *c;
556 	char *endptr;
557 	long l;
558 	struct vfs *vfsp = fsp->pcfs_vfs;
559 
560 	ASSERT(fsp->pcfs_secondswest == 0);
561 	ASSERT(fsp->pcfs_secsize == 0);
562 
563 	if (vfs_optionisset(vfsp, MNTOPT_PCFS_HIDDEN, NULL))
564 		fsp->pcfs_flags |= PCFS_HIDDEN;
565 	if (vfs_optionisset(vfsp, MNTOPT_PCFS_FOLDCASE, NULL))
566 		fsp->pcfs_flags |= PCFS_FOLDCASE;
567 	if (vfs_optionisset(vfsp, MNTOPT_PCFS_NOCLAMPTIME, NULL))
568 		fsp->pcfs_flags |= PCFS_NOCLAMPTIME;
569 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
570 		fsp->pcfs_flags |= PCFS_NOATIME;
571 
572 	if (vfs_optionisset(vfsp, MNTOPT_PCFS_TIMEZONE, &c)) {
573 		if (ddi_strtol(c, &endptr, 10, &l) == 0 &&
574 		    endptr == c + strlen(c)) {
575 			/*
576 			 * A number alright - in the allowed range ?
577 			 */
578 			if (l <= -12*3600 || l >= 12*3600) {
579 				cmn_err(CE_WARN, "!pcfs: invalid use of "
580 				    "'timezone' mount option - %ld "
581 				    "is out of range. Assuming 0.", l);
582 				l = 0;
583 			}
584 		} else {
585 			cmn_err(CE_WARN, "!pcfs: invalid use of "
586 			    "'timezone' mount option - argument %s "
587 			    "is not a valid number. Assuming 0.", c);
588 			l = 0;
589 		}
590 		fsp->pcfs_secondswest = l;
591 	}
592 
593 	/*
594 	 * The "secsize=..." mount option is a workaround for the lack of
595 	 * lofi(4D) support for DKIOCGMEDIAINFO. If PCFS wants to parse the
596 	 * partition table of a disk image and it has been partitioned with
597 	 * sector sizes other than 512 bytes, we'd fail on loopback'ed disk
598 	 * images.
599 	 * That should really be fixed in lofi ... this is a workaround.
600 	 */
601 	if (vfs_optionisset(vfsp, MNTOPT_PCFS_SECSIZE, &c)) {
602 		if (ddi_strtol(c, &endptr, 10, &l) == 0 &&
603 		    endptr == c + strlen(c)) {
604 			/*
605 			 * A number alright - a valid sector size as well ?
606 			 */
607 			if (!VALID_SECSIZE(l)) {
608 				cmn_err(CE_WARN, "!pcfs: invalid use of "
609 				    "'secsize' mount option - %ld is "
610 				    "unsupported. Autodetecting.", l);
611 				l = 0;
612 			}
613 		} else {
614 			cmn_err(CE_WARN, "!pcfs: invalid use of "
615 			    "'secsize' mount option - argument %s "
616 			    "is not a valid number. Autodetecting.", c);
617 			l = 0;
618 		}
619 		fsp->pcfs_secsize = l;
620 		fsp->pcfs_sdshift = ddi_ffs(l / DEV_BSIZE) - 1;
621 	}
622 }
623 
624 /*
625  * vfs operations
626  */
627 
628 /*
629  * pcfs_mount - backend for VFS_MOUNT() on PCFS.
630  */
631 static int
632 pcfs_mount(
633 	struct vfs *vfsp,
634 	struct vnode *mvp,
635 	struct mounta *uap,
636 	struct cred *cr)
637 {
638 	struct pcfs *fsp;
639 	struct vnode *devvp;
640 	dev_t pseudodev;
641 	dev_t xdev;
642 	int dos_ldrive = 0;
643 	int error;
644 	int remounting;
645 
646 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
647 		return (error);
648 
649 	if (mvp->v_type != VDIR)
650 		return (ENOTDIR);
651 
652 	mutex_enter(&mvp->v_lock);
653 	if ((uap->flags & MS_REMOUNT) == 0 &&
654 	    (uap->flags & MS_OVERLAY) == 0 &&
655 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
656 		mutex_exit(&mvp->v_lock);
657 		return (EBUSY);
658 	}
659 	mutex_exit(&mvp->v_lock);
660 
661 	/*
662 	 * PCFS doesn't do mount arguments anymore - everything's a mount
663 	 * option these days. In order not to break existing callers, we
664 	 * don't reject it yet, just warn that the data (if any) is ignored.
665 	 */
666 	if (uap->datalen != 0)
667 		cmn_err(CE_WARN, "!pcfs: deprecated use of mount(2) with "
668 		    "mount argument structures instead of mount options. "
669 		    "Ignoring mount(2) 'dataptr' argument.");
670 
671 	/*
672 	 * This is needed early, to make sure the access / open calls
673 	 * are done using the correct mode. Processing this mount option
674 	 * only when calling pcfs_parse_mntopts() would lead us to attempt
675 	 * a read/write access to a possibly writeprotected device, and
676 	 * a readonly mount attempt might fail because of that.
677 	 */
678 	if (uap->flags & MS_RDONLY) {
679 		vfsp->vfs_flag |= VFS_RDONLY;
680 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
681 	}
682 
683 	/*
684 	 * For most filesystems, this is just a lookupname() on the
685 	 * mount pathname string. PCFS historically has to do its own
686 	 * partition table parsing because not all Solaris architectures
687 	 * support all styles of partitioning that PC media can have, and
688 	 * hence PCFS understands "device names" that don't map to actual
689 	 * physical device nodes. Parsing the "PCFS syntax" for device
690 	 * names is done in pcfs_device_identify() - see there.
691 	 *
692 	 * Once all block device drivers that can host FAT filesystems have
693 	 * been enhanced to create device nodes for all PC-style partitions,
694 	 * this code can go away.
695 	 */
696 	if (error = pcfs_device_identify(vfsp, uap, cr, &dos_ldrive, &xdev))
697 		return (error);
698 
699 	/*
700 	 * As with looking up the actual device to mount, PCFS cannot rely
701 	 * on just the checks done by vfs_ismounted() whether a given device
702 	 * is mounted already. The additional check against the "PCFS syntax"
703 	 * is done in  pcfs_device_ismounted().
704 	 */
705 	remounting = (uap->flags & MS_REMOUNT);
706 
707 	if (error = pcfs_device_ismounted(vfsp, dos_ldrive, xdev, &remounting,
708 	    &pseudodev))
709 		return (error);
710 
711 	if (remounting)
712 		return (0);
713 
714 	/*
715 	 * Mount the filesystem.
716 	 * An instance structure is required before the attempt to locate
717 	 * and parse the FAT BPB. This is because mount options may change
718 	 * the behaviour of the filesystem type matching code. Precreate
719 	 * it and fill it in to a degree that allows parsing the mount
720 	 * options.
721 	 */
722 	devvp = makespecvp(xdev, VBLK);
723 	if (IS_SWAPVP(devvp)) {
724 		VN_RELE(devvp);
725 		return (EBUSY);
726 	}
727 	error = VOP_OPEN(&devvp,
728 	    (vfsp->vfs_flag & VFS_RDONLY) ? FREAD : FREAD | FWRITE, cr, NULL);
729 	if (error) {
730 		VN_RELE(devvp);
731 		return (error);
732 	}
733 
734 	fsp = kmem_zalloc(sizeof (*fsp), KM_SLEEP);
735 	fsp->pcfs_vfs = vfsp;
736 	fsp->pcfs_xdev = xdev;
737 	fsp->pcfs_devvp = devvp;
738 	fsp->pcfs_ldrive = dos_ldrive;
739 	mutex_init(&fsp->pcfs_lock, NULL, MUTEX_DEFAULT, NULL);
740 
741 	pcfs_parse_mntopts(fsp);
742 
743 	/*
744 	 * This is the actual "mount" - the PCFS superblock check.
745 	 *
746 	 * Find the requested logical drive and the FAT BPB therein.
747 	 * Check device type and flag the instance if media is removeable.
748 	 *
749 	 * Initializes most members of the filesystem instance structure.
750 	 * Returns EINVAL if no valid BPB can be found. Other errors may
751 	 * occur after I/O failures, or when invalid / unparseable partition
752 	 * tables are encountered.
753 	 */
754 	if (error = pc_getfattype(fsp))
755 		goto errout;
756 
757 	/*
758 	 * Now that the BPB has been parsed, this structural information
759 	 * is available and known to be valid. Initialize the VFS.
760 	 */
761 	vfsp->vfs_data = fsp;
762 	vfsp->vfs_dev = pseudodev;
763 	vfsp->vfs_fstype = pcfstype;
764 	vfs_make_fsid(&vfsp->vfs_fsid, pseudodev, pcfstype);
765 	vfsp->vfs_bcount = 0;
766 	vfsp->vfs_bsize = fsp->pcfs_clsize;
767 
768 	/*
769 	 * Validate that we can access the FAT and that it is, to the
770 	 * degree we can verify here, self-consistent.
771 	 */
772 	if (error = pc_verify(fsp))
773 		goto errout;
774 
775 	/*
776 	 * Record the time of the mount, to return as an "approximate"
777 	 * timestamp for the FAT root directory. Since FAT roots don't
778 	 * have timestamps, this is less confusing to the user than
779 	 * claiming "zero" / Jan/01/1970.
780 	 */
781 	gethrestime(&fsp->pcfs_mounttime);
782 
783 	/*
784 	 * Fix up the mount options. Because "noatime" is made default on
785 	 * removeable media only, a fixed disk will have neither "atime"
786 	 * nor "noatime" set. We set the options explicitly depending on
787 	 * the PCFS_NOATIME flag, to inform the user of what applies.
788 	 * Mount option cancellation will take care that the mutually
789 	 * exclusive 'other' is cleared.
790 	 */
791 	vfs_setmntopt(vfsp,
792 	    fsp->pcfs_flags & PCFS_NOATIME ? MNTOPT_NOATIME : MNTOPT_ATIME,
793 	    NULL, 0);
794 
795 	/*
796 	 * All clear - insert the FS instance into PCFS' list.
797 	 */
798 	mutex_enter(&pcfslock);
799 	fsp->pcfs_nxt = pc_mounttab;
800 	pc_mounttab = fsp;
801 	mutex_exit(&pcfslock);
802 	atomic_inc_32(&pcfs_mountcount);
803 	return (0);
804 
805 errout:
806 	(void) VOP_CLOSE(devvp,
807 	    vfsp->vfs_flag & VFS_RDONLY ? FREAD : FREAD | FWRITE,
808 	    1, (offset_t)0, cr, NULL);
809 	VN_RELE(devvp);
810 	mutex_destroy(&fsp->pcfs_lock);
811 	kmem_free(fsp, sizeof (*fsp));
812 	return (error);
813 
814 }
815 
816 static int
817 pcfs_unmount(
818 	struct vfs *vfsp,
819 	int flag,
820 	struct cred *cr)
821 {
822 	struct pcfs *fsp, *fsp1;
823 
824 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
825 		return (EPERM);
826 
827 	fsp = VFSTOPCFS(vfsp);
828 
829 	/*
830 	 * We don't have to lock fsp because the VVFSLOCK in vfs layer will
831 	 * prevent lookuppn from crossing the mount point.
832 	 * If this is not a forced umount request and there's ongoing I/O,
833 	 * don't allow the mount to proceed.
834 	 */
835 	if (flag & MS_FORCE)
836 		vfsp->vfs_flag |= VFS_UNMOUNTED;
837 	else if (fsp->pcfs_nrefs)
838 		return (EBUSY);
839 
840 	mutex_enter(&pcfslock);
841 
842 	/*
843 	 * If this is a forced umount request or if the fs instance has
844 	 * been marked as beyond recovery, allow the umount to proceed
845 	 * regardless of state. pc_diskchanged() forcibly releases all
846 	 * inactive vnodes/pcnodes.
847 	 */
848 	if (flag & MS_FORCE || fsp->pcfs_flags & PCFS_IRRECOV) {
849 		rw_enter(&pcnodes_lock, RW_WRITER);
850 		pc_diskchanged(fsp);
851 		rw_exit(&pcnodes_lock);
852 	}
853 
854 	/* now there should be no pcp node on pcfhead or pcdhead. */
855 
856 	if (fsp == pc_mounttab) {
857 		pc_mounttab = fsp->pcfs_nxt;
858 	} else {
859 		for (fsp1 = pc_mounttab; fsp1 != NULL; fsp1 = fsp1->pcfs_nxt)
860 			if (fsp1->pcfs_nxt == fsp)
861 				fsp1->pcfs_nxt = fsp->pcfs_nxt;
862 	}
863 
864 	mutex_exit(&pcfslock);
865 
866 	/*
867 	 * Since we support VFS_FREEVFS(), there's no need to
868 	 * free the fsp right now. The framework will tell us
869 	 * when the right time to do so has arrived by calling
870 	 * into pcfs_freevfs.
871 	 */
872 	return (0);
873 }
874 
875 /*
876  * find root of pcfs
877  */
878 static int
879 pcfs_root(
880 	struct vfs *vfsp,
881 	struct vnode **vpp)
882 {
883 	struct pcfs *fsp;
884 	struct pcnode *pcp;
885 	int error;
886 
887 	fsp = VFSTOPCFS(vfsp);
888 	if (error = pc_lockfs(fsp, 0, 0))
889 		return (error);
890 
891 	pcp = pc_getnode(fsp, (daddr_t)0, 0, NULL);
892 	pc_unlockfs(fsp);
893 	*vpp = PCTOV(pcp);
894 	pcp->pc_flags |= PC_EXTERNAL;
895 	return (0);
896 }
897 
898 /*
899  * Get file system statistics.
900  */
901 static int
902 pcfs_statvfs(
903 	struct vfs *vfsp,
904 	struct statvfs64 *sp)
905 {
906 	struct pcfs *fsp;
907 	int error;
908 	dev32_t d32;
909 
910 	fsp = VFSTOPCFS(vfsp);
911 	error = pc_getfat(fsp);
912 	if (error)
913 		return (error);
914 	bzero(sp, sizeof (*sp));
915 	sp->f_bsize = sp->f_frsize = fsp->pcfs_clsize;
916 	sp->f_blocks = (fsblkcnt64_t)fsp->pcfs_ncluster;
917 	sp->f_bavail = sp->f_bfree = (fsblkcnt64_t)pc_freeclusters(fsp);
918 	sp->f_files = (fsfilcnt64_t)-1;
919 	sp->f_ffree = (fsfilcnt64_t)-1;
920 	sp->f_favail = (fsfilcnt64_t)-1;
921 #ifdef notdef
922 	(void) cmpldev(&d32, fsp->pcfs_devvp->v_rdev);
923 #endif /* notdef */
924 	(void) cmpldev(&d32, vfsp->vfs_dev);
925 	sp->f_fsid = d32;
926 	(void) strcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
927 	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
928 	sp->f_namemax = PCMAXNAMLEN;
929 	return (0);
930 }
931 
932 static int
933 pc_syncfsnodes(struct pcfs *fsp)
934 {
935 	struct pchead *hp;
936 	struct pcnode *pcp;
937 	int error;
938 
939 	if (error = pc_lockfs(fsp, 0, 0))
940 		return (error);
941 
942 	if (!(error = pc_syncfat(fsp))) {
943 		hp = pcfhead;
944 		while (hp < & pcfhead [ NPCHASH ]) {
945 			rw_enter(&pcnodes_lock, RW_READER);
946 			pcp = hp->pch_forw;
947 			while (pcp != (struct pcnode *)hp) {
948 				if (VFSTOPCFS(PCTOV(pcp) -> v_vfsp) == fsp)
949 					if (error = pc_nodesync(pcp))
950 						break;
951 				pcp = pcp -> pc_forw;
952 			}
953 			rw_exit(&pcnodes_lock);
954 			if (error)
955 				break;
956 			hp++;
957 		}
958 	}
959 	pc_unlockfs(fsp);
960 	return (error);
961 }
962 
963 /*
964  * Flush any pending I/O.
965  */
966 static int
967 pcfs_sync(struct vfs *vfsp, short flag, struct cred *cr)
968 {
969 	struct pcfs *fsp;
970 	int error = 0;
971 
972 	/* this prevents the filesystem from being umounted. */
973 	mutex_enter(&pcfslock);
974 	if (vfsp != NULL) {
975 		fsp = VFSTOPCFS(vfsp);
976 		if (!(fsp->pcfs_flags & PCFS_IRRECOV)) {
977 			error = pc_syncfsnodes(fsp);
978 		} else {
979 			rw_enter(&pcnodes_lock, RW_WRITER);
980 			pc_diskchanged(fsp);
981 			rw_exit(&pcnodes_lock);
982 			error = EIO;
983 		}
984 	} else {
985 		fsp = pc_mounttab;
986 		while (fsp != NULL) {
987 			if (fsp->pcfs_flags & PCFS_IRRECOV) {
988 				rw_enter(&pcnodes_lock, RW_WRITER);
989 				pc_diskchanged(fsp);
990 				rw_exit(&pcnodes_lock);
991 				error = EIO;
992 				break;
993 			}
994 			error = pc_syncfsnodes(fsp);
995 			if (error) break;
996 			fsp = fsp->pcfs_nxt;
997 		}
998 	}
999 	mutex_exit(&pcfslock);
1000 	return (error);
1001 }
1002 
1003 static int
1004 pcfs_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr)
1005 {
1006 	int ret;
1007 	struct pcfs *fsp;
1008 
1009 	if (flags != 0) {
1010 		return (ENOTSUP);
1011 	}
1012 
1013 	fsp = VFSTOPCFS(vfsp);
1014 	if ((fsp->pcfs_flags & PCFS_IRRECOV) == 0) {
1015 		ret = pc_syncfsnodes(fsp);
1016 	} else {
1017 		rw_enter(&pcnodes_lock, RW_WRITER);
1018 		pc_diskchanged(fsp);
1019 		rw_exit(&pcnodes_lock);
1020 		ret = EIO;
1021 	}
1022 
1023 	return (ret);
1024 }
1025 
1026 int
1027 pc_lockfs(struct pcfs *fsp, int diskchanged, int releasing)
1028 {
1029 	int err;
1030 
1031 	if ((fsp->pcfs_flags & PCFS_IRRECOV) && !releasing)
1032 		return (EIO);
1033 
1034 	if ((fsp->pcfs_flags & PCFS_LOCKED) && (fsp->pcfs_owner == curthread)) {
1035 		fsp->pcfs_count++;
1036 	} else {
1037 		mutex_enter(&fsp->pcfs_lock);
1038 		if (fsp->pcfs_flags & PCFS_LOCKED)
1039 			panic("pc_lockfs");
1040 		/*
1041 		 * We check the IRRECOV bit again just in case somebody
1042 		 * snuck past the initial check but then got held up before
1043 		 * they could grab the lock.  (And in the meantime someone
1044 		 * had grabbed the lock and set the bit)
1045 		 */
1046 		if (!diskchanged && !(fsp->pcfs_flags & PCFS_IRRECOV)) {
1047 			if ((err = pc_getfat(fsp))) {
1048 				mutex_exit(&fsp->pcfs_lock);
1049 				return (err);
1050 			}
1051 		}
1052 		fsp->pcfs_flags |= PCFS_LOCKED;
1053 		fsp->pcfs_owner = curthread;
1054 		fsp->pcfs_count++;
1055 	}
1056 	return (0);
1057 }
1058 
1059 void
1060 pc_unlockfs(struct pcfs *fsp)
1061 {
1062 
1063 	if ((fsp->pcfs_flags & PCFS_LOCKED) == 0)
1064 		panic("pc_unlockfs");
1065 	if (--fsp->pcfs_count < 0)
1066 		panic("pc_unlockfs: count");
1067 	if (fsp->pcfs_count == 0) {
1068 		fsp->pcfs_flags &= ~PCFS_LOCKED;
1069 		fsp->pcfs_owner = 0;
1070 		mutex_exit(&fsp->pcfs_lock);
1071 	}
1072 }
1073 
1074 int
1075 pc_syncfat(struct pcfs *fsp)
1076 {
1077 	struct buf *bp;
1078 	int nfat;
1079 	int	error = 0;
1080 	struct fat_od_fsi *fsinfo_disk;
1081 
1082 	if ((fsp->pcfs_fatp == NULL) ||
1083 	    !(fsp->pcfs_flags & PCFS_FATMOD))
1084 		return (0);
1085 	/*
1086 	 * write out all copies of FATs
1087 	 */
1088 	fsp->pcfs_flags &= ~PCFS_FATMOD;
1089 	fsp->pcfs_fattime = gethrestime_sec() + PCFS_DISKTIMEOUT;
1090 	for (nfat = 0; nfat < fsp->pcfs_numfat; nfat++) {
1091 		error = pc_writefat(fsp, pc_dbdaddr(fsp,
1092 		    fsp->pcfs_fatstart + nfat * fsp->pcfs_fatsec));
1093 		if (error) {
1094 			pc_mark_irrecov(fsp);
1095 			return (EIO);
1096 		}
1097 	}
1098 	pc_clear_fatchanges(fsp);
1099 
1100 	/*
1101 	 * Write out fsinfo sector.
1102 	 */
1103 	if (IS_FAT32(fsp)) {
1104 		bp = bread(fsp->pcfs_xdev,
1105 		    pc_dbdaddr(fsp, fsp->pcfs_fsistart), fsp->pcfs_secsize);
1106 		if (bp->b_flags & (B_ERROR | B_STALE)) {
1107 			error = geterror(bp);
1108 		}
1109 		fsinfo_disk = (fat_od_fsi_t *)(bp->b_un.b_addr);
1110 		if (!error && FSISIG_OK(fsinfo_disk)) {
1111 			fsinfo_disk->fsi_incore.fs_free_clusters =
1112 			    LE_32(fsp->pcfs_fsinfo.fs_free_clusters);
1113 			fsinfo_disk->fsi_incore.fs_next_free =
1114 			    LE_32(FSINFO_UNKNOWN);
1115 			bwrite2(bp);
1116 			error = geterror(bp);
1117 		}
1118 		brelse(bp);
1119 		if (error) {
1120 			pc_mark_irrecov(fsp);
1121 			return (EIO);
1122 		}
1123 	}
1124 	return (0);
1125 }
1126 
1127 void
1128 pc_invalfat(struct pcfs *fsp)
1129 {
1130 	struct pcfs *xfsp;
1131 	int mount_cnt = 0;
1132 
1133 	if (fsp->pcfs_fatp == NULL)
1134 		panic("pc_invalfat");
1135 	/*
1136 	 * Release FAT
1137 	 */
1138 	kmem_free(fsp->pcfs_fatp, fsp->pcfs_fatsec * fsp->pcfs_secsize);
1139 	fsp->pcfs_fatp = NULL;
1140 	kmem_free(fsp->pcfs_fat_changemap, fsp->pcfs_fat_changemapsize);
1141 	fsp->pcfs_fat_changemap = NULL;
1142 	/*
1143 	 * Invalidate all the blocks associated with the device.
1144 	 * Not needed if stateless.
1145 	 */
1146 	for (xfsp = pc_mounttab; xfsp; xfsp = xfsp->pcfs_nxt)
1147 		if (xfsp != fsp && xfsp->pcfs_xdev == fsp->pcfs_xdev)
1148 			mount_cnt++;
1149 
1150 	if (!mount_cnt)
1151 		binval(fsp->pcfs_xdev);
1152 	/*
1153 	 * close mounted device
1154 	 */
1155 	(void) VOP_CLOSE(fsp->pcfs_devvp,
1156 	    (PCFSTOVFS(fsp)->vfs_flag & VFS_RDONLY) ? FREAD : FREAD|FWRITE,
1157 	    1, (offset_t)0, CRED(), NULL);
1158 }
1159 
1160 void
1161 pc_badfs(struct pcfs *fsp)
1162 {
1163 	cmn_err(CE_WARN, "corrupted PC file system on dev (%x.%x):%d\n",
1164 	    getmajor(fsp->pcfs_devvp->v_rdev),
1165 	    getminor(fsp->pcfs_devvp->v_rdev), fsp->pcfs_ldrive);
1166 }
1167 
1168 /*
1169  * The problem with supporting NFS on the PCFS filesystem is that there
1170  * is no good place to keep the generation number. The only possible
1171  * place is inside a directory entry. There are a few words that we
1172  * don't use - they store NT & OS/2 attributes, and the creation/last access
1173  * time of the file - but it seems wrong to use them. In addition, directory
1174  * entries come and go. If a directory is removed completely, its directory
1175  * blocks are freed and the generation numbers are lost. Whereas in ufs,
1176  * inode blocks are dedicated for inodes, so the generation numbers are
1177  * permanently kept on the disk.
1178  */
1179 static int
1180 pcfs_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
1181 {
1182 	struct pcnode *pcp;
1183 	struct pc_fid *pcfid;
1184 	struct pcfs *fsp;
1185 	struct pcdir *ep;
1186 	daddr_t eblkno;
1187 	int eoffset;
1188 	struct buf *bp;
1189 	int error;
1190 	pc_cluster32_t	cn;
1191 
1192 	pcfid = (struct pc_fid *)fidp;
1193 	fsp = VFSTOPCFS(vfsp);
1194 
1195 	error = pc_lockfs(fsp, 0, 0);
1196 	if (error) {
1197 		*vpp = NULL;
1198 		return (error);
1199 	}
1200 
1201 	if (pcfid->pcfid_block == 0) {
1202 		pcp = pc_getnode(fsp, (daddr_t)0, 0, NULL);
1203 		pcp->pc_flags |= PC_EXTERNAL;
1204 		*vpp = PCTOV(pcp);
1205 		pc_unlockfs(fsp);
1206 		return (0);
1207 	}
1208 	eblkno = pcfid->pcfid_block;
1209 	eoffset = pcfid->pcfid_offset;
1210 
1211 	if ((pc_dbtocl(fsp,
1212 	    eblkno - fsp->pcfs_dosstart) >= fsp->pcfs_ncluster) ||
1213 	    (eoffset > fsp->pcfs_clsize)) {
1214 		pc_unlockfs(fsp);
1215 		*vpp = NULL;
1216 		return (EINVAL);
1217 	}
1218 
1219 	if (eblkno >= fsp->pcfs_datastart || (eblkno - fsp->pcfs_rdirstart)
1220 	    < (fsp->pcfs_rdirsec & ~(fsp->pcfs_spcl - 1))) {
1221 		bp = bread(fsp->pcfs_xdev, pc_dbdaddr(fsp, eblkno),
1222 		    fsp->pcfs_clsize);
1223 	} else {
1224 		/*
1225 		 * This is an access "backwards" into the FAT12/FAT16
1226 		 * root directory. A better code structure would
1227 		 * significantly improve maintainability here ...
1228 		 */
1229 		bp = bread(fsp->pcfs_xdev, pc_dbdaddr(fsp, eblkno),
1230 		    (int)(fsp->pcfs_datastart - eblkno) * fsp->pcfs_secsize);
1231 	}
1232 	if (bp->b_flags & (B_ERROR | B_STALE)) {
1233 		error = geterror(bp);
1234 		brelse(bp);
1235 		if (error)
1236 			pc_mark_irrecov(fsp);
1237 		*vpp = NULL;
1238 		pc_unlockfs(fsp);
1239 		return (error);
1240 	}
1241 	ep = (struct pcdir *)(bp->b_un.b_addr + eoffset);
1242 	/*
1243 	 * Ok, if this is a valid file handle that we gave out,
1244 	 * then simply ensuring that the creation time matches,
1245 	 * the entry has not been deleted, and it has a valid first
1246 	 * character should be enough.
1247 	 *
1248 	 * Unfortunately, verifying that the <blkno, offset> _still_
1249 	 * refers to a directory entry is not easy, since we'd have
1250 	 * to search _all_ directories starting from root to find it.
1251 	 * That's a high price to pay just in case somebody is forging
1252 	 * file handles. So instead we verify that as much of the
1253 	 * entry is valid as we can:
1254 	 *
1255 	 * 1. The starting cluster is 0 (unallocated) or valid
1256 	 * 2. It is not an LFN entry
1257 	 * 3. It is not hidden (unless mounted as such)
1258 	 * 4. It is not the label
1259 	 */
1260 	cn = pc_getstartcluster(fsp, ep);
1261 	/*
1262 	 * if the starting cluster is valid, but not valid according
1263 	 * to pc_validcl(), force it to be to simplify the following if.
1264 	 */
1265 	if (cn == 0)
1266 		cn = PCF_FIRSTCLUSTER;
1267 	if (IS_FAT32(fsp)) {
1268 		if (cn >= PCF_LASTCLUSTER32)
1269 			cn = PCF_FIRSTCLUSTER;
1270 	} else {
1271 		if (cn >= PCF_LASTCLUSTER)
1272 			cn = PCF_FIRSTCLUSTER;
1273 	}
1274 	if ((!pc_validcl(fsp, cn)) ||
1275 	    (PCDL_IS_LFN(ep)) ||
1276 	    (PCA_IS_HIDDEN(fsp, ep->pcd_attr)) ||
1277 	    ((ep->pcd_attr & PCA_LABEL) == PCA_LABEL)) {
1278 		bp->b_flags |= B_STALE | B_AGE;
1279 		brelse(bp);
1280 		pc_unlockfs(fsp);
1281 		return (EINVAL);
1282 	}
1283 	if ((ep->pcd_crtime.pct_time == pcfid->pcfid_ctime) &&
1284 	    (ep->pcd_filename[0] != PCD_ERASED) &&
1285 	    (pc_validchar(ep->pcd_filename[0]) ||
1286 	    (ep->pcd_filename[0] == '.' && ep->pcd_filename[1] == '.'))) {
1287 		pcp = pc_getnode(fsp, eblkno, eoffset, ep);
1288 		pcp->pc_flags |= PC_EXTERNAL;
1289 		*vpp = PCTOV(pcp);
1290 	} else {
1291 		*vpp = NULL;
1292 	}
1293 	bp->b_flags |= B_STALE | B_AGE;
1294 	brelse(bp);
1295 	pc_unlockfs(fsp);
1296 	return (0);
1297 }
1298 
1299 /*
1300  * Unfortunately, FAT32 fat's can be pretty big (On a 1 gig jaz drive, about
1301  * a meg), so we can't bread() it all in at once. This routine reads a
1302  * fat a chunk at a time.
1303  */
1304 static int
1305 pc_readfat(struct pcfs *fsp, uchar_t *fatp)
1306 {
1307 	struct buf *bp;
1308 	size_t off;
1309 	size_t readsize;
1310 	daddr_t diskblk;
1311 	size_t fatsize = fsp->pcfs_fatsec * fsp->pcfs_secsize;
1312 	daddr_t start = fsp->pcfs_fatstart;
1313 
1314 	readsize = fsp->pcfs_clsize;
1315 	for (off = 0; off < fatsize; off += readsize, fatp += readsize) {
1316 		if (readsize > (fatsize - off))
1317 			readsize = fatsize - off;
1318 		diskblk = pc_dbdaddr(fsp, start +
1319 		    pc_cltodb(fsp, pc_lblkno(fsp, off)));
1320 		bp = bread(fsp->pcfs_xdev, diskblk, readsize);
1321 		if (bp->b_flags & (B_ERROR | B_STALE)) {
1322 			brelse(bp);
1323 			return (EIO);
1324 		}
1325 		bp->b_flags |= B_STALE | B_AGE;
1326 		bcopy(bp->b_un.b_addr, fatp, readsize);
1327 		brelse(bp);
1328 	}
1329 	return (0);
1330 }
1331 
1332 /*
1333  * We write the FAT out a _lot_, in order to make sure that it
1334  * is up-to-date. But on a FAT32 system (large drive, small clusters)
1335  * the FAT might be a couple of megabytes, and writing it all out just
1336  * because we created or deleted a small file is painful (especially
1337  * since we do it for each alternate FAT too). So instead, for FAT16 and
1338  * FAT32 we only write out the bit that has changed. We don't clear
1339  * the 'updated' fields here because the caller might be writing out
1340  * several FATs, so the caller must use pc_clear_fatchanges() after
1341  * all FATs have been updated.
1342  * This function doesn't take "start" from fsp->pcfs_dosstart because
1343  * callers can use it to write either the primary or any of the alternate
1344  * FAT tables.
1345  */
1346 static int
1347 pc_writefat(struct pcfs *fsp, daddr_t start)
1348 {
1349 	struct buf *bp;
1350 	size_t off;
1351 	size_t writesize;
1352 	int	error;
1353 	uchar_t *fatp = fsp->pcfs_fatp;
1354 	size_t fatsize = fsp->pcfs_fatsec * fsp->pcfs_secsize;
1355 
1356 	writesize = fsp->pcfs_clsize;
1357 	for (off = 0; off < fatsize; off += writesize, fatp += writesize) {
1358 		if (writesize > (fatsize - off))
1359 			writesize = fatsize - off;
1360 		if (!pc_fat_is_changed(fsp, pc_lblkno(fsp, off))) {
1361 			continue;
1362 		}
1363 		bp = ngeteblk(writesize);
1364 		bp->b_edev = fsp->pcfs_xdev;
1365 		bp->b_dev = cmpdev(bp->b_edev);
1366 		bp->b_blkno = start + pc_dbdaddr(fsp,
1367 		    pc_cltodb(fsp, pc_lblkno(fsp, off)));
1368 		DTRACE_PROBE3(pc_writefat, longlong_t, bp->b_blkno,
1369 		    uchar_t *, fatp,
1370 		    size_t, writesize);
1371 		bcopy(fatp, bp->b_un.b_addr, writesize);
1372 		bwrite2(bp);
1373 		error = geterror(bp);
1374 		brelse(bp);
1375 		if (error) {
1376 			return (error);
1377 		}
1378 	}
1379 	return (0);
1380 }
1381 
1382 /*
1383  * Mark the FAT cluster that 'cn' is stored in as modified.
1384  */
1385 void
1386 pc_mark_fat_updated(struct pcfs *fsp, pc_cluster32_t cn)
1387 {
1388 	pc_cluster32_t	bn;
1389 	size_t		size;
1390 
1391 	/* which fat block is the cluster number stored in? */
1392 	if (IS_FAT32(fsp)) {
1393 		size = sizeof (pc_cluster32_t);
1394 		bn = pc_lblkno(fsp, cn * size);
1395 		fsp->pcfs_fat_changemap[bn] = 1;
1396 	} else if (IS_FAT16(fsp)) {
1397 		size = sizeof (pc_cluster16_t);
1398 		bn = pc_lblkno(fsp, cn * size);
1399 		fsp->pcfs_fat_changemap[bn] = 1;
1400 	} else {
1401 		offset_t off;
1402 		pc_cluster32_t nbn;
1403 
1404 		ASSERT(IS_FAT12(fsp));
1405 		off = cn + (cn >> 1);
1406 		bn = pc_lblkno(fsp, off);
1407 		fsp->pcfs_fat_changemap[bn] = 1;
1408 		/* does this field wrap into the next fat cluster? */
1409 		nbn = pc_lblkno(fsp, off + 1);
1410 		if (nbn != bn) {
1411 			fsp->pcfs_fat_changemap[nbn] = 1;
1412 		}
1413 	}
1414 }
1415 
1416 /*
1417  * return whether the FAT cluster 'bn' is updated and needs to
1418  * be written out.
1419  */
1420 int
1421 pc_fat_is_changed(struct pcfs *fsp, pc_cluster32_t bn)
1422 {
1423 	return (fsp->pcfs_fat_changemap[bn] == 1);
1424 }
1425 
1426 /*
1427  * Implementation of VFS_FREEVFS() to support forced umounts.
1428  * This is called by the vfs framework after umount, to trigger
1429  * the release of any resources still associated with the given
1430  * vfs_t once the need to keep them has gone away.
1431  */
1432 void
1433 pcfs_freevfs(vfs_t *vfsp)
1434 {
1435 	struct pcfs *fsp = VFSTOPCFS(vfsp);
1436 
1437 	mutex_enter(&pcfslock);
1438 	/*
1439 	 * Purging the FAT closes the device - can't do any more
1440 	 * I/O after this.
1441 	 */
1442 	if (fsp->pcfs_fatp != NULL)
1443 		pc_invalfat(fsp);
1444 	mutex_exit(&pcfslock);
1445 
1446 	VN_RELE(fsp->pcfs_devvp);
1447 	mutex_destroy(&fsp->pcfs_lock);
1448 	kmem_free(fsp, sizeof (*fsp));
1449 
1450 	/*
1451 	 * Allow _fini() to succeed now, if so desired.
1452 	 */
1453 	atomic_dec_32(&pcfs_mountcount);
1454 }
1455 
1456 
1457 /*
1458  * PC-style partition parsing and FAT BPB identification/validation code.
1459  * The partition parsers here assume:
1460  *	- a FAT filesystem will be in a partition that has one of a set of
1461  *	  recognized partition IDs
1462  *	- the user wants the 'numbering' (C:, D:, ...) that one would get
1463  *	  on MSDOS 6.x.
1464  *	  That means any non-FAT partition type (NTFS, HPFS, or any Linux fs)
1465  *	  will not factor in the enumeration.
1466  * These days, such assumptions should be revisited. FAT is no longer the
1467  * only game in 'PC town'.
1468  */
1469 /*
1470  * isDosDrive()
1471  *	Boolean function.  Give it the systid field for an fdisk partition
1472  *	and it decides if that's a systid that describes a DOS drive.  We
1473  *	use systid values defined in sys/dktp/fdisk.h.
1474  */
1475 static int
1476 isDosDrive(uchar_t checkMe)
1477 {
1478 	return ((checkMe == DOSOS12) || (checkMe == DOSOS16) ||
1479 	    (checkMe == DOSHUGE) || (checkMe == FDISK_WINDOWS) ||
1480 	    (checkMe == FDISK_EXT_WIN) || (checkMe == FDISK_FAT95) ||
1481 	    (checkMe == DIAGPART));
1482 }
1483 
1484 
1485 /*
1486  * isDosExtended()
1487  *	Boolean function.  Give it the systid field for an fdisk partition
1488  *	and it decides if that's a systid that describes an extended DOS
1489  *	partition.
1490  */
1491 static int
1492 isDosExtended(uchar_t checkMe)
1493 {
1494 	return ((checkMe == EXTDOS) || (checkMe == FDISK_EXTLBA));
1495 }
1496 
1497 
1498 /*
1499  * isBootPart()
1500  *	Boolean function.  Give it the systid field for an fdisk partition
1501  *	and it decides if that's a systid that describes a Solaris boot
1502  *	partition.
1503  */
1504 static int
1505 isBootPart(uchar_t checkMe)
1506 {
1507 	return (checkMe == X86BOOT);
1508 }
1509 
1510 
1511 /*
1512  * noLogicalDrive()
1513  *	Display error message about not being able to find a logical
1514  *	drive.
1515  */
1516 static void
1517 noLogicalDrive(int ldrive)
1518 {
1519 	if (ldrive == BOOT_PARTITION_DRIVE) {
1520 		cmn_err(CE_NOTE, "!pcfs: no boot partition");
1521 	} else {
1522 		cmn_err(CE_NOTE, "!pcfs: %d: no such logical drive", ldrive);
1523 	}
1524 }
1525 
1526 
1527 /*
1528  * findTheDrive()
1529  *	Discover offset of the requested logical drive, and return
1530  *	that offset (startSector), the systid of that drive (sysid),
1531  *	and a buffer pointer (bp), with the buffer contents being
1532  *	the first sector of the logical drive (i.e., the sector that
1533  *	contains the BPB for that drive).
1534  *
1535  * Note: this code is not capable of addressing >2TB disks, as it uses
1536  *       daddr_t not diskaddr_t, some of the calculations would overflow
1537  */
1538 #define	COPY_PTBL(mbr, ptblp)					\
1539 	bcopy(&(((struct mboot *)(mbr))->parts), (ptblp),	\
1540 	    FD_NUMPART * sizeof (struct ipart))
1541 
1542 static int
1543 findTheDrive(struct pcfs *fsp, buf_t **bp)
1544 {
1545 	int ldrive = fsp->pcfs_ldrive;
1546 	dev_t dev = fsp->pcfs_devvp->v_rdev;
1547 
1548 	struct ipart dosp[FD_NUMPART];	/* incore fdisk partition structure */
1549 	daddr_t lastseek = 0;		/* Disk block we sought previously */
1550 	daddr_t diskblk = 0;		/* Disk block to get */
1551 	daddr_t xstartsect;		/* base of Extended DOS partition */
1552 	int logicalDriveCount = 0;	/* Count of logical drives seen */
1553 	int extendedPart = -1;		/* index of extended dos partition */
1554 	int primaryPart = -1;		/* index of primary dos partition */
1555 	int bootPart = -1;		/* index of a Solaris boot partition */
1556 	uint32_t xnumsect = 0;		/* length of extended DOS partition */
1557 	int driveIndex;			/* computed FDISK table index */
1558 	daddr_t startsec;
1559 	len_t mediasize;
1560 	int i;
1561 	/*
1562 	 * Count of drives in the current extended partition's
1563 	 * FDISK table, and indexes of the drives themselves.
1564 	 */
1565 	int extndDrives[FD_NUMPART];
1566 	int numDrives = 0;
1567 
1568 	/*
1569 	 * Count of drives (beyond primary) in master boot record's
1570 	 * FDISK table, and indexes of the drives themselves.
1571 	 */
1572 	int extraDrives[FD_NUMPART];
1573 	int numExtraDrives = 0;
1574 
1575 	/*
1576 	 * "ldrive == 0" should never happen, as this is a request to
1577 	 * mount the physical device (and ignore partitioning). The code
1578 	 * in pcfs_mount() should have made sure that a logical drive number
1579 	 * is at least 1, meaning we're looking for drive "C:". It is not
1580 	 * safe (and a bug in the callers of this function) to request logical
1581 	 * drive number 0; we could ASSERT() but a graceful EIO is a more
1582 	 * polite way.
1583 	 */
1584 	if (ldrive == 0) {
1585 		cmn_err(CE_NOTE, "!pcfs: request for logical partition zero");
1586 		noLogicalDrive(ldrive);
1587 		return (EIO);
1588 	}
1589 
1590 	/*
1591 	 *  Copy from disk block into memory aligned structure for fdisk usage.
1592 	 */
1593 	COPY_PTBL((*bp)->b_un.b_addr, dosp);
1594 
1595 	/*
1596 	 * This check is ok because a FAT BPB and a master boot record (MBB)
1597 	 * have the same signature, in the same position within the block.
1598 	 */
1599 	if (bpb_get_BPBSig((*bp)->b_un.b_addr) != MBB_MAGIC) {
1600 		cmn_err(CE_NOTE, "!pcfs: MBR partition table signature err, "
1601 		    "device (%x.%x):%d\n",
1602 		    getmajor(dev), getminor(dev), ldrive);
1603 		return (EINVAL);
1604 	}
1605 
1606 	/*
1607 	 * Get a summary of what is in the Master FDISK table.
1608 	 * Normally we expect to find one partition marked as a DOS drive.
1609 	 * This partition is the one Windows calls the primary dos partition.
1610 	 * If the machine has any logical drives then we also expect
1611 	 * to find a partition marked as an extended DOS partition.
1612 	 *
1613 	 * Sometimes we'll find multiple partitions marked as DOS drives.
1614 	 * The Solaris fdisk program allows these partitions
1615 	 * to be created, but Windows fdisk no longer does.  We still need
1616 	 * to support these, though, since Windows does.  We also need to fix
1617 	 * our fdisk to behave like the Windows version.
1618 	 *
1619 	 * It turns out that some off-the-shelf media have *only* an
1620 	 * Extended partition, so we need to deal with that case as well.
1621 	 *
1622 	 * Only a single (the first) Extended or Boot Partition will
1623 	 * be recognized.  Any others will be ignored.
1624 	 */
1625 	for (i = 0; i < FD_NUMPART; i++) {
1626 		DTRACE_PROBE4(primarypart, struct pcfs *, fsp,
1627 		    uint_t, (uint_t)dosp[i].systid,
1628 		    uint_t, LE_32(dosp[i].relsect),
1629 		    uint_t, LE_32(dosp[i].numsect));
1630 
1631 		if (isDosDrive(dosp[i].systid)) {
1632 			if (primaryPart < 0) {
1633 				logicalDriveCount++;
1634 				primaryPart = i;
1635 			} else {
1636 				extraDrives[numExtraDrives++] = i;
1637 			}
1638 			continue;
1639 		}
1640 		if ((extendedPart < 0) && isDosExtended(dosp[i].systid)) {
1641 			extendedPart = i;
1642 			continue;
1643 		}
1644 		if ((bootPart < 0) && isBootPart(dosp[i].systid)) {
1645 			bootPart = i;
1646 			continue;
1647 		}
1648 	}
1649 
1650 	if (ldrive == BOOT_PARTITION_DRIVE) {
1651 		if (bootPart < 0) {
1652 			noLogicalDrive(ldrive);
1653 			return (EINVAL);
1654 		}
1655 		startsec = LE_32(dosp[bootPart].relsect);
1656 		mediasize = LE_32(dosp[bootPart].numsect);
1657 		goto found;
1658 	}
1659 
1660 	if (ldrive == PRIMARY_DOS_DRIVE && primaryPart >= 0) {
1661 		startsec = LE_32(dosp[primaryPart].relsect);
1662 		mediasize = LE_32(dosp[primaryPart].numsect);
1663 		goto found;
1664 	}
1665 
1666 	/*
1667 	 * We are not looking for the C: drive (or the primary drive
1668 	 * was not found), so we had better have an extended partition
1669 	 * or extra drives in the Master FDISK table.
1670 	 */
1671 	if ((extendedPart < 0) && (numExtraDrives == 0)) {
1672 		cmn_err(CE_NOTE, "!pcfs: no extended dos partition");
1673 		noLogicalDrive(ldrive);
1674 		return (EINVAL);
1675 	}
1676 
1677 	if (extendedPart >= 0) {
1678 		diskblk = xstartsect = LE_32(dosp[extendedPart].relsect);
1679 		xnumsect = LE_32(dosp[extendedPart].numsect);
1680 		do {
1681 			/*
1682 			 *  If the seek would not cause us to change
1683 			 *  position on the drive, then we're out of
1684 			 *  extended partitions to examine.
1685 			 */
1686 			if (diskblk == lastseek)
1687 				break;
1688 			logicalDriveCount += numDrives;
1689 			/*
1690 			 *  Seek the next extended partition, and find
1691 			 *  logical drives within it.
1692 			 */
1693 			brelse(*bp);
1694 			/*
1695 			 * bread() block numbers are multiples of DEV_BSIZE
1696 			 * but the device sector size (the unit of partitioning)
1697 			 * might be larger than that; pcfs_get_device_info()
1698 			 * has calculated the multiplicator for us.
1699 			 */
1700 			*bp = bread(dev,
1701 			    pc_dbdaddr(fsp, diskblk), fsp->pcfs_secsize);
1702 			if ((*bp)->b_flags & B_ERROR) {
1703 				return (EIO);
1704 			}
1705 
1706 			lastseek = diskblk;
1707 			COPY_PTBL((*bp)->b_un.b_addr, dosp);
1708 			if (bpb_get_BPBSig((*bp)->b_un.b_addr) != MBB_MAGIC) {
1709 				cmn_err(CE_NOTE, "!pcfs: "
1710 				    "extended partition table signature err, "
1711 				    "device (%x.%x):%d, LBA %u",
1712 				    getmajor(dev), getminor(dev), ldrive,
1713 				    (uint_t)pc_dbdaddr(fsp, diskblk));
1714 				return (EINVAL);
1715 			}
1716 			/*
1717 			 *  Count up drives, and track where the next
1718 			 *  extended partition is in case we need it.  We
1719 			 *  are expecting only one extended partition.  If
1720 			 *  there is more than one we'll only go to the
1721 			 *  first one we see, but warn about ignoring.
1722 			 */
1723 			numDrives = 0;
1724 			for (i = 0; i < FD_NUMPART; i++) {
1725 				DTRACE_PROBE4(extendedpart,
1726 				    struct pcfs *, fsp,
1727 				    uint_t, (uint_t)dosp[i].systid,
1728 				    uint_t, LE_32(dosp[i].relsect),
1729 				    uint_t, LE_32(dosp[i].numsect));
1730 				if (isDosDrive(dosp[i].systid)) {
1731 					extndDrives[numDrives++] = i;
1732 				} else if (isDosExtended(dosp[i].systid)) {
1733 					if (diskblk != lastseek) {
1734 						/*
1735 						 * Already found an extended
1736 						 * partition in this table.
1737 						 */
1738 						cmn_err(CE_NOTE,
1739 						    "!pcfs: ignoring unexpected"
1740 						    " additional extended"
1741 						    " partition");
1742 					} else {
1743 						diskblk = xstartsect +
1744 						    LE_32(dosp[i].relsect);
1745 					}
1746 				}
1747 			}
1748 		} while (ldrive > logicalDriveCount + numDrives);
1749 
1750 		ASSERT(numDrives <= FD_NUMPART);
1751 
1752 		if (ldrive <= logicalDriveCount + numDrives) {
1753 			/*
1754 			 * The number of logical drives we've found thus
1755 			 * far is enough to get us to the one we were
1756 			 * searching for.
1757 			 */
1758 			driveIndex = logicalDriveCount + numDrives - ldrive;
1759 			mediasize =
1760 			    LE_32(dosp[extndDrives[driveIndex]].numsect);
1761 			startsec =
1762 			    LE_32(dosp[extndDrives[driveIndex]].relsect) +
1763 			    lastseek;
1764 			if (startsec > (xstartsect + xnumsect)) {
1765 				cmn_err(CE_NOTE, "!pcfs: extended partition "
1766 				    "values bad");
1767 				return (EINVAL);
1768 			}
1769 			goto found;
1770 		} else {
1771 			/*
1772 			 * We ran out of extended dos partition
1773 			 * drives.  The only hope now is to go
1774 			 * back to extra drives defined in the master
1775 			 * fdisk table.  But we overwrote that table
1776 			 * already, so we must load it in again.
1777 			 */
1778 			logicalDriveCount += numDrives;
1779 			brelse(*bp);
1780 			ASSERT(fsp->pcfs_dosstart == 0);
1781 			*bp = bread(dev, pc_dbdaddr(fsp, fsp->pcfs_dosstart),
1782 			    fsp->pcfs_secsize);
1783 			if ((*bp)->b_flags & B_ERROR) {
1784 				return (EIO);
1785 			}
1786 			COPY_PTBL((*bp)->b_un.b_addr, dosp);
1787 		}
1788 	}
1789 	/*
1790 	 *  Still haven't found the drive, is it an extra
1791 	 *  drive defined in the main FDISK table?
1792 	 */
1793 	if (ldrive <= logicalDriveCount + numExtraDrives) {
1794 		driveIndex = logicalDriveCount + numExtraDrives - ldrive;
1795 		ASSERT(driveIndex < MIN(numExtraDrives, FD_NUMPART));
1796 		mediasize = LE_32(dosp[extraDrives[driveIndex]].numsect);
1797 		startsec = LE_32(dosp[extraDrives[driveIndex]].relsect);
1798 		goto found;
1799 	}
1800 	/*
1801 	 *  Still haven't found the drive, and there is
1802 	 *  nowhere else to look.
1803 	 */
1804 	noLogicalDrive(ldrive);
1805 	return (EINVAL);
1806 
1807 found:
1808 	/*
1809 	 * We need this value in units of sectorsize, because PCFS' internal
1810 	 * offset calculations go haywire for > 512Byte sectors unless all
1811 	 * pcfs_.*start values are in units of sectors.
1812 	 * So, assign before the capacity check (that's done in DEV_BSIZE)
1813 	 */
1814 	fsp->pcfs_dosstart = startsec;
1815 
1816 	/*
1817 	 * convert from device sectors to proper units:
1818 	 *	- starting sector: DEV_BSIZE (as argument to bread())
1819 	 *	- media size: Bytes
1820 	 */
1821 	startsec = pc_dbdaddr(fsp, startsec);
1822 	mediasize *= fsp->pcfs_secsize;
1823 
1824 	/*
1825 	 * some additional validation / warnings in case the partition table
1826 	 * and the actual media capacity are not in accordance ...
1827 	 */
1828 	if (fsp->pcfs_mediasize != 0) {
1829 		diskaddr_t startoff =
1830 		    (diskaddr_t)startsec * (diskaddr_t)DEV_BSIZE;
1831 
1832 		if (startoff >= fsp->pcfs_mediasize ||
1833 		    startoff + mediasize > fsp->pcfs_mediasize) {
1834 			cmn_err(CE_WARN,
1835 			    "!pcfs: partition size (LBA start %u, %lld bytes, "
1836 			    "device (%x.%x):%d) smaller than "
1837 			    "mediasize (%lld bytes).\n"
1838 			    "filesystem may be truncated, access errors "
1839 			    "may result.\n",
1840 			    (uint_t)startsec, (long long)mediasize,
1841 			    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
1842 			    fsp->pcfs_ldrive, (long long)fsp->pcfs_mediasize);
1843 		}
1844 	} else {
1845 		fsp->pcfs_mediasize = mediasize;
1846 	}
1847 
1848 	return (0);
1849 }
1850 
1851 
1852 static fattype_t
1853 secondaryBPBChecks(struct pcfs *fsp, uchar_t *bpb, size_t secsize)
1854 {
1855 	uint32_t ncl = fsp->pcfs_ncluster;
1856 
1857 	if (ncl <= 4096) {
1858 		if (bpb_get_FatSz16(bpb) == 0)
1859 			return (FAT_UNKNOWN);
1860 
1861 		if (bpb_get_FatSz16(bpb) * secsize < ncl * 2 &&
1862 		    bpb_get_FatSz16(bpb) * secsize >= (3 * ncl / 2))
1863 			return (FAT12);
1864 		if (bcmp(bpb_FilSysType16(bpb), "FAT12", 5) == 0)
1865 			return (FAT12);
1866 		if (bcmp(bpb_FilSysType16(bpb), "FAT16", 5) == 0)
1867 			return (FAT16);
1868 
1869 		switch (bpb_get_Media(bpb)) {
1870 			case SS8SPT:
1871 			case DS8SPT:
1872 			case SS9SPT:
1873 			case DS9SPT:
1874 			case DS18SPT:
1875 			case DS9_15SPT:
1876 				/*
1877 				 * Is this reliable - all floppies are FAT12 ?
1878 				 */
1879 				return (FAT12);
1880 			case MD_FIXED:
1881 				/*
1882 				 * Is this reliable - disks are always FAT16 ?
1883 				 */
1884 				return (FAT16);
1885 			default:
1886 				break;
1887 		}
1888 	} else if (ncl <= 65536) {
1889 		if (bpb_get_FatSz16(bpb) == 0 && bpb_get_FatSz32(bpb) > 0)
1890 			return (FAT32);
1891 		if (VALID_BOOTSIG(bpb_get_BootSig32(bpb)))
1892 			return (FAT32);
1893 		if (VALID_FSTYPSTR32(bpb_FilSysType32(bpb)))
1894 			return (FAT32);
1895 
1896 		if (VALID_BOOTSIG(bpb_get_BootSig16(bpb)))
1897 			return (FAT16);
1898 		if (bpb_get_FatSz16(bpb) * secsize < ncl * 4)
1899 			return (FAT16);
1900 	}
1901 
1902 	/*
1903 	 * We don't know
1904 	 */
1905 	return (FAT_UNKNOWN);
1906 }
1907 
1908 /*
1909  * Check to see if the BPB we found is correct.
1910  *
1911  * This looks far more complicated that it needs to be for pure structural
1912  * validation. The reason for this is that parseBPB() is also used for
1913  * debugging purposes (mdb dcmd) and we therefore want a bitmap of which
1914  * BPB fields (do not) have 'known good' values, even if we (do not) reject
1915  * the BPB when attempting to mount the filesystem.
1916  *
1917  * Real-world usage of FAT shows there are a lot of corner-case situations
1918  * and, following the specification strictly, invalid filesystems out there.
1919  * Known are situations such as:
1920  *	- FAT12/FAT16 filesystems with garbage in either totsec16/32
1921  *	  instead of the zero in one of the fields mandated by the spec
1922  *	- filesystems that claim to be larger than the partition they're in
1923  *	- filesystems without valid media descriptor
1924  *	- FAT32 filesystems with RootEntCnt != 0
1925  *	- FAT32 filesystems with less than 65526 clusters
1926  *	- FAT32 filesystems without valid FSI sector
1927  *	- FAT32 filesystems with FAT size in fatsec16 instead of fatsec32
1928  *
1929  * Such filesystems are accessible by PCFS - if it'd know to start with that
1930  * the filesystem should be treated as a specific FAT type. Before S10, it
1931  * relied on the PC/fdisk partition type for the purpose and almost completely
1932  * ignored the BPB; now it ignores the partition type for anything else but
1933  * logical drive enumeration, which can result in rejection of (invalid)
1934  * FAT32 - if the partition ID says FAT32, but the filesystem, for example
1935  * has less than 65526 clusters.
1936  *
1937  * Without a "force this fs as FAT{12,16,32}" tunable or mount option, it's
1938  * not possible to allow all such mostly-compliant filesystems in unless one
1939  * accepts false positives (definitely invalid filesystems that cause problems
1940  * later). This at least allows to pinpoint why the mount failed.
1941  *
1942  * Due to the use of FAT on removeable media, all relaxations of the rules
1943  * here need to be carefully evaluated wrt. to potential effects on PCFS
1944  * resilience. A faulty/"mis-crafted" filesystem must not cause a panic, so
1945  * beware.
1946  */
1947 static int
1948 parseBPB(struct pcfs *fsp, uchar_t *bpb, int *valid)
1949 {
1950 	fattype_t type;
1951 
1952 	uint32_t	ncl;	/* number of clusters in file area */
1953 	uint32_t	rec;
1954 	uint32_t	reserved;
1955 	uint32_t	fsisec, bkbootsec;
1956 	blkcnt_t	totsec, totsec16, totsec32, datasec;
1957 	size_t		fatsec, fatsec16, fatsec32, rdirsec;
1958 	size_t		secsize;
1959 	len_t		mediasize;
1960 	uint64_t	validflags = 0;
1961 
1962 	if (VALID_BPBSIG(bpb_get_BPBSig(bpb)))
1963 		validflags |= BPB_BPBSIG_OK;
1964 
1965 	rec = bpb_get_RootEntCnt(bpb);
1966 	reserved = bpb_get_RsvdSecCnt(bpb);
1967 	fsisec = bpb_get_FSInfo32(bpb);
1968 	bkbootsec = bpb_get_BkBootSec32(bpb);
1969 	totsec16 = (blkcnt_t)bpb_get_TotSec16(bpb);
1970 	totsec32 = (blkcnt_t)bpb_get_TotSec32(bpb);
1971 	fatsec16 = bpb_get_FatSz16(bpb);
1972 	fatsec32 = bpb_get_FatSz32(bpb);
1973 
1974 	totsec = totsec16 ? totsec16 : totsec32;
1975 	fatsec = fatsec16 ? fatsec16 : fatsec32;
1976 
1977 	secsize = bpb_get_BytesPerSec(bpb);
1978 	if (!VALID_SECSIZE(secsize))
1979 		secsize = fsp->pcfs_secsize;
1980 	if (secsize != fsp->pcfs_secsize) {
1981 		PC_DPRINTF3(3, "!pcfs: parseBPB, device (%x.%x):%d:\n",
1982 		    getmajor(fsp->pcfs_xdev),
1983 		    getminor(fsp->pcfs_xdev), fsp->pcfs_ldrive);
1984 		PC_DPRINTF2(3, "!BPB secsize %d != "
1985 		    "autodetected media block size %d\n",
1986 		    (int)secsize, (int)fsp->pcfs_secsize);
1987 		if (fsp->pcfs_ldrive) {
1988 			/*
1989 			 * We've already attempted to parse the partition
1990 			 * table. If the block size used for that don't match
1991 			 * the PCFS sector size, we're hosed one way or the
1992 			 * other. Just try what happens.
1993 			 */
1994 			secsize = fsp->pcfs_secsize;
1995 			PC_DPRINTF1(3,
1996 			    "!pcfs: Using autodetected secsize %d\n",
1997 			    (int)secsize);
1998 		} else {
1999 			/*
2000 			 * This allows mounting lofi images of PCFS partitions
2001 			 * with sectorsize != DEV_BSIZE. We can't parse the
2002 			 * partition table on whole-disk images unless the
2003 			 * (undocumented) "secsize=..." mount option is used,
2004 			 * but at least this allows us to mount if we have
2005 			 * an image of a partition.
2006 			 */
2007 			PC_DPRINTF1(3,
2008 			    "!pcfs: Using BPB secsize %d\n", (int)secsize);
2009 		}
2010 	}
2011 
2012 	if (fsp->pcfs_mediasize == 0) {
2013 		mediasize = (len_t)totsec * (len_t)secsize;
2014 		/*
2015 		 * This is not an error because not all devices support the
2016 		 * dkio(4I) mediasize queries, and/or not all devices are
2017 		 * partitioned. If we have not been able to figure out the
2018 		 * size of the underlaying medium, we have to trust the BPB.
2019 		 */
2020 		PC_DPRINTF4(3, "!pcfs: parseBPB: mediasize autodetect failed "
2021 		    "on device (%x.%x):%d, trusting BPB totsec (%lld Bytes)\n",
2022 		    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
2023 		    fsp->pcfs_ldrive, (long long)fsp->pcfs_mediasize);
2024 	} else if ((len_t)totsec * (len_t)secsize > fsp->pcfs_mediasize) {
2025 		cmn_err(CE_WARN,
2026 		    "!pcfs: autodetected mediasize (%lld Bytes) smaller than "
2027 		    "FAT BPB mediasize (%lld Bytes).\n"
2028 		    "truncated filesystem on device (%x.%x):%d, access errors "
2029 		    "possible.\n",
2030 		    (long long)fsp->pcfs_mediasize,
2031 		    (long long)(totsec * (blkcnt_t)secsize),
2032 		    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
2033 		    fsp->pcfs_ldrive);
2034 		mediasize = fsp->pcfs_mediasize;
2035 	} else {
2036 		/*
2037 		 * This is actually ok. A FAT needs not occupy the maximum
2038 		 * space available in its partition, it can be shorter.
2039 		 */
2040 		mediasize = (len_t)totsec * (len_t)secsize;
2041 	}
2042 
2043 	/*
2044 	 * Since we let just about anything pass through this function,
2045 	 * fence against divide-by-zero here.
2046 	 */
2047 	if (secsize)
2048 		rdirsec = roundup(rec * 32, secsize) / secsize;
2049 	else
2050 		rdirsec = 0;
2051 
2052 	/*
2053 	 * This assignment is necessary before pc_dbdaddr() can first be
2054 	 * used. Must initialize the value here.
2055 	 */
2056 	fsp->pcfs_secsize = secsize;
2057 	fsp->pcfs_sdshift = ddi_ffs(secsize / DEV_BSIZE) - 1;
2058 
2059 	fsp->pcfs_mediasize = mediasize;
2060 
2061 	fsp->pcfs_spcl = bpb_get_SecPerClus(bpb);
2062 	fsp->pcfs_numfat = bpb_get_NumFATs(bpb);
2063 	fsp->pcfs_mediadesc = bpb_get_Media(bpb);
2064 	fsp->pcfs_clsize = secsize * fsp->pcfs_spcl;
2065 	fsp->pcfs_rdirsec = rdirsec;
2066 
2067 	/*
2068 	 * Remember: All PCFS offset calculations in sectors. Before I/O
2069 	 * is done, convert to DEV_BSIZE units via pc_dbdaddr(). This is
2070 	 * necessary so that media with > 512Byte sector sizes work correctly.
2071 	 */
2072 	fsp->pcfs_fatstart = fsp->pcfs_dosstart + reserved;
2073 	fsp->pcfs_rdirstart = fsp->pcfs_fatstart + fsp->pcfs_numfat * fatsec;
2074 	fsp->pcfs_datastart = fsp->pcfs_rdirstart + rdirsec;
2075 	datasec = totsec -
2076 	    (blkcnt_t)fatsec * fsp->pcfs_numfat -
2077 	    (blkcnt_t)rdirsec -
2078 	    (blkcnt_t)reserved;
2079 
2080 	DTRACE_PROBE4(fatgeometry,
2081 	    blkcnt_t, totsec, size_t, fatsec,
2082 	    size_t, rdirsec, blkcnt_t, datasec);
2083 
2084 	/*
2085 	 * 'totsec' is taken directly from the BPB and guaranteed to fit
2086 	 * into a 32bit unsigned integer. The calculation of 'datasec',
2087 	 * on the other hand, could underflow for incorrect values in
2088 	 * rdirsec/reserved/fatsec. Check for that.
2089 	 * We also check that the BPB conforms to the FAT specification's
2090 	 * requirement that either of the 16/32bit total sector counts
2091 	 * must be zero.
2092 	 */
2093 	if (totsec != 0 &&
2094 	    (totsec16 == totsec32 || totsec16 == 0 || totsec32 == 0) &&
2095 	    datasec < totsec && datasec <= UINT32_MAX)
2096 		validflags |= BPB_TOTSEC_OK;
2097 
2098 	if ((len_t)totsec * (len_t)secsize <= mediasize)
2099 		validflags |= BPB_MEDIASZ_OK;
2100 
2101 	if (VALID_SECSIZE(secsize))
2102 		validflags |= BPB_SECSIZE_OK;
2103 	if (VALID_SPCL(fsp->pcfs_spcl))
2104 		validflags |= BPB_SECPERCLUS_OK;
2105 	if (VALID_CLSIZE(fsp->pcfs_clsize))
2106 		validflags |= BPB_CLSIZE_OK;
2107 	if (VALID_NUMFATS(fsp->pcfs_numfat))
2108 		validflags |= BPB_NUMFAT_OK;
2109 	if (VALID_RSVDSEC(reserved) && reserved < totsec)
2110 		validflags |= BPB_RSVDSECCNT_OK;
2111 	if (VALID_MEDIA(fsp->pcfs_mediadesc))
2112 		validflags |= BPB_MEDIADESC_OK;
2113 	if (VALID_BOOTSIG(bpb_get_BootSig16(bpb)))
2114 		validflags |= BPB_BOOTSIG16_OK;
2115 	if (VALID_BOOTSIG(bpb_get_BootSig32(bpb)))
2116 		validflags |= BPB_BOOTSIG32_OK;
2117 	if (VALID_FSTYPSTR16(bpb_FilSysType16(bpb)))
2118 		validflags |= BPB_FSTYPSTR16_OK;
2119 	if (VALID_FSTYPSTR32(bpb_FilSysType32(bpb)))
2120 		validflags |= BPB_FSTYPSTR32_OK;
2121 	if (VALID_OEMNAME(bpb_OEMName(bpb)))
2122 		validflags |= BPB_OEMNAME_OK;
2123 	if (bkbootsec > 0 && bkbootsec <= reserved && fsisec != bkbootsec)
2124 		validflags |= BPB_BKBOOTSEC_OK;
2125 	if (fsisec > 0 && fsisec <= reserved)
2126 		validflags |= BPB_FSISEC_OK;
2127 	if (VALID_JMPBOOT(bpb_jmpBoot(bpb)))
2128 		validflags |= BPB_JMPBOOT_OK;
2129 	if (VALID_FSVER32(bpb_get_FSVer32(bpb)))
2130 		validflags |= BPB_FSVER_OK;
2131 	if (VALID_VOLLAB(bpb_VolLab16(bpb)))
2132 		validflags |= BPB_VOLLAB16_OK;
2133 	if (VALID_VOLLAB(bpb_VolLab32(bpb)))
2134 		validflags |= BPB_VOLLAB32_OK;
2135 	if (VALID_EXTFLAGS(bpb_get_ExtFlags32(bpb)))
2136 		validflags |= BPB_EXTFLAGS_OK;
2137 
2138 	/*
2139 	 * Try to determine which FAT format to use.
2140 	 *
2141 	 * Calculate the number of clusters in order to determine
2142 	 * the type of FAT we are looking at.  This is the only
2143 	 * recommended way of determining FAT type, though there
2144 	 * are other hints in the data, this is the best way.
2145 	 *
2146 	 * Since we let just about "anything" pass through this function
2147 	 * without early exits, fence against divide-by-zero here.
2148 	 *
2149 	 * datasec was already validated against UINT32_MAX so we know
2150 	 * the result will not overflow the 32bit calculation.
2151 	 */
2152 	if (fsp->pcfs_spcl)
2153 		ncl = (uint32_t)datasec / fsp->pcfs_spcl;
2154 	else
2155 		ncl = 0;
2156 
2157 	fsp->pcfs_ncluster = ncl;
2158 
2159 	/*
2160 	 * From the Microsoft FAT specification:
2161 	 * In the following example, when it says <, it does not mean <=.
2162 	 * Note also that the numbers are correct.  The first number for
2163 	 * FAT12 is 4085; the second number for FAT16 is 65525. These numbers
2164 	 * and the '<' signs are not wrong.
2165 	 *
2166 	 * We "specialdetect" the corner cases, and use at least one "extra"
2167 	 * criterion to decide whether it's FAT16 or FAT32 if the cluster
2168 	 * count is dangerously close to the boundaries.
2169 	 */
2170 
2171 	if (ncl <= PCF_FIRSTCLUSTER) {
2172 		type = FAT_UNKNOWN;
2173 	} else if (ncl < 4085) {
2174 		type = FAT12;
2175 	} else if (ncl <= 4096) {
2176 		type = FAT_QUESTIONABLE;
2177 	} else if (ncl < 65525) {
2178 		type = FAT16;
2179 	} else if (ncl <= 65536) {
2180 		type = FAT_QUESTIONABLE;
2181 	} else if (ncl < PCF_LASTCLUSTER32) {
2182 		type = FAT32;
2183 	} else {
2184 		type = FAT_UNKNOWN;
2185 	}
2186 
2187 	DTRACE_PROBE4(parseBPB__initial,
2188 	    struct pcfs *, fsp, unsigned char *, bpb,
2189 	    int, validflags, fattype_t, type);
2190 
2191 recheck:
2192 	fsp->pcfs_fatsec = fatsec;
2193 
2194 	/* Do some final sanity checks for each specific type of FAT */
2195 	switch (type) {
2196 		case FAT12:
2197 			if (rec != 0)
2198 				validflags |= BPB_ROOTENTCNT_OK;
2199 			if ((blkcnt_t)bpb_get_TotSec16(bpb) == totsec ||
2200 			    bpb_get_TotSec16(bpb) == 0)
2201 				validflags |= BPB_TOTSEC16_OK;
2202 			if ((blkcnt_t)bpb_get_TotSec32(bpb) == totsec ||
2203 			    bpb_get_TotSec32(bpb) == 0)
2204 				validflags |= BPB_TOTSEC32_OK;
2205 			if (bpb_get_FatSz16(bpb) == fatsec)
2206 				validflags |= BPB_FATSZ16_OK;
2207 			if (fatsec * secsize >= (ncl + PCF_FIRSTCLUSTER)
2208 			    * 3 / 2)
2209 				validflags |= BPB_FATSZ_OK;
2210 			if (ncl < 4085)
2211 				validflags |= BPB_NCLUSTERS_OK;
2212 
2213 			fsp->pcfs_lastclmark = (PCF_LASTCLUSTER & 0xfff);
2214 			fsp->pcfs_rootblksize =
2215 			    fsp->pcfs_rdirsec * secsize;
2216 			fsp->pcfs_fsistart = 0;
2217 
2218 			if ((validflags & FAT12_VALIDMSK) != FAT12_VALIDMSK)
2219 				type = FAT_UNKNOWN;
2220 			break;
2221 		case FAT16:
2222 			if (rec != 0)
2223 				validflags |= BPB_ROOTENTCNT_OK;
2224 			if ((blkcnt_t)bpb_get_TotSec16(bpb) == totsec ||
2225 			    bpb_get_TotSec16(bpb) == 0)
2226 				validflags |= BPB_TOTSEC16_OK;
2227 			if ((blkcnt_t)bpb_get_TotSec32(bpb) == totsec ||
2228 			    bpb_get_TotSec32(bpb) == 0)
2229 				validflags |= BPB_TOTSEC32_OK;
2230 			if (bpb_get_FatSz16(bpb) == fatsec)
2231 				validflags |= BPB_FATSZ16_OK;
2232 			if (fatsec * secsize >= (ncl + PCF_FIRSTCLUSTER) * 2)
2233 				validflags |= BPB_FATSZ_OK;
2234 			if (ncl >= 4085 && ncl < 65525)
2235 				validflags |= BPB_NCLUSTERS_OK;
2236 
2237 			fsp->pcfs_lastclmark = PCF_LASTCLUSTER;
2238 			fsp->pcfs_rootblksize =
2239 			    fsp->pcfs_rdirsec * secsize;
2240 			fsp->pcfs_fsistart = 0;
2241 
2242 			if ((validflags & FAT16_VALIDMSK) != FAT16_VALIDMSK)
2243 				type = FAT_UNKNOWN;
2244 			break;
2245 		case FAT32:
2246 			if (rec == 0)
2247 				validflags |= BPB_ROOTENTCNT_OK;
2248 			if (bpb_get_TotSec16(bpb) == 0)
2249 				validflags |= BPB_TOTSEC16_OK;
2250 			if ((blkcnt_t)bpb_get_TotSec32(bpb) == totsec)
2251 				validflags |= BPB_TOTSEC32_OK;
2252 			if (bpb_get_FatSz16(bpb) == 0)
2253 				validflags |= BPB_FATSZ16_OK;
2254 			if (bpb_get_FatSz32(bpb) == fatsec)
2255 				validflags |= BPB_FATSZ32_OK;
2256 			if (fatsec * secsize >= (ncl + PCF_FIRSTCLUSTER) * 4)
2257 				validflags |= BPB_FATSZ_OK;
2258 			if (ncl >= 65525 && ncl < PCF_LASTCLUSTER32)
2259 				validflags |= BPB_NCLUSTERS_OK;
2260 
2261 			fsp->pcfs_lastclmark = PCF_LASTCLUSTER32;
2262 			fsp->pcfs_rootblksize = fsp->pcfs_clsize;
2263 			fsp->pcfs_fsistart = fsp->pcfs_dosstart + fsisec;
2264 			if (validflags & BPB_FSISEC_OK)
2265 				fsp->pcfs_flags |= PCFS_FSINFO_OK;
2266 			fsp->pcfs_rootclnum = bpb_get_RootClus32(bpb);
2267 			if (pc_validcl(fsp, fsp->pcfs_rootclnum))
2268 				validflags |= BPB_ROOTCLUSTER_OK;
2269 
2270 			/*
2271 			 * Current PCFS code only works if 'pcfs_rdirstart'
2272 			 * contains the root cluster number on FAT32.
2273 			 * That's a mis-use and would better be changed.
2274 			 */
2275 			fsp->pcfs_rdirstart = (daddr_t)fsp->pcfs_rootclnum;
2276 
2277 			if ((validflags & FAT32_VALIDMSK) != FAT32_VALIDMSK)
2278 				type = FAT_UNKNOWN;
2279 			break;
2280 		case FAT_QUESTIONABLE:
2281 			type = secondaryBPBChecks(fsp, bpb, secsize);
2282 			goto recheck;
2283 		default:
2284 			ASSERT(type == FAT_UNKNOWN);
2285 			break;
2286 	}
2287 
2288 	ASSERT(type != FAT_QUESTIONABLE);
2289 
2290 	fsp->pcfs_fattype = type;
2291 
2292 	if (valid)
2293 		*valid = validflags;
2294 
2295 	DTRACE_PROBE4(parseBPB__final,
2296 	    struct pcfs *, fsp, unsigned char *, bpb,
2297 	    int, validflags, fattype_t, type);
2298 
2299 	if (type != FAT_UNKNOWN) {
2300 		ASSERT((secsize & (DEV_BSIZE - 1)) == 0);
2301 		ASSERT(ISP2(secsize / DEV_BSIZE));
2302 		return (1);
2303 	}
2304 
2305 	return (0);
2306 }
2307 
2308 
2309 /*
2310  * Detect the device's native block size (sector size).
2311  *
2312  * Test whether the device is:
2313  *	- a floppy device from a known controller type via DKIOCINFO
2314  *	- a real floppy using the fd(4D) driver and capable of fdio(4I) ioctls
2315  *	- a USB floppy drive (identified by drive geometry)
2316  *
2317  * Detecting a floppy will make PCFS metadata updates on such media synchronous,
2318  * to minimize risks due to slow I/O and user hotplugging / device ejection.
2319  *
2320  * This might be a bit wasteful on kernel stack space; if anyone's
2321  * bothered by this, kmem_alloc/kmem_free the ioctl arguments...
2322  */
2323 static void
2324 pcfs_device_getinfo(struct pcfs *fsp)
2325 {
2326 	dev_t			rdev = fsp->pcfs_xdev;
2327 	int			error;
2328 	union {
2329 		struct dk_minfo		mi;
2330 		struct dk_cinfo		ci;
2331 		struct dk_geom		gi;
2332 		struct fd_char		fc;
2333 	} arg;				/* save stackspace ... */
2334 	intptr_t argp = (intptr_t)&arg;
2335 	ldi_handle_t		lh;
2336 	ldi_ident_t		li;
2337 	int isfloppy, isremoveable, ishotpluggable;
2338 	cred_t			*cr = CRED();
2339 
2340 	if (ldi_ident_from_dev(rdev, &li))
2341 		goto out;
2342 
2343 	error = ldi_open_by_dev(&rdev, OTYP_CHR, FREAD, cr, &lh, li);
2344 	ldi_ident_release(li);
2345 	if (error)
2346 		goto out;
2347 
2348 	/*
2349 	 * Not sure if this could possibly happen. It'd be a bit like
2350 	 * VOP_OPEN() changing the passed-in vnode ptr. We're just not
2351 	 * expecting it, needs some thought if triggered ...
2352 	 */
2353 	ASSERT(fsp->pcfs_xdev == rdev);
2354 
2355 	/*
2356 	 * Check for removeable/hotpluggable media.
2357 	 */
2358 	if (ldi_ioctl(lh, DKIOCREMOVABLE,
2359 	    (intptr_t)&isremoveable, FKIOCTL, cr, NULL)) {
2360 		isremoveable = 0;
2361 	}
2362 	if (ldi_ioctl(lh, DKIOCHOTPLUGGABLE,
2363 	    (intptr_t)&ishotpluggable, FKIOCTL, cr, NULL)) {
2364 		ishotpluggable = 0;
2365 	}
2366 
2367 	/*
2368 	 * Make sure we don't use "half-initialized" values if the ioctls fail.
2369 	 */
2370 	if (ldi_ioctl(lh, DKIOCGMEDIAINFO, argp, FKIOCTL, cr, NULL)) {
2371 		bzero(&arg, sizeof (arg));
2372 		fsp->pcfs_mediasize = 0;
2373 	} else {
2374 		fsp->pcfs_mediasize =
2375 		    (len_t)arg.mi.dki_lbsize *
2376 		    (len_t)arg.mi.dki_capacity;
2377 	}
2378 
2379 	if (VALID_SECSIZE(arg.mi.dki_lbsize)) {
2380 		if (fsp->pcfs_secsize == 0) {
2381 			fsp->pcfs_secsize = arg.mi.dki_lbsize;
2382 			fsp->pcfs_sdshift =
2383 			    ddi_ffs(arg.mi.dki_lbsize / DEV_BSIZE) - 1;
2384 		} else {
2385 			PC_DPRINTF4(1, "!pcfs: autodetected media block size "
2386 			    "%d, device (%x.%x), different from user-provided "
2387 			    "%d. User override - ignoring autodetect result.\n",
2388 			    arg.mi.dki_lbsize,
2389 			    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
2390 			    fsp->pcfs_secsize);
2391 		}
2392 	} else if (arg.mi.dki_lbsize) {
2393 		PC_DPRINTF3(1, "!pcfs: autodetected media block size "
2394 		    "%d, device (%x.%x), invalid (not 512, 1024, 2048, 4096). "
2395 		    "Ignoring autodetect result.\n",
2396 		    arg.mi.dki_lbsize,
2397 		    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev));
2398 	}
2399 
2400 	/*
2401 	 * We treat the following media types as a floppy by default.
2402 	 */
2403 	isfloppy =
2404 	    (arg.mi.dki_media_type == DK_FLOPPY ||
2405 	    arg.mi.dki_media_type == DK_ZIP ||
2406 	    arg.mi.dki_media_type == DK_JAZ);
2407 
2408 	/*
2409 	 * if this device understands fdio(4I) requests it's
2410 	 * obviously a floppy drive.
2411 	 */
2412 	if (!isfloppy &&
2413 	    !ldi_ioctl(lh, FDIOGCHAR, argp, FKIOCTL, cr, NULL))
2414 		isfloppy = 1;
2415 
2416 	/*
2417 	 * some devices we like to treat as floppies, but they don't
2418 	 * understand fdio(4I) requests.
2419 	 */
2420 	if (!isfloppy &&
2421 	    !ldi_ioctl(lh, DKIOCINFO, argp, FKIOCTL, cr, NULL) &&
2422 	    (arg.ci.dki_ctype == DKC_WDC2880 ||
2423 	    arg.ci.dki_ctype == DKC_NCRFLOPPY ||
2424 	    arg.ci.dki_ctype == DKC_SMSFLOPPY ||
2425 	    arg.ci.dki_ctype == DKC_INTEL82077))
2426 		isfloppy = 1;
2427 
2428 	/*
2429 	 * This is the "final fallback" test - media with
2430 	 * 2 heads and 80 cylinders are assumed to be floppies.
2431 	 * This is normally true for USB floppy drives ...
2432 	 */
2433 	if (!isfloppy &&
2434 	    !ldi_ioctl(lh, DKIOCGGEOM, argp, FKIOCTL, cr, NULL) &&
2435 	    (arg.gi.dkg_ncyl == 80 && arg.gi.dkg_nhead == 2))
2436 		isfloppy = 1;
2437 
2438 	/*
2439 	 * This is similar to the "old" PCFS code that sets this flag
2440 	 * just based on the media descriptor being 0xf8 (MD_FIXED).
2441 	 * Should be re-worked. We really need some specialcasing for
2442 	 * removeable media.
2443 	 */
2444 	if (!isfloppy) {
2445 		fsp->pcfs_flags |= PCFS_NOCHK;
2446 	}
2447 
2448 	/*
2449 	 * We automatically disable access time updates if the medium is
2450 	 * removeable and/or hotpluggable, and the admin did not explicitly
2451 	 * request access time updates (via the "atime" mount option).
2452 	 * The majority of flash-based media should fit this category.
2453 	 * Minimizing write access extends the lifetime of your memory stick !
2454 	 */
2455 	if (!vfs_optionisset(fsp->pcfs_vfs, MNTOPT_ATIME, NULL) &&
2456 	    (isremoveable || ishotpluggable | isfloppy)) {
2457 		fsp->pcfs_flags |= PCFS_NOATIME;
2458 	}
2459 
2460 	(void) ldi_close(lh, FREAD, cr);
2461 out:
2462 	if (fsp->pcfs_secsize == 0) {
2463 		PC_DPRINTF3(1, "!pcfs: media block size autodetection "
2464 		    "device (%x.%x) failed, no user-provided fallback. "
2465 		    "Using %d bytes.\n",
2466 		    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
2467 		    DEV_BSIZE);
2468 		fsp->pcfs_secsize = DEV_BSIZE;
2469 		fsp->pcfs_sdshift = 0;
2470 	}
2471 	ASSERT(fsp->pcfs_secsize % DEV_BSIZE == 0);
2472 	ASSERT(VALID_SECSIZE(fsp->pcfs_secsize));
2473 }
2474 
2475 /*
2476  * Get the FAT type for the DOS medium.
2477  *
2478  * -------------------------
2479  * According to Microsoft:
2480  *   The FAT type one of FAT12, FAT16, or FAT32 is determined by the
2481  * count of clusters on the volume and nothing else.
2482  * -------------------------
2483  *
2484  */
2485 static int
2486 pc_getfattype(struct pcfs *fsp)
2487 {
2488 	int error = 0;
2489 	buf_t *bp = NULL;
2490 	struct vnode *devvp = fsp->pcfs_devvp;
2491 	dev_t	dev = devvp->v_rdev;
2492 
2493 	/*
2494 	 * Detect the native block size of the medium, and attempt to
2495 	 * detect whether the medium is removeable.
2496 	 * We do treat removable media (floppies, USB and FireWire disks)
2497 	 * differently wrt. to the frequency and synchronicity of FAT updates.
2498 	 * We need to know the media block size in order to be able to
2499 	 * parse the partition table.
2500 	 */
2501 	pcfs_device_getinfo(fsp);
2502 
2503 	/*
2504 	 * Unpartitioned media (floppies and some removeable devices)
2505 	 * don't have a partition table, the FAT BPB is at disk block 0.
2506 	 * Start out by reading block 0.
2507 	 */
2508 	fsp->pcfs_dosstart = 0;
2509 	bp = bread(dev, pc_dbdaddr(fsp, fsp->pcfs_dosstart), fsp->pcfs_secsize);
2510 
2511 	if (error = geterror(bp))
2512 		goto out;
2513 
2514 	/*
2515 	 * If a logical drive number is requested, parse the partition table
2516 	 * and attempt to locate it. Otherwise, proceed immediately to the
2517 	 * BPB check. findTheDrive(), if successful, returns the disk block
2518 	 * number where the requested partition starts in "startsec".
2519 	 */
2520 	if (fsp->pcfs_ldrive != 0) {
2521 		PC_DPRINTF3(5, "!pcfs: pc_getfattype: using FDISK table on "
2522 		    "device (%x,%x):%d to find BPB\n",
2523 		    getmajor(dev), getminor(dev), fsp->pcfs_ldrive);
2524 
2525 		if (error = findTheDrive(fsp, &bp))
2526 			goto out;
2527 
2528 		ASSERT(fsp->pcfs_dosstart != 0);
2529 
2530 		brelse(bp);
2531 		bp = bread(dev, pc_dbdaddr(fsp, fsp->pcfs_dosstart),
2532 		    fsp->pcfs_secsize);
2533 		if (error = geterror(bp))
2534 			goto out;
2535 	}
2536 
2537 	/*
2538 	 * Validate the BPB and fill in the instance structure.
2539 	 */
2540 	if (!parseBPB(fsp, (uchar_t *)bp->b_un.b_addr, NULL)) {
2541 		PC_DPRINTF4(1, "!pcfs: pc_getfattype: No FAT BPB on "
2542 		    "device (%x.%x):%d, disk LBA %u\n",
2543 		    getmajor(dev), getminor(dev), fsp->pcfs_ldrive,
2544 		    (uint_t)pc_dbdaddr(fsp, fsp->pcfs_dosstart));
2545 		error = EINVAL;
2546 		goto out;
2547 	}
2548 
2549 	ASSERT(fsp->pcfs_fattype != FAT_UNKNOWN);
2550 
2551 out:
2552 	/*
2553 	 * Release the buffer used
2554 	 */
2555 	if (bp != NULL)
2556 		brelse(bp);
2557 	return (error);
2558 }
2559 
2560 
2561 /*
2562  * Get the file allocation table.
2563  * If there is an old FAT, invalidate it.
2564  */
2565 int
2566 pc_getfat(struct pcfs *fsp)
2567 {
2568 	struct buf *bp = NULL;
2569 	uchar_t *fatp = NULL;
2570 	uchar_t *fat_changemap = NULL;
2571 	int error;
2572 	int fat_changemapsize;
2573 	int flags = 0;
2574 	int nfat;
2575 	int altfat_mustmatch = 0;
2576 	int fatsize = fsp->pcfs_fatsec * fsp->pcfs_secsize;
2577 
2578 	if (fsp->pcfs_fatp) {
2579 		/*
2580 		 * There is a FAT in core.
2581 		 * If there are open file pcnodes or we have modified it or
2582 		 * it hasn't timed out yet use the in core FAT.
2583 		 * Otherwise invalidate it and get a new one
2584 		 */
2585 #ifdef notdef
2586 		if (fsp->pcfs_frefs ||
2587 		    (fsp->pcfs_flags & PCFS_FATMOD) ||
2588 		    (gethrestime_sec() < fsp->pcfs_fattime)) {
2589 			return (0);
2590 		} else {
2591 			mutex_enter(&pcfslock);
2592 			pc_invalfat(fsp);
2593 			mutex_exit(&pcfslock);
2594 		}
2595 #endif /* notdef */
2596 		return (0);
2597 	}
2598 
2599 	/*
2600 	 * Get FAT and check it for validity
2601 	 */
2602 	fatp = kmem_alloc(fatsize, KM_SLEEP);
2603 	error = pc_readfat(fsp, fatp);
2604 	if (error) {
2605 		flags = B_ERROR;
2606 		goto out;
2607 	}
2608 	fat_changemapsize = (fatsize / fsp->pcfs_clsize) + 1;
2609 	fat_changemap = kmem_zalloc(fat_changemapsize, KM_SLEEP);
2610 	fsp->pcfs_fatp = fatp;
2611 	fsp->pcfs_fat_changemapsize = fat_changemapsize;
2612 	fsp->pcfs_fat_changemap = fat_changemap;
2613 
2614 	/*
2615 	 * The only definite signature check is that the
2616 	 * media descriptor byte should match the first byte
2617 	 * of the FAT block.
2618 	 */
2619 	if (fatp[0] != fsp->pcfs_mediadesc) {
2620 		cmn_err(CE_NOTE, "!pcfs: FAT signature mismatch, "
2621 		    "media descriptor %x, FAT[0] lowbyte %x\n",
2622 		    (uint32_t)fsp->pcfs_mediadesc, (uint32_t)fatp[0]);
2623 		cmn_err(CE_NOTE, "!pcfs: Enforcing alternate FAT validation\n");
2624 		altfat_mustmatch = 1;
2625 	}
2626 
2627 	/*
2628 	 * Get alternate FATs and check for consistency
2629 	 * This is an inlined version of pc_readfat().
2630 	 * Since we're only comparing FAT and alternate FAT,
2631 	 * there's no reason to let pc_readfat() copy data out
2632 	 * of the buf. Instead, compare in-situ, one cluster
2633 	 * at a time.
2634 	 */
2635 	for (nfat = 1; nfat < fsp->pcfs_numfat; nfat++) {
2636 		size_t startsec;
2637 		size_t off;
2638 
2639 		startsec = pc_dbdaddr(fsp,
2640 		    fsp->pcfs_fatstart + nfat * fsp->pcfs_fatsec);
2641 
2642 		for (off = 0; off < fatsize; off += fsp->pcfs_clsize) {
2643 			daddr_t fatblk = startsec + pc_dbdaddr(fsp,
2644 			    pc_cltodb(fsp, pc_lblkno(fsp, off)));
2645 
2646 			bp = bread(fsp->pcfs_xdev, fatblk,
2647 			    MIN(fsp->pcfs_clsize, fatsize - off));
2648 			if (bp->b_flags & (B_ERROR | B_STALE)) {
2649 				cmn_err(CE_NOTE,
2650 				    "!pcfs: alternate FAT #%d (start LBA %p)"
2651 				    " read error at offset %ld on device"
2652 				    " (%x.%x):%d",
2653 				    nfat, (void *)(uintptr_t)startsec, off,
2654 				    getmajor(fsp->pcfs_xdev),
2655 				    getminor(fsp->pcfs_xdev),
2656 				    fsp->pcfs_ldrive);
2657 				flags = B_ERROR;
2658 				error = EIO;
2659 				goto out;
2660 			}
2661 			bp->b_flags |= B_STALE | B_AGE;
2662 			if (bcmp(bp->b_un.b_addr, fatp + off,
2663 			    MIN(fsp->pcfs_clsize, fatsize - off))) {
2664 				cmn_err(CE_NOTE,
2665 				    "!pcfs: alternate FAT #%d (start LBA %p)"
2666 				    " corrupted at offset %ld on device"
2667 				    " (%x.%x):%d",
2668 				    nfat, (void *)(uintptr_t)startsec, off,
2669 				    getmajor(fsp->pcfs_xdev),
2670 				    getminor(fsp->pcfs_xdev),
2671 				    fsp->pcfs_ldrive);
2672 				if (altfat_mustmatch) {
2673 					flags = B_ERROR;
2674 					error = EIO;
2675 					goto out;
2676 				}
2677 			}
2678 			brelse(bp);
2679 			bp = NULL;	/* prevent double release */
2680 		}
2681 	}
2682 
2683 	fsp->pcfs_fattime = gethrestime_sec() + PCFS_DISKTIMEOUT;
2684 	fsp->pcfs_fatjustread = 1;
2685 
2686 	/*
2687 	 * Retrieve FAT32 fsinfo sector.
2688 	 * A failure to read this is not fatal to accessing the volume.
2689 	 * It simply means operations that count or search free blocks
2690 	 * will have to do a full FAT walk, vs. a possibly quicker lookup
2691 	 * of the summary information.
2692 	 * Hence, we log a message but return success overall after this point.
2693 	 */
2694 	if (IS_FAT32(fsp) && (fsp->pcfs_flags & PCFS_FSINFO_OK)) {
2695 		struct fat_od_fsi *fsinfo_disk;
2696 
2697 		bp = bread(fsp->pcfs_xdev,
2698 		    pc_dbdaddr(fsp, fsp->pcfs_fsistart), fsp->pcfs_secsize);
2699 		fsinfo_disk = (struct fat_od_fsi *)bp->b_un.b_addr;
2700 		if (bp->b_flags & (B_ERROR | B_STALE) ||
2701 		    !FSISIG_OK(fsinfo_disk)) {
2702 			cmn_err(CE_NOTE,
2703 			    "!pcfs: error reading fat32 fsinfo from "
2704 			    "device (%x.%x):%d, block %lld",
2705 			    getmajor(fsp->pcfs_xdev), getminor(fsp->pcfs_xdev),
2706 			    fsp->pcfs_ldrive,
2707 			    (long long)pc_dbdaddr(fsp, fsp->pcfs_fsistart));
2708 			fsp->pcfs_flags &= ~PCFS_FSINFO_OK;
2709 			fsp->pcfs_fsinfo.fs_free_clusters = FSINFO_UNKNOWN;
2710 			fsp->pcfs_fsinfo.fs_next_free = FSINFO_UNKNOWN;
2711 		} else {
2712 			bp->b_flags |= B_STALE | B_AGE;
2713 			fsinfo_disk = (fat_od_fsi_t *)(bp->b_un.b_addr);
2714 			fsp->pcfs_fsinfo.fs_free_clusters =
2715 			    LE_32(fsinfo_disk->fsi_incore.fs_free_clusters);
2716 			fsp->pcfs_fsinfo.fs_next_free =
2717 			    LE_32(fsinfo_disk->fsi_incore.fs_next_free);
2718 		}
2719 		brelse(bp);
2720 		bp = NULL;
2721 	}
2722 
2723 	if (pc_validcl(fsp, (pc_cluster32_t)fsp->pcfs_fsinfo.fs_next_free))
2724 		fsp->pcfs_nxfrecls = fsp->pcfs_fsinfo.fs_next_free;
2725 	else
2726 		fsp->pcfs_nxfrecls = PCF_FIRSTCLUSTER;
2727 
2728 	return (0);
2729 
2730 out:
2731 	cmn_err(CE_NOTE, "!pcfs: illegal disk format");
2732 	if (bp)
2733 		brelse(bp);
2734 	if (fatp)
2735 		kmem_free(fatp, fatsize);
2736 	if (fat_changemap)
2737 		kmem_free(fat_changemap, fat_changemapsize);
2738 
2739 	if (flags) {
2740 		pc_mark_irrecov(fsp);
2741 	}
2742 	return (error);
2743 }
2744