xref: /freebsd/sys/kern/subr_disk.c (revision 4b2eaea43fec8e8792be611dea204071a10b655a)
1 /*
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  *
9  * $FreeBSD$
10  *
11  */
12 
13 #include "opt_geom.h"
14 
15 #include <sys/param.h>
16 #include <sys/systm.h>
17 #include <sys/stdint.h>
18 #include <sys/bio.h>
19 #include <sys/conf.h>
20 #include <sys/disk.h>
21 #include <sys/disklabel.h>
22 #ifdef NO_GEOM
23 #include <sys/diskslice.h>
24 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/sysctl.h>
27 #include <machine/md_var.h>
28 #include <sys/ctype.h>
29 
30 static MALLOC_DEFINE(M_DISK, "disk", "disk data");
31 
32 static d_strategy_t diskstrategy;
33 static d_open_t diskopen;
34 static d_close_t diskclose;
35 static d_ioctl_t diskioctl;
36 static d_psize_t diskpsize;
37 
38 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
39 
40 void disk_dev_synth(dev_t dev);
41 
42 void
43 disk_dev_synth(dev_t dev)
44 {
45 	struct disk *dp;
46 	int u, s, p;
47 	dev_t pdev;
48 
49 	if (dksparebits(dev))
50 		return;
51 	LIST_FOREACH(dp, &disklist, d_list) {
52 		if (major(dev) != dp->d_devsw->d_maj)
53 			continue;
54 		u = dkunit(dev);
55 		p = RAW_PART;
56 		s = WHOLE_DISK_SLICE;
57 		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
58 		if (pdev->si_devsw == NULL)
59 			return;		/* Probably a unit we don't have */
60 		s = dkslice(dev);
61 		p = dkpart(dev);
62 		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
63 			/* XXX: actually should not happen */
64 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
65 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d",
66 				dp->d_devsw->d_name, u);
67 			dev_depends(pdev, dev);
68 			return;
69 		}
70 		if (s == COMPATIBILITY_SLICE) {
71 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
72 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
73 				dp->d_devsw->d_name, u, 'a' + p);
74 			dev_depends(pdev, dev);
75 			return;
76 		}
77 		if (p != RAW_PART) {
78 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
79 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
80 				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
81 				'a' + p);
82 		} else {
83 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
84 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
85 				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
86 			make_dev_alias(dev, "%s%ds%dc",
87 			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
88 		}
89 		dev_depends(pdev, dev);
90 		return;
91 	}
92 }
93 
94 static void
95 disk_clone(void *arg, char *name, int namelen, dev_t *dev)
96 {
97 	struct disk *dp;
98 	char const *d;
99 	char *e;
100 	int j, u, s, p;
101 	dev_t pdev;
102 
103 	if (*dev != NODEV)
104 		return;
105 
106 	LIST_FOREACH(dp, &disklist, d_list) {
107 		d = dp->d_devsw->d_name;
108 		j = dev_stdclone(name, &e, d, &u);
109 		if (j == 0)
110 			continue;
111 		if (u > DKMAXUNIT)
112 			continue;
113 		p = RAW_PART;
114 		s = WHOLE_DISK_SLICE;
115 		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
116 		if (pdev->si_disk == NULL)
117 			continue;
118 		if (*e != '\0') {
119 			j = dev_stdclone(e, &e, "s", &s);
120 			if (j == 0)
121 				s = COMPATIBILITY_SLICE;
122 			else if (j == 1 || j == 2)
123 				s += BASE_SLICE - 1;
124 			if (!*e)
125 				;		/* ad0s1 case */
126 			else if (e[1] != '\0')
127 				return;		/* can never be a disk name */
128 			else if (*e < 'a' || *e > 'h')
129 				return;		/* can never be a disk name */
130 			else
131 				p = *e - 'a';
132 		}
133 		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
134 			return;
135 		} else if (s >= BASE_SLICE && p != RAW_PART) {
136 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
137 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
138 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
139 			    p + 'a');
140 		} else if (s >= BASE_SLICE) {
141 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
142 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
143 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
144 			make_dev_alias(*dev, "%s%ds%dc",
145 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
146 		} else {
147 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
148 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
149 			    pdev->si_devsw->d_name, u, p + 'a');
150 		}
151 		dev_depends(pdev, *dev);
152 		return;
153 	}
154 }
155 
156 static void
157 inherit_raw(dev_t pdev, dev_t dev)
158 {
159 	dev->si_disk = pdev->si_disk;
160 	dev->si_drv1 = pdev->si_drv1;
161 	dev->si_drv2 = pdev->si_drv2;
162 	dev->si_iosize_max = pdev->si_iosize_max;
163 	dev->si_bsize_phys = pdev->si_bsize_phys;
164 	dev->si_bsize_best = pdev->si_bsize_best;
165 }
166 
167 dev_t
168 disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
169 {
170 	static int once;
171 	dev_t dev;
172 
173 	if (!once) {
174 		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
175 		once++;
176 	}
177 
178 	bzero(dp, sizeof(*dp));
179 	dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_ZERO);
180 
181 	if (proto->d_open != diskopen) {
182 		*proto = *cdevsw;
183 		proto->d_open = diskopen;
184 		proto->d_close = diskclose;
185 		proto->d_ioctl = diskioctl;
186 		proto->d_strategy = diskstrategy;
187 		proto->d_psize = diskpsize;
188 	}
189 
190 	if (bootverbose)
191 		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
192 	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
193 	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
194 
195 	dev->si_disk = dp;
196 	dp->d_dev = dev;
197 	dp->d_dsflags = flags;
198 	dp->d_devsw = cdevsw;
199 	LIST_INSERT_HEAD(&disklist, dp, d_list);
200 
201 	return (dev);
202 }
203 
204 static int
205 diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
206 {
207 	struct dumperinfo di;
208 	struct disklabel *dl;
209 
210 	if (!onoff)
211 		return(set_dumper(NULL));
212 	dl = dsgetlabel(dev, dp->d_slice);
213 	if (!dl)
214 		return (ENXIO);
215 	bzero(&di, sizeof di);
216 	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
217 	di.priv = dp->d_dev;
218 	di.blocksize = dl->d_secsize;
219 	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
220 	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
221 	di.mediasize =
222 	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
223 	if (di.mediasize == 0)
224 		return (EINVAL);
225 	return(set_dumper(&di));
226 }
227 
228 void
229 disk_invalidate (struct disk *disk)
230 {
231 	if (disk->d_slice)
232 		dsgone(&disk->d_slice);
233 }
234 
235 void
236 disk_destroy(dev_t dev)
237 {
238 	LIST_REMOVE(dev->si_disk, d_list);
239 	free(dev->si_disk->d_label, M_DEVBUF);
240 	bzero(dev->si_disk, sizeof(*dev->si_disk));
241     	dev->si_disk = NULL;
242 	destroy_dev(dev);
243 	return;
244 }
245 
246 struct disk *
247 disk_enumerate(struct disk *disk)
248 {
249 	if (!disk)
250 		return (LIST_FIRST(&disklist));
251 	else
252 		return (LIST_NEXT(disk, d_list));
253 }
254 
255 static int
256 sysctl_disks(SYSCTL_HANDLER_ARGS)
257 {
258 	struct disk *disk;
259 	int error, first;
260 
261 	disk = NULL;
262 	first = 1;
263 
264 	while ((disk = disk_enumerate(disk))) {
265 		if (!first) {
266 			error = SYSCTL_OUT(req, " ", 1);
267 			if (error)
268 				return error;
269 		} else {
270 			first = 0;
271 		}
272 		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
273 		if (error)
274 			return error;
275 	}
276 	error = SYSCTL_OUT(req, "", 1);
277 	return error;
278 }
279 
280 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
281     sysctl_disks, "A", "names of available disks");
282 
283 /*
284  * The cdevsw functions
285  */
286 
287 static int
288 diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
289 {
290 	dev_t pdev;
291 	struct disk *dp;
292 	int error;
293 
294 	error = 0;
295 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
296 
297 	dp = pdev->si_disk;
298 	if (!dp)
299 		return (ENXIO);
300 
301 	while (dp->d_flags & DISKFLAG_LOCK) {
302 		dp->d_flags |= DISKFLAG_WANTED;
303 		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
304 		if (error)
305 			return (error);
306 	}
307 	dp->d_flags |= DISKFLAG_LOCK;
308 
309 	if (!dsisopen(dp->d_slice)) {
310 		if (!pdev->si_iosize_max)
311 			pdev->si_iosize_max = dev->si_iosize_max;
312 		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
313 		dp->d_label->d_secsize = dp->d_sectorsize;
314 		dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize;
315 		dp->d_label->d_nsectors = dp->d_fwsectors;
316 		dp->d_label->d_ntracks = dp->d_fwheads;
317 	}
318 
319 	/* Inherit properties from the whole/raw dev_t */
320 	inherit_raw(pdev, dev);
321 
322 	if (error)
323 		goto out;
324 
325 	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label);
326 
327 	if (!dsisopen(dp->d_slice))
328 		dp->d_devsw->d_close(pdev, oflags, devtype, td);
329 out:
330 	dp->d_flags &= ~DISKFLAG_LOCK;
331 	if (dp->d_flags & DISKFLAG_WANTED) {
332 		dp->d_flags &= ~DISKFLAG_WANTED;
333 		wakeup(dp);
334 	}
335 
336 	return(error);
337 }
338 
339 static int
340 diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
341 {
342 	struct disk *dp;
343 	int error;
344 	dev_t pdev;
345 
346 	error = 0;
347 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
348 	dp = pdev->si_disk;
349 	if (!dp)
350 		return (ENXIO);
351 	dsclose(dev, devtype, dp->d_slice);
352 	if (!dsisopen(dp->d_slice))
353 		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
354 	return (error);
355 }
356 
357 static void
358 diskstrategy(struct bio *bp)
359 {
360 	dev_t pdev;
361 	struct disk *dp;
362 
363 	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
364 	dp = pdev->si_disk;
365 	bp->bio_resid = bp->bio_bcount;
366 	if (dp != bp->bio_dev->si_disk)
367 		inherit_raw(pdev, bp->bio_dev);
368 
369 	if (!dp) {
370 		biofinish(bp, NULL, ENXIO);
371 		return;
372 	}
373 
374 	if (dscheck(bp, dp->d_slice) <= 0) {
375 		biodone(bp);
376 		return;
377 	}
378 
379 	if (bp->bio_bcount == 0) {
380 		biodone(bp);
381 		return;
382 	}
383 
384 	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
385 	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
386 	dp->d_devsw->d_strategy(bp);
387 	return;
388 
389 }
390 
391 static int
392 diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
393 {
394 	struct disk *dp;
395 	int error;
396 	u_int u;
397 	dev_t pdev;
398 
399 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
400 	dp = pdev->si_disk;
401 	if (!dp)
402 		return (ENXIO);
403 	if (cmd == DIOCSKERNELDUMP) {
404 		u = *(u_int *)data;
405 		return (diskdumpconf(u, dev, dp));
406 	}
407 	if (cmd == DIOCGFRONTSTUFF) {
408 		*(off_t *)data = 8192;	/* XXX: crude but enough) */
409 		return (0);
410 	}
411 	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
412 	if (error == ENOIOCTL)
413 		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
414 	return (error);
415 }
416 
417 static int
418 diskpsize(dev_t dev)
419 {
420 	struct disk *dp;
421 	dev_t pdev;
422 
423 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
424 	dp = pdev->si_disk;
425 	if (!dp)
426 		return (-1);
427 	if (dp != dev->si_disk) {
428 		dev->si_drv1 = pdev->si_drv1;
429 		dev->si_drv2 = pdev->si_drv2;
430 		/* XXX: don't set bp->b_dev->si_disk (?) */
431 	}
432 	return (dssize(dev, &dp->d_slice));
433 }
434 
435 SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
436     0, sizeof(struct disklabel), "sizeof(struct disklabel)");
437 
438 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
439     0, sizeof(struct diskslices), "sizeof(struct diskslices)");
440 
441 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
442     0, sizeof(struct disk), "sizeof(struct disk)");
443 
444 #endif /* NO_GEOM */
445 
446 /*-
447  * Disk error is the preface to plaintive error messages
448  * about failing disk transfers.  It prints messages of the form
449  * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
450  * blkdone should be -1 if the position of the error is unknown.
451  * The message is printed with printf.
452  */
453 void
454 disk_err(struct bio *bp, const char *what, int blkdone, int nl)
455 {
456 	daddr_t sn;
457 
458 	printf("%s: %s ", devtoname(bp->bio_dev), what);
459 	switch(bp->bio_cmd) {
460 	case BIO_READ:		printf("cmd=read "); break;
461 	case BIO_WRITE:		printf("cmd=write "); break;
462 	case BIO_DELETE:	printf("cmd=delete "); break;
463 	case BIO_GETATTR:	printf("cmd=getattr "); break;
464 	case BIO_SETATTR:	printf("cmd=setattr "); break;
465 	default:		printf("cmd=%x ", bp->bio_cmd); break;
466 	}
467 	sn = bp->bio_blkno;
468 	if (bp->bio_bcount <= DEV_BSIZE) {
469 		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
470 		return;
471 	}
472 	if (blkdone >= 0) {
473 		sn += blkdone;
474 		printf("fsbn %jd of ", (intmax_t)sn);
475 	}
476 	printf("%jd-%jd", (intmax_t)bp->bio_blkno,
477 	    (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
478 	if (nl)
479 		printf("\n");
480 }
481 
482 /*
483  * Seek sort for disks.
484  *
485  * The buf_queue keep two queues, sorted in ascending block order.  The first
486  * queue holds those requests which are positioned after the current block
487  * (in the first request); the second, which starts at queue->switch_point,
488  * holds requests which came in after their block number was passed.  Thus
489  * we implement a one way scan, retracting after reaching the end of the drive
490  * to the first request on the second queue, at which time it becomes the
491  * first queue.
492  *
493  * A one-way scan is natural because of the way UNIX read-ahead blocks are
494  * allocated.
495  */
496 
497 void
498 bioq_disksort(bioq, bp)
499 	struct bio_queue_head *bioq;
500 	struct bio *bp;
501 {
502 	struct bio *bq;
503 	struct bio *bn;
504 	struct bio *be;
505 
506 	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
507 		panic("Recursing in bioq_disksort()");
508 	be = TAILQ_LAST(&bioq->queue, bio_queue);
509 	/*
510 	 * If the queue is empty or we are an
511 	 * ordered transaction, then it's easy.
512 	 */
513 	if ((bq = bioq_first(bioq)) == NULL) {
514 		bioq_insert_tail(bioq, bp);
515 		bioq->busy = 0;
516 		return;
517 	} else if (bioq->insert_point != NULL) {
518 
519 		/*
520 		 * A certain portion of the list is
521 		 * "locked" to preserve ordering, so
522 		 * we can only insert after the insert
523 		 * point.
524 		 */
525 		bq = bioq->insert_point;
526 	} else {
527 
528 		/*
529 		 * If we lie before the last removed (currently active)
530 		 * request, and are not inserting ourselves into the
531 		 * "locked" portion of the list, then we must add ourselves
532 		 * to the second request list.
533 		 */
534 		if (bp->bio_pblkno < bioq->last_pblkno) {
535 
536 			bq = bioq->switch_point;
537 			/*
538 			 * If we are starting a new secondary list,
539 			 * then it's easy.
540 			 */
541 			if (bq == NULL) {
542 				bioq->switch_point = bp;
543 				bioq_insert_tail(bioq, bp);
544 				bioq->busy = 0;
545 				return;
546 			}
547 			/*
548 			 * If we lie ahead of the current switch point,
549 			 * insert us before the switch point and move
550 			 * the switch point.
551 			 */
552 			if (bp->bio_pblkno < bq->bio_pblkno) {
553 				bioq->switch_point = bp;
554 				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
555 				bioq->busy = 0;
556 				return;
557 			}
558 		} else {
559 			if (bioq->switch_point != NULL)
560 				be = TAILQ_PREV(bioq->switch_point,
561 						bio_queue, bio_queue);
562 			/*
563 			 * If we lie between last_pblkno and bq,
564 			 * insert before bq.
565 			 */
566 			if (bp->bio_pblkno < bq->bio_pblkno) {
567 				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
568 				bioq->busy = 0;
569 				return;
570 			}
571 		}
572 	}
573 
574 	/*
575 	 * Request is at/after our current position in the list.
576 	 * Optimize for sequential I/O by seeing if we go at the tail.
577 	 */
578 	if (bp->bio_pblkno > be->bio_pblkno) {
579 		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
580 		bioq->busy = 0;
581 		return;
582 	}
583 
584 	/* Otherwise, insertion sort */
585 	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
586 
587 		/*
588 		 * We want to go after the current request if it is the end
589 		 * of the first request list, or if the next request is a
590 		 * larger cylinder than our request.
591 		 */
592 		if (bn == bioq->switch_point
593 		 || bp->bio_pblkno < bn->bio_pblkno)
594 			break;
595 		bq = bn;
596 	}
597 	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
598 	bioq->busy = 0;
599 }
600 
601 
602