1 /*- 2 * ---------------------------------------------------------------------------- 3 * "THE BEER-WARE LICENSE" (Revision 42): 4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 5 * can do whatever you want with this stuff. If we meet some day, and you think 6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 7 * ---------------------------------------------------------------------------- 8 * 9 * $FreeBSD$ 10 * 11 */ 12 13 /*- 14 * The following functions are based in the vn(4) driver: mdstart_swap(), 15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), 16 * and as such under the following copyright: 17 * 18 * Copyright (c) 1988 University of Utah. 19 * Copyright (c) 1990, 1993 20 * The Regents of the University of California. All rights reserved. 21 * Copyright (c) 2013 The FreeBSD Foundation 22 * All rights reserved. 23 * 24 * This code is derived from software contributed to Berkeley by 25 * the Systems Programming Group of the University of Utah Computer 26 * Science Department. 27 * 28 * Portions of this software were developed by Konstantin Belousov 29 * under sponsorship from the FreeBSD Foundation. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 4. Neither the name of the University nor the names of its contributors 40 * may be used to endorse or promote products derived from this software 41 * without specific prior written permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 * 55 * from: Utah Hdr: vn.c 1.13 94/04/02 56 * 57 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 58 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 59 */ 60 61 #include "opt_geom.h" 62 #include "opt_md.h" 63 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/bio.h> 67 #include <sys/buf.h> 68 #include <sys/conf.h> 69 #include <sys/devicestat.h> 70 #include <sys/fcntl.h> 71 #include <sys/kernel.h> 72 #include <sys/kthread.h> 73 #include <sys/limits.h> 74 #include <sys/linker.h> 75 #include <sys/lock.h> 76 #include <sys/malloc.h> 77 #include <sys/mdioctl.h> 78 #include <sys/mount.h> 79 #include <sys/mutex.h> 80 #include <sys/sx.h> 81 #include <sys/namei.h> 82 #include <sys/proc.h> 83 #include <sys/queue.h> 84 #include <sys/rwlock.h> 85 #include <sys/sbuf.h> 86 #include <sys/sched.h> 87 #include <sys/sf_buf.h> 88 #include <sys/sysctl.h> 89 #include <sys/vnode.h> 90 91 #include <geom/geom.h> 92 #include <geom/geom_int.h> 93 94 #include <vm/vm.h> 95 #include <vm/vm_param.h> 96 #include <vm/vm_object.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_pager.h> 99 #include <vm/swap_pager.h> 100 #include <vm/uma.h> 101 102 #include <machine/bus.h> 103 104 #define MD_MODVER 1 105 106 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ 107 #define MD_EXITING 0x20000 /* Worker thread is exiting. */ 108 109 #ifndef MD_NSECT 110 #define MD_NSECT (10000 * 2) 111 #endif 112 113 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); 114 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); 115 116 static int md_debug; 117 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, 118 "Enable md(4) debug messages"); 119 static int md_malloc_wait; 120 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, 121 "Allow malloc to wait for memory allocations"); 122 123 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) 124 #define MD_ROOT_FSTYPE "ufs" 125 #endif 126 127 #if defined(MD_ROOT) 128 /* 129 * Preloaded image gets put here. 130 */ 131 #if defined(MD_ROOT_SIZE) 132 /* 133 * We put the mfs_root symbol into the oldmfs section of the kernel object file. 134 * Applications that patch the object with the image can determine 135 * the size looking at the oldmfs section size within the kernel. 136 */ 137 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs"))); 138 const int mfs_root_size = sizeof(mfs_root); 139 #else 140 extern volatile u_char __weak_symbol mfs_root; 141 extern volatile u_char __weak_symbol mfs_root_end; 142 __GLOBL(mfs_root); 143 __GLOBL(mfs_root_end); 144 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root)) 145 #endif 146 #endif 147 148 static g_init_t g_md_init; 149 static g_fini_t g_md_fini; 150 static g_start_t g_md_start; 151 static g_access_t g_md_access; 152 static void g_md_dumpconf(struct sbuf *sb, const char *indent, 153 struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); 154 155 static struct cdev *status_dev = 0; 156 static struct sx md_sx; 157 static struct unrhdr *md_uh; 158 159 static d_ioctl_t mdctlioctl; 160 161 static struct cdevsw mdctl_cdevsw = { 162 .d_version = D_VERSION, 163 .d_ioctl = mdctlioctl, 164 .d_name = MD_NAME, 165 }; 166 167 struct g_class g_md_class = { 168 .name = "MD", 169 .version = G_VERSION, 170 .init = g_md_init, 171 .fini = g_md_fini, 172 .start = g_md_start, 173 .access = g_md_access, 174 .dumpconf = g_md_dumpconf, 175 }; 176 177 DECLARE_GEOM_CLASS(g_md_class, g_md); 178 179 180 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); 181 182 #define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) 183 #define NMASK (NINDIR-1) 184 static int nshift; 185 186 static int md_vnode_pbuf_freecnt; 187 188 struct indir { 189 uintptr_t *array; 190 u_int total; 191 u_int used; 192 u_int shift; 193 }; 194 195 struct md_s { 196 int unit; 197 LIST_ENTRY(md_s) list; 198 struct bio_queue_head bio_queue; 199 struct mtx queue_mtx; 200 struct mtx stat_mtx; 201 struct cdev *dev; 202 enum md_types type; 203 off_t mediasize; 204 unsigned sectorsize; 205 unsigned opencount; 206 unsigned fwheads; 207 unsigned fwsectors; 208 unsigned flags; 209 char name[20]; 210 struct proc *procp; 211 struct g_geom *gp; 212 struct g_provider *pp; 213 int (*start)(struct md_s *sc, struct bio *bp); 214 struct devstat *devstat; 215 216 /* MD_MALLOC related fields */ 217 struct indir *indir; 218 uma_zone_t uma; 219 220 /* MD_PRELOAD related fields */ 221 u_char *pl_ptr; 222 size_t pl_len; 223 224 /* MD_VNODE related fields */ 225 struct vnode *vnode; 226 char file[PATH_MAX]; 227 struct ucred *cred; 228 229 /* MD_SWAP related fields */ 230 vm_object_t object; 231 }; 232 233 static struct indir * 234 new_indir(u_int shift) 235 { 236 struct indir *ip; 237 238 ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT) 239 | M_ZERO); 240 if (ip == NULL) 241 return (NULL); 242 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 243 M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); 244 if (ip->array == NULL) { 245 free(ip, M_MD); 246 return (NULL); 247 } 248 ip->total = NINDIR; 249 ip->shift = shift; 250 return (ip); 251 } 252 253 static void 254 del_indir(struct indir *ip) 255 { 256 257 free(ip->array, M_MDSECT); 258 free(ip, M_MD); 259 } 260 261 static void 262 destroy_indir(struct md_s *sc, struct indir *ip) 263 { 264 int i; 265 266 for (i = 0; i < NINDIR; i++) { 267 if (!ip->array[i]) 268 continue; 269 if (ip->shift) 270 destroy_indir(sc, (struct indir*)(ip->array[i])); 271 else if (ip->array[i] > 255) 272 uma_zfree(sc->uma, (void *)(ip->array[i])); 273 } 274 del_indir(ip); 275 } 276 277 /* 278 * This function does the math and allocates the top level "indir" structure 279 * for a device of "size" sectors. 280 */ 281 282 static struct indir * 283 dimension(off_t size) 284 { 285 off_t rcnt; 286 struct indir *ip; 287 int layer; 288 289 rcnt = size; 290 layer = 0; 291 while (rcnt > NINDIR) { 292 rcnt /= NINDIR; 293 layer++; 294 } 295 296 /* 297 * XXX: the top layer is probably not fully populated, so we allocate 298 * too much space for ip->array in here. 299 */ 300 ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); 301 ip->array = malloc(sizeof(uintptr_t) * NINDIR, 302 M_MDSECT, M_WAITOK | M_ZERO); 303 ip->total = NINDIR; 304 ip->shift = layer * nshift; 305 return (ip); 306 } 307 308 /* 309 * Read a given sector 310 */ 311 312 static uintptr_t 313 s_read(struct indir *ip, off_t offset) 314 { 315 struct indir *cip; 316 int idx; 317 uintptr_t up; 318 319 if (md_debug > 1) 320 printf("s_read(%jd)\n", (intmax_t)offset); 321 up = 0; 322 for (cip = ip; cip != NULL;) { 323 if (cip->shift) { 324 idx = (offset >> cip->shift) & NMASK; 325 up = cip->array[idx]; 326 cip = (struct indir *)up; 327 continue; 328 } 329 idx = offset & NMASK; 330 return (cip->array[idx]); 331 } 332 return (0); 333 } 334 335 /* 336 * Write a given sector, prune the tree if the value is 0 337 */ 338 339 static int 340 s_write(struct indir *ip, off_t offset, uintptr_t ptr) 341 { 342 struct indir *cip, *lip[10]; 343 int idx, li; 344 uintptr_t up; 345 346 if (md_debug > 1) 347 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); 348 up = 0; 349 li = 0; 350 cip = ip; 351 for (;;) { 352 lip[li++] = cip; 353 if (cip->shift) { 354 idx = (offset >> cip->shift) & NMASK; 355 up = cip->array[idx]; 356 if (up != 0) { 357 cip = (struct indir *)up; 358 continue; 359 } 360 /* Allocate branch */ 361 cip->array[idx] = 362 (uintptr_t)new_indir(cip->shift - nshift); 363 if (cip->array[idx] == 0) 364 return (ENOSPC); 365 cip->used++; 366 up = cip->array[idx]; 367 cip = (struct indir *)up; 368 continue; 369 } 370 /* leafnode */ 371 idx = offset & NMASK; 372 up = cip->array[idx]; 373 if (up != 0) 374 cip->used--; 375 cip->array[idx] = ptr; 376 if (ptr != 0) 377 cip->used++; 378 break; 379 } 380 if (cip->used != 0 || li == 1) 381 return (0); 382 li--; 383 while (cip->used == 0 && cip != ip) { 384 li--; 385 idx = (offset >> lip[li]->shift) & NMASK; 386 up = lip[li]->array[idx]; 387 KASSERT(up == (uintptr_t)cip, ("md screwed up")); 388 del_indir(cip); 389 lip[li]->array[idx] = 0; 390 lip[li]->used--; 391 cip = lip[li]; 392 } 393 return (0); 394 } 395 396 397 static int 398 g_md_access(struct g_provider *pp, int r, int w, int e) 399 { 400 struct md_s *sc; 401 402 sc = pp->geom->softc; 403 if (sc == NULL) { 404 if (r <= 0 && w <= 0 && e <= 0) 405 return (0); 406 return (ENXIO); 407 } 408 r += pp->acr; 409 w += pp->acw; 410 e += pp->ace; 411 if ((sc->flags & MD_READONLY) != 0 && w > 0) 412 return (EROFS); 413 if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { 414 sc->opencount = 1; 415 } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { 416 sc->opencount = 0; 417 } 418 return (0); 419 } 420 421 static void 422 g_md_start(struct bio *bp) 423 { 424 struct md_s *sc; 425 426 sc = bp->bio_to->geom->softc; 427 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) { 428 mtx_lock(&sc->stat_mtx); 429 devstat_start_transaction_bio(sc->devstat, bp); 430 mtx_unlock(&sc->stat_mtx); 431 } 432 mtx_lock(&sc->queue_mtx); 433 bioq_disksort(&sc->bio_queue, bp); 434 mtx_unlock(&sc->queue_mtx); 435 wakeup(sc); 436 } 437 438 #define MD_MALLOC_MOVE_ZERO 1 439 #define MD_MALLOC_MOVE_FILL 2 440 #define MD_MALLOC_MOVE_READ 3 441 #define MD_MALLOC_MOVE_WRITE 4 442 #define MD_MALLOC_MOVE_CMP 5 443 444 static int 445 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize, 446 void *ptr, u_char fill, int op) 447 { 448 struct sf_buf *sf; 449 vm_page_t m, *mp1; 450 char *p, first; 451 off_t *uc; 452 unsigned n; 453 int error, i, ma_offs1, sz, first_read; 454 455 m = NULL; 456 error = 0; 457 sf = NULL; 458 /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ 459 first = 0; 460 first_read = 0; 461 uc = ptr; 462 mp1 = *mp; 463 ma_offs1 = *ma_offs; 464 /* } */ 465 sched_pin(); 466 for (n = sectorsize; n != 0; n -= sz) { 467 sz = imin(PAGE_SIZE - *ma_offs, n); 468 if (m != **mp) { 469 if (sf != NULL) 470 sf_buf_free(sf); 471 m = **mp; 472 sf = sf_buf_alloc(m, SFB_CPUPRIVATE | 473 (md_malloc_wait ? 0 : SFB_NOWAIT)); 474 if (sf == NULL) { 475 error = ENOMEM; 476 break; 477 } 478 } 479 p = (char *)sf_buf_kva(sf) + *ma_offs; 480 switch (op) { 481 case MD_MALLOC_MOVE_ZERO: 482 bzero(p, sz); 483 break; 484 case MD_MALLOC_MOVE_FILL: 485 memset(p, fill, sz); 486 break; 487 case MD_MALLOC_MOVE_READ: 488 bcopy(ptr, p, sz); 489 cpu_flush_dcache(p, sz); 490 break; 491 case MD_MALLOC_MOVE_WRITE: 492 bcopy(p, ptr, sz); 493 break; 494 case MD_MALLOC_MOVE_CMP: 495 for (i = 0; i < sz; i++, p++) { 496 if (!first_read) { 497 *uc = (u_char)*p; 498 first = *p; 499 first_read = 1; 500 } else if (*p != first) { 501 error = EDOOFUS; 502 break; 503 } 504 } 505 break; 506 default: 507 KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op)); 508 break; 509 } 510 if (error != 0) 511 break; 512 *ma_offs += sz; 513 *ma_offs %= PAGE_SIZE; 514 if (*ma_offs == 0) 515 (*mp)++; 516 ptr = (char *)ptr + sz; 517 } 518 519 if (sf != NULL) 520 sf_buf_free(sf); 521 sched_unpin(); 522 if (op == MD_MALLOC_MOVE_CMP && error != 0) { 523 *mp = mp1; 524 *ma_offs = ma_offs1; 525 } 526 return (error); 527 } 528 529 static int 530 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs, 531 unsigned len, void *ptr, u_char fill, int op) 532 { 533 bus_dma_segment_t *vlist; 534 uint8_t *p, *end, first; 535 off_t *uc; 536 int ma_offs, seg_len; 537 538 vlist = *pvlist; 539 ma_offs = *pma_offs; 540 uc = ptr; 541 542 for (; len != 0; len -= seg_len) { 543 seg_len = imin(vlist->ds_len - ma_offs, len); 544 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs; 545 switch (op) { 546 case MD_MALLOC_MOVE_ZERO: 547 bzero(p, seg_len); 548 break; 549 case MD_MALLOC_MOVE_FILL: 550 memset(p, fill, seg_len); 551 break; 552 case MD_MALLOC_MOVE_READ: 553 bcopy(ptr, p, seg_len); 554 cpu_flush_dcache(p, seg_len); 555 break; 556 case MD_MALLOC_MOVE_WRITE: 557 bcopy(p, ptr, seg_len); 558 break; 559 case MD_MALLOC_MOVE_CMP: 560 end = p + seg_len; 561 first = *uc = *p; 562 /* Confirm all following bytes match the first */ 563 while (++p < end) { 564 if (*p != first) 565 return (EDOOFUS); 566 } 567 break; 568 default: 569 KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op)); 570 break; 571 } 572 573 ma_offs += seg_len; 574 if (ma_offs == vlist->ds_len) { 575 ma_offs = 0; 576 vlist++; 577 } 578 ptr = (uint8_t *)ptr + seg_len; 579 } 580 *pvlist = vlist; 581 *pma_offs = ma_offs; 582 583 return (0); 584 } 585 586 static int 587 mdstart_malloc(struct md_s *sc, struct bio *bp) 588 { 589 u_char *dst; 590 vm_page_t *m; 591 bus_dma_segment_t *vlist; 592 int i, error, error1, ma_offs, notmapped; 593 off_t secno, nsec, uc; 594 uintptr_t sp, osp; 595 596 switch (bp->bio_cmd) { 597 case BIO_READ: 598 case BIO_WRITE: 599 case BIO_DELETE: 600 break; 601 default: 602 return (EOPNOTSUPP); 603 } 604 605 notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; 606 vlist = (bp->bio_flags & BIO_VLIST) != 0 ? 607 (bus_dma_segment_t *)bp->bio_data : NULL; 608 if (notmapped) { 609 m = bp->bio_ma; 610 ma_offs = bp->bio_ma_offset; 611 dst = NULL; 612 KASSERT(vlist == NULL, ("vlists cannot be unmapped")); 613 } else if (vlist != NULL) { 614 ma_offs = bp->bio_ma_offset; 615 dst = NULL; 616 } else { 617 dst = bp->bio_data; 618 } 619 620 nsec = bp->bio_length / sc->sectorsize; 621 secno = bp->bio_offset / sc->sectorsize; 622 error = 0; 623 while (nsec--) { 624 osp = s_read(sc->indir, secno); 625 if (bp->bio_cmd == BIO_DELETE) { 626 if (osp != 0) 627 error = s_write(sc->indir, secno, 0); 628 } else if (bp->bio_cmd == BIO_READ) { 629 if (osp == 0) { 630 if (notmapped) { 631 error = md_malloc_move_ma(&m, &ma_offs, 632 sc->sectorsize, NULL, 0, 633 MD_MALLOC_MOVE_ZERO); 634 } else if (vlist != NULL) { 635 error = md_malloc_move_vlist(&vlist, 636 &ma_offs, sc->sectorsize, NULL, 0, 637 MD_MALLOC_MOVE_ZERO); 638 } else 639 bzero(dst, sc->sectorsize); 640 } else if (osp <= 255) { 641 if (notmapped) { 642 error = md_malloc_move_ma(&m, &ma_offs, 643 sc->sectorsize, NULL, osp, 644 MD_MALLOC_MOVE_FILL); 645 } else if (vlist != NULL) { 646 error = md_malloc_move_vlist(&vlist, 647 &ma_offs, sc->sectorsize, NULL, osp, 648 MD_MALLOC_MOVE_FILL); 649 } else 650 memset(dst, osp, sc->sectorsize); 651 } else { 652 if (notmapped) { 653 error = md_malloc_move_ma(&m, &ma_offs, 654 sc->sectorsize, (void *)osp, 0, 655 MD_MALLOC_MOVE_READ); 656 } else if (vlist != NULL) { 657 error = md_malloc_move_vlist(&vlist, 658 &ma_offs, sc->sectorsize, 659 (void *)osp, 0, 660 MD_MALLOC_MOVE_READ); 661 } else { 662 bcopy((void *)osp, dst, sc->sectorsize); 663 cpu_flush_dcache(dst, sc->sectorsize); 664 } 665 } 666 osp = 0; 667 } else if (bp->bio_cmd == BIO_WRITE) { 668 if (sc->flags & MD_COMPRESS) { 669 if (notmapped) { 670 error1 = md_malloc_move_ma(&m, &ma_offs, 671 sc->sectorsize, &uc, 0, 672 MD_MALLOC_MOVE_CMP); 673 i = error1 == 0 ? sc->sectorsize : 0; 674 } else if (vlist != NULL) { 675 error1 = md_malloc_move_vlist(&vlist, 676 &ma_offs, sc->sectorsize, &uc, 0, 677 MD_MALLOC_MOVE_CMP); 678 i = error1 == 0 ? sc->sectorsize : 0; 679 } else { 680 uc = dst[0]; 681 for (i = 1; i < sc->sectorsize; i++) { 682 if (dst[i] != uc) 683 break; 684 } 685 } 686 } else { 687 i = 0; 688 uc = 0; 689 } 690 if (i == sc->sectorsize) { 691 if (osp != uc) 692 error = s_write(sc->indir, secno, uc); 693 } else { 694 if (osp <= 255) { 695 sp = (uintptr_t)uma_zalloc(sc->uma, 696 md_malloc_wait ? M_WAITOK : 697 M_NOWAIT); 698 if (sp == 0) { 699 error = ENOSPC; 700 break; 701 } 702 if (notmapped) { 703 error = md_malloc_move_ma(&m, 704 &ma_offs, sc->sectorsize, 705 (void *)sp, 0, 706 MD_MALLOC_MOVE_WRITE); 707 } else if (vlist != NULL) { 708 error = md_malloc_move_vlist( 709 &vlist, &ma_offs, 710 sc->sectorsize, (void *)sp, 711 0, MD_MALLOC_MOVE_WRITE); 712 } else { 713 bcopy(dst, (void *)sp, 714 sc->sectorsize); 715 } 716 error = s_write(sc->indir, secno, sp); 717 } else { 718 if (notmapped) { 719 error = md_malloc_move_ma(&m, 720 &ma_offs, sc->sectorsize, 721 (void *)osp, 0, 722 MD_MALLOC_MOVE_WRITE); 723 } else if (vlist != NULL) { 724 error = md_malloc_move_vlist( 725 &vlist, &ma_offs, 726 sc->sectorsize, (void *)osp, 727 0, MD_MALLOC_MOVE_WRITE); 728 } else { 729 bcopy(dst, (void *)osp, 730 sc->sectorsize); 731 } 732 osp = 0; 733 } 734 } 735 } else { 736 error = EOPNOTSUPP; 737 } 738 if (osp > 255) 739 uma_zfree(sc->uma, (void*)osp); 740 if (error != 0) 741 break; 742 secno++; 743 if (!notmapped && vlist == NULL) 744 dst += sc->sectorsize; 745 } 746 bp->bio_resid = 0; 747 return (error); 748 } 749 750 static void 751 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len) 752 { 753 off_t seg_len; 754 755 while (offset >= vlist->ds_len) { 756 offset -= vlist->ds_len; 757 vlist++; 758 } 759 760 while (len != 0) { 761 seg_len = omin(len, vlist->ds_len - offset); 762 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset), 763 seg_len); 764 offset = 0; 765 src = (uint8_t *)src + seg_len; 766 len -= seg_len; 767 vlist++; 768 } 769 } 770 771 static void 772 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len) 773 { 774 off_t seg_len; 775 776 while (offset >= vlist->ds_len) { 777 offset -= vlist->ds_len; 778 vlist++; 779 } 780 781 while (len != 0) { 782 seg_len = omin(len, vlist->ds_len - offset); 783 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst, 784 seg_len); 785 offset = 0; 786 dst = (uint8_t *)dst + seg_len; 787 len -= seg_len; 788 vlist++; 789 } 790 } 791 792 static int 793 mdstart_preload(struct md_s *sc, struct bio *bp) 794 { 795 uint8_t *p; 796 797 p = sc->pl_ptr + bp->bio_offset; 798 switch (bp->bio_cmd) { 799 case BIO_READ: 800 if ((bp->bio_flags & BIO_VLIST) != 0) { 801 mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data, 802 bp->bio_ma_offset, bp->bio_length); 803 } else { 804 bcopy(p, bp->bio_data, bp->bio_length); 805 } 806 cpu_flush_dcache(bp->bio_data, bp->bio_length); 807 break; 808 case BIO_WRITE: 809 if ((bp->bio_flags & BIO_VLIST) != 0) { 810 mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data, 811 bp->bio_ma_offset, p, bp->bio_length); 812 } else { 813 bcopy(bp->bio_data, p, bp->bio_length); 814 } 815 break; 816 } 817 bp->bio_resid = 0; 818 return (0); 819 } 820 821 static int 822 mdstart_vnode(struct md_s *sc, struct bio *bp) 823 { 824 int error; 825 struct uio auio; 826 struct iovec aiov; 827 struct iovec *piov; 828 struct mount *mp; 829 struct vnode *vp; 830 struct buf *pb; 831 bus_dma_segment_t *vlist; 832 struct thread *td; 833 off_t iolen, len, zerosize; 834 int ma_offs, npages; 835 836 switch (bp->bio_cmd) { 837 case BIO_READ: 838 auio.uio_rw = UIO_READ; 839 break; 840 case BIO_WRITE: 841 case BIO_DELETE: 842 auio.uio_rw = UIO_WRITE; 843 break; 844 case BIO_FLUSH: 845 break; 846 default: 847 return (EOPNOTSUPP); 848 } 849 850 td = curthread; 851 vp = sc->vnode; 852 pb = NULL; 853 piov = NULL; 854 ma_offs = bp->bio_ma_offset; 855 len = bp->bio_length; 856 857 /* 858 * VNODE I/O 859 * 860 * If an error occurs, we set BIO_ERROR but we do not set 861 * B_INVAL because (for a write anyway), the buffer is 862 * still valid. 863 */ 864 865 if (bp->bio_cmd == BIO_FLUSH) { 866 (void) vn_start_write(vp, &mp, V_WAIT); 867 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 868 error = VOP_FSYNC(vp, MNT_WAIT, td); 869 VOP_UNLOCK(vp, 0); 870 vn_finished_write(mp); 871 return (error); 872 } 873 874 auio.uio_offset = (vm_ooffset_t)bp->bio_offset; 875 auio.uio_resid = bp->bio_length; 876 auio.uio_segflg = UIO_SYSSPACE; 877 auio.uio_td = td; 878 879 if (bp->bio_cmd == BIO_DELETE) { 880 /* 881 * Emulate BIO_DELETE by writing zeros. 882 */ 883 zerosize = ZERO_REGION_SIZE - 884 (ZERO_REGION_SIZE % sc->sectorsize); 885 auio.uio_iovcnt = howmany(bp->bio_length, zerosize); 886 piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK); 887 auio.uio_iov = piov; 888 while (len > 0) { 889 piov->iov_base = __DECONST(void *, zero_region); 890 piov->iov_len = len; 891 if (len > zerosize) 892 piov->iov_len = zerosize; 893 len -= piov->iov_len; 894 piov++; 895 } 896 piov = auio.uio_iov; 897 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 898 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK); 899 auio.uio_iov = piov; 900 vlist = (bus_dma_segment_t *)bp->bio_data; 901 while (len > 0) { 902 piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr + 903 ma_offs); 904 piov->iov_len = vlist->ds_len - ma_offs; 905 if (piov->iov_len > len) 906 piov->iov_len = len; 907 len -= piov->iov_len; 908 ma_offs = 0; 909 vlist++; 910 piov++; 911 } 912 auio.uio_iovcnt = piov - auio.uio_iov; 913 piov = auio.uio_iov; 914 } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 915 pb = getpbuf(&md_vnode_pbuf_freecnt); 916 bp->bio_resid = len; 917 unmapped_step: 918 npages = atop(min(MAXPHYS, round_page(len + (ma_offs & 919 PAGE_MASK)))); 920 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len); 921 KASSERT(iolen > 0, ("zero iolen")); 922 pmap_qenter((vm_offset_t)pb->b_data, 923 &bp->bio_ma[atop(ma_offs)], npages); 924 aiov.iov_base = (void *)((vm_offset_t)pb->b_data + 925 (ma_offs & PAGE_MASK)); 926 aiov.iov_len = iolen; 927 auio.uio_iov = &aiov; 928 auio.uio_iovcnt = 1; 929 auio.uio_resid = iolen; 930 } else { 931 aiov.iov_base = bp->bio_data; 932 aiov.iov_len = bp->bio_length; 933 auio.uio_iov = &aiov; 934 auio.uio_iovcnt = 1; 935 } 936 /* 937 * When reading set IO_DIRECT to try to avoid double-caching 938 * the data. When writing IO_DIRECT is not optimal. 939 */ 940 if (auio.uio_rw == UIO_READ) { 941 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 942 error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred); 943 VOP_UNLOCK(vp, 0); 944 } else { 945 (void) vn_start_write(vp, &mp, V_WAIT); 946 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 947 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, 948 sc->cred); 949 VOP_UNLOCK(vp, 0); 950 vn_finished_write(mp); 951 } 952 953 if (pb != NULL) { 954 pmap_qremove((vm_offset_t)pb->b_data, npages); 955 if (error == 0) { 956 len -= iolen; 957 bp->bio_resid -= iolen; 958 ma_offs += iolen; 959 if (len > 0) 960 goto unmapped_step; 961 } 962 relpbuf(pb, &md_vnode_pbuf_freecnt); 963 } 964 965 free(piov, M_MD); 966 if (pb == NULL) 967 bp->bio_resid = auio.uio_resid; 968 return (error); 969 } 970 971 static int 972 mdstart_swap(struct md_s *sc, struct bio *bp) 973 { 974 vm_page_t m; 975 u_char *p; 976 vm_pindex_t i, lastp; 977 bus_dma_segment_t *vlist; 978 int rv, ma_offs, offs, len, lastend; 979 980 switch (bp->bio_cmd) { 981 case BIO_READ: 982 case BIO_WRITE: 983 case BIO_DELETE: 984 break; 985 default: 986 return (EOPNOTSUPP); 987 } 988 989 p = bp->bio_data; 990 ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ? 991 bp->bio_ma_offset : 0; 992 vlist = (bp->bio_flags & BIO_VLIST) != 0 ? 993 (bus_dma_segment_t *)bp->bio_data : NULL; 994 995 /* 996 * offs is the offset at which to start operating on the 997 * next (ie, first) page. lastp is the last page on 998 * which we're going to operate. lastend is the ending 999 * position within that last page (ie, PAGE_SIZE if 1000 * we're operating on complete aligned pages). 1001 */ 1002 offs = bp->bio_offset % PAGE_SIZE; 1003 lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; 1004 lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; 1005 1006 rv = VM_PAGER_OK; 1007 VM_OBJECT_WLOCK(sc->object); 1008 vm_object_pip_add(sc->object, 1); 1009 for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { 1010 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; 1011 m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM); 1012 if (bp->bio_cmd == BIO_READ) { 1013 if (m->valid == VM_PAGE_BITS_ALL) 1014 rv = VM_PAGER_OK; 1015 else 1016 rv = vm_pager_get_pages(sc->object, &m, 1, 1017 NULL, NULL); 1018 if (rv == VM_PAGER_ERROR) { 1019 vm_page_xunbusy(m); 1020 break; 1021 } else if (rv == VM_PAGER_FAIL) { 1022 /* 1023 * Pager does not have the page. Zero 1024 * the allocated page, and mark it as 1025 * valid. Do not set dirty, the page 1026 * can be recreated if thrown out. 1027 */ 1028 pmap_zero_page(m); 1029 m->valid = VM_PAGE_BITS_ALL; 1030 } 1031 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 1032 pmap_copy_pages(&m, offs, bp->bio_ma, 1033 ma_offs, len); 1034 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 1035 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs, 1036 vlist, ma_offs, len); 1037 cpu_flush_dcache(p, len); 1038 } else { 1039 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); 1040 cpu_flush_dcache(p, len); 1041 } 1042 } else if (bp->bio_cmd == BIO_WRITE) { 1043 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) 1044 rv = vm_pager_get_pages(sc->object, &m, 1, 1045 NULL, NULL); 1046 else 1047 rv = VM_PAGER_OK; 1048 if (rv == VM_PAGER_ERROR) { 1049 vm_page_xunbusy(m); 1050 break; 1051 } 1052 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 1053 pmap_copy_pages(bp->bio_ma, ma_offs, &m, 1054 offs, len); 1055 } else if ((bp->bio_flags & BIO_VLIST) != 0) { 1056 physcopyin_vlist(vlist, ma_offs, 1057 VM_PAGE_TO_PHYS(m) + offs, len); 1058 } else { 1059 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); 1060 } 1061 m->valid = VM_PAGE_BITS_ALL; 1062 } else if (bp->bio_cmd == BIO_DELETE) { 1063 if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) 1064 rv = vm_pager_get_pages(sc->object, &m, 1, 1065 NULL, NULL); 1066 else 1067 rv = VM_PAGER_OK; 1068 if (rv == VM_PAGER_ERROR) { 1069 vm_page_xunbusy(m); 1070 break; 1071 } 1072 if (len != PAGE_SIZE) { 1073 pmap_zero_page_area(m, offs, len); 1074 vm_page_clear_dirty(m, offs, len); 1075 m->valid = VM_PAGE_BITS_ALL; 1076 } else 1077 vm_pager_page_unswapped(m); 1078 } 1079 vm_page_xunbusy(m); 1080 vm_page_lock(m); 1081 if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) 1082 vm_page_free(m); 1083 else 1084 vm_page_activate(m); 1085 vm_page_unlock(m); 1086 if (bp->bio_cmd == BIO_WRITE) { 1087 vm_page_dirty(m); 1088 vm_pager_page_unswapped(m); 1089 } 1090 1091 /* Actions on further pages start at offset 0 */ 1092 p += PAGE_SIZE - offs; 1093 offs = 0; 1094 ma_offs += len; 1095 } 1096 vm_object_pip_wakeup(sc->object); 1097 VM_OBJECT_WUNLOCK(sc->object); 1098 return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); 1099 } 1100 1101 static int 1102 mdstart_null(struct md_s *sc, struct bio *bp) 1103 { 1104 1105 switch (bp->bio_cmd) { 1106 case BIO_READ: 1107 bzero(bp->bio_data, bp->bio_length); 1108 cpu_flush_dcache(bp->bio_data, bp->bio_length); 1109 break; 1110 case BIO_WRITE: 1111 break; 1112 } 1113 bp->bio_resid = 0; 1114 return (0); 1115 } 1116 1117 static void 1118 md_kthread(void *arg) 1119 { 1120 struct md_s *sc; 1121 struct bio *bp; 1122 int error; 1123 1124 sc = arg; 1125 thread_lock(curthread); 1126 sched_prio(curthread, PRIBIO); 1127 thread_unlock(curthread); 1128 if (sc->type == MD_VNODE) 1129 curthread->td_pflags |= TDP_NORUNNINGBUF; 1130 1131 for (;;) { 1132 mtx_lock(&sc->queue_mtx); 1133 if (sc->flags & MD_SHUTDOWN) { 1134 sc->flags |= MD_EXITING; 1135 mtx_unlock(&sc->queue_mtx); 1136 kproc_exit(0); 1137 } 1138 bp = bioq_takefirst(&sc->bio_queue); 1139 if (!bp) { 1140 msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); 1141 continue; 1142 } 1143 mtx_unlock(&sc->queue_mtx); 1144 if (bp->bio_cmd == BIO_GETATTR) { 1145 if ((sc->fwsectors && sc->fwheads && 1146 (g_handleattr_int(bp, "GEOM::fwsectors", 1147 sc->fwsectors) || 1148 g_handleattr_int(bp, "GEOM::fwheads", 1149 sc->fwheads))) || 1150 g_handleattr_int(bp, "GEOM::candelete", 1)) 1151 error = -1; 1152 else 1153 error = EOPNOTSUPP; 1154 } else { 1155 error = sc->start(sc, bp); 1156 } 1157 1158 if (error != -1) { 1159 bp->bio_completed = bp->bio_length; 1160 if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) 1161 devstat_end_transaction_bio(sc->devstat, bp); 1162 g_io_deliver(bp, error); 1163 } 1164 } 1165 } 1166 1167 static struct md_s * 1168 mdfind(int unit) 1169 { 1170 struct md_s *sc; 1171 1172 LIST_FOREACH(sc, &md_softc_list, list) { 1173 if (sc->unit == unit) 1174 break; 1175 } 1176 return (sc); 1177 } 1178 1179 static struct md_s * 1180 mdnew(int unit, int *errp, enum md_types type) 1181 { 1182 struct md_s *sc; 1183 int error; 1184 1185 *errp = 0; 1186 if (unit == -1) 1187 unit = alloc_unr(md_uh); 1188 else 1189 unit = alloc_unr_specific(md_uh, unit); 1190 1191 if (unit == -1) { 1192 *errp = EBUSY; 1193 return (NULL); 1194 } 1195 1196 sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); 1197 sc->type = type; 1198 bioq_init(&sc->bio_queue); 1199 mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); 1200 mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF); 1201 sc->unit = unit; 1202 sprintf(sc->name, "md%d", unit); 1203 LIST_INSERT_HEAD(&md_softc_list, sc, list); 1204 error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); 1205 if (error == 0) 1206 return (sc); 1207 LIST_REMOVE(sc, list); 1208 mtx_destroy(&sc->stat_mtx); 1209 mtx_destroy(&sc->queue_mtx); 1210 free_unr(md_uh, sc->unit); 1211 free(sc, M_MD); 1212 *errp = error; 1213 return (NULL); 1214 } 1215 1216 static void 1217 mdinit(struct md_s *sc) 1218 { 1219 struct g_geom *gp; 1220 struct g_provider *pp; 1221 1222 g_topology_lock(); 1223 gp = g_new_geomf(&g_md_class, "md%d", sc->unit); 1224 gp->softc = sc; 1225 pp = g_new_providerf(gp, "md%d", sc->unit); 1226 pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; 1227 pp->mediasize = sc->mediasize; 1228 pp->sectorsize = sc->sectorsize; 1229 switch (sc->type) { 1230 case MD_MALLOC: 1231 case MD_VNODE: 1232 case MD_SWAP: 1233 pp->flags |= G_PF_ACCEPT_UNMAPPED; 1234 break; 1235 case MD_PRELOAD: 1236 case MD_NULL: 1237 break; 1238 } 1239 sc->gp = gp; 1240 sc->pp = pp; 1241 g_error_provider(pp, 0); 1242 g_topology_unlock(); 1243 sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, 1244 DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); 1245 } 1246 1247 static int 1248 mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio) 1249 { 1250 uintptr_t sp; 1251 int error; 1252 off_t u; 1253 1254 error = 0; 1255 if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) 1256 return (EINVAL); 1257 if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize)) 1258 return (EINVAL); 1259 /* Compression doesn't make sense if we have reserved space */ 1260 if (mdio->md_options & MD_RESERVE) 1261 mdio->md_options &= ~MD_COMPRESS; 1262 if (mdio->md_fwsectors != 0) 1263 sc->fwsectors = mdio->md_fwsectors; 1264 if (mdio->md_fwheads != 0) 1265 sc->fwheads = mdio->md_fwheads; 1266 sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE); 1267 sc->indir = dimension(sc->mediasize / sc->sectorsize); 1268 sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 1269 0x1ff, 0); 1270 if (mdio->md_options & MD_RESERVE) { 1271 off_t nsectors; 1272 1273 nsectors = sc->mediasize / sc->sectorsize; 1274 for (u = 0; u < nsectors; u++) { 1275 sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ? 1276 M_WAITOK : M_NOWAIT) | M_ZERO); 1277 if (sp != 0) 1278 error = s_write(sc->indir, u, sp); 1279 else 1280 error = ENOMEM; 1281 if (error != 0) 1282 break; 1283 } 1284 } 1285 return (error); 1286 } 1287 1288 1289 static int 1290 mdsetcred(struct md_s *sc, struct ucred *cred) 1291 { 1292 char *tmpbuf; 1293 int error = 0; 1294 1295 /* 1296 * Set credits in our softc 1297 */ 1298 1299 if (sc->cred) 1300 crfree(sc->cred); 1301 sc->cred = crhold(cred); 1302 1303 /* 1304 * Horrible kludge to establish credentials for NFS XXX. 1305 */ 1306 1307 if (sc->vnode) { 1308 struct uio auio; 1309 struct iovec aiov; 1310 1311 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); 1312 bzero(&auio, sizeof(auio)); 1313 1314 aiov.iov_base = tmpbuf; 1315 aiov.iov_len = sc->sectorsize; 1316 auio.uio_iov = &aiov; 1317 auio.uio_iovcnt = 1; 1318 auio.uio_offset = 0; 1319 auio.uio_rw = UIO_READ; 1320 auio.uio_segflg = UIO_SYSSPACE; 1321 auio.uio_resid = aiov.iov_len; 1322 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1323 error = VOP_READ(sc->vnode, &auio, 0, sc->cred); 1324 VOP_UNLOCK(sc->vnode, 0); 1325 free(tmpbuf, M_TEMP); 1326 } 1327 return (error); 1328 } 1329 1330 static int 1331 mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1332 { 1333 struct vattr vattr; 1334 struct nameidata nd; 1335 char *fname; 1336 int error, flags; 1337 1338 /* 1339 * Kernel-originated requests must have the filename appended 1340 * to the mdio structure to protect against malicious software. 1341 */ 1342 fname = mdio->md_file; 1343 if ((void *)fname != (void *)(mdio + 1)) { 1344 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL); 1345 if (error != 0) 1346 return (error); 1347 } else 1348 strlcpy(sc->file, fname, sizeof(sc->file)); 1349 1350 /* 1351 * If the user specified that this is a read only device, don't 1352 * set the FWRITE mask before trying to open the backing store. 1353 */ 1354 flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE); 1355 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td); 1356 error = vn_open(&nd, &flags, 0, NULL); 1357 if (error != 0) 1358 return (error); 1359 NDFREE(&nd, NDF_ONLY_PNBUF); 1360 if (nd.ni_vp->v_type != VREG) { 1361 error = EINVAL; 1362 goto bad; 1363 } 1364 error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred); 1365 if (error != 0) 1366 goto bad; 1367 if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) { 1368 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY); 1369 if (nd.ni_vp->v_iflag & VI_DOOMED) { 1370 /* Forced unmount. */ 1371 error = EBADF; 1372 goto bad; 1373 } 1374 } 1375 nd.ni_vp->v_vflag |= VV_MD; 1376 VOP_UNLOCK(nd.ni_vp, 0); 1377 1378 if (mdio->md_fwsectors != 0) 1379 sc->fwsectors = mdio->md_fwsectors; 1380 if (mdio->md_fwheads != 0) 1381 sc->fwheads = mdio->md_fwheads; 1382 sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC); 1383 if (!(flags & FWRITE)) 1384 sc->flags |= MD_READONLY; 1385 sc->vnode = nd.ni_vp; 1386 1387 error = mdsetcred(sc, td->td_ucred); 1388 if (error != 0) { 1389 sc->vnode = NULL; 1390 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); 1391 nd.ni_vp->v_vflag &= ~VV_MD; 1392 goto bad; 1393 } 1394 return (0); 1395 bad: 1396 VOP_UNLOCK(nd.ni_vp, 0); 1397 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); 1398 return (error); 1399 } 1400 1401 static int 1402 mddestroy(struct md_s *sc, struct thread *td) 1403 { 1404 1405 if (sc->gp) { 1406 sc->gp->softc = NULL; 1407 g_topology_lock(); 1408 g_wither_geom(sc->gp, ENXIO); 1409 g_topology_unlock(); 1410 sc->gp = NULL; 1411 sc->pp = NULL; 1412 } 1413 if (sc->devstat) { 1414 devstat_remove_entry(sc->devstat); 1415 sc->devstat = NULL; 1416 } 1417 mtx_lock(&sc->queue_mtx); 1418 sc->flags |= MD_SHUTDOWN; 1419 wakeup(sc); 1420 while (!(sc->flags & MD_EXITING)) 1421 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); 1422 mtx_unlock(&sc->queue_mtx); 1423 mtx_destroy(&sc->stat_mtx); 1424 mtx_destroy(&sc->queue_mtx); 1425 if (sc->vnode != NULL) { 1426 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); 1427 sc->vnode->v_vflag &= ~VV_MD; 1428 VOP_UNLOCK(sc->vnode, 0); 1429 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? 1430 FREAD : (FREAD|FWRITE), sc->cred, td); 1431 } 1432 if (sc->cred != NULL) 1433 crfree(sc->cred); 1434 if (sc->object != NULL) 1435 vm_object_deallocate(sc->object); 1436 if (sc->indir) 1437 destroy_indir(sc, sc->indir); 1438 if (sc->uma) 1439 uma_zdestroy(sc->uma); 1440 1441 LIST_REMOVE(sc, list); 1442 free_unr(md_uh, sc->unit); 1443 free(sc, M_MD); 1444 return (0); 1445 } 1446 1447 static int 1448 mdresize(struct md_s *sc, struct md_ioctl *mdio) 1449 { 1450 int error, res; 1451 vm_pindex_t oldpages, newpages; 1452 1453 switch (sc->type) { 1454 case MD_VNODE: 1455 case MD_NULL: 1456 break; 1457 case MD_SWAP: 1458 if (mdio->md_mediasize <= 0 || 1459 (mdio->md_mediasize % PAGE_SIZE) != 0) 1460 return (EDOM); 1461 oldpages = OFF_TO_IDX(round_page(sc->mediasize)); 1462 newpages = OFF_TO_IDX(round_page(mdio->md_mediasize)); 1463 if (newpages < oldpages) { 1464 VM_OBJECT_WLOCK(sc->object); 1465 vm_object_page_remove(sc->object, newpages, 0, 0); 1466 swap_pager_freespace(sc->object, newpages, 1467 oldpages - newpages); 1468 swap_release_by_cred(IDX_TO_OFF(oldpages - 1469 newpages), sc->cred); 1470 sc->object->charge = IDX_TO_OFF(newpages); 1471 sc->object->size = newpages; 1472 VM_OBJECT_WUNLOCK(sc->object); 1473 } else if (newpages > oldpages) { 1474 res = swap_reserve_by_cred(IDX_TO_OFF(newpages - 1475 oldpages), sc->cred); 1476 if (!res) 1477 return (ENOMEM); 1478 if ((mdio->md_options & MD_RESERVE) || 1479 (sc->flags & MD_RESERVE)) { 1480 error = swap_pager_reserve(sc->object, 1481 oldpages, newpages - oldpages); 1482 if (error < 0) { 1483 swap_release_by_cred( 1484 IDX_TO_OFF(newpages - oldpages), 1485 sc->cred); 1486 return (EDOM); 1487 } 1488 } 1489 VM_OBJECT_WLOCK(sc->object); 1490 sc->object->charge = IDX_TO_OFF(newpages); 1491 sc->object->size = newpages; 1492 VM_OBJECT_WUNLOCK(sc->object); 1493 } 1494 break; 1495 default: 1496 return (EOPNOTSUPP); 1497 } 1498 1499 sc->mediasize = mdio->md_mediasize; 1500 g_topology_lock(); 1501 g_resize_provider(sc->pp, sc->mediasize); 1502 g_topology_unlock(); 1503 return (0); 1504 } 1505 1506 static int 1507 mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1508 { 1509 vm_ooffset_t npage; 1510 int error; 1511 1512 /* 1513 * Range check. Disallow negative sizes and sizes not being 1514 * multiple of page size. 1515 */ 1516 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1517 return (EDOM); 1518 1519 /* 1520 * Allocate an OBJT_SWAP object. 1521 * 1522 * Note the truncation. 1523 */ 1524 1525 npage = mdio->md_mediasize / PAGE_SIZE; 1526 if (mdio->md_fwsectors != 0) 1527 sc->fwsectors = mdio->md_fwsectors; 1528 if (mdio->md_fwheads != 0) 1529 sc->fwheads = mdio->md_fwheads; 1530 sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, 1531 VM_PROT_DEFAULT, 0, td->td_ucred); 1532 if (sc->object == NULL) 1533 return (ENOMEM); 1534 sc->flags = mdio->md_options & (MD_FORCE | MD_RESERVE); 1535 if (mdio->md_options & MD_RESERVE) { 1536 if (swap_pager_reserve(sc->object, 0, npage) < 0) { 1537 error = EDOM; 1538 goto finish; 1539 } 1540 } 1541 error = mdsetcred(sc, td->td_ucred); 1542 finish: 1543 if (error != 0) { 1544 vm_object_deallocate(sc->object); 1545 sc->object = NULL; 1546 } 1547 return (error); 1548 } 1549 1550 static int 1551 mdcreate_null(struct md_s *sc, struct md_ioctl *mdio, struct thread *td) 1552 { 1553 1554 /* 1555 * Range check. Disallow negative sizes and sizes not being 1556 * multiple of page size. 1557 */ 1558 if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) 1559 return (EDOM); 1560 1561 return (0); 1562 } 1563 1564 static int 1565 xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1566 { 1567 struct md_ioctl *mdio; 1568 struct md_s *sc; 1569 int error, i; 1570 unsigned sectsize; 1571 1572 if (md_debug) 1573 printf("mdctlioctl(%s %lx %p %x %p)\n", 1574 devtoname(dev), cmd, addr, flags, td); 1575 1576 mdio = (struct md_ioctl *)addr; 1577 if (mdio->md_version != MDIOVERSION) 1578 return (EINVAL); 1579 1580 /* 1581 * We assert the version number in the individual ioctl 1582 * handlers instead of out here because (a) it is possible we 1583 * may add another ioctl in the future which doesn't read an 1584 * mdio, and (b) the correct return value for an unknown ioctl 1585 * is ENOIOCTL, not EINVAL. 1586 */ 1587 error = 0; 1588 switch (cmd) { 1589 case MDIOCATTACH: 1590 switch (mdio->md_type) { 1591 case MD_MALLOC: 1592 case MD_PRELOAD: 1593 case MD_VNODE: 1594 case MD_SWAP: 1595 case MD_NULL: 1596 break; 1597 default: 1598 return (EINVAL); 1599 } 1600 if (mdio->md_sectorsize == 0) 1601 sectsize = DEV_BSIZE; 1602 else 1603 sectsize = mdio->md_sectorsize; 1604 if (sectsize > MAXPHYS || mdio->md_mediasize < sectsize) 1605 return (EINVAL); 1606 if (mdio->md_options & MD_AUTOUNIT) 1607 sc = mdnew(-1, &error, mdio->md_type); 1608 else { 1609 if (mdio->md_unit > INT_MAX) 1610 return (EINVAL); 1611 sc = mdnew(mdio->md_unit, &error, mdio->md_type); 1612 } 1613 if (sc == NULL) 1614 return (error); 1615 if (mdio->md_options & MD_AUTOUNIT) 1616 mdio->md_unit = sc->unit; 1617 sc->mediasize = mdio->md_mediasize; 1618 sc->sectorsize = sectsize; 1619 error = EDOOFUS; 1620 switch (sc->type) { 1621 case MD_MALLOC: 1622 sc->start = mdstart_malloc; 1623 error = mdcreate_malloc(sc, mdio); 1624 break; 1625 case MD_PRELOAD: 1626 /* 1627 * We disallow attaching preloaded memory disks via 1628 * ioctl. Preloaded memory disks are automatically 1629 * attached in g_md_init(). 1630 */ 1631 error = EOPNOTSUPP; 1632 break; 1633 case MD_VNODE: 1634 sc->start = mdstart_vnode; 1635 error = mdcreate_vnode(sc, mdio, td); 1636 break; 1637 case MD_SWAP: 1638 sc->start = mdstart_swap; 1639 error = mdcreate_swap(sc, mdio, td); 1640 break; 1641 case MD_NULL: 1642 sc->start = mdstart_null; 1643 error = mdcreate_null(sc, mdio, td); 1644 break; 1645 } 1646 if (error != 0) { 1647 mddestroy(sc, td); 1648 return (error); 1649 } 1650 1651 /* Prune off any residual fractional sector */ 1652 i = sc->mediasize % sc->sectorsize; 1653 sc->mediasize -= i; 1654 1655 mdinit(sc); 1656 return (0); 1657 case MDIOCDETACH: 1658 if (mdio->md_mediasize != 0 || 1659 (mdio->md_options & ~MD_FORCE) != 0) 1660 return (EINVAL); 1661 1662 sc = mdfind(mdio->md_unit); 1663 if (sc == NULL) 1664 return (ENOENT); 1665 if (sc->opencount != 0 && !(sc->flags & MD_FORCE) && 1666 !(mdio->md_options & MD_FORCE)) 1667 return (EBUSY); 1668 return (mddestroy(sc, td)); 1669 case MDIOCRESIZE: 1670 if ((mdio->md_options & ~(MD_FORCE | MD_RESERVE)) != 0) 1671 return (EINVAL); 1672 1673 sc = mdfind(mdio->md_unit); 1674 if (sc == NULL) 1675 return (ENOENT); 1676 if (mdio->md_mediasize < sc->sectorsize) 1677 return (EINVAL); 1678 if (mdio->md_mediasize < sc->mediasize && 1679 !(sc->flags & MD_FORCE) && 1680 !(mdio->md_options & MD_FORCE)) 1681 return (EBUSY); 1682 return (mdresize(sc, mdio)); 1683 case MDIOCQUERY: 1684 sc = mdfind(mdio->md_unit); 1685 if (sc == NULL) 1686 return (ENOENT); 1687 mdio->md_type = sc->type; 1688 mdio->md_options = sc->flags; 1689 mdio->md_mediasize = sc->mediasize; 1690 mdio->md_sectorsize = sc->sectorsize; 1691 if (sc->type == MD_VNODE) 1692 error = copyout(sc->file, mdio->md_file, 1693 strlen(sc->file) + 1); 1694 return (error); 1695 case MDIOCLIST: 1696 i = 1; 1697 LIST_FOREACH(sc, &md_softc_list, list) { 1698 if (i == MDNPAD - 1) 1699 mdio->md_pad[i] = -1; 1700 else 1701 mdio->md_pad[i++] = sc->unit; 1702 } 1703 mdio->md_pad[0] = i - 1; 1704 return (0); 1705 default: 1706 return (ENOIOCTL); 1707 }; 1708 } 1709 1710 static int 1711 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) 1712 { 1713 int error; 1714 1715 sx_xlock(&md_sx); 1716 error = xmdctlioctl(dev, cmd, addr, flags, td); 1717 sx_xunlock(&md_sx); 1718 return (error); 1719 } 1720 1721 static void 1722 md_preloaded(u_char *image, size_t length, const char *name) 1723 { 1724 struct md_s *sc; 1725 int error; 1726 1727 sc = mdnew(-1, &error, MD_PRELOAD); 1728 if (sc == NULL) 1729 return; 1730 sc->mediasize = length; 1731 sc->sectorsize = DEV_BSIZE; 1732 sc->pl_ptr = image; 1733 sc->pl_len = length; 1734 sc->start = mdstart_preload; 1735 #ifdef MD_ROOT 1736 if (sc->unit == 0) 1737 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0"; 1738 #endif 1739 mdinit(sc); 1740 if (name != NULL) { 1741 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", 1742 MD_NAME, sc->unit, name, length, image); 1743 } else { 1744 printf("%s%d: Embedded image %zd bytes at %p\n", 1745 MD_NAME, sc->unit, length, image); 1746 } 1747 } 1748 1749 static void 1750 g_md_init(struct g_class *mp __unused) 1751 { 1752 caddr_t mod; 1753 u_char *ptr, *name, *type; 1754 unsigned len; 1755 int i; 1756 1757 /* figure out log2(NINDIR) */ 1758 for (i = NINDIR, nshift = -1; i; nshift++) 1759 i >>= 1; 1760 1761 mod = NULL; 1762 sx_init(&md_sx, "MD config lock"); 1763 g_topology_unlock(); 1764 md_uh = new_unrhdr(0, INT_MAX, NULL); 1765 #ifdef MD_ROOT 1766 if (mfs_root_size != 0) { 1767 sx_xlock(&md_sx); 1768 md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size, 1769 NULL); 1770 sx_xunlock(&md_sx); 1771 } 1772 #endif 1773 /* XXX: are preload_* static or do they need Giant ? */ 1774 while ((mod = preload_search_next_name(mod)) != NULL) { 1775 name = (char *)preload_search_info(mod, MODINFO_NAME); 1776 if (name == NULL) 1777 continue; 1778 type = (char *)preload_search_info(mod, MODINFO_TYPE); 1779 if (type == NULL) 1780 continue; 1781 if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) 1782 continue; 1783 ptr = preload_fetch_addr(mod); 1784 len = preload_fetch_size(mod); 1785 if (ptr != NULL && len != 0) { 1786 sx_xlock(&md_sx); 1787 md_preloaded(ptr, len, name); 1788 sx_xunlock(&md_sx); 1789 } 1790 } 1791 md_vnode_pbuf_freecnt = nswbuf / 10; 1792 status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 1793 0600, MDCTL_NAME); 1794 g_topology_lock(); 1795 } 1796 1797 static void 1798 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1799 struct g_consumer *cp __unused, struct g_provider *pp) 1800 { 1801 struct md_s *mp; 1802 char *type; 1803 1804 mp = gp->softc; 1805 if (mp == NULL) 1806 return; 1807 1808 switch (mp->type) { 1809 case MD_MALLOC: 1810 type = "malloc"; 1811 break; 1812 case MD_PRELOAD: 1813 type = "preload"; 1814 break; 1815 case MD_VNODE: 1816 type = "vnode"; 1817 break; 1818 case MD_SWAP: 1819 type = "swap"; 1820 break; 1821 case MD_NULL: 1822 type = "null"; 1823 break; 1824 default: 1825 type = "unknown"; 1826 break; 1827 } 1828 1829 if (pp != NULL) { 1830 if (indent == NULL) { 1831 sbuf_printf(sb, " u %d", mp->unit); 1832 sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); 1833 sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); 1834 sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); 1835 sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); 1836 sbuf_printf(sb, " t %s", type); 1837 if (mp->type == MD_VNODE && mp->vnode != NULL) 1838 sbuf_printf(sb, " file %s", mp->file); 1839 } else { 1840 sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, 1841 mp->unit); 1842 sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n", 1843 indent, (uintmax_t) mp->sectorsize); 1844 sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n", 1845 indent, (uintmax_t) mp->fwheads); 1846 sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n", 1847 indent, (uintmax_t) mp->fwsectors); 1848 sbuf_printf(sb, "%s<length>%ju</length>\n", 1849 indent, (uintmax_t) mp->mediasize); 1850 sbuf_printf(sb, "%s<compression>%s</compression>\n", indent, 1851 (mp->flags & MD_COMPRESS) == 0 ? "off": "on"); 1852 sbuf_printf(sb, "%s<access>%s</access>\n", indent, 1853 (mp->flags & MD_READONLY) == 0 ? "read-write": 1854 "read-only"); 1855 sbuf_printf(sb, "%s<type>%s</type>\n", indent, 1856 type); 1857 if (mp->type == MD_VNODE && mp->vnode != NULL) { 1858 sbuf_printf(sb, "%s<file>", indent); 1859 g_conf_printf_escaped(sb, "%s", mp->file); 1860 sbuf_printf(sb, "</file>\n"); 1861 } 1862 } 1863 } 1864 } 1865 1866 static void 1867 g_md_fini(struct g_class *mp __unused) 1868 { 1869 1870 sx_destroy(&md_sx); 1871 if (status_dev != NULL) 1872 destroy_dev(status_dev); 1873 delete_unrhdr(md_uh); 1874 } 1875