1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. 24 * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved. 25 * Copyright 2017 The MathWorks, Inc. All rights reserved. 26 * Copyright 2020 Joyent, Inc. 27 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 28 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 29 * Copyright 2023 Oxide Computer Company 30 */ 31 32 #include <sys/types.h> 33 #include <sys/ksynch.h> 34 #include <sys/kmem.h> 35 #include <sys/file.h> 36 #include <sys/errno.h> 37 #include <sys/open.h> 38 #include <sys/buf.h> 39 #include <sys/uio.h> 40 #include <sys/aio_req.h> 41 #include <sys/cred.h> 42 #include <sys/modctl.h> 43 #include <sys/cmlb.h> 44 #include <sys/conf.h> 45 #include <sys/devops.h> 46 #include <sys/list.h> 47 #include <sys/sysmacros.h> 48 #include <sys/dkio.h> 49 #include <sys/dkioc_free_util.h> 50 #include <sys/vtoc.h> 51 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 52 #include <sys/kstat.h> 53 #include <sys/fs/dv_node.h> 54 #include <sys/ddi.h> 55 #include <sys/sunddi.h> 56 #include <sys/note.h> 57 #include <sys/blkdev.h> 58 #include <sys/scsi/impl/inquiry.h> 59 #include <sys/taskq.h> 60 #include <sys/taskq_impl.h> 61 #include <sys/disp.h> 62 #include <sys/sysevent/eventdefs.h> 63 #include <sys/sysevent/dev.h> 64 65 /* 66 * blkdev is a driver which provides a lot of the common functionality 67 * a block device driver may need and helps by removing code which 68 * is frequently duplicated in block device drivers. 69 * 70 * Within this driver all the struct cb_ops functions required for a 71 * block device driver are written with appropriate call back functions 72 * to be provided by the parent driver. 73 * 74 * To use blkdev, a driver needs to: 75 * 1. Create a bd_ops_t structure which has the call back operations 76 * blkdev will use. 77 * 2. Create a handle by calling bd_alloc_handle(). One of the 78 * arguments to this function is the bd_ops_t. 79 * 3. Call bd_attach_handle(). This will instantiate a blkdev device 80 * as a child device node of the calling driver. 81 * 82 * A parent driver is not restricted to just allocating and attaching a 83 * single instance, it may attach as many as it wishes. For each handle 84 * attached, appropriate entries in /dev/[r]dsk are created. 85 * 86 * The bd_ops_t routines that a parent of blkdev need to provide are: 87 * 88 * o_drive_info: Provide information to blkdev such as how many I/O queues 89 * to create and the size of those queues. Also some device 90 * specifics such as EUI, vendor, product, model, serial 91 * number .... 92 * 93 * o_media_info: Provide information about the media. Eg size and block size. 94 * 95 * o_devid_init: Creates and initializes the device id. Typically calls 96 * ddi_devid_init(). 97 * 98 * o_sync_cache: Issues a device appropriate command to flush any write 99 * caches. 100 * 101 * o_read: Read data as described by bd_xfer_t argument. 102 * 103 * o_write: Write data as described by bd_xfer_t argument. 104 * 105 * o_free_space: Free the space described by bd_xfer_t argument (optional). 106 * 107 * Queues 108 * ------ 109 * Part of the drive_info data is a queue count. blkdev will create 110 * "queue count" number of waitq/runq pairs. Each waitq/runq pair 111 * operates independently. As an I/O is scheduled up to the parent 112 * driver via o_read or o_write its queue number is given. If the 113 * parent driver supports multiple hardware queues it can then select 114 * where to submit the I/O request. 115 * 116 * Currently blkdev uses a simplistic round-robin queue selection method. 117 * It has the advantage that it is lockless. In the future it will be 118 * worthwhile reviewing this strategy for something which prioritizes queues 119 * depending on how busy they are. 120 * 121 * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming 122 * I/O requests are initially added to the waitq. They are taken off the 123 * waitq, added to the runq and submitted, providing the runq is less 124 * than the qsize as specified in the drive_info. As an I/O request 125 * completes, the parent driver is required to call bd_xfer_done(), which 126 * will remove the I/O request from the runq and pass I/O completion 127 * status up the stack. 128 * 129 * Locks 130 * ----- 131 * There are 5 instance global locks d_ocmutex, d_ksmutex, d_errmutex, 132 * d_statemutex and d_dle_mutex. As well a q_iomutex per waitq/runq pair. 133 * 134 * Lock Hierarchy 135 * -------------- 136 * The only two locks which may be held simultaneously are q_iomutex and 137 * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex. 138 */ 139 140 #define BD_MAXPART 64 141 #define BDINST(dev) (getminor(dev) / BD_MAXPART) 142 #define BDPART(dev) (getminor(dev) % BD_MAXPART) 143 144 typedef struct bd bd_t; 145 typedef struct bd_xfer_impl bd_xfer_impl_t; 146 typedef struct bd_queue bd_queue_t; 147 148 typedef enum { 149 BD_DLE_PENDING = 1 << 0, 150 BD_DLE_RUNNING = 1 << 1 151 } bd_dle_state_t; 152 153 struct bd { 154 void *d_private; 155 dev_info_t *d_dip; 156 kmutex_t d_ocmutex; /* open/close */ 157 kmutex_t d_ksmutex; /* kstat */ 158 kmutex_t d_errmutex; 159 kmutex_t d_statemutex; 160 kcondvar_t d_statecv; 161 enum dkio_state d_state; 162 cmlb_handle_t d_cmlbh; 163 unsigned d_open_lyr[BD_MAXPART]; /* open count */ 164 uint64_t d_open_excl; /* bit mask indexed by partition */ 165 uint64_t d_open_reg[OTYPCNT]; /* bit mask */ 166 uint64_t d_io_counter; 167 168 uint32_t d_qcount; 169 uint32_t d_qactive; 170 uint32_t d_maxxfer; 171 uint32_t d_blkshift; 172 uint32_t d_pblkshift; 173 uint64_t d_numblks; 174 ddi_devid_t d_devid; 175 176 uint64_t d_max_free_seg; 177 uint64_t d_max_free_blks; 178 uint64_t d_max_free_seg_blks; 179 uint64_t d_free_align; 180 181 kmem_cache_t *d_cache; 182 bd_queue_t *d_queues; 183 kstat_t *d_ksp; 184 kstat_io_t *d_kiop; 185 kstat_t *d_errstats; 186 struct bd_errstats *d_kerr; 187 188 boolean_t d_rdonly; 189 boolean_t d_ssd; 190 boolean_t d_removable; 191 boolean_t d_hotpluggable; 192 boolean_t d_use_dma; 193 194 ddi_dma_attr_t d_dma; 195 bd_ops_t d_ops; 196 bd_handle_t d_handle; 197 198 kmutex_t d_dle_mutex; 199 taskq_ent_t d_dle_ent; 200 bd_dle_state_t d_dle_state; 201 }; 202 203 struct bd_handle { 204 bd_ops_t h_ops; 205 ddi_dma_attr_t *h_dma; 206 dev_info_t *h_parent; 207 dev_info_t *h_child; 208 void *h_private; 209 bd_t *h_bd; 210 char *h_name; 211 char h_addr[50]; /* enough for w%0.32x,%X */ 212 }; 213 214 struct bd_xfer_impl { 215 bd_xfer_t i_public; 216 list_node_t i_linkage; 217 bd_t *i_bd; 218 buf_t *i_bp; 219 bd_queue_t *i_bq; 220 uint_t i_num_win; 221 uint_t i_cur_win; 222 off_t i_offset; 223 int (*i_func)(void *, bd_xfer_t *); 224 uint32_t i_blkshift; 225 size_t i_len; 226 size_t i_resid; 227 }; 228 229 struct bd_queue { 230 kmutex_t q_iomutex; 231 uint32_t q_qsize; 232 uint32_t q_qactive; 233 list_t q_runq; 234 list_t q_waitq; 235 }; 236 237 #define i_dmah i_public.x_dmah 238 #define i_dmac i_public.x_dmac 239 #define i_ndmac i_public.x_ndmac 240 #define i_kaddr i_public.x_kaddr 241 #define i_nblks i_public.x_nblks 242 #define i_blkno i_public.x_blkno 243 #define i_flags i_public.x_flags 244 #define i_qnum i_public.x_qnum 245 #define i_dfl i_public.x_dfl 246 247 #define CAN_FREESPACE(bd) \ 248 (((bd)->d_ops.o_free_space == NULL) ? B_FALSE : B_TRUE) 249 250 /* 251 * Private prototypes. 252 */ 253 254 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t); 255 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *); 256 static void bd_create_errstats(bd_t *, int, bd_drive_t *); 257 static void bd_destroy_errstats(bd_t *); 258 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *); 259 static void bd_init_errstats(bd_t *, bd_drive_t *); 260 static void bd_fini_errstats(bd_t *); 261 262 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 263 static int bd_attach(dev_info_t *, ddi_attach_cmd_t); 264 static int bd_detach(dev_info_t *, ddi_detach_cmd_t); 265 266 static int bd_open(dev_t *, int, int, cred_t *); 267 static int bd_close(dev_t, int, int, cred_t *); 268 static int bd_strategy(struct buf *); 269 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 270 static int bd_dump(dev_t, caddr_t, daddr_t, int); 271 static int bd_read(dev_t, struct uio *, cred_t *); 272 static int bd_write(dev_t, struct uio *, cred_t *); 273 static int bd_aread(dev_t, struct aio_req *, cred_t *); 274 static int bd_awrite(dev_t, struct aio_req *, cred_t *); 275 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *, 276 caddr_t, int *); 277 278 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, 279 void *); 280 static int bd_tg_getinfo(dev_info_t *, int, void *, void *); 281 static int bd_xfer_ctor(void *, void *, int); 282 static void bd_xfer_dtor(void *, void *); 283 static void bd_sched(bd_t *, bd_queue_t *); 284 static void bd_submit(bd_t *, bd_xfer_impl_t *); 285 static void bd_runq_exit(bd_xfer_impl_t *, int); 286 static void bd_update_state(bd_t *); 287 static int bd_check_state(bd_t *, enum dkio_state *); 288 static int bd_flush_write_cache(bd_t *, struct dk_callback *); 289 static int bd_check_uio(dev_t, struct uio *); 290 static int bd_free_space(dev_t, bd_t *, dkioc_free_list_t *); 291 292 struct cmlb_tg_ops bd_tg_ops = { 293 TG_DK_OPS_VERSION_1, 294 bd_tg_rdwr, 295 bd_tg_getinfo, 296 }; 297 298 static struct cb_ops bd_cb_ops = { 299 bd_open, /* open */ 300 bd_close, /* close */ 301 bd_strategy, /* strategy */ 302 nodev, /* print */ 303 bd_dump, /* dump */ 304 bd_read, /* read */ 305 bd_write, /* write */ 306 bd_ioctl, /* ioctl */ 307 nodev, /* devmap */ 308 nodev, /* mmap */ 309 nodev, /* segmap */ 310 nochpoll, /* poll */ 311 bd_prop_op, /* cb_prop_op */ 312 0, /* streamtab */ 313 D_64BIT | D_MP, /* Driver comaptibility flag */ 314 CB_REV, /* cb_rev */ 315 bd_aread, /* async read */ 316 bd_awrite /* async write */ 317 }; 318 319 struct dev_ops bd_dev_ops = { 320 DEVO_REV, /* devo_rev, */ 321 0, /* refcnt */ 322 bd_getinfo, /* getinfo */ 323 nulldev, /* identify */ 324 nulldev, /* probe */ 325 bd_attach, /* attach */ 326 bd_detach, /* detach */ 327 nodev, /* reset */ 328 &bd_cb_ops, /* driver operations */ 329 NULL, /* bus operations */ 330 NULL, /* power */ 331 ddi_quiesce_not_needed, /* quiesce */ 332 }; 333 334 static struct modldrv modldrv = { 335 &mod_driverops, 336 "Generic Block Device", 337 &bd_dev_ops, 338 }; 339 340 static struct modlinkage modlinkage = { 341 MODREV_1, { &modldrv, NULL } 342 }; 343 344 static void *bd_state; 345 static krwlock_t bd_lock; 346 static taskq_t *bd_taskq; 347 348 int 349 _init(void) 350 { 351 char taskq_name[TASKQ_NAMELEN]; 352 const char *name; 353 int rv; 354 355 rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2); 356 if (rv != DDI_SUCCESS) 357 return (rv); 358 359 name = mod_modname(&modlinkage); 360 (void) snprintf(taskq_name, sizeof (taskq_name), "%s_taskq", name); 361 bd_taskq = taskq_create(taskq_name, 1, minclsyspri, 0, 0, 0); 362 if (bd_taskq == NULL) { 363 cmn_err(CE_WARN, "%s: unable to create %s", name, taskq_name); 364 ddi_soft_state_fini(&bd_state); 365 return (DDI_FAILURE); 366 } 367 368 rw_init(&bd_lock, NULL, RW_DRIVER, NULL); 369 370 rv = mod_install(&modlinkage); 371 if (rv != DDI_SUCCESS) { 372 rw_destroy(&bd_lock); 373 taskq_destroy(bd_taskq); 374 ddi_soft_state_fini(&bd_state); 375 } 376 return (rv); 377 } 378 379 int 380 _fini(void) 381 { 382 int rv; 383 384 rv = mod_remove(&modlinkage); 385 if (rv == DDI_SUCCESS) { 386 rw_destroy(&bd_lock); 387 taskq_destroy(bd_taskq); 388 ddi_soft_state_fini(&bd_state); 389 } 390 return (rv); 391 } 392 393 int 394 _info(struct modinfo *modinfop) 395 { 396 return (mod_info(&modlinkage, modinfop)); 397 } 398 399 static int 400 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 401 { 402 bd_t *bd; 403 minor_t inst; 404 405 _NOTE(ARGUNUSED(dip)); 406 407 inst = BDINST((dev_t)arg); 408 409 switch (cmd) { 410 case DDI_INFO_DEVT2DEVINFO: 411 bd = ddi_get_soft_state(bd_state, inst); 412 if (bd == NULL) { 413 return (DDI_FAILURE); 414 } 415 *resultp = (void *)bd->d_dip; 416 break; 417 418 case DDI_INFO_DEVT2INSTANCE: 419 *resultp = (void *)(intptr_t)inst; 420 break; 421 422 default: 423 return (DDI_FAILURE); 424 } 425 return (DDI_SUCCESS); 426 } 427 428 static void 429 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len) 430 { 431 int ilen; 432 char *data_string; 433 434 ilen = scsi_ascii_inquiry_len(data, len); 435 ASSERT3U(ilen, <=, len); 436 if (ilen <= 0) 437 return; 438 /* ensure null termination */ 439 data_string = kmem_zalloc(ilen + 1, KM_SLEEP); 440 bcopy(data, data_string, ilen); 441 (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string); 442 kmem_free(data_string, ilen + 1); 443 } 444 445 static void 446 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive) 447 { 448 if (drive->d_vendor_len > 0) 449 bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID, 450 drive->d_vendor, drive->d_vendor_len); 451 452 if (drive->d_product_len > 0) 453 bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID, 454 drive->d_product, drive->d_product_len); 455 456 if (drive->d_serial_len > 0) 457 bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO, 458 drive->d_serial, drive->d_serial_len); 459 460 if (drive->d_revision_len > 0) 461 bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID, 462 drive->d_revision, drive->d_revision_len); 463 } 464 465 static void 466 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive) 467 { 468 char ks_module[KSTAT_STRLEN]; 469 char ks_name[KSTAT_STRLEN]; 470 int ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t); 471 472 if (bd->d_errstats != NULL) 473 return; 474 475 (void) snprintf(ks_module, sizeof (ks_module), "%serr", 476 ddi_driver_name(bd->d_dip)); 477 (void) snprintf(ks_name, sizeof (ks_name), "%s%d,err", 478 ddi_driver_name(bd->d_dip), inst); 479 480 bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error", 481 KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 482 483 mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL); 484 if (bd->d_errstats == NULL) { 485 /* 486 * Even if we cannot create the kstat, we create a 487 * scratch kstat. The reason for this is to ensure 488 * that we can update the kstat all of the time, 489 * without adding an extra branch instruction. 490 */ 491 bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats), 492 KM_SLEEP); 493 } else { 494 bd->d_errstats->ks_lock = &bd->d_errmutex; 495 bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data; 496 } 497 498 kstat_named_init(&bd->d_kerr->bd_softerrs, "Soft Errors", 499 KSTAT_DATA_UINT32); 500 kstat_named_init(&bd->d_kerr->bd_harderrs, "Hard Errors", 501 KSTAT_DATA_UINT32); 502 kstat_named_init(&bd->d_kerr->bd_transerrs, "Transport Errors", 503 KSTAT_DATA_UINT32); 504 505 if (drive->d_model_len > 0) { 506 kstat_named_init(&bd->d_kerr->bd_model, "Model", 507 KSTAT_DATA_STRING); 508 } else { 509 kstat_named_init(&bd->d_kerr->bd_vid, "Vendor", 510 KSTAT_DATA_STRING); 511 kstat_named_init(&bd->d_kerr->bd_pid, "Product", 512 KSTAT_DATA_STRING); 513 } 514 515 kstat_named_init(&bd->d_kerr->bd_revision, "Revision", 516 KSTAT_DATA_STRING); 517 kstat_named_init(&bd->d_kerr->bd_serial, "Serial No", 518 KSTAT_DATA_STRING); 519 kstat_named_init(&bd->d_kerr->bd_capacity, "Size", 520 KSTAT_DATA_ULONGLONG); 521 kstat_named_init(&bd->d_kerr->bd_rq_media_err, "Media Error", 522 KSTAT_DATA_UINT32); 523 kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err, "Device Not Ready", 524 KSTAT_DATA_UINT32); 525 kstat_named_init(&bd->d_kerr->bd_rq_nodev_err, "No Device", 526 KSTAT_DATA_UINT32); 527 kstat_named_init(&bd->d_kerr->bd_rq_recov_err, "Recoverable", 528 KSTAT_DATA_UINT32); 529 kstat_named_init(&bd->d_kerr->bd_rq_illrq_err, "Illegal Request", 530 KSTAT_DATA_UINT32); 531 kstat_named_init(&bd->d_kerr->bd_rq_pfa_err, 532 "Predictive Failure Analysis", KSTAT_DATA_UINT32); 533 534 bd->d_errstats->ks_private = bd; 535 536 kstat_install(bd->d_errstats); 537 bd_init_errstats(bd, drive); 538 } 539 540 static void 541 bd_destroy_errstats(bd_t *bd) 542 { 543 if (bd->d_errstats != NULL) { 544 bd_fini_errstats(bd); 545 kstat_delete(bd->d_errstats); 546 bd->d_errstats = NULL; 547 } else { 548 kmem_free(bd->d_kerr, sizeof (struct bd_errstats)); 549 bd->d_kerr = NULL; 550 mutex_destroy(&bd->d_errmutex); 551 } 552 } 553 554 static void 555 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt) 556 { 557 char *tmp; 558 size_t km_len; 559 560 if (KSTAT_NAMED_STR_PTR(k) == NULL) { 561 if (len > 0) 562 km_len = strnlen(str, len); 563 else if (alt != NULL) 564 km_len = strlen(alt); 565 else 566 return; 567 568 tmp = kmem_alloc(km_len + 1, KM_SLEEP); 569 bcopy(len > 0 ? str : alt, tmp, km_len); 570 tmp[km_len] = '\0'; 571 572 kstat_named_setstr(k, tmp); 573 } 574 } 575 576 static void 577 bd_errstats_clrstr(kstat_named_t *k) 578 { 579 if (KSTAT_NAMED_STR_PTR(k) == NULL) 580 return; 581 582 kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k)); 583 kstat_named_setstr(k, NULL); 584 } 585 586 static void 587 bd_init_errstats(bd_t *bd, bd_drive_t *drive) 588 { 589 struct bd_errstats *est = bd->d_kerr; 590 591 mutex_enter(&bd->d_errmutex); 592 593 if (drive->d_model_len > 0 && 594 KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) { 595 bd_errstats_setstr(&est->bd_model, drive->d_model, 596 drive->d_model_len, NULL); 597 } else { 598 bd_errstats_setstr(&est->bd_vid, drive->d_vendor, 599 drive->d_vendor_len, "Unknown "); 600 bd_errstats_setstr(&est->bd_pid, drive->d_product, 601 drive->d_product_len, "Unknown "); 602 } 603 604 bd_errstats_setstr(&est->bd_revision, drive->d_revision, 605 drive->d_revision_len, "0001"); 606 bd_errstats_setstr(&est->bd_serial, drive->d_serial, 607 drive->d_serial_len, "0 "); 608 609 mutex_exit(&bd->d_errmutex); 610 } 611 612 static void 613 bd_fini_errstats(bd_t *bd) 614 { 615 struct bd_errstats *est = bd->d_kerr; 616 617 mutex_enter(&bd->d_errmutex); 618 619 bd_errstats_clrstr(&est->bd_model); 620 bd_errstats_clrstr(&est->bd_vid); 621 bd_errstats_clrstr(&est->bd_pid); 622 bd_errstats_clrstr(&est->bd_revision); 623 bd_errstats_clrstr(&est->bd_serial); 624 625 mutex_exit(&bd->d_errmutex); 626 } 627 628 static void 629 bd_queues_free(bd_t *bd) 630 { 631 uint32_t i; 632 633 for (i = 0; i < bd->d_qcount; i++) { 634 bd_queue_t *bq = &bd->d_queues[i]; 635 636 mutex_destroy(&bq->q_iomutex); 637 list_destroy(&bq->q_waitq); 638 list_destroy(&bq->q_runq); 639 } 640 641 kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount); 642 } 643 644 static int 645 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 646 { 647 int inst; 648 bd_handle_t hdl; 649 bd_t *bd; 650 bd_drive_t drive; 651 uint32_t i; 652 int rv; 653 char name[16]; 654 char kcache[32]; 655 char *node_type; 656 657 switch (cmd) { 658 case DDI_ATTACH: 659 break; 660 case DDI_RESUME: 661 /* We don't do anything native for suspend/resume */ 662 return (DDI_SUCCESS); 663 default: 664 return (DDI_FAILURE); 665 } 666 667 inst = ddi_get_instance(dip); 668 hdl = ddi_get_parent_data(dip); 669 670 (void) snprintf(name, sizeof (name), "%s%d", 671 ddi_driver_name(dip), ddi_get_instance(dip)); 672 (void) snprintf(kcache, sizeof (kcache), "%s_xfer", name); 673 674 if (hdl == NULL) { 675 cmn_err(CE_WARN, "%s: missing parent data!", name); 676 return (DDI_FAILURE); 677 } 678 679 if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) { 680 cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name); 681 return (DDI_FAILURE); 682 } 683 bd = ddi_get_soft_state(bd_state, inst); 684 685 if (hdl->h_dma) { 686 bd->d_dma = *(hdl->h_dma); 687 bd->d_dma.dma_attr_granular = 688 max(DEV_BSIZE, bd->d_dma.dma_attr_granular); 689 bd->d_use_dma = B_TRUE; 690 691 if (bd->d_maxxfer && 692 (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) { 693 cmn_err(CE_WARN, 694 "%s: inconsistent maximum transfer size!", 695 name); 696 /* We force it */ 697 bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer; 698 } else { 699 bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer; 700 } 701 } else { 702 bd->d_use_dma = B_FALSE; 703 if (bd->d_maxxfer == 0) { 704 bd->d_maxxfer = 1024 * 1024; 705 } 706 } 707 bd->d_ops = hdl->h_ops; 708 bd->d_private = hdl->h_private; 709 bd->d_blkshift = DEV_BSHIFT; /* 512 bytes, to start */ 710 711 if (bd->d_maxxfer % DEV_BSIZE) { 712 cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name); 713 bd->d_maxxfer &= ~(DEV_BSIZE - 1); 714 } 715 if (bd->d_maxxfer < DEV_BSIZE) { 716 cmn_err(CE_WARN, "%s: maximum transfer size too small!", name); 717 ddi_soft_state_free(bd_state, inst); 718 return (DDI_FAILURE); 719 } 720 721 bd->d_dip = dip; 722 bd->d_handle = hdl; 723 ddi_set_driver_private(dip, bd); 724 725 mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL); 726 mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL); 727 mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL); 728 cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL); 729 mutex_init(&bd->d_dle_mutex, NULL, MUTEX_DRIVER, NULL); 730 bd->d_dle_state = 0; 731 732 bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8, 733 bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0); 734 735 bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk", 736 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 737 if (bd->d_ksp != NULL) { 738 bd->d_ksp->ks_lock = &bd->d_ksmutex; 739 kstat_install(bd->d_ksp); 740 bd->d_kiop = bd->d_ksp->ks_data; 741 } else { 742 /* 743 * Even if we cannot create the kstat, we create a 744 * scratch kstat. The reason for this is to ensure 745 * that we can update the kstat all of the time, 746 * without adding an extra branch instruction. 747 */ 748 bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP); 749 } 750 751 cmlb_alloc_handle(&bd->d_cmlbh); 752 753 bd->d_state = DKIO_NONE; 754 755 bzero(&drive, sizeof (drive)); 756 /* 757 * Default to one queue, and no restrictions on free space requests 758 * (if driver provides method) parent driver can override. 759 */ 760 drive.d_qcount = 1; 761 drive.d_free_align = 1; 762 bd->d_ops.o_drive_info(bd->d_private, &drive); 763 764 /* 765 * Several checks to make sure o_drive_info() didn't return bad 766 * values: 767 * 768 * There must be at least one queue 769 */ 770 if (drive.d_qcount == 0) 771 goto fail_drive_info; 772 773 /* FREE/UNMAP/TRIM alignment needs to be at least 1 block */ 774 if (drive.d_free_align == 0) 775 goto fail_drive_info; 776 777 /* 778 * If d_max_free_blks is not unlimited (not 0), then we cannot allow 779 * an unlimited segment size. It is however permissible to not impose 780 * a limit on the total number of blocks freed while limiting the 781 * amount allowed in an individual segment. 782 */ 783 if ((drive.d_max_free_blks > 0 && drive.d_max_free_seg_blks == 0)) 784 goto fail_drive_info; 785 786 /* 787 * If a limit is set on d_max_free_blks (by the above check, we know 788 * if there's a limit on d_max_free_blks, d_max_free_seg_blks cannot 789 * be unlimited), it cannot be smaller than the limit on an individual 790 * segment. 791 */ 792 if ((drive.d_max_free_blks > 0 && 793 drive.d_max_free_seg_blks > drive.d_max_free_blks)) { 794 goto fail_drive_info; 795 } 796 797 bd->d_qcount = drive.d_qcount; 798 bd->d_removable = drive.d_removable; 799 bd->d_hotpluggable = drive.d_hotpluggable; 800 801 if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer) 802 bd->d_maxxfer = drive.d_maxxfer; 803 804 bd->d_free_align = drive.d_free_align; 805 bd->d_max_free_seg = drive.d_max_free_seg; 806 bd->d_max_free_blks = drive.d_max_free_blks; 807 bd->d_max_free_seg_blks = drive.d_max_free_seg_blks; 808 809 bd_create_inquiry_props(dip, &drive); 810 bd_create_errstats(bd, inst, &drive); 811 bd_update_state(bd); 812 813 bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount, 814 KM_SLEEP); 815 for (i = 0; i < bd->d_qcount; i++) { 816 bd_queue_t *bq = &bd->d_queues[i]; 817 818 bq->q_qsize = drive.d_qsize; 819 bq->q_qactive = 0; 820 mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL); 821 822 list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t), 823 offsetof(struct bd_xfer_impl, i_linkage)); 824 list_create(&bq->q_runq, sizeof (bd_xfer_impl_t), 825 offsetof(struct bd_xfer_impl, i_linkage)); 826 } 827 828 if (*(uint64_t *)drive.d_eui64 != 0 || 829 *(uint64_t *)drive.d_guid != 0 || 830 *((uint64_t *)drive.d_guid + 1) != 0) 831 node_type = DDI_NT_BLOCK_BLKDEV; 832 else if (drive.d_lun >= 0) 833 node_type = DDI_NT_BLOCK_CHAN; 834 else 835 node_type = DDI_NT_BLOCK; 836 837 rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT, 838 bd->d_removable, bd->d_hotpluggable, node_type, 839 CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0); 840 if (rv != 0) { 841 goto fail_cmlb_attach; 842 } 843 844 if (bd->d_ops.o_devid_init != NULL) { 845 rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid); 846 if (rv == DDI_SUCCESS) { 847 if (ddi_devid_register(dip, bd->d_devid) != 848 DDI_SUCCESS) { 849 cmn_err(CE_WARN, 850 "%s: unable to register devid", name); 851 } 852 } 853 } 854 855 /* 856 * Add a zero-length attribute to tell the world we support 857 * kernel ioctls (for layered drivers). Also set up properties 858 * used by HAL to identify removable media. 859 */ 860 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 861 DDI_KERNEL_IOCTL, NULL, 0); 862 if (bd->d_removable) { 863 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 864 "removable-media", NULL, 0); 865 } 866 if (bd->d_hotpluggable) { 867 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 868 "hotpluggable", NULL, 0); 869 } 870 871 /* 872 * Before we proceed, we need to ensure that the geometry and labels on 873 * the cmlb disk are reasonable. When cmlb first attaches, it does not 874 * perform label validation and creates minor nodes based on the 875 * assumption of the size. This may not be correct and the rest of the 876 * system assumes that this will have been done before we allow opens 877 * to proceed. Otherwise, on first open, this'll all end up changing 878 * around on users. We do not care if it succeeds or not. It is totally 879 * acceptable for this device to be unlabeled or not to have anything on 880 * it. 881 */ 882 (void) cmlb_validate(bd->d_cmlbh, 0, 0); 883 884 hdl->h_bd = bd; 885 ddi_report_dev(dip); 886 887 return (DDI_SUCCESS); 888 889 fail_cmlb_attach: 890 bd_queues_free(bd); 891 bd_destroy_errstats(bd); 892 893 fail_drive_info: 894 cmlb_free_handle(&bd->d_cmlbh); 895 896 if (bd->d_ksp != NULL) { 897 kstat_delete(bd->d_ksp); 898 bd->d_ksp = NULL; 899 } else { 900 kmem_free(bd->d_kiop, sizeof (kstat_io_t)); 901 } 902 903 kmem_cache_destroy(bd->d_cache); 904 cv_destroy(&bd->d_statecv); 905 mutex_destroy(&bd->d_statemutex); 906 mutex_destroy(&bd->d_ocmutex); 907 mutex_destroy(&bd->d_ksmutex); 908 mutex_destroy(&bd->d_dle_mutex); 909 ddi_soft_state_free(bd_state, inst); 910 return (DDI_FAILURE); 911 } 912 913 static int 914 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 915 { 916 bd_handle_t hdl; 917 bd_t *bd; 918 919 bd = ddi_get_driver_private(dip); 920 hdl = ddi_get_parent_data(dip); 921 922 switch (cmd) { 923 case DDI_DETACH: 924 break; 925 case DDI_SUSPEND: 926 /* We don't suspend, but our parent does */ 927 return (DDI_SUCCESS); 928 default: 929 return (DDI_FAILURE); 930 } 931 932 hdl->h_bd = NULL; 933 934 if (bd->d_ksp != NULL) { 935 kstat_delete(bd->d_ksp); 936 bd->d_ksp = NULL; 937 } else { 938 kmem_free(bd->d_kiop, sizeof (kstat_io_t)); 939 } 940 941 bd_destroy_errstats(bd); 942 cmlb_detach(bd->d_cmlbh, 0); 943 cmlb_free_handle(&bd->d_cmlbh); 944 if (bd->d_devid) 945 ddi_devid_free(bd->d_devid); 946 kmem_cache_destroy(bd->d_cache); 947 mutex_destroy(&bd->d_ksmutex); 948 mutex_destroy(&bd->d_ocmutex); 949 mutex_destroy(&bd->d_statemutex); 950 cv_destroy(&bd->d_statecv); 951 mutex_destroy(&bd->d_dle_mutex); 952 bd_queues_free(bd); 953 ddi_soft_state_free(bd_state, ddi_get_instance(dip)); 954 return (DDI_SUCCESS); 955 } 956 957 static int 958 bd_xfer_ctor(void *buf, void *arg, int kmflag) 959 { 960 bd_xfer_impl_t *xi; 961 bd_t *bd = arg; 962 int (*dcb)(caddr_t); 963 964 if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) { 965 dcb = DDI_DMA_SLEEP; 966 } else { 967 dcb = DDI_DMA_DONTWAIT; 968 } 969 970 xi = buf; 971 bzero(xi, sizeof (*xi)); 972 xi->i_bd = bd; 973 974 if (bd->d_use_dma) { 975 if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL, 976 &xi->i_dmah) != DDI_SUCCESS) { 977 return (-1); 978 } 979 } 980 981 return (0); 982 } 983 984 static void 985 bd_xfer_dtor(void *buf, void *arg) 986 { 987 bd_xfer_impl_t *xi = buf; 988 989 _NOTE(ARGUNUSED(arg)); 990 991 if (xi->i_dmah) 992 ddi_dma_free_handle(&xi->i_dmah); 993 xi->i_dmah = NULL; 994 } 995 996 static bd_xfer_impl_t * 997 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *), 998 int kmflag) 999 { 1000 bd_xfer_impl_t *xi; 1001 int rv = 0; 1002 int status; 1003 unsigned dir; 1004 int (*cb)(caddr_t); 1005 size_t len; 1006 uint32_t shift; 1007 1008 if (kmflag == KM_SLEEP) { 1009 cb = DDI_DMA_SLEEP; 1010 } else { 1011 cb = DDI_DMA_DONTWAIT; 1012 } 1013 1014 xi = kmem_cache_alloc(bd->d_cache, kmflag); 1015 if (xi == NULL) { 1016 bioerror(bp, ENOMEM); 1017 return (NULL); 1018 } 1019 1020 ASSERT(bp); 1021 1022 xi->i_bp = bp; 1023 xi->i_func = func; 1024 xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT); 1025 1026 if (bp->b_bcount == 0) { 1027 xi->i_len = 0; 1028 xi->i_nblks = 0; 1029 xi->i_kaddr = NULL; 1030 xi->i_resid = 0; 1031 xi->i_num_win = 0; 1032 goto done; 1033 } 1034 1035 if (bp->b_flags & B_READ) { 1036 dir = DDI_DMA_READ; 1037 xi->i_func = bd->d_ops.o_read; 1038 } else { 1039 dir = DDI_DMA_WRITE; 1040 xi->i_func = bd->d_ops.o_write; 1041 } 1042 1043 shift = bd->d_blkshift; 1044 xi->i_blkshift = shift; 1045 1046 if (!bd->d_use_dma) { 1047 bp_mapin(bp); 1048 rv = 0; 1049 xi->i_offset = 0; 1050 xi->i_num_win = 1051 (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer; 1052 xi->i_cur_win = 0; 1053 xi->i_len = min(bp->b_bcount, bd->d_maxxfer); 1054 xi->i_nblks = xi->i_len >> shift; 1055 xi->i_kaddr = bp->b_un.b_addr; 1056 xi->i_resid = bp->b_bcount; 1057 } else { 1058 1059 /* 1060 * We have to use consistent DMA if the address is misaligned. 1061 */ 1062 if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) && 1063 ((uintptr_t)bp->b_un.b_addr & 0x7)) { 1064 dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL; 1065 } else { 1066 dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL; 1067 } 1068 1069 status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb, 1070 NULL, &xi->i_dmac, &xi->i_ndmac); 1071 switch (status) { 1072 case DDI_DMA_MAPPED: 1073 xi->i_num_win = 1; 1074 xi->i_cur_win = 0; 1075 xi->i_offset = 0; 1076 xi->i_len = bp->b_bcount; 1077 xi->i_nblks = xi->i_len >> shift; 1078 xi->i_resid = bp->b_bcount; 1079 rv = 0; 1080 break; 1081 case DDI_DMA_PARTIAL_MAP: 1082 xi->i_cur_win = 0; 1083 1084 if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) != 1085 DDI_SUCCESS) || 1086 (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset, 1087 &len, &xi->i_dmac, &xi->i_ndmac) != 1088 DDI_SUCCESS) || 1089 (P2PHASE(len, (1U << shift)) != 0)) { 1090 (void) ddi_dma_unbind_handle(xi->i_dmah); 1091 rv = EFAULT; 1092 goto done; 1093 } 1094 xi->i_len = len; 1095 xi->i_nblks = xi->i_len >> shift; 1096 xi->i_resid = bp->b_bcount; 1097 rv = 0; 1098 break; 1099 case DDI_DMA_NORESOURCES: 1100 rv = EAGAIN; 1101 goto done; 1102 case DDI_DMA_TOOBIG: 1103 rv = EINVAL; 1104 goto done; 1105 case DDI_DMA_NOMAPPING: 1106 case DDI_DMA_INUSE: 1107 default: 1108 rv = EFAULT; 1109 goto done; 1110 } 1111 } 1112 1113 done: 1114 if (rv != 0) { 1115 kmem_cache_free(bd->d_cache, xi); 1116 bioerror(bp, rv); 1117 return (NULL); 1118 } 1119 1120 return (xi); 1121 } 1122 1123 static void 1124 bd_xfer_free(bd_xfer_impl_t *xi) 1125 { 1126 if (xi->i_dmah) { 1127 (void) ddi_dma_unbind_handle(xi->i_dmah); 1128 } 1129 if (xi->i_dfl != NULL) { 1130 dfl_free((dkioc_free_list_t *)xi->i_dfl); 1131 xi->i_dfl = NULL; 1132 } 1133 kmem_cache_free(xi->i_bd->d_cache, xi); 1134 } 1135 1136 static int 1137 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp) 1138 { 1139 dev_t dev = *devp; 1140 bd_t *bd; 1141 minor_t part; 1142 minor_t inst; 1143 uint64_t mask; 1144 boolean_t ndelay; 1145 int rv; 1146 diskaddr_t nblks; 1147 diskaddr_t lba; 1148 1149 _NOTE(ARGUNUSED(credp)); 1150 1151 part = BDPART(dev); 1152 inst = BDINST(dev); 1153 1154 if (otyp >= OTYPCNT) 1155 return (EINVAL); 1156 1157 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 1158 1159 /* 1160 * Block any DR events from changing the set of registered 1161 * devices while we function. 1162 */ 1163 rw_enter(&bd_lock, RW_READER); 1164 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) { 1165 rw_exit(&bd_lock); 1166 return (ENXIO); 1167 } 1168 1169 mutex_enter(&bd->d_ocmutex); 1170 1171 ASSERT(part < 64); 1172 mask = (1U << part); 1173 1174 bd_update_state(bd); 1175 1176 if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) { 1177 1178 /* non-blocking opens are allowed to succeed */ 1179 if (!ndelay) { 1180 rv = ENXIO; 1181 goto done; 1182 } 1183 } else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba, 1184 NULL, NULL, 0) == 0) { 1185 1186 /* 1187 * We read the partinfo, verify valid ranges. If the 1188 * partition is invalid, and we aren't blocking or 1189 * doing a raw access, then fail. (Non-blocking and 1190 * raw accesses can still succeed to allow a disk with 1191 * bad partition data to opened by format and fdisk.) 1192 */ 1193 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 1194 rv = ENXIO; 1195 goto done; 1196 } 1197 } else if (!ndelay) { 1198 /* 1199 * cmlb_partinfo failed -- invalid partition or no 1200 * disk label. 1201 */ 1202 rv = ENXIO; 1203 goto done; 1204 } 1205 1206 if ((flag & FWRITE) && bd->d_rdonly) { 1207 rv = EROFS; 1208 goto done; 1209 } 1210 1211 if ((bd->d_open_excl) & (mask)) { 1212 rv = EBUSY; 1213 goto done; 1214 } 1215 if (flag & FEXCL) { 1216 if (bd->d_open_lyr[part]) { 1217 rv = EBUSY; 1218 goto done; 1219 } 1220 for (int i = 0; i < OTYP_LYR; i++) { 1221 if (bd->d_open_reg[i] & mask) { 1222 rv = EBUSY; 1223 goto done; 1224 } 1225 } 1226 } 1227 1228 if (otyp == OTYP_LYR) { 1229 bd->d_open_lyr[part]++; 1230 } else { 1231 bd->d_open_reg[otyp] |= mask; 1232 } 1233 if (flag & FEXCL) { 1234 bd->d_open_excl |= mask; 1235 } 1236 1237 rv = 0; 1238 done: 1239 mutex_exit(&bd->d_ocmutex); 1240 rw_exit(&bd_lock); 1241 1242 return (rv); 1243 } 1244 1245 static int 1246 bd_close(dev_t dev, int flag, int otyp, cred_t *credp) 1247 { 1248 bd_t *bd; 1249 minor_t inst; 1250 minor_t part; 1251 uint64_t mask; 1252 boolean_t last = B_TRUE; 1253 1254 _NOTE(ARGUNUSED(flag)); 1255 _NOTE(ARGUNUSED(credp)); 1256 1257 part = BDPART(dev); 1258 inst = BDINST(dev); 1259 1260 ASSERT(part < 64); 1261 mask = (1U << part); 1262 1263 rw_enter(&bd_lock, RW_READER); 1264 1265 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) { 1266 rw_exit(&bd_lock); 1267 return (ENXIO); 1268 } 1269 1270 mutex_enter(&bd->d_ocmutex); 1271 if (bd->d_open_excl & mask) { 1272 bd->d_open_excl &= ~mask; 1273 } 1274 if (otyp == OTYP_LYR) { 1275 bd->d_open_lyr[part]--; 1276 } else { 1277 bd->d_open_reg[otyp] &= ~mask; 1278 } 1279 for (int i = 0; i < 64; i++) { 1280 if (bd->d_open_lyr[part]) { 1281 last = B_FALSE; 1282 } 1283 } 1284 for (int i = 0; last && (i < OTYP_LYR); i++) { 1285 if (bd->d_open_reg[i]) { 1286 last = B_FALSE; 1287 } 1288 } 1289 mutex_exit(&bd->d_ocmutex); 1290 1291 if (last) { 1292 cmlb_invalidate(bd->d_cmlbh, 0); 1293 } 1294 rw_exit(&bd_lock); 1295 1296 return (0); 1297 } 1298 1299 static int 1300 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk) 1301 { 1302 minor_t inst; 1303 minor_t part; 1304 diskaddr_t pstart; 1305 diskaddr_t psize; 1306 bd_t *bd; 1307 bd_xfer_impl_t *xi; 1308 buf_t *bp; 1309 int rv; 1310 uint32_t shift; 1311 daddr_t d_blkno; 1312 int d_nblk; 1313 1314 rw_enter(&bd_lock, RW_READER); 1315 1316 part = BDPART(dev); 1317 inst = BDINST(dev); 1318 1319 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) { 1320 rw_exit(&bd_lock); 1321 return (ENXIO); 1322 } 1323 shift = bd->d_blkshift; 1324 d_blkno = blkno >> (shift - DEV_BSHIFT); 1325 d_nblk = nblk >> (shift - DEV_BSHIFT); 1326 /* 1327 * do cmlb, but do it synchronously unless we already have the 1328 * partition (which we probably should.) 1329 */ 1330 if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL, 1331 (void *)1)) { 1332 rw_exit(&bd_lock); 1333 return (ENXIO); 1334 } 1335 1336 if ((d_blkno + d_nblk) > psize) { 1337 rw_exit(&bd_lock); 1338 return (EINVAL); 1339 } 1340 bp = getrbuf(KM_NOSLEEP); 1341 if (bp == NULL) { 1342 rw_exit(&bd_lock); 1343 return (ENOMEM); 1344 } 1345 1346 bp->b_bcount = nblk << DEV_BSHIFT; 1347 bp->b_resid = bp->b_bcount; 1348 bp->b_lblkno = blkno; 1349 bp->b_un.b_addr = caddr; 1350 1351 xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_write, KM_NOSLEEP); 1352 if (xi == NULL) { 1353 rw_exit(&bd_lock); 1354 freerbuf(bp); 1355 return (ENOMEM); 1356 } 1357 xi->i_blkno = d_blkno + pstart; 1358 xi->i_flags = BD_XFER_POLL; 1359 bd_submit(bd, xi); 1360 rw_exit(&bd_lock); 1361 1362 /* 1363 * Generally, we should have run this entirely synchronously 1364 * at this point and the biowait call should be a no-op. If 1365 * it didn't happen this way, it's a bug in the underlying 1366 * driver not honoring BD_XFER_POLL. 1367 */ 1368 (void) biowait(bp); 1369 rv = geterror(bp); 1370 freerbuf(bp); 1371 return (rv); 1372 } 1373 1374 void 1375 bd_minphys(struct buf *bp) 1376 { 1377 minor_t inst; 1378 bd_t *bd; 1379 inst = BDINST(bp->b_edev); 1380 1381 bd = ddi_get_soft_state(bd_state, inst); 1382 1383 /* 1384 * In a non-debug kernel, bd_strategy will catch !bd as 1385 * well, and will fail nicely. 1386 */ 1387 ASSERT(bd); 1388 1389 if (bp->b_bcount > bd->d_maxxfer) 1390 bp->b_bcount = bd->d_maxxfer; 1391 } 1392 1393 static int 1394 bd_check_uio(dev_t dev, struct uio *uio) 1395 { 1396 bd_t *bd; 1397 uint32_t shift; 1398 1399 if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) { 1400 return (ENXIO); 1401 } 1402 1403 shift = bd->d_blkshift; 1404 if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) || 1405 (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) { 1406 return (EINVAL); 1407 } 1408 1409 return (0); 1410 } 1411 1412 static int 1413 bd_read(dev_t dev, struct uio *uio, cred_t *credp) 1414 { 1415 _NOTE(ARGUNUSED(credp)); 1416 int ret = bd_check_uio(dev, uio); 1417 if (ret != 0) { 1418 return (ret); 1419 } 1420 return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio)); 1421 } 1422 1423 static int 1424 bd_write(dev_t dev, struct uio *uio, cred_t *credp) 1425 { 1426 _NOTE(ARGUNUSED(credp)); 1427 int ret = bd_check_uio(dev, uio); 1428 if (ret != 0) { 1429 return (ret); 1430 } 1431 return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio)); 1432 } 1433 1434 static int 1435 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp) 1436 { 1437 _NOTE(ARGUNUSED(credp)); 1438 int ret = bd_check_uio(dev, aio->aio_uio); 1439 if (ret != 0) { 1440 return (ret); 1441 } 1442 return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio)); 1443 } 1444 1445 static int 1446 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1447 { 1448 _NOTE(ARGUNUSED(credp)); 1449 int ret = bd_check_uio(dev, aio->aio_uio); 1450 if (ret != 0) { 1451 return (ret); 1452 } 1453 return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio)); 1454 } 1455 1456 static int 1457 bd_strategy(struct buf *bp) 1458 { 1459 minor_t inst; 1460 minor_t part; 1461 bd_t *bd; 1462 diskaddr_t p_lba; 1463 diskaddr_t p_nblks; 1464 diskaddr_t b_nblks; 1465 bd_xfer_impl_t *xi; 1466 uint32_t shift; 1467 int (*func)(void *, bd_xfer_t *); 1468 diskaddr_t lblkno; 1469 1470 part = BDPART(bp->b_edev); 1471 inst = BDINST(bp->b_edev); 1472 1473 ASSERT(bp); 1474 1475 bp->b_resid = bp->b_bcount; 1476 1477 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) { 1478 bioerror(bp, ENXIO); 1479 biodone(bp); 1480 return (0); 1481 } 1482 1483 if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba, 1484 NULL, NULL, 0)) { 1485 bioerror(bp, ENXIO); 1486 biodone(bp); 1487 return (0); 1488 } 1489 1490 shift = bd->d_blkshift; 1491 lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT); 1492 if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) || 1493 (P2PHASE(bp->b_bcount, (1U << shift)) != 0) || 1494 (lblkno > p_nblks)) { 1495 bioerror(bp, EINVAL); 1496 biodone(bp); 1497 return (0); 1498 } 1499 b_nblks = bp->b_bcount >> shift; 1500 if ((lblkno == p_nblks) || (bp->b_bcount == 0)) { 1501 biodone(bp); 1502 return (0); 1503 } 1504 1505 if ((b_nblks + lblkno) > p_nblks) { 1506 bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift); 1507 bp->b_bcount -= bp->b_resid; 1508 } else { 1509 bp->b_resid = 0; 1510 } 1511 func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write; 1512 1513 xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP); 1514 if (xi == NULL) { 1515 xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE); 1516 } 1517 if (xi == NULL) { 1518 /* bd_request_alloc will have done bioerror */ 1519 biodone(bp); 1520 return (0); 1521 } 1522 xi->i_blkno = lblkno + p_lba; 1523 1524 bd_submit(bd, xi); 1525 1526 return (0); 1527 } 1528 1529 static int 1530 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp) 1531 { 1532 minor_t inst; 1533 uint16_t part; 1534 bd_t *bd; 1535 void *ptr = (void *)arg; 1536 int rv; 1537 1538 part = BDPART(dev); 1539 inst = BDINST(dev); 1540 1541 if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) { 1542 return (ENXIO); 1543 } 1544 1545 rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0); 1546 if (rv != ENOTTY) 1547 return (rv); 1548 1549 if (rvalp != NULL) { 1550 /* the return value of the ioctl is 0 by default */ 1551 *rvalp = 0; 1552 } 1553 1554 switch (cmd) { 1555 case DKIOCGMEDIAINFO: { 1556 struct dk_minfo minfo; 1557 1558 /* make sure our state information is current */ 1559 bd_update_state(bd); 1560 bzero(&minfo, sizeof (minfo)); 1561 minfo.dki_media_type = DK_FIXED_DISK; 1562 minfo.dki_lbsize = (1U << bd->d_blkshift); 1563 minfo.dki_capacity = bd->d_numblks; 1564 if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) { 1565 return (EFAULT); 1566 } 1567 return (0); 1568 } 1569 case DKIOCGMEDIAINFOEXT: { 1570 struct dk_minfo_ext miext; 1571 size_t len; 1572 1573 /* make sure our state information is current */ 1574 bd_update_state(bd); 1575 bzero(&miext, sizeof (miext)); 1576 miext.dki_media_type = DK_FIXED_DISK; 1577 miext.dki_lbsize = (1U << bd->d_blkshift); 1578 miext.dki_pbsize = (1U << bd->d_pblkshift); 1579 miext.dki_capacity = bd->d_numblks; 1580 1581 switch (ddi_model_convert_from(flag & FMODELS)) { 1582 case DDI_MODEL_ILP32: 1583 len = sizeof (struct dk_minfo_ext32); 1584 break; 1585 default: 1586 len = sizeof (struct dk_minfo_ext); 1587 break; 1588 } 1589 1590 if (ddi_copyout(&miext, ptr, len, flag)) { 1591 return (EFAULT); 1592 } 1593 return (0); 1594 } 1595 case DKIOCINFO: { 1596 struct dk_cinfo cinfo; 1597 bzero(&cinfo, sizeof (cinfo)); 1598 cinfo.dki_ctype = DKC_BLKDEV; 1599 cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip)); 1600 (void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname), 1601 "%s", ddi_driver_name(ddi_get_parent(bd->d_dip))); 1602 (void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname), 1603 "%s", ddi_driver_name(bd->d_dip)); 1604 cinfo.dki_unit = inst; 1605 cinfo.dki_flags = DKI_FMTVOL; 1606 cinfo.dki_partition = part; 1607 cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE; 1608 cinfo.dki_addr = 0; 1609 cinfo.dki_slave = 0; 1610 cinfo.dki_space = 0; 1611 cinfo.dki_prio = 0; 1612 cinfo.dki_vec = 0; 1613 if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) { 1614 return (EFAULT); 1615 } 1616 return (0); 1617 } 1618 case DKIOCREMOVABLE: { 1619 int i; 1620 i = bd->d_removable ? 1 : 0; 1621 if (ddi_copyout(&i, ptr, sizeof (i), flag)) { 1622 return (EFAULT); 1623 } 1624 return (0); 1625 } 1626 case DKIOCHOTPLUGGABLE: { 1627 int i; 1628 i = bd->d_hotpluggable ? 1 : 0; 1629 if (ddi_copyout(&i, ptr, sizeof (i), flag)) { 1630 return (EFAULT); 1631 } 1632 return (0); 1633 } 1634 case DKIOCREADONLY: { 1635 int i; 1636 i = bd->d_rdonly ? 1 : 0; 1637 if (ddi_copyout(&i, ptr, sizeof (i), flag)) { 1638 return (EFAULT); 1639 } 1640 return (0); 1641 } 1642 case DKIOCSOLIDSTATE: { 1643 int i; 1644 i = bd->d_ssd ? 1 : 0; 1645 if (ddi_copyout(&i, ptr, sizeof (i), flag)) { 1646 return (EFAULT); 1647 } 1648 return (0); 1649 } 1650 case DKIOCSTATE: { 1651 enum dkio_state state; 1652 if (ddi_copyin(ptr, &state, sizeof (state), flag)) { 1653 return (EFAULT); 1654 } 1655 if ((rv = bd_check_state(bd, &state)) != 0) { 1656 return (rv); 1657 } 1658 if (ddi_copyout(&state, ptr, sizeof (state), flag)) { 1659 return (EFAULT); 1660 } 1661 return (0); 1662 } 1663 case DKIOCFLUSHWRITECACHE: { 1664 struct dk_callback *dkc = NULL; 1665 1666 if (flag & FKIOCTL) 1667 dkc = (void *)arg; 1668 1669 rv = bd_flush_write_cache(bd, dkc); 1670 return (rv); 1671 } 1672 case DKIOCFREE: { 1673 dkioc_free_list_t *dfl = NULL; 1674 1675 /* 1676 * Check free space support early to avoid copyin/allocation 1677 * when unnecessary. 1678 */ 1679 if (!CAN_FREESPACE(bd)) 1680 return (ENOTSUP); 1681 1682 rv = dfl_copyin(ptr, &dfl, flag, KM_SLEEP); 1683 if (rv != 0) 1684 return (rv); 1685 1686 /* 1687 * bd_free_space() consumes 'dfl'. bd_free_space() will 1688 * call dfl_iter() which will normally try to pass dfl through 1689 * to bd_free_space_cb() which attaches dfl to the bd_xfer_t 1690 * that is then queued for the underlying driver. Once the 1691 * driver processes the request, the bd_xfer_t instance is 1692 * disposed of, including any attached dkioc_free_list_t. 1693 * 1694 * If dfl cannot be processed by the underlying driver due to 1695 * size or alignment requirements of the driver, dfl_iter() 1696 * will replace dfl with one or more new dkioc_free_list_t 1697 * instances with the correct alignment and sizes for the driver 1698 * (and free the original dkioc_free_list_t). 1699 */ 1700 rv = bd_free_space(dev, bd, dfl); 1701 return (rv); 1702 } 1703 1704 case DKIOC_CANFREE: { 1705 boolean_t supported = CAN_FREESPACE(bd); 1706 1707 if (ddi_copyout(&supported, (void *)arg, sizeof (supported), 1708 flag) != 0) { 1709 return (EFAULT); 1710 } 1711 1712 return (0); 1713 } 1714 1715 default: 1716 break; 1717 1718 } 1719 return (ENOTTY); 1720 } 1721 1722 static int 1723 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1724 char *name, caddr_t valuep, int *lengthp) 1725 { 1726 bd_t *bd; 1727 1728 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip)); 1729 if (bd == NULL) 1730 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1731 name, valuep, lengthp)); 1732 1733 return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name, 1734 valuep, lengthp, BDPART(dev), 0)); 1735 } 1736 1737 1738 static int 1739 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 1740 size_t length, void *tg_cookie) 1741 { 1742 bd_t *bd; 1743 buf_t *bp; 1744 bd_xfer_impl_t *xi; 1745 int rv; 1746 int (*func)(void *, bd_xfer_t *); 1747 int kmflag; 1748 1749 /* 1750 * If we are running in polled mode (such as during dump(9e) 1751 * execution), then we cannot sleep for kernel allocations. 1752 */ 1753 kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP; 1754 1755 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip)); 1756 1757 if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) { 1758 /* We can only transfer whole blocks at a time! */ 1759 return (EINVAL); 1760 } 1761 1762 if ((bp = getrbuf(kmflag)) == NULL) { 1763 return (ENOMEM); 1764 } 1765 1766 switch (cmd) { 1767 case TG_READ: 1768 bp->b_flags = B_READ; 1769 func = bd->d_ops.o_read; 1770 break; 1771 case TG_WRITE: 1772 bp->b_flags = B_WRITE; 1773 func = bd->d_ops.o_write; 1774 break; 1775 default: 1776 freerbuf(bp); 1777 return (EINVAL); 1778 } 1779 1780 bp->b_un.b_addr = bufaddr; 1781 bp->b_bcount = length; 1782 xi = bd_xfer_alloc(bd, bp, func, kmflag); 1783 if (xi == NULL) { 1784 rv = geterror(bp); 1785 freerbuf(bp); 1786 return (rv); 1787 } 1788 xi->i_flags = tg_cookie ? BD_XFER_POLL : 0; 1789 xi->i_blkno = start; 1790 bd_submit(bd, xi); 1791 (void) biowait(bp); 1792 rv = geterror(bp); 1793 freerbuf(bp); 1794 1795 return (rv); 1796 } 1797 1798 static int 1799 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 1800 { 1801 bd_t *bd; 1802 1803 _NOTE(ARGUNUSED(tg_cookie)); 1804 bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip)); 1805 1806 switch (cmd) { 1807 case TG_GETPHYGEOM: 1808 case TG_GETVIRTGEOM: 1809 /* 1810 * We don't have any "geometry" as such, let cmlb 1811 * fabricate something. 1812 */ 1813 return (ENOTTY); 1814 1815 case TG_GETCAPACITY: 1816 bd_update_state(bd); 1817 *(diskaddr_t *)arg = bd->d_numblks; 1818 return (0); 1819 1820 case TG_GETBLOCKSIZE: 1821 *(uint32_t *)arg = (1U << bd->d_blkshift); 1822 return (0); 1823 1824 case TG_GETATTR: 1825 /* 1826 * It turns out that cmlb really doesn't do much for 1827 * non-writable media, but lets make the information 1828 * available for it in case it does more in the 1829 * future. (The value is currently used for 1830 * triggering special behavior for CD-ROMs.) 1831 */ 1832 bd_update_state(bd); 1833 ((tg_attribute_t *)arg)->media_is_writable = 1834 bd->d_rdonly ? B_FALSE : B_TRUE; 1835 ((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd; 1836 ((tg_attribute_t *)arg)->media_is_rotational = B_FALSE; 1837 return (0); 1838 1839 default: 1840 return (EINVAL); 1841 } 1842 } 1843 1844 1845 static void 1846 bd_sched(bd_t *bd, bd_queue_t *bq) 1847 { 1848 bd_xfer_impl_t *xi; 1849 struct buf *bp; 1850 int rv; 1851 1852 mutex_enter(&bq->q_iomutex); 1853 1854 while ((bq->q_qactive < bq->q_qsize) && 1855 ((xi = list_remove_head(&bq->q_waitq)) != NULL)) { 1856 mutex_enter(&bd->d_ksmutex); 1857 kstat_waitq_to_runq(bd->d_kiop); 1858 mutex_exit(&bd->d_ksmutex); 1859 1860 bq->q_qactive++; 1861 list_insert_tail(&bq->q_runq, xi); 1862 1863 /* 1864 * Submit the job to the driver. We drop the I/O mutex 1865 * so that we can deal with the case where the driver 1866 * completion routine calls back into us synchronously. 1867 */ 1868 1869 mutex_exit(&bq->q_iomutex); 1870 1871 rv = xi->i_func(bd->d_private, &xi->i_public); 1872 if (rv != 0) { 1873 bp = xi->i_bp; 1874 bioerror(bp, rv); 1875 biodone(bp); 1876 1877 atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32); 1878 1879 mutex_enter(&bq->q_iomutex); 1880 1881 mutex_enter(&bd->d_ksmutex); 1882 kstat_runq_exit(bd->d_kiop); 1883 mutex_exit(&bd->d_ksmutex); 1884 1885 bq->q_qactive--; 1886 list_remove(&bq->q_runq, xi); 1887 bd_xfer_free(xi); 1888 } else { 1889 mutex_enter(&bq->q_iomutex); 1890 } 1891 } 1892 1893 mutex_exit(&bq->q_iomutex); 1894 } 1895 1896 static void 1897 bd_submit(bd_t *bd, bd_xfer_impl_t *xi) 1898 { 1899 uint64_t nv = atomic_inc_64_nv(&bd->d_io_counter); 1900 unsigned q = nv % bd->d_qcount; 1901 bd_queue_t *bq = &bd->d_queues[q]; 1902 1903 xi->i_bq = bq; 1904 xi->i_qnum = q; 1905 1906 mutex_enter(&bq->q_iomutex); 1907 1908 list_insert_tail(&bq->q_waitq, xi); 1909 1910 mutex_enter(&bd->d_ksmutex); 1911 kstat_waitq_enter(bd->d_kiop); 1912 mutex_exit(&bd->d_ksmutex); 1913 1914 mutex_exit(&bq->q_iomutex); 1915 1916 bd_sched(bd, bq); 1917 } 1918 1919 static void 1920 bd_runq_exit(bd_xfer_impl_t *xi, int err) 1921 { 1922 bd_t *bd = xi->i_bd; 1923 buf_t *bp = xi->i_bp; 1924 bd_queue_t *bq = xi->i_bq; 1925 1926 mutex_enter(&bq->q_iomutex); 1927 bq->q_qactive--; 1928 1929 mutex_enter(&bd->d_ksmutex); 1930 kstat_runq_exit(bd->d_kiop); 1931 mutex_exit(&bd->d_ksmutex); 1932 1933 list_remove(&bq->q_runq, xi); 1934 mutex_exit(&bq->q_iomutex); 1935 1936 if (err == 0) { 1937 if (bp->b_flags & B_READ) { 1938 atomic_inc_uint(&bd->d_kiop->reads); 1939 atomic_add_64((uint64_t *)&bd->d_kiop->nread, 1940 bp->b_bcount - xi->i_resid); 1941 } else { 1942 atomic_inc_uint(&bd->d_kiop->writes); 1943 atomic_add_64((uint64_t *)&bd->d_kiop->nwritten, 1944 bp->b_bcount - xi->i_resid); 1945 } 1946 } 1947 bd_sched(bd, bq); 1948 } 1949 1950 static void 1951 bd_dle_sysevent_task(void *arg) 1952 { 1953 nvlist_t *attr = NULL; 1954 char *path = NULL; 1955 bd_t *bd = arg; 1956 dev_info_t *dip = bd->d_dip; 1957 size_t n; 1958 1959 mutex_enter(&bd->d_dle_mutex); 1960 bd->d_dle_state &= ~BD_DLE_PENDING; 1961 bd->d_dle_state |= BD_DLE_RUNNING; 1962 mutex_exit(&bd->d_dle_mutex); 1963 1964 dev_err(dip, CE_NOTE, "!dynamic LUN expansion"); 1965 1966 if (nvlist_alloc(&attr, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) { 1967 mutex_enter(&bd->d_dle_mutex); 1968 bd->d_dle_state &= ~(BD_DLE_RUNNING|BD_DLE_PENDING); 1969 mutex_exit(&bd->d_dle_mutex); 1970 return; 1971 } 1972 1973 path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 1974 1975 n = snprintf(path, MAXPATHLEN, "/devices"); 1976 (void) ddi_pathname(dip, path + n); 1977 n = strlen(path); 1978 n += snprintf(path + n, MAXPATHLEN - n, ":x"); 1979 1980 for (;;) { 1981 /* 1982 * On receipt of this event, the ZFS sysevent module will scan 1983 * active zpools for child vdevs matching this physical path. 1984 * In order to catch both whole disk pools and those with an 1985 * EFI boot partition, generate separate sysevents for minor 1986 * node 'a' and 'b'. 1987 */ 1988 for (char c = 'a'; c < 'c'; c++) { 1989 path[n - 1] = c; 1990 1991 if (nvlist_add_string(attr, DEV_PHYS_PATH, path) != 0) 1992 break; 1993 1994 (void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, 1995 EC_DEV_STATUS, ESC_DEV_DLE, attr, NULL, DDI_SLEEP); 1996 } 1997 1998 mutex_enter(&bd->d_dle_mutex); 1999 if ((bd->d_dle_state & BD_DLE_PENDING) == 0) { 2000 bd->d_dle_state &= ~BD_DLE_RUNNING; 2001 mutex_exit(&bd->d_dle_mutex); 2002 break; 2003 } 2004 bd->d_dle_state &= ~BD_DLE_PENDING; 2005 mutex_exit(&bd->d_dle_mutex); 2006 } 2007 2008 nvlist_free(attr); 2009 kmem_free(path, MAXPATHLEN); 2010 } 2011 2012 static void 2013 bd_update_state(bd_t *bd) 2014 { 2015 enum dkio_state state = DKIO_INSERTED; 2016 boolean_t docmlb = B_FALSE; 2017 bd_media_t media; 2018 2019 bzero(&media, sizeof (media)); 2020 2021 mutex_enter(&bd->d_statemutex); 2022 if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) { 2023 bd->d_numblks = 0; 2024 state = DKIO_EJECTED; 2025 goto done; 2026 } 2027 2028 if ((media.m_blksize < 512) || 2029 (!ISP2(media.m_blksize)) || 2030 (P2PHASE(bd->d_maxxfer, media.m_blksize))) { 2031 dev_err(bd->d_dip, CE_WARN, "Invalid media block size (%d)", 2032 media.m_blksize); 2033 /* 2034 * We can't use the media, treat it as not present. 2035 */ 2036 state = DKIO_EJECTED; 2037 bd->d_numblks = 0; 2038 goto done; 2039 } 2040 2041 if (((1U << bd->d_blkshift) != media.m_blksize) || 2042 (bd->d_numblks != media.m_nblks)) { 2043 /* Device size changed */ 2044 docmlb = B_TRUE; 2045 } 2046 2047 bd->d_blkshift = ddi_ffs(media.m_blksize) - 1; 2048 bd->d_pblkshift = bd->d_blkshift; 2049 bd->d_numblks = media.m_nblks; 2050 bd->d_rdonly = media.m_readonly; 2051 bd->d_ssd = media.m_solidstate; 2052 2053 /* 2054 * Only use the supplied physical block size if it is non-zero, 2055 * greater or equal to the block size, and a power of 2. Ignore it 2056 * if not, it's just informational and we can still use the media. 2057 */ 2058 if ((media.m_pblksize != 0) && 2059 (media.m_pblksize >= media.m_blksize) && 2060 (ISP2(media.m_pblksize))) 2061 bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1; 2062 2063 done: 2064 if (state != bd->d_state) { 2065 bd->d_state = state; 2066 cv_broadcast(&bd->d_statecv); 2067 docmlb = B_TRUE; 2068 } 2069 mutex_exit(&bd->d_statemutex); 2070 2071 bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift; 2072 2073 if (docmlb) { 2074 if (state == DKIO_INSERTED) { 2075 (void) cmlb_validate(bd->d_cmlbh, 0, 0); 2076 2077 mutex_enter(&bd->d_dle_mutex); 2078 /* 2079 * If there is already an event pending, there's 2080 * nothing to do; we coalesce multiple events. 2081 */ 2082 if ((bd->d_dle_state & BD_DLE_PENDING) == 0) { 2083 if ((bd->d_dle_state & BD_DLE_RUNNING) == 0) { 2084 taskq_dispatch_ent(bd_taskq, 2085 bd_dle_sysevent_task, bd, 0, 2086 &bd->d_dle_ent); 2087 } 2088 bd->d_dle_state |= BD_DLE_PENDING; 2089 } 2090 mutex_exit(&bd->d_dle_mutex); 2091 } else { 2092 cmlb_invalidate(bd->d_cmlbh, 0); 2093 } 2094 } 2095 } 2096 2097 static int 2098 bd_check_state(bd_t *bd, enum dkio_state *state) 2099 { 2100 clock_t when; 2101 2102 for (;;) { 2103 2104 bd_update_state(bd); 2105 2106 mutex_enter(&bd->d_statemutex); 2107 2108 if (bd->d_state != *state) { 2109 *state = bd->d_state; 2110 mutex_exit(&bd->d_statemutex); 2111 break; 2112 } 2113 2114 when = drv_usectohz(1000000); 2115 if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex, 2116 when, TR_CLOCK_TICK) == 0) { 2117 mutex_exit(&bd->d_statemutex); 2118 return (EINTR); 2119 } 2120 2121 mutex_exit(&bd->d_statemutex); 2122 } 2123 2124 return (0); 2125 } 2126 2127 static int 2128 bd_flush_write_cache_done(struct buf *bp) 2129 { 2130 struct dk_callback *dc = (void *)bp->b_private; 2131 2132 (*dc->dkc_callback)(dc->dkc_cookie, geterror(bp)); 2133 kmem_free(dc, sizeof (*dc)); 2134 freerbuf(bp); 2135 return (0); 2136 } 2137 2138 static int 2139 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc) 2140 { 2141 buf_t *bp; 2142 struct dk_callback *dc; 2143 bd_xfer_impl_t *xi; 2144 int rv; 2145 2146 if (bd->d_ops.o_sync_cache == NULL) { 2147 return (ENOTSUP); 2148 } 2149 if ((bp = getrbuf(KM_SLEEP)) == NULL) { 2150 return (ENOMEM); 2151 } 2152 bp->b_resid = 0; 2153 bp->b_bcount = 0; 2154 2155 xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP); 2156 if (xi == NULL) { 2157 rv = geterror(bp); 2158 freerbuf(bp); 2159 return (rv); 2160 } 2161 2162 /* Make an asynchronous flush, but only if there is a callback */ 2163 if (dkc != NULL && dkc->dkc_callback != NULL) { 2164 /* Make a private copy of the callback structure */ 2165 dc = kmem_alloc(sizeof (*dc), KM_SLEEP); 2166 *dc = *dkc; 2167 bp->b_private = dc; 2168 bp->b_iodone = bd_flush_write_cache_done; 2169 2170 bd_submit(bd, xi); 2171 return (0); 2172 } 2173 2174 /* In case there is no callback, perform a synchronous flush */ 2175 bd_submit(bd, xi); 2176 (void) biowait(bp); 2177 rv = geterror(bp); 2178 freerbuf(bp); 2179 2180 return (rv); 2181 } 2182 2183 static int 2184 bd_free_space_done(struct buf *bp) 2185 { 2186 freerbuf(bp); 2187 return (0); 2188 } 2189 2190 static int 2191 bd_free_space_cb(dkioc_free_list_t *dfl, void *arg, int kmflag) 2192 { 2193 bd_t *bd = arg; 2194 buf_t *bp = NULL; 2195 bd_xfer_impl_t *xi = NULL; 2196 boolean_t sync = DFL_ISSYNC(dfl) ? B_TRUE : B_FALSE; 2197 int rv = 0; 2198 2199 bp = getrbuf(KM_SLEEP); 2200 bp->b_resid = 0; 2201 bp->b_bcount = 0; 2202 bp->b_lblkno = 0; 2203 2204 xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_free_space, kmflag); 2205 xi->i_dfl = dfl; 2206 2207 if (!sync) { 2208 bp->b_iodone = bd_free_space_done; 2209 bd_submit(bd, xi); 2210 return (0); 2211 } 2212 2213 xi->i_flags |= BD_XFER_POLL; 2214 bd_submit(bd, xi); 2215 2216 (void) biowait(bp); 2217 rv = geterror(bp); 2218 freerbuf(bp); 2219 2220 return (rv); 2221 } 2222 2223 static int 2224 bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl) 2225 { 2226 diskaddr_t p_len, p_offset; 2227 uint64_t offset_bytes, len_bytes; 2228 minor_t part = BDPART(dev); 2229 const uint_t bshift = bd->d_blkshift; 2230 dkioc_free_info_t dfi = { 2231 .dfi_bshift = bshift, 2232 .dfi_align = bd->d_free_align << bshift, 2233 .dfi_max_bytes = bd->d_max_free_blks << bshift, 2234 .dfi_max_ext = bd->d_max_free_seg, 2235 .dfi_max_ext_bytes = bd->d_max_free_seg_blks << bshift, 2236 }; 2237 2238 if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL, 2239 NULL, 0) != 0) { 2240 dfl_free(dfl); 2241 return (ENXIO); 2242 } 2243 2244 /* 2245 * bd_ioctl created our own copy of dfl, so we can modify as 2246 * necessary 2247 */ 2248 offset_bytes = (uint64_t)p_offset << bshift; 2249 len_bytes = (uint64_t)p_len << bshift; 2250 2251 dfl->dfl_offset += offset_bytes; 2252 if (dfl->dfl_offset < offset_bytes) { 2253 dfl_free(dfl); 2254 return (EOVERFLOW); 2255 } 2256 2257 return (dfl_iter(dfl, &dfi, offset_bytes + len_bytes, bd_free_space_cb, 2258 bd, KM_SLEEP)); 2259 } 2260 2261 /* 2262 * Nexus support. 2263 */ 2264 int 2265 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, 2266 void *arg, void *result) 2267 { 2268 bd_handle_t hdl; 2269 2270 switch (ctlop) { 2271 case DDI_CTLOPS_REPORTDEV: 2272 cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n", 2273 ddi_node_name(rdip), ddi_get_name_addr(rdip), 2274 ddi_driver_name(rdip), ddi_get_instance(rdip)); 2275 return (DDI_SUCCESS); 2276 2277 case DDI_CTLOPS_INITCHILD: 2278 hdl = ddi_get_parent_data((dev_info_t *)arg); 2279 if (hdl == NULL) { 2280 return (DDI_NOT_WELL_FORMED); 2281 } 2282 ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr); 2283 return (DDI_SUCCESS); 2284 2285 case DDI_CTLOPS_UNINITCHILD: 2286 ddi_set_name_addr((dev_info_t *)arg, NULL); 2287 ndi_prop_remove_all((dev_info_t *)arg); 2288 return (DDI_SUCCESS); 2289 2290 default: 2291 return (ddi_ctlops(dip, rdip, ctlop, arg, result)); 2292 } 2293 } 2294 2295 /* 2296 * Functions for device drivers. 2297 */ 2298 bd_handle_t 2299 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag) 2300 { 2301 bd_handle_t hdl; 2302 2303 switch (ops->o_version) { 2304 case BD_OPS_VERSION_0: 2305 case BD_OPS_VERSION_1: 2306 case BD_OPS_VERSION_2: 2307 break; 2308 2309 default: 2310 /* Unsupported version */ 2311 return (NULL); 2312 } 2313 2314 hdl = kmem_zalloc(sizeof (*hdl), kmflag); 2315 if (hdl == NULL) { 2316 return (NULL); 2317 } 2318 2319 switch (ops->o_version) { 2320 case BD_OPS_VERSION_2: 2321 hdl->h_ops.o_free_space = ops->o_free_space; 2322 /*FALLTHRU*/ 2323 case BD_OPS_VERSION_1: 2324 case BD_OPS_VERSION_0: 2325 hdl->h_ops.o_drive_info = ops->o_drive_info; 2326 hdl->h_ops.o_media_info = ops->o_media_info; 2327 hdl->h_ops.o_devid_init = ops->o_devid_init; 2328 hdl->h_ops.o_sync_cache = ops->o_sync_cache; 2329 hdl->h_ops.o_read = ops->o_read; 2330 hdl->h_ops.o_write = ops->o_write; 2331 break; 2332 } 2333 2334 hdl->h_dma = dma; 2335 hdl->h_private = private; 2336 2337 return (hdl); 2338 } 2339 2340 void 2341 bd_free_handle(bd_handle_t hdl) 2342 { 2343 kmem_free(hdl, sizeof (*hdl)); 2344 } 2345 2346 int 2347 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl) 2348 { 2349 bd_drive_t drive = { 0 }; 2350 dev_info_t *child; 2351 size_t len; 2352 2353 /* 2354 * It's not an error if bd_attach_handle() is called on a handle that 2355 * already is attached. We just ignore the request to attach and return. 2356 * This way drivers using blkdev don't have to keep track about blkdev 2357 * state, they can just call this function to make sure it attached. 2358 */ 2359 if (hdl->h_child != NULL) { 2360 return (DDI_SUCCESS); 2361 } 2362 2363 /* if drivers don't override this, make it assume none */ 2364 drive.d_lun = -1; 2365 hdl->h_ops.o_drive_info(hdl->h_private, &drive); 2366 2367 hdl->h_parent = dip; 2368 hdl->h_name = "blkdev"; 2369 2370 /* 2371 * Prefer the GUID over the EUI64. 2372 */ 2373 if (*(uint64_t *)drive.d_guid != 0 || 2374 *((uint64_t *)drive.d_guid + 1) != 0) { 2375 len = snprintf(hdl->h_addr, sizeof (hdl->h_addr), 2376 "w%02X%02X%02X%02X%02X%02X%02X%02X" 2377 "%02X%02X%02X%02X%02X%02X%02X%02X", 2378 drive.d_guid[0], drive.d_guid[1], drive.d_guid[2], 2379 drive.d_guid[3], drive.d_guid[4], drive.d_guid[5], 2380 drive.d_guid[6], drive.d_guid[7], drive.d_guid[8], 2381 drive.d_guid[9], drive.d_guid[10], drive.d_guid[11], 2382 drive.d_guid[12], drive.d_guid[13], drive.d_guid[14], 2383 drive.d_guid[15]); 2384 } else if (*(uint64_t *)drive.d_eui64 != 0) { 2385 len = snprintf(hdl->h_addr, sizeof (hdl->h_addr), 2386 "w%02X%02X%02X%02X%02X%02X%02X%02X", 2387 drive.d_eui64[0], drive.d_eui64[1], 2388 drive.d_eui64[2], drive.d_eui64[3], 2389 drive.d_eui64[4], drive.d_eui64[5], 2390 drive.d_eui64[6], drive.d_eui64[7]); 2391 } else { 2392 len = snprintf(hdl->h_addr, sizeof (hdl->h_addr), 2393 "%X", drive.d_target); 2394 } 2395 2396 VERIFY(len <= sizeof (hdl->h_addr)); 2397 2398 if (drive.d_lun >= 0) { 2399 (void) snprintf(hdl->h_addr + len, sizeof (hdl->h_addr) - len, 2400 ",%X", drive.d_lun); 2401 } 2402 2403 if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID, 2404 &child) != NDI_SUCCESS) { 2405 cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s", 2406 ddi_driver_name(dip), ddi_get_instance(dip), 2407 "blkdev", hdl->h_addr); 2408 return (DDI_FAILURE); 2409 } 2410 2411 ddi_set_parent_data(child, hdl); 2412 hdl->h_child = child; 2413 2414 if (ndi_devi_online(child, 0) != NDI_SUCCESS) { 2415 cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online", 2416 ddi_driver_name(dip), ddi_get_instance(dip), 2417 hdl->h_name, hdl->h_addr); 2418 (void) ndi_devi_free(child); 2419 hdl->h_child = NULL; 2420 return (DDI_FAILURE); 2421 } 2422 2423 return (DDI_SUCCESS); 2424 } 2425 2426 int 2427 bd_detach_handle(bd_handle_t hdl) 2428 { 2429 int rv; 2430 char *devnm; 2431 2432 /* 2433 * It's not an error if bd_detach_handle() is called on a handle that 2434 * already is detached. We just ignore the request to detach and return. 2435 * This way drivers using blkdev don't have to keep track about blkdev 2436 * state, they can just call this function to make sure it detached. 2437 */ 2438 if (hdl->h_child == NULL) { 2439 return (DDI_SUCCESS); 2440 } 2441 ndi_devi_enter(hdl->h_parent); 2442 if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) { 2443 rv = ddi_remove_child(hdl->h_child, 0); 2444 } else { 2445 devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); 2446 (void) ddi_deviname(hdl->h_child, devnm); 2447 (void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE); 2448 rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL, 2449 NDI_DEVI_REMOVE | NDI_UNCONFIG); 2450 kmem_free(devnm, MAXNAMELEN + 1); 2451 } 2452 if (rv == 0) { 2453 hdl->h_child = NULL; 2454 } 2455 2456 ndi_devi_exit(hdl->h_parent); 2457 return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); 2458 } 2459 2460 void 2461 bd_xfer_done(bd_xfer_t *xfer, int err) 2462 { 2463 bd_xfer_impl_t *xi = (void *)xfer; 2464 buf_t *bp = xi->i_bp; 2465 int rv = DDI_SUCCESS; 2466 bd_t *bd = xi->i_bd; 2467 size_t len; 2468 2469 if (err != 0) { 2470 bd_runq_exit(xi, err); 2471 atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32); 2472 2473 bp->b_resid += xi->i_resid; 2474 bd_xfer_free(xi); 2475 bioerror(bp, err); 2476 biodone(bp); 2477 return; 2478 } 2479 2480 xi->i_cur_win++; 2481 xi->i_resid -= xi->i_len; 2482 2483 if (xi->i_resid == 0) { 2484 /* Job completed succcessfully! */ 2485 bd_runq_exit(xi, 0); 2486 2487 bd_xfer_free(xi); 2488 biodone(bp); 2489 return; 2490 } 2491 2492 xi->i_blkno += xi->i_nblks; 2493 2494 if (bd->d_use_dma) { 2495 /* More transfer still pending... advance to next DMA window. */ 2496 rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win, 2497 &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac); 2498 } else { 2499 /* Advance memory window. */ 2500 xi->i_kaddr += xi->i_len; 2501 xi->i_offset += xi->i_len; 2502 len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer); 2503 } 2504 2505 2506 if ((rv != DDI_SUCCESS) || 2507 (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) { 2508 bd_runq_exit(xi, EFAULT); 2509 2510 bp->b_resid += xi->i_resid; 2511 bd_xfer_free(xi); 2512 bioerror(bp, EFAULT); 2513 biodone(bp); 2514 return; 2515 } 2516 xi->i_len = len; 2517 xi->i_nblks = len >> xi->i_blkshift; 2518 2519 /* Submit next window to hardware. */ 2520 rv = xi->i_func(bd->d_private, &xi->i_public); 2521 if (rv != 0) { 2522 bd_runq_exit(xi, rv); 2523 2524 atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32); 2525 2526 bp->b_resid += xi->i_resid; 2527 bd_xfer_free(xi); 2528 bioerror(bp, rv); 2529 biodone(bp); 2530 } 2531 } 2532 2533 void 2534 bd_error(bd_xfer_t *xfer, int error) 2535 { 2536 bd_xfer_impl_t *xi = (void *)xfer; 2537 bd_t *bd = xi->i_bd; 2538 2539 switch (error) { 2540 case BD_ERR_MEDIA: 2541 atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32); 2542 break; 2543 case BD_ERR_NTRDY: 2544 atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32); 2545 break; 2546 case BD_ERR_NODEV: 2547 atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32); 2548 break; 2549 case BD_ERR_RECOV: 2550 atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32); 2551 break; 2552 case BD_ERR_ILLRQ: 2553 atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32); 2554 break; 2555 case BD_ERR_PFA: 2556 atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32); 2557 break; 2558 default: 2559 cmn_err(CE_PANIC, "bd_error: unknown error type %d", error); 2560 break; 2561 } 2562 } 2563 2564 void 2565 bd_state_change(bd_handle_t hdl) 2566 { 2567 bd_t *bd; 2568 2569 if ((bd = hdl->h_bd) != NULL) { 2570 bd_update_state(bd); 2571 } 2572 } 2573 2574 void 2575 bd_mod_init(struct dev_ops *devops) 2576 { 2577 static struct bus_ops bd_bus_ops = { 2578 BUSO_REV, /* busops_rev */ 2579 nullbusmap, /* bus_map */ 2580 NULL, /* bus_get_intrspec (OBSOLETE) */ 2581 NULL, /* bus_add_intrspec (OBSOLETE) */ 2582 NULL, /* bus_remove_intrspec (OBSOLETE) */ 2583 i_ddi_map_fault, /* bus_map_fault */ 2584 NULL, /* bus_dma_map (OBSOLETE) */ 2585 ddi_dma_allochdl, /* bus_dma_allochdl */ 2586 ddi_dma_freehdl, /* bus_dma_freehdl */ 2587 ddi_dma_bindhdl, /* bus_dma_bindhdl */ 2588 ddi_dma_unbindhdl, /* bus_dma_unbindhdl */ 2589 ddi_dma_flush, /* bus_dma_flush */ 2590 ddi_dma_win, /* bus_dma_win */ 2591 ddi_dma_mctl, /* bus_dma_ctl */ 2592 bd_bus_ctl, /* bus_ctl */ 2593 ddi_bus_prop_op, /* bus_prop_op */ 2594 NULL, /* bus_get_eventcookie */ 2595 NULL, /* bus_add_eventcall */ 2596 NULL, /* bus_remove_eventcall */ 2597 NULL, /* bus_post_event */ 2598 NULL, /* bus_intr_ctl (OBSOLETE) */ 2599 NULL, /* bus_config */ 2600 NULL, /* bus_unconfig */ 2601 NULL, /* bus_fm_init */ 2602 NULL, /* bus_fm_fini */ 2603 NULL, /* bus_fm_access_enter */ 2604 NULL, /* bus_fm_access_exit */ 2605 NULL, /* bus_power */ 2606 NULL, /* bus_intr_op */ 2607 }; 2608 2609 devops->devo_bus_ops = &bd_bus_ops; 2610 2611 /* 2612 * NB: The device driver is free to supply its own 2613 * character entry device support. 2614 */ 2615 } 2616 2617 void 2618 bd_mod_fini(struct dev_ops *devops) 2619 { 2620 devops->devo_bus_ops = NULL; 2621 } 2622