1 /* SPDX-License-Identifier: MIT */ 2 /* 3 * Description: uring_cmd based ublk 4 */ 5 6 #include "kublk.h" 7 8 #define MAX_NR_TGT_ARG 64 9 10 unsigned int ublk_dbg_mask = UBLK_LOG; 11 static const struct ublk_tgt_ops *tgt_ops_list[] = { 12 &null_tgt_ops, 13 &loop_tgt_ops, 14 &stripe_tgt_ops, 15 &fault_inject_tgt_ops, 16 }; 17 18 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) 19 { 20 int i; 21 22 if (name == NULL) 23 return NULL; 24 25 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) 26 if (strcmp(tgt_ops_list[i]->name, name) == 0) 27 return tgt_ops_list[i]; 28 return NULL; 29 } 30 31 static inline int ublk_setup_ring(struct io_uring *r, int depth, 32 int cq_depth, unsigned flags) 33 { 34 struct io_uring_params p; 35 36 memset(&p, 0, sizeof(p)); 37 p.flags = flags | IORING_SETUP_CQSIZE; 38 p.cq_entries = cq_depth; 39 40 return io_uring_queue_init_params(depth, r, &p); 41 } 42 43 static void ublk_ctrl_init_cmd(struct ublk_dev *dev, 44 struct io_uring_sqe *sqe, 45 struct ublk_ctrl_cmd_data *data) 46 { 47 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 48 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); 49 50 sqe->fd = dev->ctrl_fd; 51 sqe->opcode = IORING_OP_URING_CMD; 52 sqe->ioprio = 0; 53 54 if (data->flags & CTRL_CMD_HAS_BUF) { 55 cmd->addr = data->addr; 56 cmd->len = data->len; 57 } 58 59 if (data->flags & CTRL_CMD_HAS_DATA) 60 cmd->data[0] = data->data[0]; 61 62 cmd->dev_id = info->dev_id; 63 cmd->queue_id = -1; 64 65 ublk_set_sqe_cmd_op(sqe, data->cmd_op); 66 67 io_uring_sqe_set_data(sqe, cmd); 68 } 69 70 static int __ublk_ctrl_cmd(struct ublk_dev *dev, 71 struct ublk_ctrl_cmd_data *data) 72 { 73 struct io_uring_sqe *sqe; 74 struct io_uring_cqe *cqe; 75 int ret = -EINVAL; 76 77 sqe = io_uring_get_sqe(&dev->ring); 78 if (!sqe) { 79 ublk_err("%s: can't get sqe ret %d\n", __func__, ret); 80 return ret; 81 } 82 83 ublk_ctrl_init_cmd(dev, sqe, data); 84 85 ret = io_uring_submit(&dev->ring); 86 if (ret < 0) { 87 ublk_err("uring submit ret %d\n", ret); 88 return ret; 89 } 90 91 ret = io_uring_wait_cqe(&dev->ring, &cqe); 92 if (ret < 0) { 93 ublk_err("wait cqe: %s\n", strerror(-ret)); 94 return ret; 95 } 96 io_uring_cqe_seen(&dev->ring, cqe); 97 98 return cqe->res; 99 } 100 101 static int ublk_ctrl_stop_dev(struct ublk_dev *dev) 102 { 103 struct ublk_ctrl_cmd_data data = { 104 .cmd_op = UBLK_U_CMD_STOP_DEV, 105 }; 106 107 return __ublk_ctrl_cmd(dev, &data); 108 } 109 110 static int ublk_ctrl_start_dev(struct ublk_dev *dev, 111 int daemon_pid) 112 { 113 struct ublk_ctrl_cmd_data data = { 114 .cmd_op = UBLK_U_CMD_START_DEV, 115 .flags = CTRL_CMD_HAS_DATA, 116 }; 117 118 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 119 120 return __ublk_ctrl_cmd(dev, &data); 121 } 122 123 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev) 124 { 125 struct ublk_ctrl_cmd_data data = { 126 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY, 127 }; 128 129 return __ublk_ctrl_cmd(dev, &data); 130 } 131 132 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid) 133 { 134 struct ublk_ctrl_cmd_data data = { 135 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY, 136 .flags = CTRL_CMD_HAS_DATA, 137 }; 138 139 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 140 141 return __ublk_ctrl_cmd(dev, &data); 142 } 143 144 static int ublk_ctrl_add_dev(struct ublk_dev *dev) 145 { 146 struct ublk_ctrl_cmd_data data = { 147 .cmd_op = UBLK_U_CMD_ADD_DEV, 148 .flags = CTRL_CMD_HAS_BUF, 149 .addr = (__u64) (uintptr_t) &dev->dev_info, 150 .len = sizeof(struct ublksrv_ctrl_dev_info), 151 }; 152 153 return __ublk_ctrl_cmd(dev, &data); 154 } 155 156 static int ublk_ctrl_del_dev(struct ublk_dev *dev) 157 { 158 struct ublk_ctrl_cmd_data data = { 159 .cmd_op = UBLK_U_CMD_DEL_DEV, 160 .flags = 0, 161 }; 162 163 return __ublk_ctrl_cmd(dev, &data); 164 } 165 166 static int ublk_ctrl_get_info(struct ublk_dev *dev) 167 { 168 struct ublk_ctrl_cmd_data data = { 169 .cmd_op = UBLK_U_CMD_GET_DEV_INFO, 170 .flags = CTRL_CMD_HAS_BUF, 171 .addr = (__u64) (uintptr_t) &dev->dev_info, 172 .len = sizeof(struct ublksrv_ctrl_dev_info), 173 }; 174 175 return __ublk_ctrl_cmd(dev, &data); 176 } 177 178 static int ublk_ctrl_set_params(struct ublk_dev *dev, 179 struct ublk_params *params) 180 { 181 struct ublk_ctrl_cmd_data data = { 182 .cmd_op = UBLK_U_CMD_SET_PARAMS, 183 .flags = CTRL_CMD_HAS_BUF, 184 .addr = (__u64) (uintptr_t) params, 185 .len = sizeof(*params), 186 }; 187 params->len = sizeof(*params); 188 return __ublk_ctrl_cmd(dev, &data); 189 } 190 191 static int ublk_ctrl_get_params(struct ublk_dev *dev, 192 struct ublk_params *params) 193 { 194 struct ublk_ctrl_cmd_data data = { 195 .cmd_op = UBLK_U_CMD_GET_PARAMS, 196 .flags = CTRL_CMD_HAS_BUF, 197 .addr = (__u64)params, 198 .len = sizeof(*params), 199 }; 200 201 params->len = sizeof(*params); 202 203 return __ublk_ctrl_cmd(dev, &data); 204 } 205 206 static int ublk_ctrl_get_features(struct ublk_dev *dev, 207 __u64 *features) 208 { 209 struct ublk_ctrl_cmd_data data = { 210 .cmd_op = UBLK_U_CMD_GET_FEATURES, 211 .flags = CTRL_CMD_HAS_BUF, 212 .addr = (__u64) (uintptr_t) features, 213 .len = sizeof(*features), 214 }; 215 216 return __ublk_ctrl_cmd(dev, &data); 217 } 218 219 static int ublk_ctrl_update_size(struct ublk_dev *dev, 220 __u64 nr_sects) 221 { 222 struct ublk_ctrl_cmd_data data = { 223 .cmd_op = UBLK_U_CMD_UPDATE_SIZE, 224 .flags = CTRL_CMD_HAS_DATA, 225 }; 226 227 data.data[0] = nr_sects; 228 return __ublk_ctrl_cmd(dev, &data); 229 } 230 231 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev, 232 unsigned int timeout_ms) 233 { 234 struct ublk_ctrl_cmd_data data = { 235 .cmd_op = UBLK_U_CMD_QUIESCE_DEV, 236 .flags = CTRL_CMD_HAS_DATA, 237 }; 238 239 data.data[0] = timeout_ms; 240 return __ublk_ctrl_cmd(dev, &data); 241 } 242 243 static const char *ublk_dev_state_desc(struct ublk_dev *dev) 244 { 245 switch (dev->dev_info.state) { 246 case UBLK_S_DEV_DEAD: 247 return "DEAD"; 248 case UBLK_S_DEV_LIVE: 249 return "LIVE"; 250 case UBLK_S_DEV_QUIESCED: 251 return "QUIESCED"; 252 default: 253 return "UNKNOWN"; 254 }; 255 } 256 257 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len) 258 { 259 unsigned done = 0; 260 int i; 261 262 for (i = 0; i < CPU_SETSIZE; i++) { 263 if (CPU_ISSET(i, set)) 264 done += snprintf(&buf[done], len - done, "%d ", i); 265 } 266 } 267 268 static void ublk_adjust_affinity(cpu_set_t *set) 269 { 270 int j, updated = 0; 271 272 /* 273 * Just keep the 1st CPU now. 274 * 275 * In future, auto affinity selection can be tried. 276 */ 277 for (j = 0; j < CPU_SETSIZE; j++) { 278 if (CPU_ISSET(j, set)) { 279 if (!updated) { 280 updated = 1; 281 continue; 282 } 283 CPU_CLR(j, set); 284 } 285 } 286 } 287 288 /* Caller must free the allocated buffer */ 289 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf) 290 { 291 struct ublk_ctrl_cmd_data data = { 292 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY, 293 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF, 294 }; 295 cpu_set_t *buf; 296 int i, ret; 297 298 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues); 299 if (!buf) 300 return -ENOMEM; 301 302 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) { 303 data.data[0] = i; 304 data.len = sizeof(cpu_set_t); 305 data.addr = (__u64)&buf[i]; 306 307 ret = __ublk_ctrl_cmd(ctrl_dev, &data); 308 if (ret < 0) { 309 free(buf); 310 return ret; 311 } 312 ublk_adjust_affinity(&buf[i]); 313 } 314 315 *ptr_buf = buf; 316 return 0; 317 } 318 319 static void ublk_ctrl_dump(struct ublk_dev *dev) 320 { 321 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 322 struct ublk_params p; 323 cpu_set_t *affinity; 324 int ret; 325 326 ret = ublk_ctrl_get_params(dev, &p); 327 if (ret < 0) { 328 ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); 329 return; 330 } 331 332 ret = ublk_ctrl_get_affinity(dev, &affinity); 333 if (ret < 0) { 334 ublk_err("failed to get affinity %m\n"); 335 return; 336 } 337 338 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 339 info->dev_id, info->nr_hw_queues, info->queue_depth, 340 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); 341 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", 342 info->max_io_buf_bytes, info->ublksrv_pid, info->flags, 343 ublk_dev_state_desc(dev)); 344 345 if (affinity) { 346 char buf[512]; 347 int i; 348 349 for (i = 0; i < info->nr_hw_queues; i++) { 350 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf)); 351 printf("\tqueue %u: affinity(%s)\n", 352 i, buf); 353 } 354 free(affinity); 355 } 356 357 fflush(stdout); 358 } 359 360 static void ublk_ctrl_deinit(struct ublk_dev *dev) 361 { 362 close(dev->ctrl_fd); 363 free(dev); 364 } 365 366 static struct ublk_dev *ublk_ctrl_init(void) 367 { 368 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); 369 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 370 int ret; 371 372 dev->ctrl_fd = open(CTRL_DEV, O_RDWR); 373 if (dev->ctrl_fd < 0) { 374 free(dev); 375 return NULL; 376 } 377 378 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; 379 380 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, 381 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); 382 if (ret < 0) { 383 ublk_err("queue_init: %s\n", strerror(-ret)); 384 free(dev); 385 return NULL; 386 } 387 dev->nr_fds = 1; 388 389 return dev; 390 } 391 392 static int __ublk_queue_cmd_buf_sz(unsigned depth) 393 { 394 int size = depth * sizeof(struct ublksrv_io_desc); 395 unsigned int page_sz = getpagesize(); 396 397 return round_up(size, page_sz); 398 } 399 400 static int ublk_queue_max_cmd_buf_sz(void) 401 { 402 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); 403 } 404 405 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) 406 { 407 return __ublk_queue_cmd_buf_sz(q->q_depth); 408 } 409 410 static void ublk_queue_deinit(struct ublk_queue *q) 411 { 412 int i; 413 int nr_ios = q->q_depth; 414 415 if (q->io_cmd_buf) 416 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); 417 418 for (i = 0; i < nr_ios; i++) 419 free(q->ios[i].buf_addr); 420 } 421 422 static void ublk_thread_deinit(struct ublk_thread *t) 423 { 424 io_uring_unregister_buffers(&t->ring); 425 426 io_uring_unregister_ring_fd(&t->ring); 427 428 if (t->ring.ring_fd > 0) { 429 io_uring_unregister_files(&t->ring); 430 close(t->ring.ring_fd); 431 t->ring.ring_fd = -1; 432 } 433 } 434 435 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) 436 { 437 struct ublk_dev *dev = q->dev; 438 int depth = dev->dev_info.queue_depth; 439 int i; 440 int cmd_buf_size, io_buf_size; 441 unsigned long off; 442 443 q->tgt_ops = dev->tgt.ops; 444 q->flags = 0; 445 q->q_depth = depth; 446 q->flags = dev->dev_info.flags; 447 q->flags |= extra_flags; 448 449 /* Cache fd in queue for fast path access */ 450 q->ublk_fd = dev->fds[0]; 451 452 cmd_buf_size = ublk_queue_cmd_buf_sz(q); 453 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); 454 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, 455 MAP_SHARED | MAP_POPULATE, dev->fds[0], off); 456 if (q->io_cmd_buf == MAP_FAILED) { 457 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", 458 q->dev->dev_info.dev_id, q->q_id); 459 goto fail; 460 } 461 462 io_buf_size = dev->dev_info.max_io_buf_bytes; 463 for (i = 0; i < q->q_depth; i++) { 464 q->ios[i].buf_addr = NULL; 465 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; 466 q->ios[i].tag = i; 467 468 if (ublk_queue_no_buf(q)) 469 continue; 470 471 if (posix_memalign((void **)&q->ios[i].buf_addr, 472 getpagesize(), io_buf_size)) { 473 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", 474 dev->dev_info.dev_id, q->q_id, i); 475 goto fail; 476 } 477 } 478 479 return 0; 480 fail: 481 ublk_queue_deinit(q); 482 ublk_err("ublk dev %d queue %d failed\n", 483 dev->dev_info.dev_id, q->q_id); 484 return -ENOMEM; 485 } 486 487 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags) 488 { 489 struct ublk_dev *dev = t->dev; 490 unsigned long long flags = dev->dev_info.flags | extra_flags; 491 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; 492 int ret; 493 494 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, 495 IORING_SETUP_COOP_TASKRUN | 496 IORING_SETUP_SINGLE_ISSUER | 497 IORING_SETUP_DEFER_TASKRUN); 498 if (ret < 0) { 499 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n", 500 dev->dev_info.dev_id, t->idx, ret); 501 goto fail; 502 } 503 504 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) { 505 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; 506 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; 507 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); 508 ret = io_uring_register_buffers_sparse( 509 &t->ring, max_nr_ios_per_thread); 510 if (ret) { 511 ublk_err("ublk dev %d thread %d register spare buffers failed %d", 512 dev->dev_info.dev_id, t->idx, ret); 513 goto fail; 514 } 515 } 516 517 io_uring_register_ring_fd(&t->ring); 518 519 if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { 520 /* Register only backing files starting from index 1, exclude ublk control device */ 521 if (dev->nr_fds > 1) { 522 ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1); 523 } else { 524 /* No backing files to register, skip file registration */ 525 ret = 0; 526 } 527 } else { 528 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); 529 } 530 if (ret) { 531 ublk_err("ublk dev %d thread %d register files failed %d\n", 532 t->dev->dev_info.dev_id, t->idx, ret); 533 goto fail; 534 } 535 536 return 0; 537 fail: 538 ublk_thread_deinit(t); 539 ublk_err("ublk dev %d thread %d init failed\n", 540 dev->dev_info.dev_id, t->idx); 541 return -ENOMEM; 542 } 543 544 #define WAIT_USEC 100000 545 #define MAX_WAIT_USEC (3 * 1000000) 546 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) 547 { 548 int dev_id = dev->dev_info.dev_id; 549 unsigned int wait_usec = 0; 550 int ret = 0, fd = -1; 551 char buf[64]; 552 553 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); 554 555 while (wait_usec < MAX_WAIT_USEC) { 556 fd = open(buf, O_RDWR); 557 if (fd >= 0) 558 break; 559 usleep(WAIT_USEC); 560 wait_usec += WAIT_USEC; 561 } 562 if (fd < 0) { 563 ublk_err("can't open %s %s\n", buf, strerror(errno)); 564 return -1; 565 } 566 567 dev->fds[0] = fd; 568 if (dev->tgt.ops->init_tgt) 569 ret = dev->tgt.ops->init_tgt(ctx, dev); 570 if (ret) 571 close(dev->fds[0]); 572 return ret; 573 } 574 575 static void ublk_dev_unprep(struct ublk_dev *dev) 576 { 577 if (dev->tgt.ops->deinit_tgt) 578 dev->tgt.ops->deinit_tgt(dev); 579 close(dev->fds[0]); 580 } 581 582 static void ublk_set_auto_buf_reg(const struct ublk_queue *q, 583 struct io_uring_sqe *sqe, 584 unsigned short tag) 585 { 586 struct ublk_auto_buf_reg buf = {}; 587 588 if (q->tgt_ops->buf_index) 589 buf.index = q->tgt_ops->buf_index(q, tag); 590 else 591 buf.index = q->ios[tag].buf_index; 592 593 if (ublk_queue_auto_zc_fallback(q)) 594 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; 595 596 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf); 597 } 598 599 /* Copy in pieces to test the buffer offset logic */ 600 #define UBLK_USER_COPY_LEN 2048 601 602 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) 603 { 604 const struct ublk_queue *q = ublk_io_to_queue(io); 605 const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag); 606 __u64 off = ublk_user_copy_offset(q->q_id, io->tag); 607 __u8 ublk_op = ublksrv_get_op(iod); 608 __u32 len = iod->nr_sectors << 9; 609 void *addr = io->buf_addr; 610 611 if (ublk_op != match_ublk_op) 612 return; 613 614 while (len) { 615 __u32 copy_len = min(len, UBLK_USER_COPY_LEN); 616 ssize_t copied; 617 618 if (ublk_op == UBLK_IO_OP_WRITE) 619 copied = pread(q->ublk_fd, addr, copy_len, off); 620 else if (ublk_op == UBLK_IO_OP_READ) 621 copied = pwrite(q->ublk_fd, addr, copy_len, off); 622 else 623 assert(0); 624 assert(copied == (ssize_t)copy_len); 625 addr += copy_len; 626 off += copy_len; 627 len -= copy_len; 628 } 629 } 630 631 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) 632 { 633 struct ublk_queue *q = ublk_io_to_queue(io); 634 struct ublksrv_io_cmd *cmd; 635 struct io_uring_sqe *sqe[1]; 636 unsigned int cmd_op = 0; 637 __u64 user_data; 638 639 /* only freed io can be issued */ 640 if (!(io->flags & UBLKS_IO_FREE)) 641 return 0; 642 643 /* 644 * we issue because we need either fetching or committing or 645 * getting data 646 */ 647 if (!(io->flags & 648 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA))) 649 return 0; 650 651 if (io->flags & UBLKS_IO_NEED_GET_DATA) 652 cmd_op = UBLK_U_IO_NEED_GET_DATA; 653 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) { 654 if (ublk_queue_use_user_copy(q)) 655 ublk_user_copy(io, UBLK_IO_OP_READ); 656 657 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; 658 } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ) 659 cmd_op = UBLK_U_IO_FETCH_REQ; 660 661 if (io_uring_sq_space_left(&t->ring) < 1) 662 io_uring_submit(&t->ring); 663 664 ublk_io_alloc_sqes(t, sqe, 1); 665 if (!sqe[0]) { 666 ublk_err("%s: run out of sqe. thread %u, tag %d\n", 667 __func__, t->idx, io->tag); 668 return -1; 669 } 670 671 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); 672 673 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) 674 cmd->result = io->result; 675 676 /* These fields should be written once, never change */ 677 ublk_set_sqe_cmd_op(sqe[0], cmd_op); 678 sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */ 679 sqe[0]->opcode = IORING_OP_URING_CMD; 680 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) 681 sqe[0]->flags = 0; /* Use raw FD, not fixed file */ 682 else 683 sqe[0]->flags = IOSQE_FIXED_FILE; 684 sqe[0]->rw_flags = 0; 685 cmd->tag = io->tag; 686 cmd->q_id = q->q_id; 687 if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q)) 688 cmd->addr = (__u64) (uintptr_t) io->buf_addr; 689 else 690 cmd->addr = 0; 691 692 if (ublk_queue_use_auto_zc(q)) 693 ublk_set_auto_buf_reg(q, sqe[0], io->tag); 694 695 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); 696 io_uring_sqe_set_data64(sqe[0], user_data); 697 698 io->flags = 0; 699 700 t->cmd_inflight += 1; 701 702 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n", 703 __func__, t->idx, q->q_id, io->tag, cmd_op, 704 io->flags, !!(t->state & UBLKS_T_STOPPING)); 705 return 1; 706 } 707 708 static void ublk_submit_fetch_commands(struct ublk_thread *t) 709 { 710 struct ublk_queue *q; 711 struct ublk_io *io; 712 int i = 0, j = 0; 713 714 if (t->dev->per_io_tasks) { 715 /* 716 * Lexicographically order all the (qid,tag) pairs, with 717 * qid taking priority (so (1,0) > (0,1)). Then make 718 * this thread the daemon for every Nth entry in this 719 * list (N is the number of threads), starting at this 720 * thread's index. This ensures that each queue is 721 * handled by as many ublk server threads as possible, 722 * so that load that is concentrated on one or a few 723 * queues can make use of all ublk server threads. 724 */ 725 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info; 726 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth; 727 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) { 728 int q_id = i / dinfo->queue_depth; 729 int tag = i % dinfo->queue_depth; 730 q = &t->dev->q[q_id]; 731 io = &q->ios[tag]; 732 io->buf_index = j++; 733 ublk_queue_io_cmd(t, io); 734 } 735 } else { 736 /* 737 * Service exclusively the queue whose q_id matches our 738 * thread index. 739 */ 740 struct ublk_queue *q = &t->dev->q[t->idx]; 741 for (i = 0; i < q->q_depth; i++) { 742 io = &q->ios[i]; 743 io->buf_index = i; 744 ublk_queue_io_cmd(t, io); 745 } 746 } 747 } 748 749 static int ublk_thread_is_idle(struct ublk_thread *t) 750 { 751 return !io_uring_sq_ready(&t->ring) && !t->io_inflight; 752 } 753 754 static int ublk_thread_is_done(struct ublk_thread *t) 755 { 756 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight; 757 } 758 759 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, 760 struct ublk_queue *q, 761 struct io_uring_cqe *cqe) 762 { 763 if (cqe->res < 0 && cqe->res != -EAGAIN) 764 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", 765 __func__, cqe->res, q->q_id, 766 user_data_to_tag(cqe->user_data), 767 user_data_to_op(cqe->user_data)); 768 769 if (q->tgt_ops->tgt_io_done) 770 q->tgt_ops->tgt_io_done(t, q, cqe); 771 } 772 773 static void ublk_handle_uring_cmd(struct ublk_thread *t, 774 struct ublk_queue *q, 775 const struct io_uring_cqe *cqe) 776 { 777 int fetch = (cqe->res != UBLK_IO_RES_ABORT) && 778 !(t->state & UBLKS_T_STOPPING); 779 unsigned tag = user_data_to_tag(cqe->user_data); 780 struct ublk_io *io = &q->ios[tag]; 781 782 if (!fetch) { 783 t->state |= UBLKS_T_STOPPING; 784 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; 785 } 786 787 if (cqe->res == UBLK_IO_RES_OK) { 788 assert(tag < q->q_depth); 789 790 if (ublk_queue_use_user_copy(q)) 791 ublk_user_copy(io, UBLK_IO_OP_WRITE); 792 793 if (q->tgt_ops->queue_io) 794 q->tgt_ops->queue_io(t, q, tag); 795 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { 796 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE; 797 ublk_queue_io_cmd(t, io); 798 } else { 799 /* 800 * COMMIT_REQ will be completed immediately since no fetching 801 * piggyback is required. 802 * 803 * Marking IO_FREE only, then this io won't be issued since 804 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*) 805 * 806 * */ 807 io->flags = UBLKS_IO_FREE; 808 } 809 } 810 811 static void ublk_handle_cqe(struct ublk_thread *t, 812 struct io_uring_cqe *cqe, void *data) 813 { 814 struct ublk_dev *dev = t->dev; 815 unsigned q_id = user_data_to_q_id(cqe->user_data); 816 struct ublk_queue *q = &dev->q[q_id]; 817 unsigned cmd_op = user_data_to_op(cqe->user_data); 818 819 if (cqe->res < 0 && cqe->res != -ENODEV) 820 ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, 821 cqe->res, cqe->user_data, q->flags); 822 823 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", 824 __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), 825 cmd_op, is_target_io(cqe->user_data), 826 user_data_to_tgt_data(cqe->user_data), 827 (t->state & UBLKS_T_STOPPING)); 828 829 /* Don't retrieve io in case of target io */ 830 if (is_target_io(cqe->user_data)) { 831 ublksrv_handle_tgt_cqe(t, q, cqe); 832 return; 833 } 834 835 t->cmd_inflight--; 836 837 ublk_handle_uring_cmd(t, q, cqe); 838 } 839 840 static int ublk_reap_events_uring(struct ublk_thread *t) 841 { 842 struct io_uring_cqe *cqe; 843 unsigned head; 844 int count = 0; 845 846 io_uring_for_each_cqe(&t->ring, head, cqe) { 847 ublk_handle_cqe(t, cqe, NULL); 848 count += 1; 849 } 850 io_uring_cq_advance(&t->ring, count); 851 852 return count; 853 } 854 855 static int ublk_process_io(struct ublk_thread *t) 856 { 857 int ret, reapped; 858 859 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n", 860 t->dev->dev_info.dev_id, 861 t->idx, io_uring_sq_ready(&t->ring), 862 t->cmd_inflight, 863 (t->state & UBLKS_T_STOPPING)); 864 865 if (ublk_thread_is_done(t)) 866 return -ENODEV; 867 868 ret = io_uring_submit_and_wait(&t->ring, 1); 869 reapped = ublk_reap_events_uring(t); 870 871 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", 872 ret, reapped, (t->state & UBLKS_T_STOPPING), 873 (t->state & UBLKS_T_IDLE)); 874 875 return reapped; 876 } 877 878 struct ublk_thread_info { 879 struct ublk_dev *dev; 880 pthread_t thread; 881 unsigned idx; 882 sem_t *ready; 883 cpu_set_t *affinity; 884 unsigned long long extra_flags; 885 }; 886 887 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) 888 { 889 if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0) 890 ublk_err("ublk dev %u thread %u set affinity failed", 891 info->dev->dev_info.dev_id, info->idx); 892 } 893 894 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) 895 { 896 struct ublk_thread t = { 897 .dev = info->dev, 898 .idx = info->idx, 899 }; 900 int dev_id = info->dev->dev_info.dev_id; 901 int ret; 902 903 ret = ublk_thread_init(&t, info->extra_flags); 904 if (ret) { 905 ublk_err("ublk dev %d thread %u init failed\n", 906 dev_id, t.idx); 907 return ret; 908 } 909 sem_post(info->ready); 910 911 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", 912 gettid(), dev_id, t.idx); 913 914 /* submit all io commands to ublk driver */ 915 ublk_submit_fetch_commands(&t); 916 do { 917 if (ublk_process_io(&t) < 0) 918 break; 919 } while (1); 920 921 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", 922 gettid(), dev_id, t.idx); 923 ublk_thread_deinit(&t); 924 return 0; 925 } 926 927 static void *ublk_io_handler_fn(void *data) 928 { 929 struct ublk_thread_info *info = data; 930 931 /* 932 * IO perf is sensitive with queue pthread affinity on NUMA machine 933 * 934 * Set sched_affinity at beginning, so following allocated memory/pages 935 * could be CPU/NUMA aware. 936 */ 937 if (info->affinity) 938 ublk_thread_set_sched_affinity(info); 939 940 __ublk_io_handler_fn(info); 941 942 return NULL; 943 } 944 945 static void ublk_set_parameters(struct ublk_dev *dev) 946 { 947 int ret; 948 949 ret = ublk_ctrl_set_params(dev, &dev->tgt.params); 950 if (ret) 951 ublk_err("dev %d set basic parameter failed %d\n", 952 dev->dev_info.dev_id, ret); 953 } 954 955 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id) 956 { 957 uint64_t id; 958 int evtfd = ctx->_evtfd; 959 960 if (evtfd < 0) 961 return -EBADF; 962 963 if (dev_id >= 0) 964 id = dev_id + 1; 965 else 966 id = ERROR_EVTFD_DEVID; 967 968 if (dev && ctx->shadow_dev) 969 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q)); 970 971 if (write(evtfd, &id, sizeof(id)) != sizeof(id)) 972 return -EINVAL; 973 974 close(evtfd); 975 shmdt(ctx->shadow_dev); 976 977 return 0; 978 } 979 980 981 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) 982 { 983 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; 984 struct ublk_thread_info *tinfo; 985 unsigned long long extra_flags = 0; 986 cpu_set_t *affinity_buf; 987 void *thread_ret; 988 sem_t ready; 989 int ret, i; 990 991 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); 992 993 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads); 994 if (!tinfo) 995 return -ENOMEM; 996 997 sem_init(&ready, 0, 0); 998 ret = ublk_dev_prep(ctx, dev); 999 if (ret) 1000 return ret; 1001 1002 ret = ublk_ctrl_get_affinity(dev, &affinity_buf); 1003 if (ret) 1004 return ret; 1005 1006 if (ctx->auto_zc_fallback) 1007 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; 1008 if (ctx->no_ublk_fixed_fd) 1009 extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD; 1010 1011 for (i = 0; i < dinfo->nr_hw_queues; i++) { 1012 dev->q[i].dev = dev; 1013 dev->q[i].q_id = i; 1014 1015 ret = ublk_queue_init(&dev->q[i], extra_flags); 1016 if (ret) { 1017 ublk_err("ublk dev %d queue %d init queue failed\n", 1018 dinfo->dev_id, i); 1019 goto fail; 1020 } 1021 } 1022 1023 for (i = 0; i < dev->nthreads; i++) { 1024 tinfo[i].dev = dev; 1025 tinfo[i].idx = i; 1026 tinfo[i].ready = &ready; 1027 tinfo[i].extra_flags = extra_flags; 1028 1029 /* 1030 * If threads are not tied 1:1 to queues, setting thread 1031 * affinity based on queue affinity makes little sense. 1032 * However, thread CPU affinity has significant impact 1033 * on performance, so to compare fairly, we'll still set 1034 * thread CPU affinity based on queue affinity where 1035 * possible. 1036 */ 1037 if (dev->nthreads == dinfo->nr_hw_queues) 1038 tinfo[i].affinity = &affinity_buf[i]; 1039 pthread_create(&tinfo[i].thread, NULL, 1040 ublk_io_handler_fn, 1041 &tinfo[i]); 1042 } 1043 1044 for (i = 0; i < dev->nthreads; i++) 1045 sem_wait(&ready); 1046 free(affinity_buf); 1047 1048 /* everything is fine now, start us */ 1049 if (ctx->recovery) 1050 ret = ublk_ctrl_end_user_recovery(dev, getpid()); 1051 else { 1052 ublk_set_parameters(dev); 1053 ret = ublk_ctrl_start_dev(dev, getpid()); 1054 } 1055 if (ret < 0) { 1056 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); 1057 /* stop device so that inflight uring_cmd can be cancelled */ 1058 ublk_ctrl_stop_dev(dev); 1059 goto fail_start; 1060 } 1061 1062 ublk_ctrl_get_info(dev); 1063 if (ctx->fg) 1064 ublk_ctrl_dump(dev); 1065 else 1066 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); 1067 fail_start: 1068 /* wait until we are terminated */ 1069 for (i = 0; i < dev->nthreads; i++) 1070 pthread_join(tinfo[i].thread, &thread_ret); 1071 free(tinfo); 1072 fail: 1073 for (i = 0; i < dinfo->nr_hw_queues; i++) 1074 ublk_queue_deinit(&dev->q[i]); 1075 ublk_dev_unprep(dev); 1076 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); 1077 1078 return ret; 1079 } 1080 1081 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) 1082 { 1083 #define EV_SIZE (sizeof(struct inotify_event)) 1084 #define EV_BUF_LEN (128 * (EV_SIZE + 16)) 1085 struct pollfd pfd; 1086 int fd, wd; 1087 int ret = -EINVAL; 1088 const char *dev_name = basename(path); 1089 1090 fd = inotify_init(); 1091 if (fd < 0) { 1092 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); 1093 return fd; 1094 } 1095 1096 wd = inotify_add_watch(fd, "/dev", evt_mask); 1097 if (wd == -1) { 1098 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); 1099 goto fail; 1100 } 1101 1102 pfd.fd = fd; 1103 pfd.events = POLL_IN; 1104 while (1) { 1105 int i = 0; 1106 char buffer[EV_BUF_LEN]; 1107 ret = poll(&pfd, 1, 1000 * timeout); 1108 1109 if (ret == -1) { 1110 ublk_err("%s: poll inotify failed: %d\n", __func__, ret); 1111 goto rm_watch; 1112 } else if (ret == 0) { 1113 ublk_err("%s: poll inotify timeout\n", __func__); 1114 ret = -ETIMEDOUT; 1115 goto rm_watch; 1116 } 1117 1118 ret = read(fd, buffer, EV_BUF_LEN); 1119 if (ret < 0) { 1120 ublk_err("%s: read inotify fd failed\n", __func__); 1121 goto rm_watch; 1122 } 1123 1124 while (i < ret) { 1125 struct inotify_event *event = (struct inotify_event *)&buffer[i]; 1126 1127 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", 1128 __func__, event->mask, event->name); 1129 if (event->mask & evt_mask) { 1130 if (!strcmp(event->name, dev_name)) { 1131 ret = 0; 1132 goto rm_watch; 1133 } 1134 } 1135 i += EV_SIZE + event->len; 1136 } 1137 } 1138 rm_watch: 1139 inotify_rm_watch(fd, wd); 1140 fail: 1141 close(fd); 1142 return ret; 1143 } 1144 1145 static int ublk_stop_io_daemon(const struct ublk_dev *dev) 1146 { 1147 int daemon_pid = dev->dev_info.ublksrv_pid; 1148 int dev_id = dev->dev_info.dev_id; 1149 char ublkc[64]; 1150 int ret = 0; 1151 1152 if (daemon_pid < 0) 1153 return 0; 1154 1155 /* daemon may be dead already */ 1156 if (kill(daemon_pid, 0) < 0) 1157 goto wait; 1158 1159 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); 1160 1161 /* ublk char device may be gone already */ 1162 if (access(ublkc, F_OK) != 0) 1163 goto wait; 1164 1165 /* Wait until ublk char device is closed, when the daemon is shutdown */ 1166 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); 1167 /* double check and since it may be closed before starting inotify */ 1168 if (ret == -ETIMEDOUT) 1169 ret = kill(daemon_pid, 0) < 0; 1170 wait: 1171 waitpid(daemon_pid, NULL, 0); 1172 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", 1173 __func__, daemon_pid, dev_id, ret); 1174 1175 return ret; 1176 } 1177 1178 static int __cmd_dev_add(const struct dev_ctx *ctx) 1179 { 1180 unsigned nthreads = ctx->nthreads; 1181 unsigned nr_queues = ctx->nr_hw_queues; 1182 const char *tgt_type = ctx->tgt_type; 1183 unsigned depth = ctx->queue_depth; 1184 __u64 features; 1185 const struct ublk_tgt_ops *ops; 1186 struct ublksrv_ctrl_dev_info *info; 1187 struct ublk_dev *dev = NULL; 1188 int dev_id = ctx->dev_id; 1189 int ret, i; 1190 1191 ops = ublk_find_tgt(tgt_type); 1192 if (!ops) { 1193 ublk_err("%s: no such tgt type, type %s\n", 1194 __func__, tgt_type); 1195 ret = -ENODEV; 1196 goto fail; 1197 } 1198 1199 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { 1200 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", 1201 __func__, nr_queues, depth); 1202 ret = -EINVAL; 1203 goto fail; 1204 } 1205 1206 /* default to 1:1 threads:queues if nthreads is unspecified */ 1207 if (!nthreads) 1208 nthreads = nr_queues; 1209 1210 if (nthreads > UBLK_MAX_THREADS) { 1211 ublk_err("%s: %u is too many threads (max %u)\n", 1212 __func__, nthreads, UBLK_MAX_THREADS); 1213 ret = -EINVAL; 1214 goto fail; 1215 } 1216 1217 if (nthreads != nr_queues && !ctx->per_io_tasks) { 1218 ublk_err("%s: threads %u must be same as queues %u if " 1219 "not using per_io_tasks\n", 1220 __func__, nthreads, nr_queues); 1221 ret = -EINVAL; 1222 goto fail; 1223 } 1224 1225 dev = ublk_ctrl_init(); 1226 if (!dev) { 1227 ublk_err("%s: can't alloc dev id %d, type %s\n", 1228 __func__, dev_id, tgt_type); 1229 ret = -ENOMEM; 1230 goto fail; 1231 } 1232 1233 /* kernel doesn't support get_features */ 1234 ret = ublk_ctrl_get_features(dev, &features); 1235 if (ret < 0) { 1236 ret = -EINVAL; 1237 goto fail; 1238 } 1239 1240 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) { 1241 ret = -ENOTSUP; 1242 goto fail; 1243 } 1244 1245 info = &dev->dev_info; 1246 info->dev_id = ctx->dev_id; 1247 info->nr_hw_queues = nr_queues; 1248 info->queue_depth = depth; 1249 info->flags = ctx->flags; 1250 if ((features & UBLK_F_QUIESCE) && 1251 (info->flags & UBLK_F_USER_RECOVERY)) 1252 info->flags |= UBLK_F_QUIESCE; 1253 dev->nthreads = nthreads; 1254 dev->per_io_tasks = ctx->per_io_tasks; 1255 dev->tgt.ops = ops; 1256 dev->tgt.sq_depth = depth; 1257 dev->tgt.cq_depth = depth; 1258 1259 for (i = 0; i < MAX_BACK_FILES; i++) { 1260 if (ctx->files[i]) { 1261 strcpy(dev->tgt.backing_file[i], ctx->files[i]); 1262 dev->tgt.nr_backing_files++; 1263 } 1264 } 1265 1266 if (ctx->recovery) 1267 ret = ublk_ctrl_start_user_recovery(dev); 1268 else 1269 ret = ublk_ctrl_add_dev(dev); 1270 if (ret < 0) { 1271 ublk_err("%s: can't add dev id %d, type %s ret %d\n", 1272 __func__, dev_id, tgt_type, ret); 1273 goto fail; 1274 } 1275 1276 ret = ublk_start_daemon(ctx, dev); 1277 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); 1278 if (ret < 0) 1279 ublk_ctrl_del_dev(dev); 1280 1281 fail: 1282 if (ret < 0) 1283 ublk_send_dev_event(ctx, dev, -1); 1284 if (dev) 1285 ublk_ctrl_deinit(dev); 1286 return ret; 1287 } 1288 1289 static int __cmd_dev_list(struct dev_ctx *ctx); 1290 1291 static int cmd_dev_add(struct dev_ctx *ctx) 1292 { 1293 int res; 1294 1295 if (ctx->fg) 1296 goto run; 1297 1298 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666); 1299 if (ctx->_shmid < 0) { 1300 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno)); 1301 exit(-1); 1302 } 1303 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0); 1304 if (ctx->shadow_dev == (struct ublk_dev *)-1) { 1305 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno)); 1306 exit(-1); 1307 } 1308 ctx->_evtfd = eventfd(0, 0); 1309 if (ctx->_evtfd < 0) { 1310 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); 1311 exit(-1); 1312 } 1313 1314 res = fork(); 1315 if (res == 0) { 1316 int res2; 1317 1318 setsid(); 1319 res2 = fork(); 1320 if (res2 == 0) { 1321 /* prepare for detaching */ 1322 close(STDIN_FILENO); 1323 close(STDOUT_FILENO); 1324 close(STDERR_FILENO); 1325 run: 1326 res = __cmd_dev_add(ctx); 1327 return res; 1328 } else { 1329 /* detached from the foreground task */ 1330 exit(EXIT_SUCCESS); 1331 } 1332 } else if (res > 0) { 1333 uint64_t id; 1334 int exit_code = EXIT_FAILURE; 1335 1336 res = read(ctx->_evtfd, &id, sizeof(id)); 1337 close(ctx->_evtfd); 1338 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { 1339 ctx->dev_id = id - 1; 1340 if (__cmd_dev_list(ctx) >= 0) 1341 exit_code = EXIT_SUCCESS; 1342 } 1343 shmdt(ctx->shadow_dev); 1344 shmctl(ctx->_shmid, IPC_RMID, NULL); 1345 /* wait for child and detach from it */ 1346 wait(NULL); 1347 if (exit_code == EXIT_FAILURE) 1348 ublk_err("%s: command failed\n", __func__); 1349 exit(exit_code); 1350 } else { 1351 exit(EXIT_FAILURE); 1352 } 1353 } 1354 1355 static int __cmd_dev_del(struct dev_ctx *ctx) 1356 { 1357 int number = ctx->dev_id; 1358 struct ublk_dev *dev; 1359 int ret; 1360 1361 dev = ublk_ctrl_init(); 1362 dev->dev_info.dev_id = number; 1363 1364 ret = ublk_ctrl_get_info(dev); 1365 if (ret < 0) 1366 goto fail; 1367 1368 ret = ublk_ctrl_stop_dev(dev); 1369 if (ret < 0) 1370 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); 1371 1372 ret = ublk_stop_io_daemon(dev); 1373 if (ret < 0) 1374 ublk_err("%s: stop daemon id %d dev %d, ret %d\n", 1375 __func__, dev->dev_info.ublksrv_pid, number, ret); 1376 ublk_ctrl_del_dev(dev); 1377 fail: 1378 ublk_ctrl_deinit(dev); 1379 1380 return (ret >= 0) ? 0 : ret; 1381 } 1382 1383 static int cmd_dev_del(struct dev_ctx *ctx) 1384 { 1385 int i; 1386 1387 if (ctx->dev_id >= 0 || !ctx->all) 1388 return __cmd_dev_del(ctx); 1389 1390 for (i = 0; i < 255; i++) { 1391 ctx->dev_id = i; 1392 __cmd_dev_del(ctx); 1393 } 1394 return 0; 1395 } 1396 1397 static int __cmd_dev_list(struct dev_ctx *ctx) 1398 { 1399 struct ublk_dev *dev = ublk_ctrl_init(); 1400 int ret; 1401 1402 if (!dev) 1403 return -ENODEV; 1404 1405 dev->dev_info.dev_id = ctx->dev_id; 1406 1407 ret = ublk_ctrl_get_info(dev); 1408 if (ret < 0) { 1409 if (ctx->logging) 1410 ublk_err("%s: can't get dev info from %d: %d\n", 1411 __func__, ctx->dev_id, ret); 1412 } else { 1413 if (ctx->shadow_dev) 1414 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q)); 1415 1416 ublk_ctrl_dump(dev); 1417 } 1418 1419 ublk_ctrl_deinit(dev); 1420 1421 return ret; 1422 } 1423 1424 static int cmd_dev_list(struct dev_ctx *ctx) 1425 { 1426 int i; 1427 1428 if (ctx->dev_id >= 0 || !ctx->all) 1429 return __cmd_dev_list(ctx); 1430 1431 ctx->logging = false; 1432 for (i = 0; i < 255; i++) { 1433 ctx->dev_id = i; 1434 __cmd_dev_list(ctx); 1435 } 1436 return 0; 1437 } 1438 1439 static int cmd_dev_get_features(void) 1440 { 1441 #define const_ilog2(x) (63 - __builtin_clzll(x)) 1442 #define FEAT_NAME(f) [const_ilog2(f)] = #f 1443 static const char *feat_map[] = { 1444 FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY), 1445 FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK), 1446 FEAT_NAME(UBLK_F_NEED_GET_DATA), 1447 FEAT_NAME(UBLK_F_USER_RECOVERY), 1448 FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE), 1449 FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV), 1450 FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE), 1451 FEAT_NAME(UBLK_F_USER_COPY), 1452 FEAT_NAME(UBLK_F_ZONED), 1453 FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO), 1454 FEAT_NAME(UBLK_F_UPDATE_SIZE), 1455 FEAT_NAME(UBLK_F_AUTO_BUF_REG), 1456 FEAT_NAME(UBLK_F_QUIESCE), 1457 FEAT_NAME(UBLK_F_PER_IO_DAEMON), 1458 FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), 1459 }; 1460 struct ublk_dev *dev; 1461 __u64 features = 0; 1462 int ret; 1463 1464 dev = ublk_ctrl_init(); 1465 if (!dev) { 1466 fprintf(stderr, "ublksrv_ctrl_init failed id\n"); 1467 return -EOPNOTSUPP; 1468 } 1469 1470 ret = ublk_ctrl_get_features(dev, &features); 1471 if (!ret) { 1472 int i; 1473 1474 printf("ublk_drv features: 0x%llx\n", features); 1475 1476 for (i = 0; i < sizeof(features) * 8; i++) { 1477 const char *feat; 1478 1479 if (!((1ULL << i) & features)) 1480 continue; 1481 if (i < ARRAY_SIZE(feat_map)) 1482 feat = feat_map[i]; 1483 else 1484 feat = "unknown"; 1485 printf("0x%-16llx: %s\n", 1ULL << i, feat); 1486 } 1487 } 1488 1489 return ret; 1490 } 1491 1492 static int cmd_dev_update_size(struct dev_ctx *ctx) 1493 { 1494 struct ublk_dev *dev = ublk_ctrl_init(); 1495 struct ublk_params p; 1496 int ret = -EINVAL; 1497 1498 if (!dev) 1499 return -ENODEV; 1500 1501 if (ctx->dev_id < 0) { 1502 fprintf(stderr, "device id isn't provided\n"); 1503 goto out; 1504 } 1505 1506 dev->dev_info.dev_id = ctx->dev_id; 1507 ret = ublk_ctrl_get_params(dev, &p); 1508 if (ret < 0) { 1509 ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); 1510 goto out; 1511 } 1512 1513 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) { 1514 ublk_err("size isn't aligned with logical block size\n"); 1515 ret = -EINVAL; 1516 goto out; 1517 } 1518 1519 ret = ublk_ctrl_update_size(dev, ctx->size >> 9); 1520 out: 1521 ublk_ctrl_deinit(dev); 1522 return ret; 1523 } 1524 1525 static int cmd_dev_quiesce(struct dev_ctx *ctx) 1526 { 1527 struct ublk_dev *dev = ublk_ctrl_init(); 1528 int ret = -EINVAL; 1529 1530 if (!dev) 1531 return -ENODEV; 1532 1533 if (ctx->dev_id < 0) { 1534 fprintf(stderr, "device id isn't provided for quiesce\n"); 1535 goto out; 1536 } 1537 dev->dev_info.dev_id = ctx->dev_id; 1538 ret = ublk_ctrl_quiesce_dev(dev, 10000); 1539 1540 out: 1541 ublk_ctrl_deinit(dev); 1542 return ret; 1543 } 1544 1545 static void __cmd_create_help(char *exe, bool recovery) 1546 { 1547 int i; 1548 1549 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n", 1550 exe, recovery ? "recover" : "add"); 1551 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); 1552 printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); 1553 printf("\t[--nthreads threads] [--per_io_tasks]\n"); 1554 printf("\t[target options] [backfile1] [backfile2] ...\n"); 1555 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); 1556 printf("\tdefault: nthreads=nr_queues"); 1557 1558 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) { 1559 const struct ublk_tgt_ops *ops = tgt_ops_list[i]; 1560 1561 if (ops->usage) 1562 ops->usage(ops); 1563 } 1564 } 1565 1566 static void cmd_add_help(char *exe) 1567 { 1568 __cmd_create_help(exe, false); 1569 printf("\n"); 1570 } 1571 1572 static void cmd_recover_help(char *exe) 1573 { 1574 __cmd_create_help(exe, true); 1575 printf("\tPlease provide exact command line for creating this device with real dev_id\n"); 1576 printf("\n"); 1577 } 1578 1579 static int cmd_dev_help(char *exe) 1580 { 1581 cmd_add_help(exe); 1582 cmd_recover_help(exe); 1583 1584 printf("%s del [-n dev_id] -a \n", exe); 1585 printf("\t -a delete all devices -n delete specified device\n\n"); 1586 printf("%s list [-n dev_id] -a \n", exe); 1587 printf("\t -a list all devices, -n list specified device, default -a \n\n"); 1588 printf("%s features\n", exe); 1589 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe); 1590 printf("%s quiesce -n dev_id\n", exe); 1591 return 0; 1592 } 1593 1594 int main(int argc, char *argv[]) 1595 { 1596 static const struct option longopts[] = { 1597 { "all", 0, NULL, 'a' }, 1598 { "type", 1, NULL, 't' }, 1599 { "number", 1, NULL, 'n' }, 1600 { "queues", 1, NULL, 'q' }, 1601 { "depth", 1, NULL, 'd' }, 1602 { "debug_mask", 1, NULL, 0 }, 1603 { "quiet", 0, NULL, 0 }, 1604 { "zero_copy", 0, NULL, 'z' }, 1605 { "foreground", 0, NULL, 0 }, 1606 { "recovery", 1, NULL, 'r' }, 1607 { "recovery_fail_io", 1, NULL, 'e'}, 1608 { "recovery_reissue", 1, NULL, 'i'}, 1609 { "get_data", 1, NULL, 'g'}, 1610 { "auto_zc", 0, NULL, 0 }, 1611 { "auto_zc_fallback", 0, NULL, 0 }, 1612 { "user_copy", 0, NULL, 'u'}, 1613 { "size", 1, NULL, 's'}, 1614 { "nthreads", 1, NULL, 0 }, 1615 { "per_io_tasks", 0, NULL, 0 }, 1616 { "no_ublk_fixed_fd", 0, NULL, 0 }, 1617 { 0, 0, 0, 0 } 1618 }; 1619 const struct ublk_tgt_ops *ops = NULL; 1620 int option_idx, opt; 1621 const char *cmd = argv[1]; 1622 struct dev_ctx ctx = { 1623 ._evtfd = -1, 1624 .queue_depth = 128, 1625 .nr_hw_queues = 2, 1626 .dev_id = -1, 1627 .tgt_type = "unknown", 1628 }; 1629 int ret = -EINVAL, i; 1630 int tgt_argc = 1; 1631 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL }; 1632 int value; 1633 1634 if (argc == 1) 1635 return ret; 1636 1637 opterr = 0; 1638 optind = 2; 1639 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", 1640 longopts, &option_idx)) != -1) { 1641 switch (opt) { 1642 case 'a': 1643 ctx.all = 1; 1644 break; 1645 case 'n': 1646 ctx.dev_id = strtol(optarg, NULL, 10); 1647 break; 1648 case 't': 1649 if (strlen(optarg) < sizeof(ctx.tgt_type)) 1650 strcpy(ctx.tgt_type, optarg); 1651 break; 1652 case 'q': 1653 ctx.nr_hw_queues = strtol(optarg, NULL, 10); 1654 break; 1655 case 'd': 1656 ctx.queue_depth = strtol(optarg, NULL, 10); 1657 break; 1658 case 'z': 1659 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY; 1660 break; 1661 case 'r': 1662 value = strtol(optarg, NULL, 10); 1663 if (value) 1664 ctx.flags |= UBLK_F_USER_RECOVERY; 1665 break; 1666 case 'e': 1667 value = strtol(optarg, NULL, 10); 1668 if (value) 1669 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO; 1670 break; 1671 case 'i': 1672 value = strtol(optarg, NULL, 10); 1673 if (value) 1674 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE; 1675 break; 1676 case 'g': 1677 ctx.flags |= UBLK_F_NEED_GET_DATA; 1678 break; 1679 case 'u': 1680 ctx.flags |= UBLK_F_USER_COPY; 1681 break; 1682 case 's': 1683 ctx.size = strtoull(optarg, NULL, 10); 1684 break; 1685 case 0: 1686 if (!strcmp(longopts[option_idx].name, "debug_mask")) 1687 ublk_dbg_mask = strtol(optarg, NULL, 16); 1688 if (!strcmp(longopts[option_idx].name, "quiet")) 1689 ublk_dbg_mask = 0; 1690 if (!strcmp(longopts[option_idx].name, "foreground")) 1691 ctx.fg = 1; 1692 if (!strcmp(longopts[option_idx].name, "auto_zc")) 1693 ctx.flags |= UBLK_F_AUTO_BUF_REG; 1694 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback")) 1695 ctx.auto_zc_fallback = 1; 1696 if (!strcmp(longopts[option_idx].name, "nthreads")) 1697 ctx.nthreads = strtol(optarg, NULL, 10); 1698 if (!strcmp(longopts[option_idx].name, "per_io_tasks")) 1699 ctx.per_io_tasks = 1; 1700 if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) 1701 ctx.no_ublk_fixed_fd = 1; 1702 break; 1703 case '?': 1704 /* 1705 * target requires every option must have argument 1706 */ 1707 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') { 1708 fprintf(stderr, "every target option requires argument: %s %s\n", 1709 argv[optind - 1], argv[optind]); 1710 exit(EXIT_FAILURE); 1711 } 1712 1713 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) { 1714 tgt_argv[tgt_argc++] = argv[optind - 1]; 1715 tgt_argv[tgt_argc++] = argv[optind]; 1716 } else { 1717 fprintf(stderr, "too many target options\n"); 1718 exit(EXIT_FAILURE); 1719 } 1720 optind += 1; 1721 break; 1722 } 1723 } 1724 1725 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ 1726 if (ctx.auto_zc_fallback && 1727 !((ctx.flags & UBLK_F_AUTO_BUF_REG) && 1728 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) { 1729 ublk_err("%s: auto_zc_fallback is set but neither " 1730 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n", 1731 __func__); 1732 return -EINVAL; 1733 } 1734 1735 if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) + 1736 !!(ctx.flags & UBLK_F_USER_COPY) + 1737 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) + 1738 (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) + 1739 ctx.auto_zc_fallback > 1) { 1740 fprintf(stderr, "too many data copy modes specified\n"); 1741 return -EINVAL; 1742 } 1743 1744 i = optind; 1745 while (i < argc && ctx.nr_files < MAX_BACK_FILES) { 1746 ctx.files[ctx.nr_files++] = argv[i++]; 1747 } 1748 1749 ops = ublk_find_tgt(ctx.tgt_type); 1750 if (ops && ops->parse_cmd_line) { 1751 optind = 0; 1752 1753 tgt_argv[0] = ctx.tgt_type; 1754 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv); 1755 } 1756 1757 if (!strcmp(cmd, "add")) 1758 ret = cmd_dev_add(&ctx); 1759 else if (!strcmp(cmd, "recover")) { 1760 if (ctx.dev_id < 0) { 1761 fprintf(stderr, "device id isn't provided for recovering\n"); 1762 ret = -EINVAL; 1763 } else { 1764 ctx.recovery = 1; 1765 ret = cmd_dev_add(&ctx); 1766 } 1767 } else if (!strcmp(cmd, "del")) 1768 ret = cmd_dev_del(&ctx); 1769 else if (!strcmp(cmd, "list")) { 1770 ctx.all = 1; 1771 ret = cmd_dev_list(&ctx); 1772 } else if (!strcmp(cmd, "help")) 1773 ret = cmd_dev_help(argv[0]); 1774 else if (!strcmp(cmd, "features")) 1775 ret = cmd_dev_get_features(); 1776 else if (!strcmp(cmd, "update_size")) 1777 ret = cmd_dev_update_size(&ctx); 1778 else if (!strcmp(cmd, "quiesce")) 1779 ret = cmd_dev_quiesce(&ctx); 1780 else 1781 cmd_dev_help(argv[0]); 1782 1783 return ret; 1784 } 1785