1 /* SPDX-License-Identifier: MIT */ 2 /* 3 * Description: uring_cmd based ublk 4 */ 5 6 #include <linux/fs.h> 7 #include <sys/un.h> 8 #include "kublk.h" 9 10 #define MAX_NR_TGT_ARG 64 11 12 unsigned int ublk_dbg_mask = UBLK_LOG; 13 static const struct ublk_tgt_ops *tgt_ops_list[] = { 14 &null_tgt_ops, 15 &loop_tgt_ops, 16 &stripe_tgt_ops, 17 &fault_inject_tgt_ops, 18 }; 19 20 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) 21 { 22 int i; 23 24 if (name == NULL) 25 return NULL; 26 27 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) 28 if (strcmp(tgt_ops_list[i]->name, name) == 0) 29 return tgt_ops_list[i]; 30 return NULL; 31 } 32 33 static inline int ublk_setup_ring(struct io_uring *r, int depth, 34 int cq_depth, unsigned flags) 35 { 36 struct io_uring_params p; 37 38 memset(&p, 0, sizeof(p)); 39 p.flags = flags | IORING_SETUP_CQSIZE; 40 p.cq_entries = cq_depth; 41 42 return io_uring_queue_init_params(depth, r, &p); 43 } 44 45 static void ublk_ctrl_init_cmd(struct ublk_dev *dev, 46 struct io_uring_sqe *sqe, 47 struct ublk_ctrl_cmd_data *data) 48 { 49 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 50 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); 51 52 sqe->fd = dev->ctrl_fd; 53 sqe->opcode = IORING_OP_URING_CMD; 54 sqe->ioprio = 0; 55 56 if (data->flags & CTRL_CMD_HAS_BUF) { 57 cmd->addr = data->addr; 58 cmd->len = data->len; 59 } 60 61 if (data->flags & CTRL_CMD_HAS_DATA) 62 cmd->data[0] = data->data[0]; 63 64 cmd->dev_id = info->dev_id; 65 cmd->queue_id = -1; 66 67 ublk_set_sqe_cmd_op(sqe, data->cmd_op); 68 69 io_uring_sqe_set_data(sqe, cmd); 70 } 71 72 static int __ublk_ctrl_cmd(struct ublk_dev *dev, 73 struct ublk_ctrl_cmd_data *data) 74 { 75 struct io_uring_sqe *sqe; 76 struct io_uring_cqe *cqe; 77 int ret = -EINVAL; 78 79 sqe = io_uring_get_sqe(&dev->ring); 80 if (!sqe) { 81 ublk_err("%s: can't get sqe ret %d\n", __func__, ret); 82 return ret; 83 } 84 85 ublk_ctrl_init_cmd(dev, sqe, data); 86 87 ret = io_uring_submit(&dev->ring); 88 if (ret < 0) { 89 ublk_err("uring submit ret %d\n", ret); 90 return ret; 91 } 92 93 ret = io_uring_wait_cqe(&dev->ring, &cqe); 94 if (ret < 0) { 95 ublk_err("wait cqe: %s\n", strerror(-ret)); 96 return ret; 97 } 98 io_uring_cqe_seen(&dev->ring, cqe); 99 100 return cqe->res; 101 } 102 103 static int ublk_ctrl_stop_dev(struct ublk_dev *dev) 104 { 105 struct ublk_ctrl_cmd_data data = { 106 .cmd_op = UBLK_U_CMD_STOP_DEV, 107 }; 108 109 return __ublk_ctrl_cmd(dev, &data); 110 } 111 112 static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) 113 { 114 struct ublk_ctrl_cmd_data data = { 115 .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, 116 }; 117 118 return __ublk_ctrl_cmd(dev, &data); 119 } 120 121 static int ublk_ctrl_start_dev(struct ublk_dev *dev, 122 int daemon_pid) 123 { 124 struct ublk_ctrl_cmd_data data = { 125 .cmd_op = UBLK_U_CMD_START_DEV, 126 .flags = CTRL_CMD_HAS_DATA, 127 }; 128 129 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 130 131 return __ublk_ctrl_cmd(dev, &data); 132 } 133 134 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev) 135 { 136 struct ublk_ctrl_cmd_data data = { 137 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY, 138 }; 139 140 return __ublk_ctrl_cmd(dev, &data); 141 } 142 143 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid) 144 { 145 struct ublk_ctrl_cmd_data data = { 146 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY, 147 .flags = CTRL_CMD_HAS_DATA, 148 }; 149 150 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 151 152 return __ublk_ctrl_cmd(dev, &data); 153 } 154 155 static int ublk_ctrl_add_dev(struct ublk_dev *dev) 156 { 157 struct ublk_ctrl_cmd_data data = { 158 .cmd_op = UBLK_U_CMD_ADD_DEV, 159 .flags = CTRL_CMD_HAS_BUF, 160 .addr = (__u64) (uintptr_t) &dev->dev_info, 161 .len = sizeof(struct ublksrv_ctrl_dev_info), 162 }; 163 164 return __ublk_ctrl_cmd(dev, &data); 165 } 166 167 static int ublk_ctrl_del_dev(struct ublk_dev *dev) 168 { 169 struct ublk_ctrl_cmd_data data = { 170 .cmd_op = UBLK_U_CMD_DEL_DEV, 171 .flags = 0, 172 }; 173 174 return __ublk_ctrl_cmd(dev, &data); 175 } 176 177 static int ublk_ctrl_get_info(struct ublk_dev *dev) 178 { 179 struct ublk_ctrl_cmd_data data = { 180 .cmd_op = UBLK_U_CMD_GET_DEV_INFO, 181 .flags = CTRL_CMD_HAS_BUF, 182 .addr = (__u64) (uintptr_t) &dev->dev_info, 183 .len = sizeof(struct ublksrv_ctrl_dev_info), 184 }; 185 186 return __ublk_ctrl_cmd(dev, &data); 187 } 188 189 static int ublk_ctrl_set_params(struct ublk_dev *dev, 190 struct ublk_params *params) 191 { 192 struct ublk_ctrl_cmd_data data = { 193 .cmd_op = UBLK_U_CMD_SET_PARAMS, 194 .flags = CTRL_CMD_HAS_BUF, 195 .addr = (__u64) (uintptr_t) params, 196 .len = sizeof(*params), 197 }; 198 params->len = sizeof(*params); 199 return __ublk_ctrl_cmd(dev, &data); 200 } 201 202 static int ublk_ctrl_get_params(struct ublk_dev *dev, 203 struct ublk_params *params) 204 { 205 struct ublk_ctrl_cmd_data data = { 206 .cmd_op = UBLK_U_CMD_GET_PARAMS, 207 .flags = CTRL_CMD_HAS_BUF, 208 .addr = (__u64)params, 209 .len = sizeof(*params), 210 }; 211 212 params->len = sizeof(*params); 213 214 return __ublk_ctrl_cmd(dev, &data); 215 } 216 217 static int ublk_ctrl_get_features(struct ublk_dev *dev, 218 __u64 *features) 219 { 220 struct ublk_ctrl_cmd_data data = { 221 .cmd_op = UBLK_U_CMD_GET_FEATURES, 222 .flags = CTRL_CMD_HAS_BUF, 223 .addr = (__u64) (uintptr_t) features, 224 .len = sizeof(*features), 225 }; 226 227 return __ublk_ctrl_cmd(dev, &data); 228 } 229 230 static int ublk_ctrl_update_size(struct ublk_dev *dev, 231 __u64 nr_sects) 232 { 233 struct ublk_ctrl_cmd_data data = { 234 .cmd_op = UBLK_U_CMD_UPDATE_SIZE, 235 .flags = CTRL_CMD_HAS_DATA, 236 }; 237 238 data.data[0] = nr_sects; 239 return __ublk_ctrl_cmd(dev, &data); 240 } 241 242 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev, 243 unsigned int timeout_ms) 244 { 245 struct ublk_ctrl_cmd_data data = { 246 .cmd_op = UBLK_U_CMD_QUIESCE_DEV, 247 .flags = CTRL_CMD_HAS_DATA, 248 }; 249 250 data.data[0] = timeout_ms; 251 return __ublk_ctrl_cmd(dev, &data); 252 } 253 254 static const char *ublk_dev_state_desc(struct ublk_dev *dev) 255 { 256 switch (dev->dev_info.state) { 257 case UBLK_S_DEV_DEAD: 258 return "DEAD"; 259 case UBLK_S_DEV_LIVE: 260 return "LIVE"; 261 case UBLK_S_DEV_QUIESCED: 262 return "QUIESCED"; 263 default: 264 return "UNKNOWN"; 265 }; 266 } 267 268 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len) 269 { 270 unsigned done = 0; 271 int i; 272 273 for (i = 0; i < CPU_SETSIZE; i++) { 274 if (CPU_ISSET(i, set)) 275 done += snprintf(&buf[done], len - done, "%d ", i); 276 } 277 } 278 279 static void ublk_adjust_affinity(cpu_set_t *set) 280 { 281 int j, updated = 0; 282 283 /* 284 * Just keep the 1st CPU now. 285 * 286 * In future, auto affinity selection can be tried. 287 */ 288 for (j = 0; j < CPU_SETSIZE; j++) { 289 if (CPU_ISSET(j, set)) { 290 if (!updated) { 291 updated = 1; 292 continue; 293 } 294 CPU_CLR(j, set); 295 } 296 } 297 } 298 299 /* Caller must free the allocated buffer */ 300 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf) 301 { 302 struct ublk_ctrl_cmd_data data = { 303 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY, 304 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF, 305 }; 306 cpu_set_t *buf; 307 int i, ret; 308 309 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues); 310 if (!buf) 311 return -ENOMEM; 312 313 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) { 314 data.data[0] = i; 315 data.len = sizeof(cpu_set_t); 316 data.addr = (__u64)&buf[i]; 317 318 ret = __ublk_ctrl_cmd(ctrl_dev, &data); 319 if (ret < 0) { 320 free(buf); 321 return ret; 322 } 323 ublk_adjust_affinity(&buf[i]); 324 } 325 326 *ptr_buf = buf; 327 return 0; 328 } 329 330 static void ublk_ctrl_dump(struct ublk_dev *dev) 331 { 332 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 333 struct ublk_params p; 334 cpu_set_t *affinity; 335 int ret; 336 337 ret = ublk_ctrl_get_params(dev, &p); 338 if (ret < 0) { 339 ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); 340 return; 341 } 342 343 ret = ublk_ctrl_get_affinity(dev, &affinity); 344 if (ret < 0) { 345 ublk_err("failed to get affinity %m\n"); 346 return; 347 } 348 349 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 350 info->dev_id, info->nr_hw_queues, info->queue_depth, 351 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); 352 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", 353 info->max_io_buf_bytes, info->ublksrv_pid, info->flags, 354 ublk_dev_state_desc(dev)); 355 356 if (affinity) { 357 char buf[512]; 358 int i; 359 360 for (i = 0; i < info->nr_hw_queues; i++) { 361 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf)); 362 printf("\tqueue %u: affinity(%s)\n", 363 i, buf); 364 } 365 free(affinity); 366 } 367 368 fflush(stdout); 369 } 370 371 static void ublk_ctrl_deinit(struct ublk_dev *dev) 372 { 373 close(dev->ctrl_fd); 374 free(dev); 375 } 376 377 static struct ublk_dev *ublk_ctrl_init(void) 378 { 379 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); 380 struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 381 int ret; 382 383 dev->ctrl_fd = open(CTRL_DEV, O_RDWR); 384 if (dev->ctrl_fd < 0) { 385 free(dev); 386 return NULL; 387 } 388 389 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; 390 391 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, 392 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); 393 if (ret < 0) { 394 ublk_err("queue_init: %s\n", strerror(-ret)); 395 free(dev); 396 return NULL; 397 } 398 dev->nr_fds = 1; 399 400 return dev; 401 } 402 403 static int __ublk_queue_cmd_buf_sz(unsigned depth) 404 { 405 int size = depth * sizeof(struct ublksrv_io_desc); 406 unsigned int page_sz = getpagesize(); 407 408 return round_up(size, page_sz); 409 } 410 411 static int ublk_queue_max_cmd_buf_sz(void) 412 { 413 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); 414 } 415 416 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) 417 { 418 return __ublk_queue_cmd_buf_sz(q->q_depth); 419 } 420 421 static void ublk_queue_deinit(struct ublk_queue *q) 422 { 423 int i; 424 int nr_ios = q->q_depth; 425 426 if (q->io_cmd_buf) 427 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); 428 429 for (i = 0; i < nr_ios; i++) { 430 free(q->ios[i].buf_addr); 431 free(q->ios[i].integrity_buf); 432 } 433 } 434 435 static void ublk_thread_deinit(struct ublk_thread *t) 436 { 437 io_uring_unregister_buffers(&t->ring); 438 439 ublk_batch_free_buf(t); 440 441 io_uring_unregister_ring_fd(&t->ring); 442 443 if (t->ring.ring_fd > 0) { 444 io_uring_unregister_files(&t->ring); 445 close(t->ring.ring_fd); 446 t->ring.ring_fd = -1; 447 } 448 } 449 450 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, 451 __u8 metadata_size) 452 { 453 struct ublk_dev *dev = q->dev; 454 int depth = dev->dev_info.queue_depth; 455 int i; 456 int cmd_buf_size, io_buf_size, integrity_size; 457 unsigned long off; 458 459 pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); 460 q->tgt_ops = dev->tgt.ops; 461 q->flags = 0; 462 q->q_depth = depth; 463 q->flags = dev->dev_info.flags; 464 q->flags |= extra_flags; 465 q->metadata_size = metadata_size; 466 467 /* Cache fd in queue for fast path access */ 468 q->ublk_fd = dev->fds[0]; 469 470 cmd_buf_size = ublk_queue_cmd_buf_sz(q); 471 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); 472 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, 473 MAP_SHARED | MAP_POPULATE, dev->fds[0], off); 474 if (q->io_cmd_buf == MAP_FAILED) { 475 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", 476 q->dev->dev_info.dev_id, q->q_id); 477 goto fail; 478 } 479 480 io_buf_size = dev->dev_info.max_io_buf_bytes; 481 integrity_size = ublk_integrity_len(q, io_buf_size); 482 for (i = 0; i < q->q_depth; i++) { 483 q->ios[i].buf_addr = NULL; 484 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; 485 q->ios[i].tag = i; 486 487 if (integrity_size) { 488 q->ios[i].integrity_buf = malloc(integrity_size); 489 if (!q->ios[i].integrity_buf) { 490 ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", 491 dev->dev_info.dev_id, q->q_id, i, 492 integrity_size); 493 goto fail; 494 } 495 } 496 497 498 if (ublk_queue_no_buf(q)) 499 continue; 500 501 if (posix_memalign((void **)&q->ios[i].buf_addr, 502 getpagesize(), io_buf_size)) { 503 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", 504 dev->dev_info.dev_id, q->q_id, i); 505 goto fail; 506 } 507 } 508 509 return 0; 510 fail: 511 ublk_queue_deinit(q); 512 ublk_err("ublk dev %d queue %d failed\n", 513 dev->dev_info.dev_id, q->q_id); 514 return -ENOMEM; 515 } 516 517 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags) 518 { 519 struct ublk_dev *dev = t->dev; 520 unsigned long long flags = dev->dev_info.flags | extra_flags; 521 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; 522 int ret; 523 524 /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ 525 if (ublk_dev_batch_io(dev)) 526 cq_depth += dev->dev_info.queue_depth * 2; 527 528 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, 529 IORING_SETUP_COOP_TASKRUN | 530 IORING_SETUP_SINGLE_ISSUER | 531 IORING_SETUP_DEFER_TASKRUN); 532 if (ret < 0) { 533 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n", 534 dev->dev_info.dev_id, t->idx, ret); 535 goto fail; 536 } 537 538 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) { 539 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; 540 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; 541 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); 542 543 t->nr_bufs = max_nr_ios_per_thread; 544 } else { 545 t->nr_bufs = 0; 546 } 547 548 if (ublk_dev_batch_io(dev)) 549 ublk_batch_prepare(t); 550 551 if (t->nr_bufs) { 552 ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); 553 if (ret) { 554 ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", 555 dev->dev_info.dev_id, t->idx, ret); 556 goto fail; 557 } 558 } 559 560 if (ublk_dev_batch_io(dev)) { 561 ret = ublk_batch_alloc_buf(t); 562 if (ret) { 563 ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", 564 dev->dev_info.dev_id, t->idx, ret); 565 goto fail; 566 } 567 } 568 569 io_uring_register_ring_fd(&t->ring); 570 571 if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { 572 /* Register only backing files starting from index 1, exclude ublk control device */ 573 if (dev->nr_fds > 1) { 574 ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1); 575 } else { 576 /* No backing files to register, skip file registration */ 577 ret = 0; 578 } 579 } else { 580 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); 581 } 582 if (ret) { 583 ublk_err("ublk dev %d thread %d register files failed %d\n", 584 t->dev->dev_info.dev_id, t->idx, ret); 585 goto fail; 586 } 587 588 return 0; 589 fail: 590 ublk_thread_deinit(t); 591 ublk_err("ublk dev %d thread %d init failed\n", 592 dev->dev_info.dev_id, t->idx); 593 return -ENOMEM; 594 } 595 596 #define WAIT_USEC 100000 597 #define MAX_WAIT_USEC (3 * 1000000) 598 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) 599 { 600 int dev_id = dev->dev_info.dev_id; 601 unsigned int wait_usec = 0; 602 int ret = 0, fd = -1; 603 char buf[64]; 604 605 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); 606 607 while (wait_usec < MAX_WAIT_USEC) { 608 fd = open(buf, O_RDWR); 609 if (fd >= 0) 610 break; 611 usleep(WAIT_USEC); 612 wait_usec += WAIT_USEC; 613 } 614 if (fd < 0) { 615 ublk_err("can't open %s %s\n", buf, strerror(errno)); 616 return -1; 617 } 618 619 dev->fds[0] = fd; 620 if (dev->tgt.ops->init_tgt) 621 ret = dev->tgt.ops->init_tgt(ctx, dev); 622 if (ret) 623 close(dev->fds[0]); 624 return ret; 625 } 626 627 static void ublk_dev_unprep(struct ublk_dev *dev) 628 { 629 if (dev->tgt.ops->deinit_tgt) 630 dev->tgt.ops->deinit_tgt(dev); 631 close(dev->fds[0]); 632 } 633 634 static void ublk_set_auto_buf_reg(const struct ublk_thread *t, 635 const struct ublk_queue *q, 636 struct io_uring_sqe *sqe, 637 unsigned short tag) 638 { 639 struct ublk_auto_buf_reg buf = {}; 640 641 if (q->tgt_ops->buf_index) 642 buf.index = q->tgt_ops->buf_index(t, q, tag); 643 else 644 buf.index = ublk_io_buf_idx(t, q, tag); 645 646 if (ublk_queue_auto_zc_fallback(q)) 647 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; 648 649 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf); 650 } 651 652 /* Copy in pieces to test the buffer offset logic */ 653 #define UBLK_USER_COPY_LEN 2048 654 655 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) 656 { 657 const struct ublk_queue *q = ublk_io_to_queue(io); 658 const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag); 659 __u64 off = ublk_user_copy_offset(q->q_id, io->tag); 660 __u8 ublk_op = ublksrv_get_op(iod); 661 __u32 len = iod->nr_sectors << 9; 662 void *addr = io->buf_addr; 663 ssize_t copied; 664 665 if (ublk_op != match_ublk_op) 666 return; 667 668 while (len) { 669 __u32 copy_len = min(len, UBLK_USER_COPY_LEN); 670 671 if (ublk_op == UBLK_IO_OP_WRITE) 672 copied = pread(q->ublk_fd, addr, copy_len, off); 673 else if (ublk_op == UBLK_IO_OP_READ) 674 copied = pwrite(q->ublk_fd, addr, copy_len, off); 675 else 676 assert(0); 677 assert(copied == (ssize_t)copy_len); 678 addr += copy_len; 679 off += copy_len; 680 len -= copy_len; 681 } 682 683 if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) 684 return; 685 686 len = ublk_integrity_len(q, iod->nr_sectors << 9); 687 off = ublk_user_copy_offset(q->q_id, io->tag); 688 off |= UBLKSRV_IO_INTEGRITY_FLAG; 689 if (ublk_op == UBLK_IO_OP_WRITE) 690 copied = pread(q->ublk_fd, io->integrity_buf, len, off); 691 else if (ublk_op == UBLK_IO_OP_READ) 692 copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); 693 else 694 assert(0); 695 assert(copied == (ssize_t)len); 696 } 697 698 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) 699 { 700 struct ublk_queue *q = ublk_io_to_queue(io); 701 struct ublksrv_io_cmd *cmd; 702 struct io_uring_sqe *sqe[1]; 703 unsigned int cmd_op = 0; 704 __u64 user_data; 705 706 /* only freed io can be issued */ 707 if (!(io->flags & UBLKS_IO_FREE)) 708 return 0; 709 710 /* 711 * we issue because we need either fetching or committing or 712 * getting data 713 */ 714 if (!(io->flags & 715 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA))) 716 return 0; 717 718 if (io->flags & UBLKS_IO_NEED_GET_DATA) 719 cmd_op = UBLK_U_IO_NEED_GET_DATA; 720 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) { 721 if (ublk_queue_use_user_copy(q)) 722 ublk_user_copy(io, UBLK_IO_OP_READ); 723 724 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; 725 } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ) 726 cmd_op = UBLK_U_IO_FETCH_REQ; 727 728 if (io_uring_sq_space_left(&t->ring) < 1) 729 io_uring_submit(&t->ring); 730 731 ublk_io_alloc_sqes(t, sqe, 1); 732 if (!sqe[0]) { 733 ublk_err("%s: run out of sqe. thread %u, tag %d\n", 734 __func__, t->idx, io->tag); 735 return -1; 736 } 737 738 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); 739 740 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) 741 cmd->result = io->result; 742 743 /* These fields should be written once, never change */ 744 ublk_set_sqe_cmd_op(sqe[0], cmd_op); 745 sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */ 746 sqe[0]->opcode = IORING_OP_URING_CMD; 747 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) 748 sqe[0]->flags = 0; /* Use raw FD, not fixed file */ 749 else 750 sqe[0]->flags = IOSQE_FIXED_FILE; 751 sqe[0]->rw_flags = 0; 752 cmd->tag = io->tag; 753 cmd->q_id = q->q_id; 754 if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q)) 755 cmd->addr = (__u64) (uintptr_t) io->buf_addr; 756 else 757 cmd->addr = 0; 758 759 if (ublk_queue_use_auto_zc(q)) 760 ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); 761 762 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); 763 io_uring_sqe_set_data64(sqe[0], user_data); 764 765 io->flags = 0; 766 767 t->cmd_inflight += 1; 768 769 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n", 770 __func__, t->idx, q->q_id, io->tag, cmd_op, 771 io->flags, !!(t->state & UBLKS_T_STOPPING)); 772 return 1; 773 } 774 775 static void ublk_submit_fetch_commands(struct ublk_thread *t) 776 { 777 struct ublk_queue *q; 778 struct ublk_io *io; 779 int i = 0, j = 0; 780 781 if (t->dev->per_io_tasks) { 782 /* 783 * Lexicographically order all the (qid,tag) pairs, with 784 * qid taking priority (so (1,0) > (0,1)). Then make 785 * this thread the daemon for every Nth entry in this 786 * list (N is the number of threads), starting at this 787 * thread's index. This ensures that each queue is 788 * handled by as many ublk server threads as possible, 789 * so that load that is concentrated on one or a few 790 * queues can make use of all ublk server threads. 791 */ 792 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info; 793 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth; 794 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) { 795 int q_id = i / dinfo->queue_depth; 796 int tag = i % dinfo->queue_depth; 797 q = &t->dev->q[q_id]; 798 io = &q->ios[tag]; 799 io->buf_index = j++; 800 if (q->tgt_ops->pre_fetch_io) 801 q->tgt_ops->pre_fetch_io(t, q, tag, false); 802 ublk_queue_io_cmd(t, io); 803 } 804 } else { 805 /* 806 * Service exclusively the queue whose q_id matches our 807 * thread index. 808 */ 809 struct ublk_queue *q = &t->dev->q[t->idx]; 810 for (i = 0; i < q->q_depth; i++) { 811 io = &q->ios[i]; 812 io->buf_index = i; 813 if (q->tgt_ops->pre_fetch_io) 814 q->tgt_ops->pre_fetch_io(t, q, i, false); 815 ublk_queue_io_cmd(t, io); 816 } 817 } 818 } 819 820 static int ublk_thread_is_idle(struct ublk_thread *t) 821 { 822 return !io_uring_sq_ready(&t->ring) && !t->io_inflight; 823 } 824 825 static int ublk_thread_is_done(struct ublk_thread *t) 826 { 827 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight; 828 } 829 830 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, 831 struct ublk_queue *q, 832 struct io_uring_cqe *cqe) 833 { 834 if (cqe->res < 0 && cqe->res != -EAGAIN) 835 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", 836 __func__, cqe->res, q->q_id, 837 user_data_to_tag(cqe->user_data), 838 user_data_to_op(cqe->user_data)); 839 840 if (q->tgt_ops->tgt_io_done) 841 q->tgt_ops->tgt_io_done(t, q, cqe); 842 } 843 844 static void ublk_handle_uring_cmd(struct ublk_thread *t, 845 struct ublk_queue *q, 846 const struct io_uring_cqe *cqe) 847 { 848 int fetch = (cqe->res != UBLK_IO_RES_ABORT) && 849 !(t->state & UBLKS_T_STOPPING); 850 unsigned tag = user_data_to_tag(cqe->user_data); 851 struct ublk_io *io = &q->ios[tag]; 852 853 t->cmd_inflight--; 854 855 if (!fetch) { 856 t->state |= UBLKS_T_STOPPING; 857 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; 858 } 859 860 if (cqe->res == UBLK_IO_RES_OK) { 861 ublk_assert(tag < q->q_depth); 862 863 if (ublk_queue_use_user_copy(q)) 864 ublk_user_copy(io, UBLK_IO_OP_WRITE); 865 866 if (q->tgt_ops->queue_io) 867 q->tgt_ops->queue_io(t, q, tag); 868 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { 869 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE; 870 ublk_queue_io_cmd(t, io); 871 } else { 872 /* 873 * COMMIT_REQ will be completed immediately since no fetching 874 * piggyback is required. 875 * 876 * Marking IO_FREE only, then this io won't be issued since 877 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*) 878 * 879 * */ 880 io->flags = UBLKS_IO_FREE; 881 } 882 } 883 884 static void ublk_handle_cqe(struct ublk_thread *t, 885 struct io_uring_cqe *cqe, void *data) 886 { 887 struct ublk_dev *dev = t->dev; 888 unsigned q_id = user_data_to_q_id(cqe->user_data); 889 unsigned cmd_op = user_data_to_op(cqe->user_data); 890 891 if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) 892 ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, 893 cqe->res, cqe->user_data, t->state); 894 895 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " 896 "data %lx target %d/%d) stopping %d\n", 897 __func__, cqe->res, t->idx, q_id, 898 user_data_to_tag(cqe->user_data), 899 cmd_op, cqe->user_data, is_target_io(cqe->user_data), 900 user_data_to_tgt_data(cqe->user_data), 901 (t->state & UBLKS_T_STOPPING)); 902 903 /* Don't retrieve io in case of target io */ 904 if (is_target_io(cqe->user_data)) { 905 ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); 906 return; 907 } 908 909 if (ublk_thread_batch_io(t)) 910 ublk_batch_compl_cmd(t, cqe); 911 else 912 ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); 913 } 914 915 static int ublk_reap_events_uring(struct ublk_thread *t) 916 { 917 struct io_uring_cqe *cqe; 918 unsigned head; 919 int count = 0; 920 921 io_uring_for_each_cqe(&t->ring, head, cqe) { 922 ublk_handle_cqe(t, cqe, NULL); 923 count += 1; 924 } 925 io_uring_cq_advance(&t->ring, count); 926 927 return count; 928 } 929 930 static int ublk_process_io(struct ublk_thread *t) 931 { 932 int ret, reapped; 933 934 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n", 935 t->dev->dev_info.dev_id, 936 t->idx, io_uring_sq_ready(&t->ring), 937 t->cmd_inflight, 938 (t->state & UBLKS_T_STOPPING)); 939 940 if (ublk_thread_is_done(t)) 941 return -ENODEV; 942 943 ret = io_uring_submit_and_wait(&t->ring, 1); 944 if (ublk_thread_batch_io(t)) { 945 ublk_batch_prep_commit(t); 946 reapped = ublk_reap_events_uring(t); 947 ublk_batch_commit_io_cmds(t); 948 } else { 949 reapped = ublk_reap_events_uring(t); 950 } 951 952 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", 953 ret, reapped, (t->state & UBLKS_T_STOPPING), 954 (t->state & UBLKS_T_IDLE)); 955 956 return reapped; 957 } 958 959 struct ublk_thread_info { 960 struct ublk_dev *dev; 961 pthread_t thread; 962 unsigned idx; 963 sem_t *ready; 964 cpu_set_t *affinity; 965 unsigned long long extra_flags; 966 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; 967 }; 968 969 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) 970 { 971 if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0) 972 ublk_err("ublk dev %u thread %u set affinity failed", 973 info->dev->dev_info.dev_id, info->idx); 974 } 975 976 static void ublk_batch_setup_queues(struct ublk_thread *t) 977 { 978 int i; 979 980 for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { 981 struct ublk_queue *q = &t->dev->q[i]; 982 int ret; 983 984 /* 985 * Only prepare io commands in the mapped thread context, 986 * otherwise io command buffer index may not work as expected 987 */ 988 if (t->q_map[i] == 0) 989 continue; 990 991 if (q->tgt_ops->pre_fetch_io) 992 q->tgt_ops->pre_fetch_io(t, q, 0, true); 993 994 ret = ublk_batch_queue_prep_io_cmds(t, q); 995 ublk_assert(ret >= 0); 996 } 997 } 998 999 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) 1000 { 1001 struct ublk_thread t = { 1002 .dev = info->dev, 1003 .idx = info->idx, 1004 }; 1005 int dev_id = info->dev->dev_info.dev_id; 1006 int ret; 1007 1008 /* Copy per-thread queue mapping into thread-local variable */ 1009 if (info->q_thread_map) 1010 memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); 1011 1012 ret = ublk_thread_init(&t, info->extra_flags); 1013 if (ret) { 1014 ublk_err("ublk dev %d thread %u init failed\n", 1015 dev_id, t.idx); 1016 return ret; 1017 } 1018 sem_post(info->ready); 1019 1020 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", 1021 gettid(), dev_id, t.idx); 1022 1023 if (!ublk_thread_batch_io(&t)) { 1024 /* submit all io commands to ublk driver */ 1025 ublk_submit_fetch_commands(&t); 1026 } else { 1027 ublk_batch_setup_queues(&t); 1028 ublk_batch_start_fetch(&t); 1029 } 1030 1031 do { 1032 if (ublk_process_io(&t) < 0) 1033 break; 1034 } while (1); 1035 1036 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", 1037 gettid(), dev_id, t.idx); 1038 ublk_thread_deinit(&t); 1039 return 0; 1040 } 1041 1042 static void *ublk_io_handler_fn(void *data) 1043 { 1044 struct ublk_thread_info *info = data; 1045 1046 /* 1047 * IO perf is sensitive with queue pthread affinity on NUMA machine 1048 * 1049 * Set sched_affinity at beginning, so following allocated memory/pages 1050 * could be CPU/NUMA aware. 1051 */ 1052 if (info->affinity) 1053 ublk_thread_set_sched_affinity(info); 1054 1055 __ublk_io_handler_fn(info); 1056 1057 return NULL; 1058 } 1059 1060 static void ublk_set_parameters(struct ublk_dev *dev) 1061 { 1062 int ret; 1063 1064 ret = ublk_ctrl_set_params(dev, &dev->tgt.params); 1065 if (ret) 1066 ublk_err("dev %d set basic parameter failed %d\n", 1067 dev->dev_info.dev_id, ret); 1068 } 1069 1070 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id) 1071 { 1072 uint64_t id; 1073 int evtfd = ctx->_evtfd; 1074 1075 if (evtfd < 0) 1076 return -EBADF; 1077 1078 if (dev_id >= 0) 1079 id = dev_id + 1; 1080 else 1081 id = ERROR_EVTFD_DEVID; 1082 1083 if (dev && ctx->shadow_dev) 1084 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q)); 1085 1086 if (write(evtfd, &id, sizeof(id)) != sizeof(id)) 1087 return -EINVAL; 1088 1089 close(evtfd); 1090 shmdt(ctx->shadow_dev); 1091 1092 return 0; 1093 } 1094 1095 1096 /* 1097 * Shared memory registration socket listener. 1098 * 1099 * The parent daemon context listens on a per-device unix socket at 1100 * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests 1101 * from clients. Clients send a memfd via SCM_RIGHTS; the server 1102 * registers it with the kernel, mmaps it, and returns the assigned index. 1103 */ 1104 #define UBLK_SHMEM_SOCK_DIR "/run/ublk" 1105 1106 /* defined in kublk.h, shared with file_backed.c (loop target) */ 1107 struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; 1108 int shmem_count; 1109 1110 static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len) 1111 { 1112 snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id); 1113 } 1114 1115 static int ublk_shmem_sock_create(int dev_id) 1116 { 1117 struct sockaddr_un addr = { .sun_family = AF_UNIX }; 1118 char path[108]; 1119 int fd; 1120 1121 mkdir(UBLK_SHMEM_SOCK_DIR, 0755); 1122 ublk_shmem_sock_path(dev_id, path, sizeof(path)); 1123 unlink(path); 1124 1125 fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); 1126 if (fd < 0) 1127 return -1; 1128 1129 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path); 1130 if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { 1131 close(fd); 1132 return -1; 1133 } 1134 1135 listen(fd, 4); 1136 ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path); 1137 return fd; 1138 } 1139 1140 static void ublk_shmem_sock_destroy(int dev_id, int sock_fd) 1141 { 1142 char path[108]; 1143 1144 if (sock_fd >= 0) 1145 close(sock_fd); 1146 ublk_shmem_sock_path(dev_id, path, sizeof(path)); 1147 unlink(path); 1148 } 1149 1150 /* Receive a memfd from a client via SCM_RIGHTS */ 1151 static int ublk_shmem_recv_fd(int client_fd) 1152 { 1153 char buf[1]; 1154 struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) }; 1155 union { 1156 char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1157 struct cmsghdr align; 1158 } u; 1159 struct msghdr msg = { 1160 .msg_iov = &iov, 1161 .msg_iovlen = 1, 1162 .msg_control = u.cmsg_buf, 1163 .msg_controllen = sizeof(u.cmsg_buf), 1164 }; 1165 struct cmsghdr *cmsg; 1166 1167 if (recvmsg(client_fd, &msg, 0) <= 0) 1168 return -1; 1169 1170 cmsg = CMSG_FIRSTHDR(&msg); 1171 if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || 1172 cmsg->cmsg_type != SCM_RIGHTS) 1173 return -1; 1174 1175 return *(int *)CMSG_DATA(cmsg); 1176 } 1177 1178 /* Register a shared memory buffer: store fd, mmap it, return index */ 1179 static int ublk_shmem_register(int shmem_fd) 1180 { 1181 off_t size; 1182 void *base; 1183 int idx; 1184 1185 if (shmem_count >= UBLK_BUF_MAX) 1186 return -1; 1187 1188 size = lseek(shmem_fd, 0, SEEK_END); 1189 if (size <= 0) 1190 return -1; 1191 1192 base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, 1193 shmem_fd, 0); 1194 if (base == MAP_FAILED) 1195 return -1; 1196 1197 idx = shmem_count++; 1198 shmem_table[idx].fd = shmem_fd; 1199 shmem_table[idx].mmap_base = base; 1200 shmem_table[idx].size = size; 1201 1202 ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n", 1203 idx, shmem_fd, (size_t)size); 1204 return idx; 1205 } 1206 1207 static void ublk_shmem_unregister_all(void) 1208 { 1209 int i; 1210 1211 for (i = 0; i < shmem_count; i++) { 1212 if (shmem_table[i].mmap_base) { 1213 munmap(shmem_table[i].mmap_base, 1214 shmem_table[i].size); 1215 close(shmem_table[i].fd); 1216 shmem_table[i].mmap_base = NULL; 1217 } 1218 } 1219 shmem_count = 0; 1220 } 1221 1222 static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size, 1223 __u32 flags) 1224 { 1225 struct ublk_shmem_buf_reg buf_reg = { 1226 .addr = (unsigned long)addr, 1227 .len = size, 1228 .flags = flags, 1229 }; 1230 struct ublk_ctrl_cmd_data data = { 1231 .cmd_op = UBLK_U_CMD_REG_BUF, 1232 .flags = CTRL_CMD_HAS_BUF, 1233 .addr = (unsigned long)&buf_reg, 1234 .len = sizeof(buf_reg), 1235 }; 1236 1237 return __ublk_ctrl_cmd(dev, &data); 1238 } 1239 1240 /* 1241 * Handle one client connection: receive memfd, mmap it, register 1242 * the VA range with kernel, send back the assigned index. 1243 */ 1244 static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) 1245 { 1246 int client_fd, memfd, idx, ret; 1247 int32_t reply; 1248 off_t size; 1249 void *base; 1250 1251 client_fd = accept(sock_fd, NULL, NULL); 1252 if (client_fd < 0) 1253 return; 1254 1255 memfd = ublk_shmem_recv_fd(client_fd); 1256 if (memfd < 0) { 1257 reply = -1; 1258 goto out; 1259 } 1260 1261 /* mmap the memfd in server address space */ 1262 size = lseek(memfd, 0, SEEK_END); 1263 if (size <= 0) { 1264 reply = -1; 1265 close(memfd); 1266 goto out; 1267 } 1268 base = mmap(NULL, size, PROT_READ | PROT_WRITE, 1269 MAP_SHARED | MAP_POPULATE, memfd, 0); 1270 if (base == MAP_FAILED) { 1271 reply = -1; 1272 close(memfd); 1273 goto out; 1274 } 1275 1276 /* Register server's VA range with kernel for PFN matching */ 1277 ret = ublk_ctrl_reg_buf(dev, base, size, 0); 1278 if (ret < 0) { 1279 ublk_dbg(UBLK_DBG_DEV, 1280 "shmem_zc: kernel reg failed %d\n", ret); 1281 munmap(base, size); 1282 close(memfd); 1283 reply = ret; 1284 goto out; 1285 } 1286 1287 /* Store in table for I/O handling */ 1288 idx = ublk_shmem_register(memfd); 1289 if (idx >= 0) { 1290 shmem_table[idx].mmap_base = base; 1291 shmem_table[idx].size = size; 1292 } 1293 reply = idx; 1294 out: 1295 send(client_fd, &reply, sizeof(reply), 0); 1296 close(client_fd); 1297 } 1298 1299 struct shmem_listener_info { 1300 int dev_id; 1301 int stop_efd; /* eventfd to signal listener to stop */ 1302 int sock_fd; /* listener socket fd (output) */ 1303 struct ublk_dev *dev; 1304 }; 1305 1306 /* 1307 * Socket listener thread: runs in the parent daemon context alongside 1308 * the I/O threads. Accepts shared memory registration requests from 1309 * clients via SCM_RIGHTS. Exits when stop_efd is signaled. 1310 */ 1311 static void *ublk_shmem_listener_fn(void *data) 1312 { 1313 struct shmem_listener_info *info = data; 1314 struct pollfd pfds[2]; 1315 1316 info->sock_fd = ublk_shmem_sock_create(info->dev_id); 1317 if (info->sock_fd < 0) 1318 return NULL; 1319 1320 pfds[0].fd = info->sock_fd; 1321 pfds[0].events = POLLIN; 1322 pfds[1].fd = info->stop_efd; 1323 pfds[1].events = POLLIN; 1324 1325 while (1) { 1326 int ret = poll(pfds, 2, -1); 1327 1328 if (ret < 0) 1329 break; 1330 1331 /* Stop signal from parent */ 1332 if (pfds[1].revents & POLLIN) 1333 break; 1334 1335 /* Client connection */ 1336 if (pfds[0].revents & POLLIN) 1337 ublk_shmem_handle_client(info->sock_fd, info->dev); 1338 } 1339 1340 return NULL; 1341 } 1342 1343 static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, 1344 struct ublk_dev *dev) 1345 { 1346 int fd, idx, ret; 1347 struct stat st; 1348 void *base; 1349 1350 fd = open(ctx->htlb_path, O_RDWR); 1351 if (fd < 0) { 1352 ublk_err("htlb: can't open %s\n", ctx->htlb_path); 1353 return -errno; 1354 } 1355 1356 if (fstat(fd, &st) < 0 || st.st_size <= 0) { 1357 ublk_err("htlb: invalid file size\n"); 1358 close(fd); 1359 return -EINVAL; 1360 } 1361 1362 base = mmap(NULL, st.st_size, 1363 ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE, 1364 MAP_SHARED | MAP_POPULATE, fd, 0); 1365 if (base == MAP_FAILED) { 1366 ublk_err("htlb: mmap failed\n"); 1367 close(fd); 1368 return -ENOMEM; 1369 } 1370 1371 ret = ublk_ctrl_reg_buf(dev, base, st.st_size, 1372 ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0); 1373 if (ret < 0) { 1374 ublk_err("htlb: reg_buf failed: %d\n", ret); 1375 munmap(base, st.st_size); 1376 close(fd); 1377 return ret; 1378 } 1379 1380 if (shmem_count >= UBLK_BUF_MAX) { 1381 munmap(base, st.st_size); 1382 close(fd); 1383 return -ENOMEM; 1384 } 1385 1386 idx = shmem_count++; 1387 shmem_table[idx].fd = fd; 1388 shmem_table[idx].mmap_base = base; 1389 shmem_table[idx].size = st.st_size; 1390 1391 ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n", 1392 idx, (size_t)st.st_size); 1393 return 0; 1394 } 1395 1396 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) 1397 { 1398 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; 1399 struct shmem_listener_info linfo = {}; 1400 struct ublk_thread_info *tinfo; 1401 unsigned long long extra_flags = 0; 1402 cpu_set_t *affinity_buf; 1403 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; 1404 uint64_t stop_val = 1; 1405 pthread_t listener; 1406 void *thread_ret; 1407 sem_t ready; 1408 int ret, i; 1409 1410 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); 1411 1412 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads); 1413 if (!tinfo) 1414 return -ENOMEM; 1415 1416 sem_init(&ready, 0, 0); 1417 ret = ublk_dev_prep(ctx, dev); 1418 if (ret) 1419 return ret; 1420 1421 ret = ublk_ctrl_get_affinity(dev, &affinity_buf); 1422 if (ret) 1423 return ret; 1424 1425 if (ublk_dev_batch_io(dev)) { 1426 q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); 1427 if (!q_thread_map) { 1428 ret = -ENOMEM; 1429 goto fail; 1430 } 1431 ublk_batch_setup_map(q_thread_map, dev->nthreads, 1432 dinfo->nr_hw_queues); 1433 } 1434 1435 if (ctx->auto_zc_fallback) 1436 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; 1437 if (ctx->no_ublk_fixed_fd) 1438 extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD; 1439 1440 for (i = 0; i < dinfo->nr_hw_queues; i++) { 1441 dev->q[i].dev = dev; 1442 dev->q[i].q_id = i; 1443 1444 ret = ublk_queue_init(&dev->q[i], extra_flags, 1445 ctx->metadata_size); 1446 if (ret) { 1447 ublk_err("ublk dev %d queue %d init queue failed\n", 1448 dinfo->dev_id, i); 1449 goto fail; 1450 } 1451 } 1452 1453 for (i = 0; i < dev->nthreads; i++) { 1454 tinfo[i].dev = dev; 1455 tinfo[i].idx = i; 1456 tinfo[i].ready = &ready; 1457 tinfo[i].extra_flags = extra_flags; 1458 tinfo[i].q_thread_map = q_thread_map; 1459 1460 /* 1461 * If threads are not tied 1:1 to queues, setting thread 1462 * affinity based on queue affinity makes little sense. 1463 * However, thread CPU affinity has significant impact 1464 * on performance, so to compare fairly, we'll still set 1465 * thread CPU affinity based on queue affinity where 1466 * possible. 1467 */ 1468 if (dev->nthreads == dinfo->nr_hw_queues) 1469 tinfo[i].affinity = &affinity_buf[i]; 1470 pthread_create(&tinfo[i].thread, NULL, 1471 ublk_io_handler_fn, 1472 &tinfo[i]); 1473 } 1474 1475 for (i = 0; i < dev->nthreads; i++) 1476 sem_wait(&ready); 1477 free(affinity_buf); 1478 free(q_thread_map); 1479 1480 /* everything is fine now, start us */ 1481 if (ctx->recovery) 1482 ret = ublk_ctrl_end_user_recovery(dev, getpid()); 1483 else { 1484 ublk_set_parameters(dev); 1485 ret = ublk_ctrl_start_dev(dev, getpid()); 1486 } 1487 if (ret < 0) { 1488 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); 1489 /* stop device so that inflight uring_cmd can be cancelled */ 1490 ublk_ctrl_stop_dev(dev); 1491 goto fail_start; 1492 } 1493 1494 if (ctx->htlb_path) { 1495 ret = ublk_shmem_htlb_setup(ctx, dev); 1496 if (ret < 0) { 1497 ublk_err("htlb setup failed: %d\n", ret); 1498 ublk_ctrl_stop_dev(dev); 1499 goto fail_start; 1500 } 1501 } 1502 1503 ublk_ctrl_get_info(dev); 1504 if (ctx->fg) 1505 ublk_ctrl_dump(dev); 1506 else 1507 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); 1508 fail_start: 1509 /* 1510 * Wait for I/O threads to exit. While waiting, a listener 1511 * thread accepts shared memory registration requests from 1512 * clients via a per-device unix socket (SCM_RIGHTS fd passing). 1513 */ 1514 linfo.dev_id = dinfo->dev_id; 1515 linfo.dev = dev; 1516 linfo.stop_efd = eventfd(0, 0); 1517 if (linfo.stop_efd >= 0) 1518 pthread_create(&listener, NULL, 1519 ublk_shmem_listener_fn, &linfo); 1520 1521 for (i = 0; i < (int)dev->nthreads; i++) 1522 pthread_join(tinfo[i].thread, &thread_ret); 1523 1524 /* Signal listener thread to stop and wait for it */ 1525 if (linfo.stop_efd >= 0) { 1526 write(linfo.stop_efd, &stop_val, sizeof(stop_val)); 1527 pthread_join(listener, NULL); 1528 close(linfo.stop_efd); 1529 ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd); 1530 } 1531 ublk_shmem_unregister_all(); 1532 free(tinfo); 1533 fail: 1534 for (i = 0; i < dinfo->nr_hw_queues; i++) 1535 ublk_queue_deinit(&dev->q[i]); 1536 ublk_dev_unprep(dev); 1537 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); 1538 1539 return ret; 1540 } 1541 1542 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) 1543 { 1544 #define EV_SIZE (sizeof(struct inotify_event)) 1545 #define EV_BUF_LEN (128 * (EV_SIZE + 16)) 1546 struct pollfd pfd; 1547 int fd, wd; 1548 int ret = -EINVAL; 1549 const char *dev_name = basename(path); 1550 1551 fd = inotify_init(); 1552 if (fd < 0) { 1553 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); 1554 return fd; 1555 } 1556 1557 wd = inotify_add_watch(fd, "/dev", evt_mask); 1558 if (wd == -1) { 1559 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); 1560 goto fail; 1561 } 1562 1563 pfd.fd = fd; 1564 pfd.events = POLL_IN; 1565 while (1) { 1566 int i = 0; 1567 char buffer[EV_BUF_LEN]; 1568 ret = poll(&pfd, 1, 1000 * timeout); 1569 1570 if (ret == -1) { 1571 ublk_err("%s: poll inotify failed: %d\n", __func__, ret); 1572 goto rm_watch; 1573 } else if (ret == 0) { 1574 ublk_err("%s: poll inotify timeout\n", __func__); 1575 ret = -ETIMEDOUT; 1576 goto rm_watch; 1577 } 1578 1579 ret = read(fd, buffer, EV_BUF_LEN); 1580 if (ret < 0) { 1581 ublk_err("%s: read inotify fd failed\n", __func__); 1582 goto rm_watch; 1583 } 1584 1585 while (i < ret) { 1586 struct inotify_event *event = (struct inotify_event *)&buffer[i]; 1587 1588 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", 1589 __func__, event->mask, event->name); 1590 if (event->mask & evt_mask) { 1591 if (!strcmp(event->name, dev_name)) { 1592 ret = 0; 1593 goto rm_watch; 1594 } 1595 } 1596 i += EV_SIZE + event->len; 1597 } 1598 } 1599 rm_watch: 1600 inotify_rm_watch(fd, wd); 1601 fail: 1602 close(fd); 1603 return ret; 1604 } 1605 1606 static int ublk_stop_io_daemon(const struct ublk_dev *dev) 1607 { 1608 int daemon_pid = dev->dev_info.ublksrv_pid; 1609 int dev_id = dev->dev_info.dev_id; 1610 char ublkc[64]; 1611 int ret = 0; 1612 1613 if (daemon_pid < 0) 1614 return 0; 1615 1616 /* daemon may be dead already */ 1617 if (kill(daemon_pid, 0) < 0) 1618 goto wait; 1619 1620 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); 1621 1622 /* ublk char device may be gone already */ 1623 if (access(ublkc, F_OK) != 0) 1624 goto wait; 1625 1626 /* Wait until ublk char device is closed, when the daemon is shutdown */ 1627 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); 1628 /* double check and since it may be closed before starting inotify */ 1629 if (ret == -ETIMEDOUT) 1630 ret = kill(daemon_pid, 0) < 0; 1631 wait: 1632 waitpid(daemon_pid, NULL, 0); 1633 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", 1634 __func__, daemon_pid, dev_id, ret); 1635 1636 return ret; 1637 } 1638 1639 static int __cmd_dev_add(const struct dev_ctx *ctx) 1640 { 1641 unsigned nthreads = ctx->nthreads; 1642 unsigned nr_queues = ctx->nr_hw_queues; 1643 const char *tgt_type = ctx->tgt_type; 1644 unsigned depth = ctx->queue_depth; 1645 __u64 features; 1646 const struct ublk_tgt_ops *ops; 1647 struct ublksrv_ctrl_dev_info *info; 1648 struct ublk_dev *dev = NULL; 1649 int dev_id = ctx->dev_id; 1650 int ret, i; 1651 1652 ops = ublk_find_tgt(tgt_type); 1653 if (!ops) { 1654 ublk_err("%s: no such tgt type, type %s\n", 1655 __func__, tgt_type); 1656 ret = -ENODEV; 1657 goto fail; 1658 } 1659 1660 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { 1661 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", 1662 __func__, nr_queues, depth); 1663 ret = -EINVAL; 1664 goto fail; 1665 } 1666 1667 /* default to 1:1 threads:queues if nthreads is unspecified */ 1668 if (!nthreads) 1669 nthreads = nr_queues; 1670 1671 if (nthreads > UBLK_MAX_THREADS) { 1672 ublk_err("%s: %u is too many threads (max %u)\n", 1673 __func__, nthreads, UBLK_MAX_THREADS); 1674 ret = -EINVAL; 1675 goto fail; 1676 } 1677 1678 if (nthreads != nr_queues && (!ctx->per_io_tasks && 1679 !(ctx->flags & UBLK_F_BATCH_IO))) { 1680 ublk_err("%s: threads %u must be same as queues %u if " 1681 "not using per_io_tasks\n", 1682 __func__, nthreads, nr_queues); 1683 ret = -EINVAL; 1684 goto fail; 1685 } 1686 1687 dev = ublk_ctrl_init(); 1688 if (!dev) { 1689 ublk_err("%s: can't alloc dev id %d, type %s\n", 1690 __func__, dev_id, tgt_type); 1691 ret = -ENOMEM; 1692 goto fail; 1693 } 1694 1695 /* kernel doesn't support get_features */ 1696 ret = ublk_ctrl_get_features(dev, &features); 1697 if (ret < 0) { 1698 ret = -EINVAL; 1699 goto fail; 1700 } 1701 1702 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) { 1703 ret = -ENOTSUP; 1704 goto fail; 1705 } 1706 1707 info = &dev->dev_info; 1708 info->dev_id = ctx->dev_id; 1709 info->nr_hw_queues = nr_queues; 1710 info->queue_depth = depth; 1711 info->flags = ctx->flags; 1712 if ((features & UBLK_F_QUIESCE) && 1713 (info->flags & UBLK_F_USER_RECOVERY)) 1714 info->flags |= UBLK_F_QUIESCE; 1715 dev->nthreads = nthreads; 1716 dev->per_io_tasks = ctx->per_io_tasks; 1717 dev->tgt.ops = ops; 1718 dev->tgt.sq_depth = depth; 1719 dev->tgt.cq_depth = depth; 1720 1721 for (i = 0; i < MAX_BACK_FILES; i++) { 1722 if (ctx->files[i]) { 1723 strcpy(dev->tgt.backing_file[i], ctx->files[i]); 1724 dev->tgt.nr_backing_files++; 1725 } 1726 } 1727 1728 if (ctx->recovery) 1729 ret = ublk_ctrl_start_user_recovery(dev); 1730 else 1731 ret = ublk_ctrl_add_dev(dev); 1732 if (ret < 0) { 1733 ublk_err("%s: can't add dev id %d, type %s ret %d\n", 1734 __func__, dev_id, tgt_type, ret); 1735 goto fail; 1736 } 1737 1738 ret = ublk_start_daemon(ctx, dev); 1739 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); 1740 if (ret < 0) 1741 ublk_ctrl_del_dev(dev); 1742 1743 fail: 1744 if (ret < 0) 1745 ublk_send_dev_event(ctx, dev, -1); 1746 if (dev) 1747 ublk_ctrl_deinit(dev); 1748 return ret; 1749 } 1750 1751 static int __cmd_dev_list(struct dev_ctx *ctx); 1752 1753 static int cmd_dev_add(struct dev_ctx *ctx) 1754 { 1755 int res; 1756 1757 if (ctx->fg) 1758 goto run; 1759 1760 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666); 1761 if (ctx->_shmid < 0) { 1762 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno)); 1763 exit(-1); 1764 } 1765 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0); 1766 if (ctx->shadow_dev == (struct ublk_dev *)-1) { 1767 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno)); 1768 exit(-1); 1769 } 1770 ctx->_evtfd = eventfd(0, 0); 1771 if (ctx->_evtfd < 0) { 1772 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); 1773 exit(-1); 1774 } 1775 1776 res = fork(); 1777 if (res == 0) { 1778 int res2; 1779 1780 setsid(); 1781 res2 = fork(); 1782 if (res2 == 0) { 1783 /* prepare for detaching */ 1784 close(STDIN_FILENO); 1785 close(STDOUT_FILENO); 1786 close(STDERR_FILENO); 1787 run: 1788 res = __cmd_dev_add(ctx); 1789 return res; 1790 } else { 1791 /* detached from the foreground task */ 1792 exit(EXIT_SUCCESS); 1793 } 1794 } else if (res > 0) { 1795 uint64_t id; 1796 int exit_code = EXIT_FAILURE; 1797 1798 res = read(ctx->_evtfd, &id, sizeof(id)); 1799 close(ctx->_evtfd); 1800 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { 1801 ctx->dev_id = id - 1; 1802 if (__cmd_dev_list(ctx) >= 0) 1803 exit_code = EXIT_SUCCESS; 1804 } 1805 shmdt(ctx->shadow_dev); 1806 shmctl(ctx->_shmid, IPC_RMID, NULL); 1807 /* wait for child and detach from it */ 1808 wait(NULL); 1809 if (exit_code == EXIT_FAILURE) 1810 ublk_err("%s: command failed\n", __func__); 1811 exit(exit_code); 1812 } else { 1813 exit(EXIT_FAILURE); 1814 } 1815 } 1816 1817 static int __cmd_dev_del(struct dev_ctx *ctx) 1818 { 1819 int number = ctx->dev_id; 1820 struct ublk_dev *dev; 1821 int ret; 1822 1823 dev = ublk_ctrl_init(); 1824 dev->dev_info.dev_id = number; 1825 1826 ret = ublk_ctrl_get_info(dev); 1827 if (ret < 0) 1828 goto fail; 1829 1830 ret = ublk_ctrl_stop_dev(dev); 1831 if (ret < 0) 1832 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); 1833 1834 ret = ublk_stop_io_daemon(dev); 1835 if (ret < 0) 1836 ublk_err("%s: stop daemon id %d dev %d, ret %d\n", 1837 __func__, dev->dev_info.ublksrv_pid, number, ret); 1838 ublk_ctrl_del_dev(dev); 1839 fail: 1840 ublk_ctrl_deinit(dev); 1841 1842 return (ret >= 0) ? 0 : ret; 1843 } 1844 1845 static int cmd_dev_del(struct dev_ctx *ctx) 1846 { 1847 int i; 1848 1849 if (ctx->dev_id >= 0 || !ctx->all) 1850 return __cmd_dev_del(ctx); 1851 1852 for (i = 0; i < 255; i++) { 1853 ctx->dev_id = i; 1854 __cmd_dev_del(ctx); 1855 } 1856 return 0; 1857 } 1858 1859 static int cmd_dev_stop(struct dev_ctx *ctx) 1860 { 1861 int number = ctx->dev_id; 1862 struct ublk_dev *dev; 1863 int ret; 1864 1865 if (number < 0) { 1866 ublk_err("%s: device id is required\n", __func__); 1867 return -EINVAL; 1868 } 1869 1870 dev = ublk_ctrl_init(); 1871 dev->dev_info.dev_id = number; 1872 1873 ret = ublk_ctrl_get_info(dev); 1874 if (ret < 0) 1875 goto fail; 1876 1877 if (ctx->safe_stop) { 1878 ret = ublk_ctrl_try_stop_dev(dev); 1879 if (ret < 0) 1880 ublk_err("%s: try_stop dev %d failed ret %d\n", 1881 __func__, number, ret); 1882 } else { 1883 ret = ublk_ctrl_stop_dev(dev); 1884 if (ret < 0) 1885 ublk_err("%s: stop dev %d failed ret %d\n", 1886 __func__, number, ret); 1887 } 1888 1889 fail: 1890 ublk_ctrl_deinit(dev); 1891 1892 return ret; 1893 } 1894 1895 static int __cmd_dev_list(struct dev_ctx *ctx) 1896 { 1897 struct ublk_dev *dev = ublk_ctrl_init(); 1898 int ret; 1899 1900 if (!dev) 1901 return -ENODEV; 1902 1903 dev->dev_info.dev_id = ctx->dev_id; 1904 1905 ret = ublk_ctrl_get_info(dev); 1906 if (ret < 0) { 1907 if (ctx->logging) 1908 ublk_err("%s: can't get dev info from %d: %d\n", 1909 __func__, ctx->dev_id, ret); 1910 } else { 1911 if (ctx->shadow_dev) 1912 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q)); 1913 1914 ublk_ctrl_dump(dev); 1915 } 1916 1917 ublk_ctrl_deinit(dev); 1918 1919 return ret; 1920 } 1921 1922 static int cmd_dev_list(struct dev_ctx *ctx) 1923 { 1924 int i; 1925 1926 if (ctx->dev_id >= 0 || !ctx->all) 1927 return __cmd_dev_list(ctx); 1928 1929 ctx->logging = false; 1930 for (i = 0; i < 255; i++) { 1931 ctx->dev_id = i; 1932 __cmd_dev_list(ctx); 1933 } 1934 return 0; 1935 } 1936 1937 static int cmd_dev_get_features(void) 1938 { 1939 #define const_ilog2(x) (63 - __builtin_clzll(x)) 1940 #define FEAT_NAME(f) [const_ilog2(f)] = #f 1941 static const char *feat_map[] = { 1942 FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY), 1943 FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK), 1944 FEAT_NAME(UBLK_F_NEED_GET_DATA), 1945 FEAT_NAME(UBLK_F_USER_RECOVERY), 1946 FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE), 1947 FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV), 1948 FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE), 1949 FEAT_NAME(UBLK_F_USER_COPY), 1950 FEAT_NAME(UBLK_F_ZONED), 1951 FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO), 1952 FEAT_NAME(UBLK_F_UPDATE_SIZE), 1953 FEAT_NAME(UBLK_F_AUTO_BUF_REG), 1954 FEAT_NAME(UBLK_F_QUIESCE), 1955 FEAT_NAME(UBLK_F_PER_IO_DAEMON), 1956 FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), 1957 FEAT_NAME(UBLK_F_INTEGRITY), 1958 FEAT_NAME(UBLK_F_SAFE_STOP_DEV), 1959 FEAT_NAME(UBLK_F_BATCH_IO), 1960 FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), 1961 FEAT_NAME(UBLK_F_SHMEM_ZC), 1962 }; 1963 struct ublk_dev *dev; 1964 __u64 features = 0; 1965 int ret; 1966 1967 dev = ublk_ctrl_init(); 1968 if (!dev) { 1969 fprintf(stderr, "ublksrv_ctrl_init failed id\n"); 1970 return -EOPNOTSUPP; 1971 } 1972 1973 ret = ublk_ctrl_get_features(dev, &features); 1974 if (!ret) { 1975 int i; 1976 1977 printf("ublk_drv features: 0x%llx\n", features); 1978 1979 for (i = 0; i < sizeof(features) * 8; i++) { 1980 const char *feat; 1981 1982 if (!((1ULL << i) & features)) 1983 continue; 1984 if (i < ARRAY_SIZE(feat_map)) 1985 feat = feat_map[i]; 1986 else 1987 feat = "unknown"; 1988 printf("0x%-16llx: %s\n", 1ULL << i, feat); 1989 } 1990 } 1991 1992 return ret; 1993 } 1994 1995 static int cmd_dev_update_size(struct dev_ctx *ctx) 1996 { 1997 struct ublk_dev *dev = ublk_ctrl_init(); 1998 struct ublk_params p; 1999 int ret = -EINVAL; 2000 2001 if (!dev) 2002 return -ENODEV; 2003 2004 if (ctx->dev_id < 0) { 2005 fprintf(stderr, "device id isn't provided\n"); 2006 goto out; 2007 } 2008 2009 dev->dev_info.dev_id = ctx->dev_id; 2010 ret = ublk_ctrl_get_params(dev, &p); 2011 if (ret < 0) { 2012 ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); 2013 goto out; 2014 } 2015 2016 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) { 2017 ublk_err("size isn't aligned with logical block size\n"); 2018 ret = -EINVAL; 2019 goto out; 2020 } 2021 2022 ret = ublk_ctrl_update_size(dev, ctx->size >> 9); 2023 out: 2024 ublk_ctrl_deinit(dev); 2025 return ret; 2026 } 2027 2028 static int cmd_dev_quiesce(struct dev_ctx *ctx) 2029 { 2030 struct ublk_dev *dev = ublk_ctrl_init(); 2031 int ret = -EINVAL; 2032 2033 if (!dev) 2034 return -ENODEV; 2035 2036 if (ctx->dev_id < 0) { 2037 fprintf(stderr, "device id isn't provided for quiesce\n"); 2038 goto out; 2039 } 2040 dev->dev_info.dev_id = ctx->dev_id; 2041 ret = ublk_ctrl_quiesce_dev(dev, 10000); 2042 2043 out: 2044 ublk_ctrl_deinit(dev); 2045 return ret; 2046 } 2047 2048 static void __cmd_create_help(char *exe, bool recovery) 2049 { 2050 int i; 2051 2052 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n", 2053 exe, recovery ? "recover" : "add"); 2054 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); 2055 printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); 2056 printf("\t[--nthreads threads] [--per_io_tasks]\n"); 2057 printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " 2058 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); 2059 printf("\t[--batch|-b] [--no_auto_part_scan]\n"); 2060 printf("\t[target options] [backfile1] [backfile2] ...\n"); 2061 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); 2062 printf("\tdefault: nthreads=nr_queues"); 2063 2064 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) { 2065 const struct ublk_tgt_ops *ops = tgt_ops_list[i]; 2066 2067 if (ops->usage) 2068 ops->usage(ops); 2069 } 2070 } 2071 2072 static void cmd_add_help(char *exe) 2073 { 2074 __cmd_create_help(exe, false); 2075 printf("\n"); 2076 } 2077 2078 static void cmd_recover_help(char *exe) 2079 { 2080 __cmd_create_help(exe, true); 2081 printf("\tPlease provide exact command line for creating this device with real dev_id\n"); 2082 printf("\n"); 2083 } 2084 2085 static int cmd_dev_help(char *exe) 2086 { 2087 cmd_add_help(exe); 2088 cmd_recover_help(exe); 2089 2090 printf("%s del [-n dev_id] -a \n", exe); 2091 printf("\t -a delete all devices -n delete specified device\n\n"); 2092 printf("%s stop -n dev_id [--safe]\n", exe); 2093 printf("\t --safe only stop if device has no active openers\n\n"); 2094 printf("%s list [-n dev_id] -a \n", exe); 2095 printf("\t -a list all devices, -n list specified device, default -a \n\n"); 2096 printf("%s features\n", exe); 2097 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe); 2098 printf("%s quiesce -n dev_id\n", exe); 2099 return 0; 2100 } 2101 2102 int main(int argc, char *argv[]) 2103 { 2104 static const struct option longopts[] = { 2105 { "all", 0, NULL, 'a' }, 2106 { "type", 1, NULL, 't' }, 2107 { "number", 1, NULL, 'n' }, 2108 { "queues", 1, NULL, 'q' }, 2109 { "depth", 1, NULL, 'd' }, 2110 { "debug_mask", 1, NULL, 0 }, 2111 { "quiet", 0, NULL, 0 }, 2112 { "zero_copy", 0, NULL, 'z' }, 2113 { "foreground", 0, NULL, 0 }, 2114 { "recovery", 1, NULL, 'r' }, 2115 { "recovery_fail_io", 1, NULL, 'e'}, 2116 { "recovery_reissue", 1, NULL, 'i'}, 2117 { "get_data", 1, NULL, 'g'}, 2118 { "auto_zc", 0, NULL, 0 }, 2119 { "auto_zc_fallback", 0, NULL, 0 }, 2120 { "user_copy", 0, NULL, 'u'}, 2121 { "size", 1, NULL, 's'}, 2122 { "nthreads", 1, NULL, 0 }, 2123 { "per_io_tasks", 0, NULL, 0 }, 2124 { "no_ublk_fixed_fd", 0, NULL, 0 }, 2125 { "integrity_capable", 0, NULL, 0 }, 2126 { "integrity_reftag", 0, NULL, 0 }, 2127 { "metadata_size", 1, NULL, 0 }, 2128 { "pi_offset", 1, NULL, 0 }, 2129 { "csum_type", 1, NULL, 0 }, 2130 { "tag_size", 1, NULL, 0 }, 2131 { "safe", 0, NULL, 0 }, 2132 { "batch", 0, NULL, 'b'}, 2133 { "no_auto_part_scan", 0, NULL, 0 }, 2134 { "shmem_zc", 0, NULL, 0 }, 2135 { "htlb", 1, NULL, 0 }, 2136 { "rdonly_shmem_buf", 0, NULL, 0 }, 2137 { 0, 0, 0, 0 } 2138 }; 2139 const struct ublk_tgt_ops *ops = NULL; 2140 int option_idx, opt; 2141 const char *cmd = argv[1]; 2142 struct dev_ctx ctx = { 2143 ._evtfd = -1, 2144 .queue_depth = 128, 2145 .nr_hw_queues = 2, 2146 .dev_id = -1, 2147 .tgt_type = "unknown", 2148 .csum_type = LBMD_PI_CSUM_NONE, 2149 }; 2150 int ret = -EINVAL, i; 2151 int tgt_argc = 1; 2152 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL }; 2153 int value; 2154 2155 if (argc == 1) 2156 return ret; 2157 2158 opterr = 0; 2159 optind = 2; 2160 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", 2161 longopts, &option_idx)) != -1) { 2162 switch (opt) { 2163 case 'a': 2164 ctx.all = 1; 2165 break; 2166 case 'b': 2167 ctx.flags |= UBLK_F_BATCH_IO; 2168 break; 2169 case 'n': 2170 ctx.dev_id = strtol(optarg, NULL, 10); 2171 break; 2172 case 't': 2173 if (strlen(optarg) < sizeof(ctx.tgt_type)) 2174 strcpy(ctx.tgt_type, optarg); 2175 break; 2176 case 'q': 2177 ctx.nr_hw_queues = strtol(optarg, NULL, 10); 2178 break; 2179 case 'd': 2180 ctx.queue_depth = strtol(optarg, NULL, 10); 2181 break; 2182 case 'z': 2183 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY; 2184 break; 2185 case 'r': 2186 value = strtol(optarg, NULL, 10); 2187 if (value) 2188 ctx.flags |= UBLK_F_USER_RECOVERY; 2189 break; 2190 case 'e': 2191 value = strtol(optarg, NULL, 10); 2192 if (value) 2193 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO; 2194 break; 2195 case 'i': 2196 value = strtol(optarg, NULL, 10); 2197 if (value) 2198 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE; 2199 break; 2200 case 'g': 2201 ctx.flags |= UBLK_F_NEED_GET_DATA; 2202 break; 2203 case 'u': 2204 ctx.flags |= UBLK_F_USER_COPY; 2205 break; 2206 case 's': 2207 ctx.size = strtoull(optarg, NULL, 10); 2208 break; 2209 case 0: 2210 if (!strcmp(longopts[option_idx].name, "debug_mask")) 2211 ublk_dbg_mask = strtol(optarg, NULL, 16); 2212 if (!strcmp(longopts[option_idx].name, "quiet")) 2213 ublk_dbg_mask = 0; 2214 if (!strcmp(longopts[option_idx].name, "foreground")) 2215 ctx.fg = 1; 2216 if (!strcmp(longopts[option_idx].name, "auto_zc")) 2217 ctx.flags |= UBLK_F_AUTO_BUF_REG; 2218 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback")) 2219 ctx.auto_zc_fallback = 1; 2220 if (!strcmp(longopts[option_idx].name, "nthreads")) 2221 ctx.nthreads = strtol(optarg, NULL, 10); 2222 if (!strcmp(longopts[option_idx].name, "per_io_tasks")) 2223 ctx.per_io_tasks = 1; 2224 if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) 2225 ctx.no_ublk_fixed_fd = 1; 2226 if (!strcmp(longopts[option_idx].name, "integrity_capable")) 2227 ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; 2228 if (!strcmp(longopts[option_idx].name, "integrity_reftag")) 2229 ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; 2230 if (!strcmp(longopts[option_idx].name, "metadata_size")) 2231 ctx.metadata_size = strtoul(optarg, NULL, 0); 2232 if (!strcmp(longopts[option_idx].name, "pi_offset")) 2233 ctx.pi_offset = strtoul(optarg, NULL, 0); 2234 if (!strcmp(longopts[option_idx].name, "csum_type")) { 2235 if (!strcmp(optarg, "ip")) { 2236 ctx.csum_type = LBMD_PI_CSUM_IP; 2237 } else if (!strcmp(optarg, "t10dif")) { 2238 ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; 2239 } else if (!strcmp(optarg, "nvme")) { 2240 ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; 2241 } else { 2242 ublk_err("invalid csum_type: %s\n", optarg); 2243 return -EINVAL; 2244 } 2245 } 2246 if (!strcmp(longopts[option_idx].name, "tag_size")) 2247 ctx.tag_size = strtoul(optarg, NULL, 0); 2248 if (!strcmp(longopts[option_idx].name, "safe")) 2249 ctx.safe_stop = 1; 2250 if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) 2251 ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; 2252 if (!strcmp(longopts[option_idx].name, "shmem_zc")) 2253 ctx.flags |= UBLK_F_SHMEM_ZC; 2254 if (!strcmp(longopts[option_idx].name, "htlb")) 2255 ctx.htlb_path = strdup(optarg); 2256 if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf")) 2257 ctx.rdonly_shmem_buf = 1; 2258 break; 2259 case '?': 2260 /* 2261 * target requires every option must have argument 2262 */ 2263 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') { 2264 fprintf(stderr, "every target option requires argument: %s %s\n", 2265 argv[optind - 1], argv[optind]); 2266 exit(EXIT_FAILURE); 2267 } 2268 2269 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) { 2270 tgt_argv[tgt_argc++] = argv[optind - 1]; 2271 tgt_argv[tgt_argc++] = argv[optind]; 2272 } else { 2273 fprintf(stderr, "too many target options\n"); 2274 exit(EXIT_FAILURE); 2275 } 2276 optind += 1; 2277 break; 2278 } 2279 } 2280 2281 if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { 2282 ublk_err("per_io_task and F_BATCH_IO conflict\n"); 2283 return -EINVAL; 2284 } 2285 2286 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ 2287 if (ctx.auto_zc_fallback && 2288 !((ctx.flags & UBLK_F_AUTO_BUF_REG) && 2289 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) { 2290 ublk_err("%s: auto_zc_fallback is set but neither " 2291 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n", 2292 __func__); 2293 return -EINVAL; 2294 } 2295 2296 if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) + 2297 !!(ctx.flags & UBLK_F_USER_COPY) + 2298 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) + 2299 (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) + 2300 ctx.auto_zc_fallback > 1) { 2301 fprintf(stderr, "too many data copy modes specified\n"); 2302 return -EINVAL; 2303 } 2304 2305 if (ctx.metadata_size) { 2306 if (!(ctx.flags & UBLK_F_USER_COPY)) { 2307 ublk_err("integrity requires user_copy\n"); 2308 return -EINVAL; 2309 } 2310 2311 ctx.flags |= UBLK_F_INTEGRITY; 2312 } else if (ctx.integrity_flags || 2313 ctx.pi_offset || 2314 ctx.csum_type != LBMD_PI_CSUM_NONE || 2315 ctx.tag_size) { 2316 ublk_err("integrity parameters require metadata_size\n"); 2317 return -EINVAL; 2318 } 2319 2320 if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && 2321 (ctx.flags & UBLK_F_BATCH_IO) && 2322 (ctx.nthreads > ctx.nr_hw_queues)) { 2323 ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); 2324 return -EINVAL; 2325 } 2326 2327 i = optind; 2328 while (i < argc && ctx.nr_files < MAX_BACK_FILES) { 2329 ctx.files[ctx.nr_files++] = argv[i++]; 2330 } 2331 2332 ops = ublk_find_tgt(ctx.tgt_type); 2333 if (ops && ops->parse_cmd_line) { 2334 optind = 0; 2335 2336 tgt_argv[0] = ctx.tgt_type; 2337 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv); 2338 } 2339 2340 if (!strcmp(cmd, "add")) 2341 ret = cmd_dev_add(&ctx); 2342 else if (!strcmp(cmd, "recover")) { 2343 if (ctx.dev_id < 0) { 2344 fprintf(stderr, "device id isn't provided for recovering\n"); 2345 ret = -EINVAL; 2346 } else { 2347 ctx.recovery = 1; 2348 ret = cmd_dev_add(&ctx); 2349 } 2350 } else if (!strcmp(cmd, "del")) 2351 ret = cmd_dev_del(&ctx); 2352 else if (!strcmp(cmd, "stop")) 2353 ret = cmd_dev_stop(&ctx); 2354 else if (!strcmp(cmd, "list")) { 2355 ctx.all = 1; 2356 ret = cmd_dev_list(&ctx); 2357 } else if (!strcmp(cmd, "help")) 2358 ret = cmd_dev_help(argv[0]); 2359 else if (!strcmp(cmd, "features")) 2360 ret = cmd_dev_get_features(); 2361 else if (!strcmp(cmd, "update_size")) 2362 ret = cmd_dev_update_size(&ctx); 2363 else if (!strcmp(cmd, "quiesce")) 2364 ret = cmd_dev_quiesce(&ctx); 2365 else 2366 cmd_dev_help(argv[0]); 2367 2368 return ret; 2369 } 2370