1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef KUBLK_INTERNAL_H 3 #define KUBLK_INTERNAL_H 4 5 #include <unistd.h> 6 #include <stdlib.h> 7 #include <assert.h> 8 #include <stdio.h> 9 #include <stdarg.h> 10 #include <string.h> 11 #include <pthread.h> 12 #include <getopt.h> 13 #include <limits.h> 14 #include <poll.h> 15 #include <fcntl.h> 16 #include <sys/syscall.h> 17 #include <sys/mman.h> 18 #include <sys/ioctl.h> 19 #include <sys/inotify.h> 20 #include <sys/wait.h> 21 #include <sys/eventfd.h> 22 #include <sys/ipc.h> 23 #include <sys/shm.h> 24 #include <linux/io_uring.h> 25 #include <liburing.h> 26 #include <semaphore.h> 27 28 /* allow ublk_dep.h to override ublk_cmd.h */ 29 #include "ublk_dep.h" 30 #include <linux/ublk_cmd.h> 31 32 #include "utils.h" 33 34 #define MAX_BACK_FILES 4 35 36 /****************** part 1: libublk ********************/ 37 38 #define CTRL_DEV "/dev/ublk-control" 39 #define UBLKC_DEV "/dev/ublkc" 40 #define UBLKB_DEV "/dev/ublkb" 41 #define UBLK_CTRL_RING_DEPTH 32 42 #define ERROR_EVTFD_DEVID -2 43 44 #define UBLK_IO_MAX_BYTES (1 << 20) 45 #define UBLK_MAX_QUEUES_SHIFT 5 46 #define UBLK_MAX_QUEUES (1 << UBLK_MAX_QUEUES_SHIFT) 47 #define UBLK_MAX_THREADS_SHIFT 5 48 #define UBLK_MAX_THREADS (1 << UBLK_MAX_THREADS_SHIFT) 49 #define UBLK_QUEUE_DEPTH 1024 50 51 struct ublk_dev; 52 struct ublk_queue; 53 struct ublk_thread; 54 55 struct stripe_ctx { 56 /* stripe */ 57 unsigned int chunk_size; 58 }; 59 60 struct fault_inject_ctx { 61 /* fault_inject */ 62 unsigned long delay_us; 63 bool die_during_fetch; 64 }; 65 66 struct dev_ctx { 67 char tgt_type[16]; 68 unsigned long flags; 69 unsigned nr_hw_queues; 70 unsigned short nthreads; 71 unsigned queue_depth; 72 int dev_id; 73 int nr_files; 74 char *files[MAX_BACK_FILES]; 75 unsigned int logging:1; 76 unsigned int all:1; 77 unsigned int fg:1; 78 unsigned int recovery:1; 79 unsigned int auto_zc_fallback:1; 80 unsigned int per_io_tasks:1; 81 unsigned int no_ublk_fixed_fd:1; 82 unsigned int safe_stop:1; 83 unsigned int no_auto_part_scan:1; 84 unsigned int rdonly_shmem_buf:1; 85 __u32 integrity_flags; 86 __u8 metadata_size; 87 __u8 pi_offset; 88 __u8 csum_type; 89 __u8 tag_size; 90 91 int _evtfd; 92 int _shmid; 93 94 /* built from shmem, only for ublk_dump_dev() */ 95 struct ublk_dev *shadow_dev; 96 97 /* for 'update_size' command */ 98 unsigned long long size; 99 100 char *htlb_path; 101 102 union { 103 struct stripe_ctx stripe; 104 struct fault_inject_ctx fault_inject; 105 }; 106 }; 107 108 struct ublk_ctrl_cmd_data { 109 __u32 cmd_op; 110 #define CTRL_CMD_HAS_DATA 1 111 #define CTRL_CMD_HAS_BUF 2 112 __u32 flags; 113 114 __u64 data[2]; 115 __u64 addr; 116 __u32 len; 117 }; 118 119 struct ublk_io { 120 char *buf_addr; 121 void *integrity_buf; 122 123 #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) 124 #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) 125 #define UBLKS_IO_FREE (1UL << 2) 126 #define UBLKS_IO_NEED_GET_DATA (1UL << 3) 127 #define UBLKS_IO_NEED_REG_BUF (1UL << 4) 128 unsigned short flags; 129 unsigned short refs; /* used by target code only */ 130 131 int tag; 132 133 int result; 134 135 unsigned short buf_index; 136 unsigned short tgt_ios; 137 void *private_data; 138 }; 139 140 struct ublk_tgt_ops { 141 const char *name; 142 int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); 143 void (*deinit_tgt)(struct ublk_dev *); 144 145 void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q, 146 int tag, bool batch); 147 int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag); 148 void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *, 149 const struct io_uring_cqe *); 150 151 /* 152 * Target specific command line handling 153 * 154 * each option requires argument for target command line 155 */ 156 void (*parse_cmd_line)(struct dev_ctx *ctx, int argc, char *argv[]); 157 void (*usage)(const struct ublk_tgt_ops *ops); 158 159 /* return buffer index for UBLK_F_AUTO_BUF_REG */ 160 unsigned short (*buf_index)(const struct ublk_thread *t, 161 const struct ublk_queue *, int tag); 162 }; 163 164 struct ublk_tgt { 165 unsigned long dev_size; 166 unsigned int sq_depth; 167 unsigned int cq_depth; 168 const struct ublk_tgt_ops *ops; 169 struct ublk_params params; 170 171 int nr_backing_files; 172 unsigned long backing_file_size[MAX_BACK_FILES]; 173 char backing_file[MAX_BACK_FILES][PATH_MAX]; 174 }; 175 176 struct ublk_queue { 177 int q_id; 178 int q_depth; 179 struct ublk_dev *dev; 180 const struct ublk_tgt_ops *tgt_ops; 181 struct ublksrv_io_desc *io_cmd_buf; 182 183 /* borrow three bit of ublk uapi flags, which may never be used */ 184 #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) 185 #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) 186 #define UBLKS_Q_PREPARED (1ULL << 61) 187 __u64 flags; 188 int ublk_fd; /* cached ublk char device fd */ 189 __u8 metadata_size; 190 struct ublk_io ios[UBLK_QUEUE_DEPTH]; 191 192 /* used for prep io commands */ 193 pthread_spinlock_t lock; 194 }; 195 196 /* align with `ublk_elem_header` */ 197 struct ublk_batch_elem { 198 __u16 tag; 199 __u16 buf_index; 200 __s32 result; 201 __u64 buf_addr; 202 }; 203 204 struct batch_commit_buf { 205 unsigned short q_id; 206 unsigned short buf_idx; 207 void *elem; 208 unsigned short done; 209 unsigned short count; 210 }; 211 212 struct batch_fetch_buf { 213 struct io_uring_buf_ring *br; 214 void *fetch_buf; 215 unsigned int fetch_buf_size; 216 unsigned int fetch_buf_off; 217 }; 218 219 struct ublk_thread { 220 /* Thread-local copy of queue-to-thread mapping for this thread */ 221 unsigned char q_map[UBLK_MAX_QUEUES]; 222 223 struct ublk_dev *dev; 224 unsigned short idx; 225 unsigned short nr_queues; 226 227 #define UBLKS_T_STOPPING (1U << 0) 228 #define UBLKS_T_IDLE (1U << 1) 229 #define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ 230 unsigned state; 231 unsigned int cmd_inflight; 232 unsigned int io_inflight; 233 234 unsigned short nr_bufs; 235 236 /* followings are for BATCH_IO */ 237 unsigned short commit_buf_start; 238 unsigned char commit_buf_elem_size; 239 /* 240 * We just support single device, so pre-calculate commit/prep flags 241 */ 242 unsigned short cmd_flags; 243 unsigned int nr_commit_buf; 244 unsigned int commit_buf_size; 245 void *commit_buf; 246 #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) 247 struct allocator commit_buf_alloc; 248 struct batch_commit_buf *commit; 249 /* FETCH_IO_CMDS buffer */ 250 unsigned short nr_fetch_bufs; 251 struct batch_fetch_buf *fetch; 252 253 struct io_uring ring; 254 }; 255 256 struct ublk_dev { 257 struct ublk_tgt tgt; 258 struct ublksrv_ctrl_dev_info dev_info; 259 struct ublk_queue q[UBLK_MAX_QUEUES]; 260 unsigned nthreads; 261 unsigned per_io_tasks; 262 263 int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ 264 int nr_fds; 265 int ctrl_fd; 266 struct io_uring ring; 267 268 void *private_data; 269 }; 270 271 extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); 272 273 static inline int __ublk_use_batch_io(__u64 flags) 274 { 275 return flags & UBLK_F_BATCH_IO; 276 } 277 278 static inline int ublk_queue_batch_io(const struct ublk_queue *q) 279 { 280 return __ublk_use_batch_io(q->flags); 281 } 282 283 static inline int ublk_dev_batch_io(const struct ublk_dev *dev) 284 { 285 return __ublk_use_batch_io(dev->dev_info.flags); 286 } 287 288 /* only work for handle single device in this pthread context */ 289 static inline int ublk_thread_batch_io(const struct ublk_thread *t) 290 { 291 return t->state & UBLKS_T_BATCH_IO; 292 } 293 294 static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, 295 struct ublk_params *params) 296 { 297 if (!ctx->metadata_size) 298 return; 299 300 params->types |= UBLK_PARAM_TYPE_INTEGRITY; 301 params->integrity = (struct ublk_param_integrity) { 302 .flags = ctx->integrity_flags, 303 .interval_exp = params->basic.logical_bs_shift, 304 .metadata_size = ctx->metadata_size, 305 .pi_offset = ctx->pi_offset, 306 .csum_type = ctx->csum_type, 307 .tag_size = ctx->tag_size, 308 }; 309 } 310 311 static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) 312 { 313 /* All targets currently use interval_exp = logical_bs_shift = 9 */ 314 return (len >> 9) * q->metadata_size; 315 } 316 317 static inline size_t 318 ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) 319 { 320 return (integrity_len / q->metadata_size) << 9; 321 } 322 323 static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) 324 { 325 return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF); 326 } 327 328 static inline __u64 ublk_user_copy_offset(unsigned q_id, unsigned tag) 329 { 330 return UBLKSRV_IO_BUF_OFFSET + 331 ((__u64)q_id << UBLK_QID_OFF | (__u64)tag << UBLK_TAG_OFF); 332 } 333 334 static inline int is_target_io(__u64 user_data) 335 { 336 return (user_data & (1ULL << 63)) != 0; 337 } 338 339 static inline __u64 build_user_data(unsigned tag, unsigned op, 340 unsigned tgt_data, unsigned q_id, unsigned is_target_io) 341 { 342 /* we only have 7 bits to encode q_id */ 343 _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7"); 344 ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); 345 346 return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | 347 (__u64)q_id << 56 | (__u64)is_target_io << 63; 348 } 349 350 static inline unsigned int user_data_to_tag(__u64 user_data) 351 { 352 return user_data & 0xffff; 353 } 354 355 static inline unsigned int user_data_to_op(__u64 user_data) 356 { 357 return (user_data >> 16) & 0xff; 358 } 359 360 static inline unsigned int user_data_to_tgt_data(__u64 user_data) 361 { 362 return (user_data >> 24) & 0xffff; 363 } 364 365 static inline unsigned int user_data_to_q_id(__u64 user_data) 366 { 367 return (user_data >> 56) & 0x7f; 368 } 369 370 static inline unsigned short ublk_cmd_op_nr(unsigned int op) 371 { 372 return _IOC_NR(op); 373 } 374 375 static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io) 376 { 377 return container_of(io, struct ublk_queue, ios[io->tag]); 378 } 379 380 static inline int ublk_io_alloc_sqes(struct ublk_thread *t, 381 struct io_uring_sqe *sqes[], int nr_sqes) 382 { 383 struct io_uring *ring = &t->ring; 384 unsigned left = io_uring_sq_space_left(ring); 385 int i; 386 387 if (left < nr_sqes) 388 io_uring_submit(ring); 389 390 for (i = 0; i < nr_sqes; i++) { 391 sqes[i] = io_uring_get_sqe(ring); 392 if (!sqes[i]) 393 return i; 394 } 395 396 return nr_sqes; 397 } 398 399 static inline int ublk_get_registered_fd(struct ublk_queue *q, int fd_index) 400 { 401 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) { 402 if (fd_index == 0) 403 /* Return the raw ublk FD for index 0 */ 404 return q->ublk_fd; 405 /* Adjust index for backing files (index 1 becomes 0, etc.) */ 406 return fd_index - 1; 407 } 408 return fd_index; 409 } 410 411 static inline void __io_uring_prep_buf_reg_unreg(struct io_uring_sqe *sqe, 412 struct ublk_queue *q, int tag, int q_id, __u64 index) 413 { 414 struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; 415 int dev_fd = ublk_get_registered_fd(q, 0); 416 417 io_uring_prep_read(sqe, dev_fd, 0, 0, 0); 418 sqe->opcode = IORING_OP_URING_CMD; 419 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) 420 sqe->flags &= ~IOSQE_FIXED_FILE; 421 else 422 sqe->flags |= IOSQE_FIXED_FILE; 423 424 cmd->tag = tag; 425 cmd->addr = index; 426 cmd->q_id = q_id; 427 } 428 429 static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, 430 struct ublk_queue *q, int tag, int q_id, __u64 index) 431 { 432 __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); 433 sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; 434 } 435 436 static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, 437 struct ublk_queue *q, int tag, int q_id, __u64 index) 438 { 439 __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); 440 sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; 441 } 442 443 static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) 444 { 445 return (void *)&sqe->cmd; 446 } 447 448 static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) 449 { 450 q->ios[tag].result = res; 451 } 452 453 static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) 454 { 455 return q->ios[tag].result; 456 } 457 458 static inline void ublk_mark_io_done(struct ublk_io *io, int res) 459 { 460 io->flags |= (UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_FREE); 461 io->result = res; 462 } 463 464 static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) 465 { 466 return &q->io_cmd_buf[tag]; 467 } 468 469 static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) 470 { 471 __u32 *addr = (__u32 *)&sqe->off; 472 473 addr[0] = cmd_op; 474 addr[1] = 0; 475 } 476 477 static inline unsigned short ublk_batch_io_buf_idx( 478 const struct ublk_thread *t, const struct ublk_queue *q, 479 unsigned tag); 480 481 static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, 482 const struct ublk_queue *q, 483 unsigned tag) 484 { 485 if (ublk_queue_batch_io(q)) 486 return ublk_batch_io_buf_idx(t, q, tag); 487 return q->ios[tag].buf_index; 488 } 489 490 static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) 491 { 492 return &q->ios[tag]; 493 } 494 495 static inline int ublk_completed_tgt_io(struct ublk_thread *t, 496 struct ublk_queue *q, unsigned tag) 497 { 498 struct ublk_io *io = ublk_get_io(q, tag); 499 500 t->io_inflight--; 501 502 return --io->tgt_ios == 0; 503 } 504 505 static inline bool ublk_queue_use_zc(const struct ublk_queue *q) 506 { 507 return !!(q->flags & UBLK_F_SUPPORT_ZERO_COPY); 508 } 509 510 static inline bool ublk_queue_use_auto_zc(const struct ublk_queue *q) 511 { 512 return !!(q->flags & UBLK_F_AUTO_BUF_REG); 513 } 514 515 static inline bool ublk_queue_auto_zc_fallback(const struct ublk_queue *q) 516 { 517 return !!(q->flags & UBLKS_Q_AUTO_BUF_REG_FALLBACK); 518 } 519 520 static inline bool ublk_queue_use_user_copy(const struct ublk_queue *q) 521 { 522 return !!(q->flags & UBLK_F_USER_COPY); 523 } 524 525 static inline int ublk_queue_no_buf(const struct ublk_queue *q) 526 { 527 return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); 528 } 529 530 static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) 531 { 532 return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; 533 } 534 535 static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, 536 const struct ublk_queue *q) 537 { 538 unsigned char idx; 539 540 idx = t->q_map[q->q_id]; 541 ublk_assert(idx != 0); 542 return idx - 1; 543 } 544 545 /* 546 * Each IO's buffer index has to be calculated by this helper for 547 * UBLKS_T_BATCH_IO 548 */ 549 static inline unsigned short ublk_batch_io_buf_idx( 550 const struct ublk_thread *t, const struct ublk_queue *q, 551 unsigned tag) 552 { 553 return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; 554 } 555 556 /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ 557 int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); 558 /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ 559 void ublk_batch_start_fetch(struct ublk_thread *t); 560 /* Handle completion of batch I/O commands (prep/commit) */ 561 void ublk_batch_compl_cmd(struct ublk_thread *t, 562 const struct io_uring_cqe *cqe); 563 /* Initialize batch I/O state and calculate buffer parameters */ 564 void ublk_batch_prepare(struct ublk_thread *t); 565 /* Allocate and register commit buffers for batch operations */ 566 int ublk_batch_alloc_buf(struct ublk_thread *t); 567 /* Free commit buffers and cleanup batch allocator */ 568 void ublk_batch_free_buf(struct ublk_thread *t); 569 570 /* Prepare a new commit buffer for batching completed I/O operations */ 571 void ublk_batch_prep_commit(struct ublk_thread *t); 572 /* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ 573 void ublk_batch_commit_io_cmds(struct ublk_thread *t); 574 /* Add a completed I/O operation to the current batch commit buffer */ 575 void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, 576 unsigned tag, int res); 577 void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], 578 int nthreads, int queues); 579 580 static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, 581 unsigned tag, int res) 582 { 583 if (ublk_queue_batch_io(q)) { 584 ublk_batch_complete_io(t, q, tag, res); 585 return 0; 586 } else { 587 struct ublk_io *io = &q->ios[tag]; 588 589 ublk_mark_io_done(io, res); 590 return ublk_queue_io_cmd(t, io); 591 } 592 } 593 594 static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, 595 unsigned tag, int queued) 596 { 597 if (queued < 0) 598 ublk_complete_io(t, q, tag, queued); 599 else { 600 struct ublk_io *io = ublk_get_io(q, tag); 601 602 t->io_inflight += queued; 603 io->tgt_ios = queued; 604 io->result = 0; 605 } 606 } 607 608 /* shared memory zero-copy support */ 609 #define UBLK_BUF_MAX 256 610 611 struct ublk_shmem_entry { 612 int fd; 613 void *mmap_base; 614 size_t size; 615 }; 616 617 extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; 618 extern int shmem_count; 619 620 extern const struct ublk_tgt_ops null_tgt_ops; 621 extern const struct ublk_tgt_ops loop_tgt_ops; 622 extern const struct ublk_tgt_ops stripe_tgt_ops; 623 extern const struct ublk_tgt_ops fault_inject_tgt_ops; 624 625 void backing_file_tgt_deinit(struct ublk_dev *dev); 626 int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); 627 628 #endif 629