1 /* SPDX-License-Identifier: MIT */
2 /*
3 * Description: uring_cmd based ublk
4 */
5
6 #include "kublk.h"
7
8 #define MAX_NR_TGT_ARG 64
9
10 unsigned int ublk_dbg_mask = UBLK_LOG;
11 static const struct ublk_tgt_ops *tgt_ops_list[] = {
12 &null_tgt_ops,
13 &loop_tgt_ops,
14 &stripe_tgt_ops,
15 &fault_inject_tgt_ops,
16 };
17
ublk_find_tgt(const char * name)18 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
19 {
20 int i;
21
22 if (name == NULL)
23 return NULL;
24
25 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
26 if (strcmp(tgt_ops_list[i]->name, name) == 0)
27 return tgt_ops_list[i];
28 return NULL;
29 }
30
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)31 static inline int ublk_setup_ring(struct io_uring *r, int depth,
32 int cq_depth, unsigned flags)
33 {
34 struct io_uring_params p;
35
36 memset(&p, 0, sizeof(p));
37 p.flags = flags | IORING_SETUP_CQSIZE;
38 p.cq_entries = cq_depth;
39
40 return io_uring_queue_init_params(depth, r, &p);
41 }
42
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)43 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
44 struct io_uring_sqe *sqe,
45 struct ublk_ctrl_cmd_data *data)
46 {
47 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
48 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
49
50 sqe->fd = dev->ctrl_fd;
51 sqe->opcode = IORING_OP_URING_CMD;
52 sqe->ioprio = 0;
53
54 if (data->flags & CTRL_CMD_HAS_BUF) {
55 cmd->addr = data->addr;
56 cmd->len = data->len;
57 }
58
59 if (data->flags & CTRL_CMD_HAS_DATA)
60 cmd->data[0] = data->data[0];
61
62 cmd->dev_id = info->dev_id;
63 cmd->queue_id = -1;
64
65 ublk_set_sqe_cmd_op(sqe, data->cmd_op);
66
67 io_uring_sqe_set_data(sqe, cmd);
68 }
69
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)70 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
71 struct ublk_ctrl_cmd_data *data)
72 {
73 struct io_uring_sqe *sqe;
74 struct io_uring_cqe *cqe;
75 int ret = -EINVAL;
76
77 sqe = io_uring_get_sqe(&dev->ring);
78 if (!sqe) {
79 ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
80 return ret;
81 }
82
83 ublk_ctrl_init_cmd(dev, sqe, data);
84
85 ret = io_uring_submit(&dev->ring);
86 if (ret < 0) {
87 ublk_err("uring submit ret %d\n", ret);
88 return ret;
89 }
90
91 ret = io_uring_wait_cqe(&dev->ring, &cqe);
92 if (ret < 0) {
93 ublk_err("wait cqe: %s\n", strerror(-ret));
94 return ret;
95 }
96 io_uring_cqe_seen(&dev->ring, cqe);
97
98 return cqe->res;
99 }
100
ublk_ctrl_stop_dev(struct ublk_dev * dev)101 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
102 {
103 struct ublk_ctrl_cmd_data data = {
104 .cmd_op = UBLK_U_CMD_STOP_DEV,
105 };
106
107 return __ublk_ctrl_cmd(dev, &data);
108 }
109
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)110 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
111 int daemon_pid)
112 {
113 struct ublk_ctrl_cmd_data data = {
114 .cmd_op = UBLK_U_CMD_START_DEV,
115 .flags = CTRL_CMD_HAS_DATA,
116 };
117
118 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
119
120 return __ublk_ctrl_cmd(dev, &data);
121 }
122
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)123 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
124 {
125 struct ublk_ctrl_cmd_data data = {
126 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
127 };
128
129 return __ublk_ctrl_cmd(dev, &data);
130 }
131
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)132 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
133 {
134 struct ublk_ctrl_cmd_data data = {
135 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
136 .flags = CTRL_CMD_HAS_DATA,
137 };
138
139 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
140
141 return __ublk_ctrl_cmd(dev, &data);
142 }
143
ublk_ctrl_add_dev(struct ublk_dev * dev)144 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
145 {
146 struct ublk_ctrl_cmd_data data = {
147 .cmd_op = UBLK_U_CMD_ADD_DEV,
148 .flags = CTRL_CMD_HAS_BUF,
149 .addr = (__u64) (uintptr_t) &dev->dev_info,
150 .len = sizeof(struct ublksrv_ctrl_dev_info),
151 };
152
153 return __ublk_ctrl_cmd(dev, &data);
154 }
155
ublk_ctrl_del_dev(struct ublk_dev * dev)156 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
157 {
158 struct ublk_ctrl_cmd_data data = {
159 .cmd_op = UBLK_U_CMD_DEL_DEV,
160 .flags = 0,
161 };
162
163 return __ublk_ctrl_cmd(dev, &data);
164 }
165
ublk_ctrl_get_info(struct ublk_dev * dev)166 static int ublk_ctrl_get_info(struct ublk_dev *dev)
167 {
168 struct ublk_ctrl_cmd_data data = {
169 .cmd_op = UBLK_U_CMD_GET_DEV_INFO,
170 .flags = CTRL_CMD_HAS_BUF,
171 .addr = (__u64) (uintptr_t) &dev->dev_info,
172 .len = sizeof(struct ublksrv_ctrl_dev_info),
173 };
174
175 return __ublk_ctrl_cmd(dev, &data);
176 }
177
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)178 static int ublk_ctrl_set_params(struct ublk_dev *dev,
179 struct ublk_params *params)
180 {
181 struct ublk_ctrl_cmd_data data = {
182 .cmd_op = UBLK_U_CMD_SET_PARAMS,
183 .flags = CTRL_CMD_HAS_BUF,
184 .addr = (__u64) (uintptr_t) params,
185 .len = sizeof(*params),
186 };
187 params->len = sizeof(*params);
188 return __ublk_ctrl_cmd(dev, &data);
189 }
190
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)191 static int ublk_ctrl_get_params(struct ublk_dev *dev,
192 struct ublk_params *params)
193 {
194 struct ublk_ctrl_cmd_data data = {
195 .cmd_op = UBLK_U_CMD_GET_PARAMS,
196 .flags = CTRL_CMD_HAS_BUF,
197 .addr = (__u64)params,
198 .len = sizeof(*params),
199 };
200
201 params->len = sizeof(*params);
202
203 return __ublk_ctrl_cmd(dev, &data);
204 }
205
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)206 static int ublk_ctrl_get_features(struct ublk_dev *dev,
207 __u64 *features)
208 {
209 struct ublk_ctrl_cmd_data data = {
210 .cmd_op = UBLK_U_CMD_GET_FEATURES,
211 .flags = CTRL_CMD_HAS_BUF,
212 .addr = (__u64) (uintptr_t) features,
213 .len = sizeof(*features),
214 };
215
216 return __ublk_ctrl_cmd(dev, &data);
217 }
218
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)219 static int ublk_ctrl_update_size(struct ublk_dev *dev,
220 __u64 nr_sects)
221 {
222 struct ublk_ctrl_cmd_data data = {
223 .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
224 .flags = CTRL_CMD_HAS_DATA,
225 };
226
227 data.data[0] = nr_sects;
228 return __ublk_ctrl_cmd(dev, &data);
229 }
230
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)231 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
232 unsigned int timeout_ms)
233 {
234 struct ublk_ctrl_cmd_data data = {
235 .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
236 .flags = CTRL_CMD_HAS_DATA,
237 };
238
239 data.data[0] = timeout_ms;
240 return __ublk_ctrl_cmd(dev, &data);
241 }
242
ublk_dev_state_desc(struct ublk_dev * dev)243 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
244 {
245 switch (dev->dev_info.state) {
246 case UBLK_S_DEV_DEAD:
247 return "DEAD";
248 case UBLK_S_DEV_LIVE:
249 return "LIVE";
250 case UBLK_S_DEV_QUIESCED:
251 return "QUIESCED";
252 default:
253 return "UNKNOWN";
254 };
255 }
256
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)257 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
258 {
259 unsigned done = 0;
260 int i;
261
262 for (i = 0; i < CPU_SETSIZE; i++) {
263 if (CPU_ISSET(i, set))
264 done += snprintf(&buf[done], len - done, "%d ", i);
265 }
266 }
267
ublk_adjust_affinity(cpu_set_t * set)268 static void ublk_adjust_affinity(cpu_set_t *set)
269 {
270 int j, updated = 0;
271
272 /*
273 * Just keep the 1st CPU now.
274 *
275 * In future, auto affinity selection can be tried.
276 */
277 for (j = 0; j < CPU_SETSIZE; j++) {
278 if (CPU_ISSET(j, set)) {
279 if (!updated) {
280 updated = 1;
281 continue;
282 }
283 CPU_CLR(j, set);
284 }
285 }
286 }
287
288 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)289 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
290 {
291 struct ublk_ctrl_cmd_data data = {
292 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
293 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
294 };
295 cpu_set_t *buf;
296 int i, ret;
297
298 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
299 if (!buf)
300 return -ENOMEM;
301
302 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
303 data.data[0] = i;
304 data.len = sizeof(cpu_set_t);
305 data.addr = (__u64)&buf[i];
306
307 ret = __ublk_ctrl_cmd(ctrl_dev, &data);
308 if (ret < 0) {
309 free(buf);
310 return ret;
311 }
312 ublk_adjust_affinity(&buf[i]);
313 }
314
315 *ptr_buf = buf;
316 return 0;
317 }
318
ublk_ctrl_dump(struct ublk_dev * dev)319 static void ublk_ctrl_dump(struct ublk_dev *dev)
320 {
321 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
322 struct ublk_params p;
323 cpu_set_t *affinity;
324 int ret;
325
326 ret = ublk_ctrl_get_params(dev, &p);
327 if (ret < 0) {
328 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
329 return;
330 }
331
332 ret = ublk_ctrl_get_affinity(dev, &affinity);
333 if (ret < 0) {
334 ublk_err("failed to get affinity %m\n");
335 return;
336 }
337
338 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
339 info->dev_id, info->nr_hw_queues, info->queue_depth,
340 1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
341 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
342 info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
343 ublk_dev_state_desc(dev));
344
345 if (affinity) {
346 char buf[512];
347 int i;
348
349 for (i = 0; i < info->nr_hw_queues; i++) {
350 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
351 printf("\tqueue %u: affinity(%s)\n",
352 i, buf);
353 }
354 free(affinity);
355 }
356
357 fflush(stdout);
358 }
359
ublk_ctrl_deinit(struct ublk_dev * dev)360 static void ublk_ctrl_deinit(struct ublk_dev *dev)
361 {
362 close(dev->ctrl_fd);
363 free(dev);
364 }
365
ublk_ctrl_init(void)366 static struct ublk_dev *ublk_ctrl_init(void)
367 {
368 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
369 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
370 int ret;
371
372 dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
373 if (dev->ctrl_fd < 0) {
374 free(dev);
375 return NULL;
376 }
377
378 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
379
380 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
381 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
382 if (ret < 0) {
383 ublk_err("queue_init: %s\n", strerror(-ret));
384 free(dev);
385 return NULL;
386 }
387 dev->nr_fds = 1;
388
389 return dev;
390 }
391
__ublk_queue_cmd_buf_sz(unsigned depth)392 static int __ublk_queue_cmd_buf_sz(unsigned depth)
393 {
394 int size = depth * sizeof(struct ublksrv_io_desc);
395 unsigned int page_sz = getpagesize();
396
397 return round_up(size, page_sz);
398 }
399
ublk_queue_max_cmd_buf_sz(void)400 static int ublk_queue_max_cmd_buf_sz(void)
401 {
402 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
403 }
404
ublk_queue_cmd_buf_sz(struct ublk_queue * q)405 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
406 {
407 return __ublk_queue_cmd_buf_sz(q->q_depth);
408 }
409
ublk_queue_deinit(struct ublk_queue * q)410 static void ublk_queue_deinit(struct ublk_queue *q)
411 {
412 int i;
413 int nr_ios = q->q_depth;
414
415 if (q->io_cmd_buf)
416 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
417
418 for (i = 0; i < nr_ios; i++)
419 free(q->ios[i].buf_addr);
420 }
421
ublk_thread_deinit(struct ublk_thread * t)422 static void ublk_thread_deinit(struct ublk_thread *t)
423 {
424 io_uring_unregister_buffers(&t->ring);
425
426 io_uring_unregister_ring_fd(&t->ring);
427
428 if (t->ring.ring_fd > 0) {
429 io_uring_unregister_files(&t->ring);
430 close(t->ring.ring_fd);
431 t->ring.ring_fd = -1;
432 }
433 }
434
ublk_queue_init(struct ublk_queue * q,unsigned long long extra_flags)435 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
436 {
437 struct ublk_dev *dev = q->dev;
438 int depth = dev->dev_info.queue_depth;
439 int i;
440 int cmd_buf_size, io_buf_size;
441 unsigned long off;
442
443 q->tgt_ops = dev->tgt.ops;
444 q->flags = 0;
445 q->q_depth = depth;
446 q->flags = dev->dev_info.flags;
447 q->flags |= extra_flags;
448
449 /* Cache fd in queue for fast path access */
450 q->ublk_fd = dev->fds[0];
451
452 cmd_buf_size = ublk_queue_cmd_buf_sz(q);
453 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
454 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
455 MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
456 if (q->io_cmd_buf == MAP_FAILED) {
457 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
458 q->dev->dev_info.dev_id, q->q_id);
459 goto fail;
460 }
461
462 io_buf_size = dev->dev_info.max_io_buf_bytes;
463 for (i = 0; i < q->q_depth; i++) {
464 q->ios[i].buf_addr = NULL;
465 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
466 q->ios[i].tag = i;
467
468 if (ublk_queue_no_buf(q))
469 continue;
470
471 if (posix_memalign((void **)&q->ios[i].buf_addr,
472 getpagesize(), io_buf_size)) {
473 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
474 dev->dev_info.dev_id, q->q_id, i);
475 goto fail;
476 }
477 }
478
479 return 0;
480 fail:
481 ublk_queue_deinit(q);
482 ublk_err("ublk dev %d queue %d failed\n",
483 dev->dev_info.dev_id, q->q_id);
484 return -ENOMEM;
485 }
486
ublk_thread_init(struct ublk_thread * t,unsigned long long extra_flags)487 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
488 {
489 struct ublk_dev *dev = t->dev;
490 unsigned long long flags = dev->dev_info.flags | extra_flags;
491 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
492 int ret;
493
494 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
495 IORING_SETUP_COOP_TASKRUN |
496 IORING_SETUP_SINGLE_ISSUER |
497 IORING_SETUP_DEFER_TASKRUN);
498 if (ret < 0) {
499 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
500 dev->dev_info.dev_id, t->idx, ret);
501 goto fail;
502 }
503
504 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
505 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
506 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
507 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
508 ret = io_uring_register_buffers_sparse(
509 &t->ring, max_nr_ios_per_thread);
510 if (ret) {
511 ublk_err("ublk dev %d thread %d register spare buffers failed %d",
512 dev->dev_info.dev_id, t->idx, ret);
513 goto fail;
514 }
515 }
516
517 io_uring_register_ring_fd(&t->ring);
518
519 if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
520 /* Register only backing files starting from index 1, exclude ublk control device */
521 if (dev->nr_fds > 1) {
522 ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
523 } else {
524 /* No backing files to register, skip file registration */
525 ret = 0;
526 }
527 } else {
528 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
529 }
530 if (ret) {
531 ublk_err("ublk dev %d thread %d register files failed %d\n",
532 t->dev->dev_info.dev_id, t->idx, ret);
533 goto fail;
534 }
535
536 return 0;
537 fail:
538 ublk_thread_deinit(t);
539 ublk_err("ublk dev %d thread %d init failed\n",
540 dev->dev_info.dev_id, t->idx);
541 return -ENOMEM;
542 }
543
544 #define WAIT_USEC 100000
545 #define MAX_WAIT_USEC (3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)546 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
547 {
548 int dev_id = dev->dev_info.dev_id;
549 unsigned int wait_usec = 0;
550 int ret = 0, fd = -1;
551 char buf[64];
552
553 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
554
555 while (wait_usec < MAX_WAIT_USEC) {
556 fd = open(buf, O_RDWR);
557 if (fd >= 0)
558 break;
559 usleep(WAIT_USEC);
560 wait_usec += WAIT_USEC;
561 }
562 if (fd < 0) {
563 ublk_err("can't open %s %s\n", buf, strerror(errno));
564 return -1;
565 }
566
567 dev->fds[0] = fd;
568 if (dev->tgt.ops->init_tgt)
569 ret = dev->tgt.ops->init_tgt(ctx, dev);
570 if (ret)
571 close(dev->fds[0]);
572 return ret;
573 }
574
ublk_dev_unprep(struct ublk_dev * dev)575 static void ublk_dev_unprep(struct ublk_dev *dev)
576 {
577 if (dev->tgt.ops->deinit_tgt)
578 dev->tgt.ops->deinit_tgt(dev);
579 close(dev->fds[0]);
580 }
581
ublk_set_auto_buf_reg(const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)582 static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
583 struct io_uring_sqe *sqe,
584 unsigned short tag)
585 {
586 struct ublk_auto_buf_reg buf = {};
587
588 if (q->tgt_ops->buf_index)
589 buf.index = q->tgt_ops->buf_index(q, tag);
590 else
591 buf.index = q->ios[tag].buf_index;
592
593 if (ublk_queue_auto_zc_fallback(q))
594 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
595
596 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
597 }
598
599 /* Copy in pieces to test the buffer offset logic */
600 #define UBLK_USER_COPY_LEN 2048
601
ublk_user_copy(const struct ublk_io * io,__u8 match_ublk_op)602 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
603 {
604 const struct ublk_queue *q = ublk_io_to_queue(io);
605 const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
606 __u64 off = ublk_user_copy_offset(q->q_id, io->tag);
607 __u8 ublk_op = ublksrv_get_op(iod);
608 __u32 len = iod->nr_sectors << 9;
609 void *addr = io->buf_addr;
610
611 if (ublk_op != match_ublk_op)
612 return;
613
614 while (len) {
615 __u32 copy_len = min(len, UBLK_USER_COPY_LEN);
616 ssize_t copied;
617
618 if (ublk_op == UBLK_IO_OP_WRITE)
619 copied = pread(q->ublk_fd, addr, copy_len, off);
620 else if (ublk_op == UBLK_IO_OP_READ)
621 copied = pwrite(q->ublk_fd, addr, copy_len, off);
622 else
623 assert(0);
624 assert(copied == (ssize_t)copy_len);
625 addr += copy_len;
626 off += copy_len;
627 len -= copy_len;
628 }
629 }
630
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)631 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
632 {
633 struct ublk_queue *q = ublk_io_to_queue(io);
634 struct ublksrv_io_cmd *cmd;
635 struct io_uring_sqe *sqe[1];
636 unsigned int cmd_op = 0;
637 __u64 user_data;
638
639 /* only freed io can be issued */
640 if (!(io->flags & UBLKS_IO_FREE))
641 return 0;
642
643 /*
644 * we issue because we need either fetching or committing or
645 * getting data
646 */
647 if (!(io->flags &
648 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
649 return 0;
650
651 if (io->flags & UBLKS_IO_NEED_GET_DATA)
652 cmd_op = UBLK_U_IO_NEED_GET_DATA;
653 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
654 if (ublk_queue_use_user_copy(q))
655 ublk_user_copy(io, UBLK_IO_OP_READ);
656
657 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
658 } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
659 cmd_op = UBLK_U_IO_FETCH_REQ;
660
661 if (io_uring_sq_space_left(&t->ring) < 1)
662 io_uring_submit(&t->ring);
663
664 ublk_io_alloc_sqes(t, sqe, 1);
665 if (!sqe[0]) {
666 ublk_err("%s: run out of sqe. thread %u, tag %d\n",
667 __func__, t->idx, io->tag);
668 return -1;
669 }
670
671 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
672
673 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
674 cmd->result = io->result;
675
676 /* These fields should be written once, never change */
677 ublk_set_sqe_cmd_op(sqe[0], cmd_op);
678 sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */
679 sqe[0]->opcode = IORING_OP_URING_CMD;
680 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
681 sqe[0]->flags = 0; /* Use raw FD, not fixed file */
682 else
683 sqe[0]->flags = IOSQE_FIXED_FILE;
684 sqe[0]->rw_flags = 0;
685 cmd->tag = io->tag;
686 cmd->q_id = q->q_id;
687 if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
688 cmd->addr = (__u64) (uintptr_t) io->buf_addr;
689 else
690 cmd->addr = 0;
691
692 if (ublk_queue_use_auto_zc(q))
693 ublk_set_auto_buf_reg(q, sqe[0], io->tag);
694
695 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
696 io_uring_sqe_set_data64(sqe[0], user_data);
697
698 io->flags = 0;
699
700 t->cmd_inflight += 1;
701
702 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
703 __func__, t->idx, q->q_id, io->tag, cmd_op,
704 io->flags, !!(t->state & UBLKS_T_STOPPING));
705 return 1;
706 }
707
ublk_submit_fetch_commands(struct ublk_thread * t)708 static void ublk_submit_fetch_commands(struct ublk_thread *t)
709 {
710 struct ublk_queue *q;
711 struct ublk_io *io;
712 int i = 0, j = 0;
713
714 if (t->dev->per_io_tasks) {
715 /*
716 * Lexicographically order all the (qid,tag) pairs, with
717 * qid taking priority (so (1,0) > (0,1)). Then make
718 * this thread the daemon for every Nth entry in this
719 * list (N is the number of threads), starting at this
720 * thread's index. This ensures that each queue is
721 * handled by as many ublk server threads as possible,
722 * so that load that is concentrated on one or a few
723 * queues can make use of all ublk server threads.
724 */
725 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
726 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
727 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
728 int q_id = i / dinfo->queue_depth;
729 int tag = i % dinfo->queue_depth;
730 q = &t->dev->q[q_id];
731 io = &q->ios[tag];
732 io->buf_index = j++;
733 ublk_queue_io_cmd(t, io);
734 }
735 } else {
736 /*
737 * Service exclusively the queue whose q_id matches our
738 * thread index.
739 */
740 struct ublk_queue *q = &t->dev->q[t->idx];
741 for (i = 0; i < q->q_depth; i++) {
742 io = &q->ios[i];
743 io->buf_index = i;
744 ublk_queue_io_cmd(t, io);
745 }
746 }
747 }
748
ublk_thread_is_idle(struct ublk_thread * t)749 static int ublk_thread_is_idle(struct ublk_thread *t)
750 {
751 return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
752 }
753
ublk_thread_is_done(struct ublk_thread * t)754 static int ublk_thread_is_done(struct ublk_thread *t)
755 {
756 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
757 }
758
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)759 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
760 struct ublk_queue *q,
761 struct io_uring_cqe *cqe)
762 {
763 if (cqe->res < 0 && cqe->res != -EAGAIN)
764 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
765 __func__, cqe->res, q->q_id,
766 user_data_to_tag(cqe->user_data),
767 user_data_to_op(cqe->user_data));
768
769 if (q->tgt_ops->tgt_io_done)
770 q->tgt_ops->tgt_io_done(t, q, cqe);
771 }
772
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)773 static void ublk_handle_uring_cmd(struct ublk_thread *t,
774 struct ublk_queue *q,
775 const struct io_uring_cqe *cqe)
776 {
777 int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
778 !(t->state & UBLKS_T_STOPPING);
779 unsigned tag = user_data_to_tag(cqe->user_data);
780 struct ublk_io *io = &q->ios[tag];
781
782 if (!fetch) {
783 t->state |= UBLKS_T_STOPPING;
784 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
785 }
786
787 if (cqe->res == UBLK_IO_RES_OK) {
788 assert(tag < q->q_depth);
789
790 if (ublk_queue_use_user_copy(q))
791 ublk_user_copy(io, UBLK_IO_OP_WRITE);
792
793 if (q->tgt_ops->queue_io)
794 q->tgt_ops->queue_io(t, q, tag);
795 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
796 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
797 ublk_queue_io_cmd(t, io);
798 } else {
799 /*
800 * COMMIT_REQ will be completed immediately since no fetching
801 * piggyback is required.
802 *
803 * Marking IO_FREE only, then this io won't be issued since
804 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
805 *
806 * */
807 io->flags = UBLKS_IO_FREE;
808 }
809 }
810
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)811 static void ublk_handle_cqe(struct ublk_thread *t,
812 struct io_uring_cqe *cqe, void *data)
813 {
814 struct ublk_dev *dev = t->dev;
815 unsigned q_id = user_data_to_q_id(cqe->user_data);
816 struct ublk_queue *q = &dev->q[q_id];
817 unsigned cmd_op = user_data_to_op(cqe->user_data);
818
819 if (cqe->res < 0 && cqe->res != -ENODEV)
820 ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
821 cqe->res, cqe->user_data, q->flags);
822
823 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
824 __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data),
825 cmd_op, is_target_io(cqe->user_data),
826 user_data_to_tgt_data(cqe->user_data),
827 (t->state & UBLKS_T_STOPPING));
828
829 /* Don't retrieve io in case of target io */
830 if (is_target_io(cqe->user_data)) {
831 ublksrv_handle_tgt_cqe(t, q, cqe);
832 return;
833 }
834
835 t->cmd_inflight--;
836
837 ublk_handle_uring_cmd(t, q, cqe);
838 }
839
ublk_reap_events_uring(struct ublk_thread * t)840 static int ublk_reap_events_uring(struct ublk_thread *t)
841 {
842 struct io_uring_cqe *cqe;
843 unsigned head;
844 int count = 0;
845
846 io_uring_for_each_cqe(&t->ring, head, cqe) {
847 ublk_handle_cqe(t, cqe, NULL);
848 count += 1;
849 }
850 io_uring_cq_advance(&t->ring, count);
851
852 return count;
853 }
854
ublk_process_io(struct ublk_thread * t)855 static int ublk_process_io(struct ublk_thread *t)
856 {
857 int ret, reapped;
858
859 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
860 t->dev->dev_info.dev_id,
861 t->idx, io_uring_sq_ready(&t->ring),
862 t->cmd_inflight,
863 (t->state & UBLKS_T_STOPPING));
864
865 if (ublk_thread_is_done(t))
866 return -ENODEV;
867
868 ret = io_uring_submit_and_wait(&t->ring, 1);
869 reapped = ublk_reap_events_uring(t);
870
871 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
872 ret, reapped, (t->state & UBLKS_T_STOPPING),
873 (t->state & UBLKS_T_IDLE));
874
875 return reapped;
876 }
877
878 struct ublk_thread_info {
879 struct ublk_dev *dev;
880 pthread_t thread;
881 unsigned idx;
882 sem_t *ready;
883 cpu_set_t *affinity;
884 unsigned long long extra_flags;
885 };
886
ublk_thread_set_sched_affinity(const struct ublk_thread_info * info)887 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
888 {
889 if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
890 ublk_err("ublk dev %u thread %u set affinity failed",
891 info->dev->dev_info.dev_id, info->idx);
892 }
893
__ublk_io_handler_fn(struct ublk_thread_info * info)894 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
895 {
896 struct ublk_thread t = {
897 .dev = info->dev,
898 .idx = info->idx,
899 };
900 int dev_id = info->dev->dev_info.dev_id;
901 int ret;
902
903 ret = ublk_thread_init(&t, info->extra_flags);
904 if (ret) {
905 ublk_err("ublk dev %d thread %u init failed\n",
906 dev_id, t.idx);
907 return ret;
908 }
909 sem_post(info->ready);
910
911 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
912 gettid(), dev_id, t.idx);
913
914 /* submit all io commands to ublk driver */
915 ublk_submit_fetch_commands(&t);
916 do {
917 if (ublk_process_io(&t) < 0)
918 break;
919 } while (1);
920
921 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
922 gettid(), dev_id, t.idx);
923 ublk_thread_deinit(&t);
924 return 0;
925 }
926
ublk_io_handler_fn(void * data)927 static void *ublk_io_handler_fn(void *data)
928 {
929 struct ublk_thread_info *info = data;
930
931 /*
932 * IO perf is sensitive with queue pthread affinity on NUMA machine
933 *
934 * Set sched_affinity at beginning, so following allocated memory/pages
935 * could be CPU/NUMA aware.
936 */
937 if (info->affinity)
938 ublk_thread_set_sched_affinity(info);
939
940 __ublk_io_handler_fn(info);
941
942 return NULL;
943 }
944
ublk_set_parameters(struct ublk_dev * dev)945 static void ublk_set_parameters(struct ublk_dev *dev)
946 {
947 int ret;
948
949 ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
950 if (ret)
951 ublk_err("dev %d set basic parameter failed %d\n",
952 dev->dev_info.dev_id, ret);
953 }
954
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)955 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
956 {
957 uint64_t id;
958 int evtfd = ctx->_evtfd;
959
960 if (evtfd < 0)
961 return -EBADF;
962
963 if (dev_id >= 0)
964 id = dev_id + 1;
965 else
966 id = ERROR_EVTFD_DEVID;
967
968 if (dev && ctx->shadow_dev)
969 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
970
971 if (write(evtfd, &id, sizeof(id)) != sizeof(id))
972 return -EINVAL;
973
974 close(evtfd);
975 shmdt(ctx->shadow_dev);
976
977 return 0;
978 }
979
980
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)981 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
982 {
983 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
984 struct ublk_thread_info *tinfo;
985 unsigned long long extra_flags = 0;
986 cpu_set_t *affinity_buf;
987 void *thread_ret;
988 sem_t ready;
989 int ret, i;
990
991 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
992
993 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
994 if (!tinfo)
995 return -ENOMEM;
996
997 sem_init(&ready, 0, 0);
998 ret = ublk_dev_prep(ctx, dev);
999 if (ret)
1000 return ret;
1001
1002 ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
1003 if (ret)
1004 return ret;
1005
1006 if (ctx->auto_zc_fallback)
1007 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
1008 if (ctx->no_ublk_fixed_fd)
1009 extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
1010
1011 for (i = 0; i < dinfo->nr_hw_queues; i++) {
1012 dev->q[i].dev = dev;
1013 dev->q[i].q_id = i;
1014
1015 ret = ublk_queue_init(&dev->q[i], extra_flags);
1016 if (ret) {
1017 ublk_err("ublk dev %d queue %d init queue failed\n",
1018 dinfo->dev_id, i);
1019 goto fail;
1020 }
1021 }
1022
1023 for (i = 0; i < dev->nthreads; i++) {
1024 tinfo[i].dev = dev;
1025 tinfo[i].idx = i;
1026 tinfo[i].ready = &ready;
1027 tinfo[i].extra_flags = extra_flags;
1028
1029 /*
1030 * If threads are not tied 1:1 to queues, setting thread
1031 * affinity based on queue affinity makes little sense.
1032 * However, thread CPU affinity has significant impact
1033 * on performance, so to compare fairly, we'll still set
1034 * thread CPU affinity based on queue affinity where
1035 * possible.
1036 */
1037 if (dev->nthreads == dinfo->nr_hw_queues)
1038 tinfo[i].affinity = &affinity_buf[i];
1039 pthread_create(&tinfo[i].thread, NULL,
1040 ublk_io_handler_fn,
1041 &tinfo[i]);
1042 }
1043
1044 for (i = 0; i < dev->nthreads; i++)
1045 sem_wait(&ready);
1046 free(affinity_buf);
1047
1048 /* everything is fine now, start us */
1049 if (ctx->recovery)
1050 ret = ublk_ctrl_end_user_recovery(dev, getpid());
1051 else {
1052 ublk_set_parameters(dev);
1053 ret = ublk_ctrl_start_dev(dev, getpid());
1054 }
1055 if (ret < 0) {
1056 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
1057 /* stop device so that inflight uring_cmd can be cancelled */
1058 ublk_ctrl_stop_dev(dev);
1059 goto fail_start;
1060 }
1061
1062 ublk_ctrl_get_info(dev);
1063 if (ctx->fg)
1064 ublk_ctrl_dump(dev);
1065 else
1066 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
1067 fail_start:
1068 /* wait until we are terminated */
1069 for (i = 0; i < dev->nthreads; i++)
1070 pthread_join(tinfo[i].thread, &thread_ret);
1071 free(tinfo);
1072 fail:
1073 for (i = 0; i < dinfo->nr_hw_queues; i++)
1074 ublk_queue_deinit(&dev->q[i]);
1075 ublk_dev_unprep(dev);
1076 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1077
1078 return ret;
1079 }
1080
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1081 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1082 {
1083 #define EV_SIZE (sizeof(struct inotify_event))
1084 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1085 struct pollfd pfd;
1086 int fd, wd;
1087 int ret = -EINVAL;
1088 const char *dev_name = basename(path);
1089
1090 fd = inotify_init();
1091 if (fd < 0) {
1092 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1093 return fd;
1094 }
1095
1096 wd = inotify_add_watch(fd, "/dev", evt_mask);
1097 if (wd == -1) {
1098 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1099 goto fail;
1100 }
1101
1102 pfd.fd = fd;
1103 pfd.events = POLL_IN;
1104 while (1) {
1105 int i = 0;
1106 char buffer[EV_BUF_LEN];
1107 ret = poll(&pfd, 1, 1000 * timeout);
1108
1109 if (ret == -1) {
1110 ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1111 goto rm_watch;
1112 } else if (ret == 0) {
1113 ublk_err("%s: poll inotify timeout\n", __func__);
1114 ret = -ETIMEDOUT;
1115 goto rm_watch;
1116 }
1117
1118 ret = read(fd, buffer, EV_BUF_LEN);
1119 if (ret < 0) {
1120 ublk_err("%s: read inotify fd failed\n", __func__);
1121 goto rm_watch;
1122 }
1123
1124 while (i < ret) {
1125 struct inotify_event *event = (struct inotify_event *)&buffer[i];
1126
1127 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1128 __func__, event->mask, event->name);
1129 if (event->mask & evt_mask) {
1130 if (!strcmp(event->name, dev_name)) {
1131 ret = 0;
1132 goto rm_watch;
1133 }
1134 }
1135 i += EV_SIZE + event->len;
1136 }
1137 }
1138 rm_watch:
1139 inotify_rm_watch(fd, wd);
1140 fail:
1141 close(fd);
1142 return ret;
1143 }
1144
ublk_stop_io_daemon(const struct ublk_dev * dev)1145 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1146 {
1147 int daemon_pid = dev->dev_info.ublksrv_pid;
1148 int dev_id = dev->dev_info.dev_id;
1149 char ublkc[64];
1150 int ret = 0;
1151
1152 if (daemon_pid < 0)
1153 return 0;
1154
1155 /* daemon may be dead already */
1156 if (kill(daemon_pid, 0) < 0)
1157 goto wait;
1158
1159 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1160
1161 /* ublk char device may be gone already */
1162 if (access(ublkc, F_OK) != 0)
1163 goto wait;
1164
1165 /* Wait until ublk char device is closed, when the daemon is shutdown */
1166 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1167 /* double check and since it may be closed before starting inotify */
1168 if (ret == -ETIMEDOUT)
1169 ret = kill(daemon_pid, 0) < 0;
1170 wait:
1171 waitpid(daemon_pid, NULL, 0);
1172 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1173 __func__, daemon_pid, dev_id, ret);
1174
1175 return ret;
1176 }
1177
__cmd_dev_add(const struct dev_ctx * ctx)1178 static int __cmd_dev_add(const struct dev_ctx *ctx)
1179 {
1180 unsigned nthreads = ctx->nthreads;
1181 unsigned nr_queues = ctx->nr_hw_queues;
1182 const char *tgt_type = ctx->tgt_type;
1183 unsigned depth = ctx->queue_depth;
1184 __u64 features;
1185 const struct ublk_tgt_ops *ops;
1186 struct ublksrv_ctrl_dev_info *info;
1187 struct ublk_dev *dev = NULL;
1188 int dev_id = ctx->dev_id;
1189 int ret, i;
1190
1191 ops = ublk_find_tgt(tgt_type);
1192 if (!ops) {
1193 ublk_err("%s: no such tgt type, type %s\n",
1194 __func__, tgt_type);
1195 ret = -ENODEV;
1196 goto fail;
1197 }
1198
1199 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1200 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1201 __func__, nr_queues, depth);
1202 ret = -EINVAL;
1203 goto fail;
1204 }
1205
1206 /* default to 1:1 threads:queues if nthreads is unspecified */
1207 if (!nthreads)
1208 nthreads = nr_queues;
1209
1210 if (nthreads > UBLK_MAX_THREADS) {
1211 ublk_err("%s: %u is too many threads (max %u)\n",
1212 __func__, nthreads, UBLK_MAX_THREADS);
1213 ret = -EINVAL;
1214 goto fail;
1215 }
1216
1217 if (nthreads != nr_queues && !ctx->per_io_tasks) {
1218 ublk_err("%s: threads %u must be same as queues %u if "
1219 "not using per_io_tasks\n",
1220 __func__, nthreads, nr_queues);
1221 ret = -EINVAL;
1222 goto fail;
1223 }
1224
1225 dev = ublk_ctrl_init();
1226 if (!dev) {
1227 ublk_err("%s: can't alloc dev id %d, type %s\n",
1228 __func__, dev_id, tgt_type);
1229 ret = -ENOMEM;
1230 goto fail;
1231 }
1232
1233 /* kernel doesn't support get_features */
1234 ret = ublk_ctrl_get_features(dev, &features);
1235 if (ret < 0) {
1236 ret = -EINVAL;
1237 goto fail;
1238 }
1239
1240 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1241 ret = -ENOTSUP;
1242 goto fail;
1243 }
1244
1245 info = &dev->dev_info;
1246 info->dev_id = ctx->dev_id;
1247 info->nr_hw_queues = nr_queues;
1248 info->queue_depth = depth;
1249 info->flags = ctx->flags;
1250 if ((features & UBLK_F_QUIESCE) &&
1251 (info->flags & UBLK_F_USER_RECOVERY))
1252 info->flags |= UBLK_F_QUIESCE;
1253 dev->nthreads = nthreads;
1254 dev->per_io_tasks = ctx->per_io_tasks;
1255 dev->tgt.ops = ops;
1256 dev->tgt.sq_depth = depth;
1257 dev->tgt.cq_depth = depth;
1258
1259 for (i = 0; i < MAX_BACK_FILES; i++) {
1260 if (ctx->files[i]) {
1261 strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1262 dev->tgt.nr_backing_files++;
1263 }
1264 }
1265
1266 if (ctx->recovery)
1267 ret = ublk_ctrl_start_user_recovery(dev);
1268 else
1269 ret = ublk_ctrl_add_dev(dev);
1270 if (ret < 0) {
1271 ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1272 __func__, dev_id, tgt_type, ret);
1273 goto fail;
1274 }
1275
1276 ret = ublk_start_daemon(ctx, dev);
1277 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
1278 if (ret < 0)
1279 ublk_ctrl_del_dev(dev);
1280
1281 fail:
1282 if (ret < 0)
1283 ublk_send_dev_event(ctx, dev, -1);
1284 if (dev)
1285 ublk_ctrl_deinit(dev);
1286 return ret;
1287 }
1288
1289 static int __cmd_dev_list(struct dev_ctx *ctx);
1290
cmd_dev_add(struct dev_ctx * ctx)1291 static int cmd_dev_add(struct dev_ctx *ctx)
1292 {
1293 int res;
1294
1295 if (ctx->fg)
1296 goto run;
1297
1298 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1299 if (ctx->_shmid < 0) {
1300 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1301 exit(-1);
1302 }
1303 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1304 if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1305 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1306 exit(-1);
1307 }
1308 ctx->_evtfd = eventfd(0, 0);
1309 if (ctx->_evtfd < 0) {
1310 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1311 exit(-1);
1312 }
1313
1314 res = fork();
1315 if (res == 0) {
1316 int res2;
1317
1318 setsid();
1319 res2 = fork();
1320 if (res2 == 0) {
1321 /* prepare for detaching */
1322 close(STDIN_FILENO);
1323 close(STDOUT_FILENO);
1324 close(STDERR_FILENO);
1325 run:
1326 res = __cmd_dev_add(ctx);
1327 return res;
1328 } else {
1329 /* detached from the foreground task */
1330 exit(EXIT_SUCCESS);
1331 }
1332 } else if (res > 0) {
1333 uint64_t id;
1334 int exit_code = EXIT_FAILURE;
1335
1336 res = read(ctx->_evtfd, &id, sizeof(id));
1337 close(ctx->_evtfd);
1338 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1339 ctx->dev_id = id - 1;
1340 if (__cmd_dev_list(ctx) >= 0)
1341 exit_code = EXIT_SUCCESS;
1342 }
1343 shmdt(ctx->shadow_dev);
1344 shmctl(ctx->_shmid, IPC_RMID, NULL);
1345 /* wait for child and detach from it */
1346 wait(NULL);
1347 if (exit_code == EXIT_FAILURE)
1348 ublk_err("%s: command failed\n", __func__);
1349 exit(exit_code);
1350 } else {
1351 exit(EXIT_FAILURE);
1352 }
1353 }
1354
__cmd_dev_del(struct dev_ctx * ctx)1355 static int __cmd_dev_del(struct dev_ctx *ctx)
1356 {
1357 int number = ctx->dev_id;
1358 struct ublk_dev *dev;
1359 int ret;
1360
1361 dev = ublk_ctrl_init();
1362 dev->dev_info.dev_id = number;
1363
1364 ret = ublk_ctrl_get_info(dev);
1365 if (ret < 0)
1366 goto fail;
1367
1368 ret = ublk_ctrl_stop_dev(dev);
1369 if (ret < 0)
1370 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1371
1372 ret = ublk_stop_io_daemon(dev);
1373 if (ret < 0)
1374 ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1375 __func__, dev->dev_info.ublksrv_pid, number, ret);
1376 ublk_ctrl_del_dev(dev);
1377 fail:
1378 ublk_ctrl_deinit(dev);
1379
1380 return (ret >= 0) ? 0 : ret;
1381 }
1382
cmd_dev_del(struct dev_ctx * ctx)1383 static int cmd_dev_del(struct dev_ctx *ctx)
1384 {
1385 int i;
1386
1387 if (ctx->dev_id >= 0 || !ctx->all)
1388 return __cmd_dev_del(ctx);
1389
1390 for (i = 0; i < 255; i++) {
1391 ctx->dev_id = i;
1392 __cmd_dev_del(ctx);
1393 }
1394 return 0;
1395 }
1396
__cmd_dev_list(struct dev_ctx * ctx)1397 static int __cmd_dev_list(struct dev_ctx *ctx)
1398 {
1399 struct ublk_dev *dev = ublk_ctrl_init();
1400 int ret;
1401
1402 if (!dev)
1403 return -ENODEV;
1404
1405 dev->dev_info.dev_id = ctx->dev_id;
1406
1407 ret = ublk_ctrl_get_info(dev);
1408 if (ret < 0) {
1409 if (ctx->logging)
1410 ublk_err("%s: can't get dev info from %d: %d\n",
1411 __func__, ctx->dev_id, ret);
1412 } else {
1413 if (ctx->shadow_dev)
1414 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1415
1416 ublk_ctrl_dump(dev);
1417 }
1418
1419 ublk_ctrl_deinit(dev);
1420
1421 return ret;
1422 }
1423
cmd_dev_list(struct dev_ctx * ctx)1424 static int cmd_dev_list(struct dev_ctx *ctx)
1425 {
1426 int i;
1427
1428 if (ctx->dev_id >= 0 || !ctx->all)
1429 return __cmd_dev_list(ctx);
1430
1431 ctx->logging = false;
1432 for (i = 0; i < 255; i++) {
1433 ctx->dev_id = i;
1434 __cmd_dev_list(ctx);
1435 }
1436 return 0;
1437 }
1438
cmd_dev_get_features(void)1439 static int cmd_dev_get_features(void)
1440 {
1441 #define const_ilog2(x) (63 - __builtin_clzll(x))
1442 #define FEAT_NAME(f) [const_ilog2(f)] = #f
1443 static const char *feat_map[] = {
1444 FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
1445 FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
1446 FEAT_NAME(UBLK_F_NEED_GET_DATA),
1447 FEAT_NAME(UBLK_F_USER_RECOVERY),
1448 FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
1449 FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
1450 FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
1451 FEAT_NAME(UBLK_F_USER_COPY),
1452 FEAT_NAME(UBLK_F_ZONED),
1453 FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
1454 FEAT_NAME(UBLK_F_UPDATE_SIZE),
1455 FEAT_NAME(UBLK_F_AUTO_BUF_REG),
1456 FEAT_NAME(UBLK_F_QUIESCE),
1457 FEAT_NAME(UBLK_F_PER_IO_DAEMON),
1458 FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
1459 };
1460 struct ublk_dev *dev;
1461 __u64 features = 0;
1462 int ret;
1463
1464 dev = ublk_ctrl_init();
1465 if (!dev) {
1466 fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1467 return -EOPNOTSUPP;
1468 }
1469
1470 ret = ublk_ctrl_get_features(dev, &features);
1471 if (!ret) {
1472 int i;
1473
1474 printf("ublk_drv features: 0x%llx\n", features);
1475
1476 for (i = 0; i < sizeof(features) * 8; i++) {
1477 const char *feat;
1478
1479 if (!((1ULL << i) & features))
1480 continue;
1481 if (i < ARRAY_SIZE(feat_map))
1482 feat = feat_map[i];
1483 else
1484 feat = "unknown";
1485 printf("0x%-16llx: %s\n", 1ULL << i, feat);
1486 }
1487 }
1488
1489 return ret;
1490 }
1491
cmd_dev_update_size(struct dev_ctx * ctx)1492 static int cmd_dev_update_size(struct dev_ctx *ctx)
1493 {
1494 struct ublk_dev *dev = ublk_ctrl_init();
1495 struct ublk_params p;
1496 int ret = -EINVAL;
1497
1498 if (!dev)
1499 return -ENODEV;
1500
1501 if (ctx->dev_id < 0) {
1502 fprintf(stderr, "device id isn't provided\n");
1503 goto out;
1504 }
1505
1506 dev->dev_info.dev_id = ctx->dev_id;
1507 ret = ublk_ctrl_get_params(dev, &p);
1508 if (ret < 0) {
1509 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
1510 goto out;
1511 }
1512
1513 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
1514 ublk_err("size isn't aligned with logical block size\n");
1515 ret = -EINVAL;
1516 goto out;
1517 }
1518
1519 ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
1520 out:
1521 ublk_ctrl_deinit(dev);
1522 return ret;
1523 }
1524
cmd_dev_quiesce(struct dev_ctx * ctx)1525 static int cmd_dev_quiesce(struct dev_ctx *ctx)
1526 {
1527 struct ublk_dev *dev = ublk_ctrl_init();
1528 int ret = -EINVAL;
1529
1530 if (!dev)
1531 return -ENODEV;
1532
1533 if (ctx->dev_id < 0) {
1534 fprintf(stderr, "device id isn't provided for quiesce\n");
1535 goto out;
1536 }
1537 dev->dev_info.dev_id = ctx->dev_id;
1538 ret = ublk_ctrl_quiesce_dev(dev, 10000);
1539
1540 out:
1541 ublk_ctrl_deinit(dev);
1542 return ret;
1543 }
1544
__cmd_create_help(char * exe,bool recovery)1545 static void __cmd_create_help(char *exe, bool recovery)
1546 {
1547 int i;
1548
1549 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
1550 exe, recovery ? "recover" : "add");
1551 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
1552 printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
1553 printf("\t[--nthreads threads] [--per_io_tasks]\n");
1554 printf("\t[target options] [backfile1] [backfile2] ...\n");
1555 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
1556 printf("\tdefault: nthreads=nr_queues");
1557
1558 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
1559 const struct ublk_tgt_ops *ops = tgt_ops_list[i];
1560
1561 if (ops->usage)
1562 ops->usage(ops);
1563 }
1564 }
1565
cmd_add_help(char * exe)1566 static void cmd_add_help(char *exe)
1567 {
1568 __cmd_create_help(exe, false);
1569 printf("\n");
1570 }
1571
cmd_recover_help(char * exe)1572 static void cmd_recover_help(char *exe)
1573 {
1574 __cmd_create_help(exe, true);
1575 printf("\tPlease provide exact command line for creating this device with real dev_id\n");
1576 printf("\n");
1577 }
1578
cmd_dev_help(char * exe)1579 static int cmd_dev_help(char *exe)
1580 {
1581 cmd_add_help(exe);
1582 cmd_recover_help(exe);
1583
1584 printf("%s del [-n dev_id] -a \n", exe);
1585 printf("\t -a delete all devices -n delete specified device\n\n");
1586 printf("%s list [-n dev_id] -a \n", exe);
1587 printf("\t -a list all devices, -n list specified device, default -a \n\n");
1588 printf("%s features\n", exe);
1589 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
1590 printf("%s quiesce -n dev_id\n", exe);
1591 return 0;
1592 }
1593
main(int argc,char * argv[])1594 int main(int argc, char *argv[])
1595 {
1596 static const struct option longopts[] = {
1597 { "all", 0, NULL, 'a' },
1598 { "type", 1, NULL, 't' },
1599 { "number", 1, NULL, 'n' },
1600 { "queues", 1, NULL, 'q' },
1601 { "depth", 1, NULL, 'd' },
1602 { "debug_mask", 1, NULL, 0 },
1603 { "quiet", 0, NULL, 0 },
1604 { "zero_copy", 0, NULL, 'z' },
1605 { "foreground", 0, NULL, 0 },
1606 { "recovery", 1, NULL, 'r' },
1607 { "recovery_fail_io", 1, NULL, 'e'},
1608 { "recovery_reissue", 1, NULL, 'i'},
1609 { "get_data", 1, NULL, 'g'},
1610 { "auto_zc", 0, NULL, 0 },
1611 { "auto_zc_fallback", 0, NULL, 0 },
1612 { "user_copy", 0, NULL, 'u'},
1613 { "size", 1, NULL, 's'},
1614 { "nthreads", 1, NULL, 0 },
1615 { "per_io_tasks", 0, NULL, 0 },
1616 { "no_ublk_fixed_fd", 0, NULL, 0 },
1617 { 0, 0, 0, 0 }
1618 };
1619 const struct ublk_tgt_ops *ops = NULL;
1620 int option_idx, opt;
1621 const char *cmd = argv[1];
1622 struct dev_ctx ctx = {
1623 ._evtfd = -1,
1624 .queue_depth = 128,
1625 .nr_hw_queues = 2,
1626 .dev_id = -1,
1627 .tgt_type = "unknown",
1628 };
1629 int ret = -EINVAL, i;
1630 int tgt_argc = 1;
1631 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
1632 int value;
1633
1634 if (argc == 1)
1635 return ret;
1636
1637 opterr = 0;
1638 optind = 2;
1639 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu",
1640 longopts, &option_idx)) != -1) {
1641 switch (opt) {
1642 case 'a':
1643 ctx.all = 1;
1644 break;
1645 case 'n':
1646 ctx.dev_id = strtol(optarg, NULL, 10);
1647 break;
1648 case 't':
1649 if (strlen(optarg) < sizeof(ctx.tgt_type))
1650 strcpy(ctx.tgt_type, optarg);
1651 break;
1652 case 'q':
1653 ctx.nr_hw_queues = strtol(optarg, NULL, 10);
1654 break;
1655 case 'd':
1656 ctx.queue_depth = strtol(optarg, NULL, 10);
1657 break;
1658 case 'z':
1659 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
1660 break;
1661 case 'r':
1662 value = strtol(optarg, NULL, 10);
1663 if (value)
1664 ctx.flags |= UBLK_F_USER_RECOVERY;
1665 break;
1666 case 'e':
1667 value = strtol(optarg, NULL, 10);
1668 if (value)
1669 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
1670 break;
1671 case 'i':
1672 value = strtol(optarg, NULL, 10);
1673 if (value)
1674 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
1675 break;
1676 case 'g':
1677 ctx.flags |= UBLK_F_NEED_GET_DATA;
1678 break;
1679 case 'u':
1680 ctx.flags |= UBLK_F_USER_COPY;
1681 break;
1682 case 's':
1683 ctx.size = strtoull(optarg, NULL, 10);
1684 break;
1685 case 0:
1686 if (!strcmp(longopts[option_idx].name, "debug_mask"))
1687 ublk_dbg_mask = strtol(optarg, NULL, 16);
1688 if (!strcmp(longopts[option_idx].name, "quiet"))
1689 ublk_dbg_mask = 0;
1690 if (!strcmp(longopts[option_idx].name, "foreground"))
1691 ctx.fg = 1;
1692 if (!strcmp(longopts[option_idx].name, "auto_zc"))
1693 ctx.flags |= UBLK_F_AUTO_BUF_REG;
1694 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
1695 ctx.auto_zc_fallback = 1;
1696 if (!strcmp(longopts[option_idx].name, "nthreads"))
1697 ctx.nthreads = strtol(optarg, NULL, 10);
1698 if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
1699 ctx.per_io_tasks = 1;
1700 if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
1701 ctx.no_ublk_fixed_fd = 1;
1702 break;
1703 case '?':
1704 /*
1705 * target requires every option must have argument
1706 */
1707 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
1708 fprintf(stderr, "every target option requires argument: %s %s\n",
1709 argv[optind - 1], argv[optind]);
1710 exit(EXIT_FAILURE);
1711 }
1712
1713 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
1714 tgt_argv[tgt_argc++] = argv[optind - 1];
1715 tgt_argv[tgt_argc++] = argv[optind];
1716 } else {
1717 fprintf(stderr, "too many target options\n");
1718 exit(EXIT_FAILURE);
1719 }
1720 optind += 1;
1721 break;
1722 }
1723 }
1724
1725 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
1726 if (ctx.auto_zc_fallback &&
1727 !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1728 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
1729 ublk_err("%s: auto_zc_fallback is set but neither "
1730 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
1731 __func__);
1732 return -EINVAL;
1733 }
1734
1735 if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
1736 !!(ctx.flags & UBLK_F_USER_COPY) +
1737 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
1738 (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
1739 ctx.auto_zc_fallback > 1) {
1740 fprintf(stderr, "too many data copy modes specified\n");
1741 return -EINVAL;
1742 }
1743
1744 i = optind;
1745 while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
1746 ctx.files[ctx.nr_files++] = argv[i++];
1747 }
1748
1749 ops = ublk_find_tgt(ctx.tgt_type);
1750 if (ops && ops->parse_cmd_line) {
1751 optind = 0;
1752
1753 tgt_argv[0] = ctx.tgt_type;
1754 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
1755 }
1756
1757 if (!strcmp(cmd, "add"))
1758 ret = cmd_dev_add(&ctx);
1759 else if (!strcmp(cmd, "recover")) {
1760 if (ctx.dev_id < 0) {
1761 fprintf(stderr, "device id isn't provided for recovering\n");
1762 ret = -EINVAL;
1763 } else {
1764 ctx.recovery = 1;
1765 ret = cmd_dev_add(&ctx);
1766 }
1767 } else if (!strcmp(cmd, "del"))
1768 ret = cmd_dev_del(&ctx);
1769 else if (!strcmp(cmd, "list")) {
1770 ctx.all = 1;
1771 ret = cmd_dev_list(&ctx);
1772 } else if (!strcmp(cmd, "help"))
1773 ret = cmd_dev_help(argv[0]);
1774 else if (!strcmp(cmd, "features"))
1775 ret = cmd_dev_get_features();
1776 else if (!strcmp(cmd, "update_size"))
1777 ret = cmd_dev_update_size(&ctx);
1778 else if (!strcmp(cmd, "quiesce"))
1779 ret = cmd_dev_quiesce(&ctx);
1780 else
1781 cmd_dev_help(argv[0]);
1782
1783 return ret;
1784 }
1785