1 /* SPDX-License-Identifier: MIT */
2 /*
3 * Description: uring_cmd based ublk
4 */
5
6 #include <linux/fs.h>
7 #include <sys/un.h>
8 #include "kublk.h"
9
10 #define MAX_NR_TGT_ARG 64
11
12 unsigned int ublk_dbg_mask = UBLK_LOG;
13 static const struct ublk_tgt_ops *tgt_ops_list[] = {
14 &null_tgt_ops,
15 &loop_tgt_ops,
16 &stripe_tgt_ops,
17 &fault_inject_tgt_ops,
18 };
19
ublk_find_tgt(const char * name)20 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
21 {
22 int i;
23
24 if (name == NULL)
25 return NULL;
26
27 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
28 if (strcmp(tgt_ops_list[i]->name, name) == 0)
29 return tgt_ops_list[i];
30 return NULL;
31 }
32
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)33 static inline int ublk_setup_ring(struct io_uring *r, int depth,
34 int cq_depth, unsigned flags)
35 {
36 struct io_uring_params p;
37
38 memset(&p, 0, sizeof(p));
39 p.flags = flags | IORING_SETUP_CQSIZE;
40 p.cq_entries = cq_depth;
41
42 return io_uring_queue_init_params(depth, r, &p);
43 }
44
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)45 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
46 struct io_uring_sqe *sqe,
47 struct ublk_ctrl_cmd_data *data)
48 {
49 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
50 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
51
52 sqe->fd = dev->ctrl_fd;
53 sqe->opcode = IORING_OP_URING_CMD;
54 sqe->ioprio = 0;
55
56 if (data->flags & CTRL_CMD_HAS_BUF) {
57 cmd->addr = data->addr;
58 cmd->len = data->len;
59 }
60
61 if (data->flags & CTRL_CMD_HAS_DATA)
62 cmd->data[0] = data->data[0];
63
64 cmd->dev_id = info->dev_id;
65 cmd->queue_id = -1;
66
67 ublk_set_sqe_cmd_op(sqe, data->cmd_op);
68
69 io_uring_sqe_set_data(sqe, cmd);
70 }
71
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)72 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
73 struct ublk_ctrl_cmd_data *data)
74 {
75 struct io_uring_sqe *sqe;
76 struct io_uring_cqe *cqe;
77 int ret = -EINVAL;
78
79 sqe = io_uring_get_sqe(&dev->ring);
80 if (!sqe) {
81 ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
82 return ret;
83 }
84
85 ublk_ctrl_init_cmd(dev, sqe, data);
86
87 ret = io_uring_submit(&dev->ring);
88 if (ret < 0) {
89 ublk_err("uring submit ret %d\n", ret);
90 return ret;
91 }
92
93 ret = io_uring_wait_cqe(&dev->ring, &cqe);
94 if (ret < 0) {
95 ublk_err("wait cqe: %s\n", strerror(-ret));
96 return ret;
97 }
98 io_uring_cqe_seen(&dev->ring, cqe);
99
100 return cqe->res;
101 }
102
ublk_ctrl_stop_dev(struct ublk_dev * dev)103 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
104 {
105 struct ublk_ctrl_cmd_data data = {
106 .cmd_op = UBLK_U_CMD_STOP_DEV,
107 };
108
109 return __ublk_ctrl_cmd(dev, &data);
110 }
111
ublk_ctrl_try_stop_dev(struct ublk_dev * dev)112 static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
113 {
114 struct ublk_ctrl_cmd_data data = {
115 .cmd_op = UBLK_U_CMD_TRY_STOP_DEV,
116 };
117
118 return __ublk_ctrl_cmd(dev, &data);
119 }
120
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)121 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
122 int daemon_pid)
123 {
124 struct ublk_ctrl_cmd_data data = {
125 .cmd_op = UBLK_U_CMD_START_DEV,
126 .flags = CTRL_CMD_HAS_DATA,
127 };
128
129 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
130
131 return __ublk_ctrl_cmd(dev, &data);
132 }
133
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)134 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
135 {
136 struct ublk_ctrl_cmd_data data = {
137 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
138 };
139
140 return __ublk_ctrl_cmd(dev, &data);
141 }
142
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)143 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
144 {
145 struct ublk_ctrl_cmd_data data = {
146 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
147 .flags = CTRL_CMD_HAS_DATA,
148 };
149
150 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
151
152 return __ublk_ctrl_cmd(dev, &data);
153 }
154
ublk_ctrl_add_dev(struct ublk_dev * dev)155 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
156 {
157 struct ublk_ctrl_cmd_data data = {
158 .cmd_op = UBLK_U_CMD_ADD_DEV,
159 .flags = CTRL_CMD_HAS_BUF,
160 .addr = (__u64) (uintptr_t) &dev->dev_info,
161 .len = sizeof(struct ublksrv_ctrl_dev_info),
162 };
163
164 return __ublk_ctrl_cmd(dev, &data);
165 }
166
ublk_ctrl_del_dev(struct ublk_dev * dev)167 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
168 {
169 struct ublk_ctrl_cmd_data data = {
170 .cmd_op = UBLK_U_CMD_DEL_DEV,
171 .flags = 0,
172 };
173
174 return __ublk_ctrl_cmd(dev, &data);
175 }
176
ublk_ctrl_get_info(struct ublk_dev * dev)177 static int ublk_ctrl_get_info(struct ublk_dev *dev)
178 {
179 struct ublk_ctrl_cmd_data data = {
180 .cmd_op = UBLK_U_CMD_GET_DEV_INFO,
181 .flags = CTRL_CMD_HAS_BUF,
182 .addr = (__u64) (uintptr_t) &dev->dev_info,
183 .len = sizeof(struct ublksrv_ctrl_dev_info),
184 };
185
186 return __ublk_ctrl_cmd(dev, &data);
187 }
188
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)189 static int ublk_ctrl_set_params(struct ublk_dev *dev,
190 struct ublk_params *params)
191 {
192 struct ublk_ctrl_cmd_data data = {
193 .cmd_op = UBLK_U_CMD_SET_PARAMS,
194 .flags = CTRL_CMD_HAS_BUF,
195 .addr = (__u64) (uintptr_t) params,
196 .len = sizeof(*params),
197 };
198 params->len = sizeof(*params);
199 return __ublk_ctrl_cmd(dev, &data);
200 }
201
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)202 static int ublk_ctrl_get_params(struct ublk_dev *dev,
203 struct ublk_params *params)
204 {
205 struct ublk_ctrl_cmd_data data = {
206 .cmd_op = UBLK_U_CMD_GET_PARAMS,
207 .flags = CTRL_CMD_HAS_BUF,
208 .addr = (__u64)params,
209 .len = sizeof(*params),
210 };
211
212 params->len = sizeof(*params);
213
214 return __ublk_ctrl_cmd(dev, &data);
215 }
216
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)217 static int ublk_ctrl_get_features(struct ublk_dev *dev,
218 __u64 *features)
219 {
220 struct ublk_ctrl_cmd_data data = {
221 .cmd_op = UBLK_U_CMD_GET_FEATURES,
222 .flags = CTRL_CMD_HAS_BUF,
223 .addr = (__u64) (uintptr_t) features,
224 .len = sizeof(*features),
225 };
226
227 return __ublk_ctrl_cmd(dev, &data);
228 }
229
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)230 static int ublk_ctrl_update_size(struct ublk_dev *dev,
231 __u64 nr_sects)
232 {
233 struct ublk_ctrl_cmd_data data = {
234 .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
235 .flags = CTRL_CMD_HAS_DATA,
236 };
237
238 data.data[0] = nr_sects;
239 return __ublk_ctrl_cmd(dev, &data);
240 }
241
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)242 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
243 unsigned int timeout_ms)
244 {
245 struct ublk_ctrl_cmd_data data = {
246 .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
247 .flags = CTRL_CMD_HAS_DATA,
248 };
249
250 data.data[0] = timeout_ms;
251 return __ublk_ctrl_cmd(dev, &data);
252 }
253
ublk_dev_state_desc(struct ublk_dev * dev)254 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
255 {
256 switch (dev->dev_info.state) {
257 case UBLK_S_DEV_DEAD:
258 return "DEAD";
259 case UBLK_S_DEV_LIVE:
260 return "LIVE";
261 case UBLK_S_DEV_QUIESCED:
262 return "QUIESCED";
263 default:
264 return "UNKNOWN";
265 };
266 }
267
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)268 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
269 {
270 unsigned done = 0;
271 int i;
272
273 for (i = 0; i < CPU_SETSIZE; i++) {
274 if (CPU_ISSET(i, set))
275 done += snprintf(&buf[done], len - done, "%d ", i);
276 }
277 }
278
ublk_adjust_affinity(cpu_set_t * set)279 static void ublk_adjust_affinity(cpu_set_t *set)
280 {
281 int j, updated = 0;
282
283 /*
284 * Just keep the 1st CPU now.
285 *
286 * In future, auto affinity selection can be tried.
287 */
288 for (j = 0; j < CPU_SETSIZE; j++) {
289 if (CPU_ISSET(j, set)) {
290 if (!updated) {
291 updated = 1;
292 continue;
293 }
294 CPU_CLR(j, set);
295 }
296 }
297 }
298
299 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)300 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
301 {
302 struct ublk_ctrl_cmd_data data = {
303 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
304 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
305 };
306 cpu_set_t *buf;
307 int i, ret;
308
309 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
310 if (!buf)
311 return -ENOMEM;
312
313 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
314 data.data[0] = i;
315 data.len = sizeof(cpu_set_t);
316 data.addr = (__u64)&buf[i];
317
318 ret = __ublk_ctrl_cmd(ctrl_dev, &data);
319 if (ret < 0) {
320 free(buf);
321 return ret;
322 }
323 ublk_adjust_affinity(&buf[i]);
324 }
325
326 *ptr_buf = buf;
327 return 0;
328 }
329
ublk_ctrl_dump(struct ublk_dev * dev)330 static void ublk_ctrl_dump(struct ublk_dev *dev)
331 {
332 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
333 struct ublk_params p;
334 cpu_set_t *affinity;
335 int ret;
336
337 ret = ublk_ctrl_get_params(dev, &p);
338 if (ret < 0) {
339 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
340 return;
341 }
342
343 ret = ublk_ctrl_get_affinity(dev, &affinity);
344 if (ret < 0) {
345 ublk_err("failed to get affinity %m\n");
346 return;
347 }
348
349 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
350 info->dev_id, info->nr_hw_queues, info->queue_depth,
351 1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
352 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
353 info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
354 ublk_dev_state_desc(dev));
355
356 if (affinity) {
357 char buf[512];
358 int i;
359
360 for (i = 0; i < info->nr_hw_queues; i++) {
361 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
362 printf("\tqueue %u: affinity(%s)\n",
363 i, buf);
364 }
365 free(affinity);
366 }
367
368 fflush(stdout);
369 }
370
ublk_ctrl_deinit(struct ublk_dev * dev)371 static void ublk_ctrl_deinit(struct ublk_dev *dev)
372 {
373 close(dev->ctrl_fd);
374 free(dev);
375 }
376
ublk_ctrl_init(void)377 static struct ublk_dev *ublk_ctrl_init(void)
378 {
379 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
380 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
381 int ret;
382
383 dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
384 if (dev->ctrl_fd < 0) {
385 free(dev);
386 return NULL;
387 }
388
389 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
390
391 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
392 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
393 if (ret < 0) {
394 ublk_err("queue_init: %s\n", strerror(-ret));
395 free(dev);
396 return NULL;
397 }
398 dev->nr_fds = 1;
399
400 return dev;
401 }
402
__ublk_queue_cmd_buf_sz(unsigned depth)403 static int __ublk_queue_cmd_buf_sz(unsigned depth)
404 {
405 int size = depth * sizeof(struct ublksrv_io_desc);
406 unsigned int page_sz = getpagesize();
407
408 return round_up(size, page_sz);
409 }
410
ublk_queue_max_cmd_buf_sz(void)411 static int ublk_queue_max_cmd_buf_sz(void)
412 {
413 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
414 }
415
ublk_queue_cmd_buf_sz(struct ublk_queue * q)416 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
417 {
418 return __ublk_queue_cmd_buf_sz(q->q_depth);
419 }
420
ublk_queue_deinit(struct ublk_queue * q)421 static void ublk_queue_deinit(struct ublk_queue *q)
422 {
423 int i;
424 int nr_ios = q->q_depth;
425
426 if (q->io_cmd_buf)
427 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
428
429 for (i = 0; i < nr_ios; i++) {
430 free(q->ios[i].buf_addr);
431 free(q->ios[i].integrity_buf);
432 }
433 }
434
ublk_thread_deinit(struct ublk_thread * t)435 static void ublk_thread_deinit(struct ublk_thread *t)
436 {
437 io_uring_unregister_buffers(&t->ring);
438
439 ublk_batch_free_buf(t);
440
441 io_uring_unregister_ring_fd(&t->ring);
442
443 if (t->ring.ring_fd > 0) {
444 io_uring_unregister_files(&t->ring);
445 close(t->ring.ring_fd);
446 t->ring.ring_fd = -1;
447 }
448 }
449
ublk_queue_init(struct ublk_queue * q,unsigned long long extra_flags,__u8 metadata_size)450 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
451 __u8 metadata_size)
452 {
453 struct ublk_dev *dev = q->dev;
454 int depth = dev->dev_info.queue_depth;
455 int i;
456 int cmd_buf_size, io_buf_size, integrity_size;
457 unsigned long off;
458
459 pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
460 q->tgt_ops = dev->tgt.ops;
461 q->flags = 0;
462 q->q_depth = depth;
463 q->flags = dev->dev_info.flags;
464 q->flags |= extra_flags;
465 q->metadata_size = metadata_size;
466
467 /* Cache fd in queue for fast path access */
468 q->ublk_fd = dev->fds[0];
469
470 cmd_buf_size = ublk_queue_cmd_buf_sz(q);
471 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
472 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
473 MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
474 if (q->io_cmd_buf == MAP_FAILED) {
475 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
476 q->dev->dev_info.dev_id, q->q_id);
477 goto fail;
478 }
479
480 io_buf_size = dev->dev_info.max_io_buf_bytes;
481 integrity_size = ublk_integrity_len(q, io_buf_size);
482 for (i = 0; i < q->q_depth; i++) {
483 q->ios[i].buf_addr = NULL;
484 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
485 q->ios[i].tag = i;
486
487 if (integrity_size) {
488 q->ios[i].integrity_buf = malloc(integrity_size);
489 if (!q->ios[i].integrity_buf) {
490 ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
491 dev->dev_info.dev_id, q->q_id, i,
492 integrity_size);
493 goto fail;
494 }
495 }
496
497
498 if (ublk_queue_no_buf(q))
499 continue;
500
501 if (posix_memalign((void **)&q->ios[i].buf_addr,
502 getpagesize(), io_buf_size)) {
503 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
504 dev->dev_info.dev_id, q->q_id, i);
505 goto fail;
506 }
507 }
508
509 return 0;
510 fail:
511 ublk_queue_deinit(q);
512 ublk_err("ublk dev %d queue %d failed\n",
513 dev->dev_info.dev_id, q->q_id);
514 return -ENOMEM;
515 }
516
ublk_thread_init(struct ublk_thread * t,unsigned long long extra_flags)517 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
518 {
519 struct ublk_dev *dev = t->dev;
520 unsigned long long flags = dev->dev_info.flags | extra_flags;
521 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
522 int ret;
523
524 /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
525 if (ublk_dev_batch_io(dev))
526 cq_depth += dev->dev_info.queue_depth * 2;
527
528 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
529 IORING_SETUP_COOP_TASKRUN |
530 IORING_SETUP_SINGLE_ISSUER |
531 IORING_SETUP_DEFER_TASKRUN);
532 if (ret < 0) {
533 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
534 dev->dev_info.dev_id, t->idx, ret);
535 goto fail;
536 }
537
538 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
539 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
540 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
541 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
542
543 t->nr_bufs = max_nr_ios_per_thread;
544 } else {
545 t->nr_bufs = 0;
546 }
547
548 if (ublk_dev_batch_io(dev))
549 ublk_batch_prepare(t);
550
551 if (t->nr_bufs) {
552 ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
553 if (ret) {
554 ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
555 dev->dev_info.dev_id, t->idx, ret);
556 goto fail;
557 }
558 }
559
560 if (ublk_dev_batch_io(dev)) {
561 ret = ublk_batch_alloc_buf(t);
562 if (ret) {
563 ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
564 dev->dev_info.dev_id, t->idx, ret);
565 goto fail;
566 }
567 }
568
569 io_uring_register_ring_fd(&t->ring);
570
571 if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
572 /* Register only backing files starting from index 1, exclude ublk control device */
573 if (dev->nr_fds > 1) {
574 ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
575 } else {
576 /* No backing files to register, skip file registration */
577 ret = 0;
578 }
579 } else {
580 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
581 }
582 if (ret) {
583 ublk_err("ublk dev %d thread %d register files failed %d\n",
584 t->dev->dev_info.dev_id, t->idx, ret);
585 goto fail;
586 }
587
588 return 0;
589 fail:
590 ublk_thread_deinit(t);
591 ublk_err("ublk dev %d thread %d init failed\n",
592 dev->dev_info.dev_id, t->idx);
593 return -ENOMEM;
594 }
595
596 #define WAIT_USEC 100000
597 #define MAX_WAIT_USEC (3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)598 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
599 {
600 int dev_id = dev->dev_info.dev_id;
601 unsigned int wait_usec = 0;
602 int ret = 0, fd = -1;
603 char buf[64];
604
605 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
606
607 while (wait_usec < MAX_WAIT_USEC) {
608 fd = open(buf, O_RDWR);
609 if (fd >= 0)
610 break;
611 usleep(WAIT_USEC);
612 wait_usec += WAIT_USEC;
613 }
614 if (fd < 0) {
615 ublk_err("can't open %s %s\n", buf, strerror(errno));
616 return -1;
617 }
618
619 dev->fds[0] = fd;
620 if (dev->tgt.ops->init_tgt)
621 ret = dev->tgt.ops->init_tgt(ctx, dev);
622 if (ret)
623 close(dev->fds[0]);
624 return ret;
625 }
626
ublk_dev_unprep(struct ublk_dev * dev)627 static void ublk_dev_unprep(struct ublk_dev *dev)
628 {
629 if (dev->tgt.ops->deinit_tgt)
630 dev->tgt.ops->deinit_tgt(dev);
631 close(dev->fds[0]);
632 }
633
ublk_set_auto_buf_reg(const struct ublk_thread * t,const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)634 static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
635 const struct ublk_queue *q,
636 struct io_uring_sqe *sqe,
637 unsigned short tag)
638 {
639 struct ublk_auto_buf_reg buf = {};
640
641 if (q->tgt_ops->buf_index)
642 buf.index = q->tgt_ops->buf_index(t, q, tag);
643 else
644 buf.index = ublk_io_buf_idx(t, q, tag);
645
646 if (ublk_queue_auto_zc_fallback(q))
647 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
648
649 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
650 }
651
652 /* Copy in pieces to test the buffer offset logic */
653 #define UBLK_USER_COPY_LEN 2048
654
ublk_user_copy(const struct ublk_io * io,__u8 match_ublk_op)655 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
656 {
657 const struct ublk_queue *q = ublk_io_to_queue(io);
658 const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
659 __u64 off = ublk_user_copy_offset(q->q_id, io->tag);
660 __u8 ublk_op = ublksrv_get_op(iod);
661 __u32 len = iod->nr_sectors << 9;
662 void *addr = io->buf_addr;
663 ssize_t copied;
664
665 if (ublk_op != match_ublk_op)
666 return;
667
668 while (len) {
669 __u32 copy_len = min(len, UBLK_USER_COPY_LEN);
670
671 if (ublk_op == UBLK_IO_OP_WRITE)
672 copied = pread(q->ublk_fd, addr, copy_len, off);
673 else if (ublk_op == UBLK_IO_OP_READ)
674 copied = pwrite(q->ublk_fd, addr, copy_len, off);
675 else
676 assert(0);
677 assert(copied == (ssize_t)copy_len);
678 addr += copy_len;
679 off += copy_len;
680 len -= copy_len;
681 }
682
683 if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
684 return;
685
686 len = ublk_integrity_len(q, iod->nr_sectors << 9);
687 off = ublk_user_copy_offset(q->q_id, io->tag);
688 off |= UBLKSRV_IO_INTEGRITY_FLAG;
689 if (ublk_op == UBLK_IO_OP_WRITE)
690 copied = pread(q->ublk_fd, io->integrity_buf, len, off);
691 else if (ublk_op == UBLK_IO_OP_READ)
692 copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
693 else
694 assert(0);
695 assert(copied == (ssize_t)len);
696 }
697
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)698 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
699 {
700 struct ublk_queue *q = ublk_io_to_queue(io);
701 struct ublksrv_io_cmd *cmd;
702 struct io_uring_sqe *sqe[1];
703 unsigned int cmd_op = 0;
704 __u64 user_data;
705
706 /* only freed io can be issued */
707 if (!(io->flags & UBLKS_IO_FREE))
708 return 0;
709
710 /*
711 * we issue because we need either fetching or committing or
712 * getting data
713 */
714 if (!(io->flags &
715 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
716 return 0;
717
718 if (io->flags & UBLKS_IO_NEED_GET_DATA)
719 cmd_op = UBLK_U_IO_NEED_GET_DATA;
720 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
721 if (ublk_queue_use_user_copy(q))
722 ublk_user_copy(io, UBLK_IO_OP_READ);
723
724 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
725 } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
726 cmd_op = UBLK_U_IO_FETCH_REQ;
727
728 if (io_uring_sq_space_left(&t->ring) < 1)
729 io_uring_submit(&t->ring);
730
731 ublk_io_alloc_sqes(t, sqe, 1);
732 if (!sqe[0]) {
733 ublk_err("%s: run out of sqe. thread %u, tag %d\n",
734 __func__, t->idx, io->tag);
735 return -1;
736 }
737
738 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
739
740 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
741 cmd->result = io->result;
742
743 /* These fields should be written once, never change */
744 ublk_set_sqe_cmd_op(sqe[0], cmd_op);
745 sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */
746 sqe[0]->opcode = IORING_OP_URING_CMD;
747 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
748 sqe[0]->flags = 0; /* Use raw FD, not fixed file */
749 else
750 sqe[0]->flags = IOSQE_FIXED_FILE;
751 sqe[0]->rw_flags = 0;
752 cmd->tag = io->tag;
753 cmd->q_id = q->q_id;
754 if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
755 cmd->addr = (__u64) (uintptr_t) io->buf_addr;
756 else
757 cmd->addr = 0;
758
759 if (ublk_queue_use_auto_zc(q))
760 ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
761
762 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
763 io_uring_sqe_set_data64(sqe[0], user_data);
764
765 io->flags = 0;
766
767 t->cmd_inflight += 1;
768
769 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
770 __func__, t->idx, q->q_id, io->tag, cmd_op,
771 io->flags, !!(t->state & UBLKS_T_STOPPING));
772 return 1;
773 }
774
ublk_submit_fetch_commands(struct ublk_thread * t)775 static void ublk_submit_fetch_commands(struct ublk_thread *t)
776 {
777 struct ublk_queue *q;
778 struct ublk_io *io;
779 int i = 0, j = 0;
780
781 if (t->dev->per_io_tasks) {
782 /*
783 * Lexicographically order all the (qid,tag) pairs, with
784 * qid taking priority (so (1,0) > (0,1)). Then make
785 * this thread the daemon for every Nth entry in this
786 * list (N is the number of threads), starting at this
787 * thread's index. This ensures that each queue is
788 * handled by as many ublk server threads as possible,
789 * so that load that is concentrated on one or a few
790 * queues can make use of all ublk server threads.
791 */
792 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
793 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
794 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
795 int q_id = i / dinfo->queue_depth;
796 int tag = i % dinfo->queue_depth;
797 q = &t->dev->q[q_id];
798 io = &q->ios[tag];
799 io->buf_index = j++;
800 if (q->tgt_ops->pre_fetch_io)
801 q->tgt_ops->pre_fetch_io(t, q, tag, false);
802 ublk_queue_io_cmd(t, io);
803 }
804 } else {
805 /*
806 * Service exclusively the queue whose q_id matches our
807 * thread index.
808 */
809 struct ublk_queue *q = &t->dev->q[t->idx];
810 for (i = 0; i < q->q_depth; i++) {
811 io = &q->ios[i];
812 io->buf_index = i;
813 if (q->tgt_ops->pre_fetch_io)
814 q->tgt_ops->pre_fetch_io(t, q, i, false);
815 ublk_queue_io_cmd(t, io);
816 }
817 }
818 }
819
ublk_thread_is_idle(struct ublk_thread * t)820 static int ublk_thread_is_idle(struct ublk_thread *t)
821 {
822 return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
823 }
824
ublk_thread_is_done(struct ublk_thread * t)825 static int ublk_thread_is_done(struct ublk_thread *t)
826 {
827 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
828 }
829
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)830 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
831 struct ublk_queue *q,
832 struct io_uring_cqe *cqe)
833 {
834 if (cqe->res < 0 && cqe->res != -EAGAIN)
835 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
836 __func__, cqe->res, q->q_id,
837 user_data_to_tag(cqe->user_data),
838 user_data_to_op(cqe->user_data));
839
840 if (q->tgt_ops->tgt_io_done)
841 q->tgt_ops->tgt_io_done(t, q, cqe);
842 }
843
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)844 static void ublk_handle_uring_cmd(struct ublk_thread *t,
845 struct ublk_queue *q,
846 const struct io_uring_cqe *cqe)
847 {
848 int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
849 !(t->state & UBLKS_T_STOPPING);
850 unsigned tag = user_data_to_tag(cqe->user_data);
851 struct ublk_io *io = &q->ios[tag];
852
853 t->cmd_inflight--;
854
855 if (!fetch) {
856 t->state |= UBLKS_T_STOPPING;
857 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
858 }
859
860 if (cqe->res == UBLK_IO_RES_OK) {
861 ublk_assert(tag < q->q_depth);
862
863 if (ublk_queue_use_user_copy(q))
864 ublk_user_copy(io, UBLK_IO_OP_WRITE);
865
866 if (q->tgt_ops->queue_io)
867 q->tgt_ops->queue_io(t, q, tag);
868 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
869 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
870 ublk_queue_io_cmd(t, io);
871 } else {
872 /*
873 * COMMIT_REQ will be completed immediately since no fetching
874 * piggyback is required.
875 *
876 * Marking IO_FREE only, then this io won't be issued since
877 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
878 *
879 * */
880 io->flags = UBLKS_IO_FREE;
881 }
882 }
883
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)884 static void ublk_handle_cqe(struct ublk_thread *t,
885 struct io_uring_cqe *cqe, void *data)
886 {
887 struct ublk_dev *dev = t->dev;
888 unsigned q_id = user_data_to_q_id(cqe->user_data);
889 unsigned cmd_op = user_data_to_op(cqe->user_data);
890
891 if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
892 ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
893 cqe->res, cqe->user_data, t->state);
894
895 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
896 "data %lx target %d/%d) stopping %d\n",
897 __func__, cqe->res, t->idx, q_id,
898 user_data_to_tag(cqe->user_data),
899 cmd_op, cqe->user_data, is_target_io(cqe->user_data),
900 user_data_to_tgt_data(cqe->user_data),
901 (t->state & UBLKS_T_STOPPING));
902
903 /* Don't retrieve io in case of target io */
904 if (is_target_io(cqe->user_data)) {
905 ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
906 return;
907 }
908
909 if (ublk_thread_batch_io(t))
910 ublk_batch_compl_cmd(t, cqe);
911 else
912 ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
913 }
914
ublk_reap_events_uring(struct ublk_thread * t)915 static int ublk_reap_events_uring(struct ublk_thread *t)
916 {
917 struct io_uring_cqe *cqe;
918 unsigned head;
919 int count = 0;
920
921 io_uring_for_each_cqe(&t->ring, head, cqe) {
922 ublk_handle_cqe(t, cqe, NULL);
923 count += 1;
924 }
925 io_uring_cq_advance(&t->ring, count);
926
927 return count;
928 }
929
ublk_process_io(struct ublk_thread * t)930 static int ublk_process_io(struct ublk_thread *t)
931 {
932 int ret, reapped;
933
934 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
935 t->dev->dev_info.dev_id,
936 t->idx, io_uring_sq_ready(&t->ring),
937 t->cmd_inflight,
938 (t->state & UBLKS_T_STOPPING));
939
940 if (ublk_thread_is_done(t))
941 return -ENODEV;
942
943 ret = io_uring_submit_and_wait(&t->ring, 1);
944 if (ublk_thread_batch_io(t)) {
945 ublk_batch_prep_commit(t);
946 reapped = ublk_reap_events_uring(t);
947 ublk_batch_commit_io_cmds(t);
948 } else {
949 reapped = ublk_reap_events_uring(t);
950 }
951
952 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
953 ret, reapped, (t->state & UBLKS_T_STOPPING),
954 (t->state & UBLKS_T_IDLE));
955
956 return reapped;
957 }
958
959 struct ublk_thread_info {
960 struct ublk_dev *dev;
961 pthread_t thread;
962 unsigned idx;
963 sem_t *ready;
964 cpu_set_t *affinity;
965 unsigned long long extra_flags;
966 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES];
967 };
968
ublk_thread_set_sched_affinity(const struct ublk_thread_info * info)969 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
970 {
971 if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
972 ublk_err("ublk dev %u thread %u set affinity failed",
973 info->dev->dev_info.dev_id, info->idx);
974 }
975
ublk_batch_setup_queues(struct ublk_thread * t)976 static void ublk_batch_setup_queues(struct ublk_thread *t)
977 {
978 int i;
979
980 for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
981 struct ublk_queue *q = &t->dev->q[i];
982 int ret;
983
984 /*
985 * Only prepare io commands in the mapped thread context,
986 * otherwise io command buffer index may not work as expected
987 */
988 if (t->q_map[i] == 0)
989 continue;
990
991 if (q->tgt_ops->pre_fetch_io)
992 q->tgt_ops->pre_fetch_io(t, q, 0, true);
993
994 ret = ublk_batch_queue_prep_io_cmds(t, q);
995 ublk_assert(ret >= 0);
996 }
997 }
998
__ublk_io_handler_fn(struct ublk_thread_info * info)999 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
1000 {
1001 struct ublk_thread t = {
1002 .dev = info->dev,
1003 .idx = info->idx,
1004 };
1005 int dev_id = info->dev->dev_info.dev_id;
1006 int ret;
1007
1008 /* Copy per-thread queue mapping into thread-local variable */
1009 if (info->q_thread_map)
1010 memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
1011
1012 ret = ublk_thread_init(&t, info->extra_flags);
1013 if (ret) {
1014 ublk_err("ublk dev %d thread %u init failed\n",
1015 dev_id, t.idx);
1016 return ret;
1017 }
1018 sem_post(info->ready);
1019
1020 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
1021 gettid(), dev_id, t.idx);
1022
1023 if (!ublk_thread_batch_io(&t)) {
1024 /* submit all io commands to ublk driver */
1025 ublk_submit_fetch_commands(&t);
1026 } else {
1027 ublk_batch_setup_queues(&t);
1028 ublk_batch_start_fetch(&t);
1029 }
1030
1031 do {
1032 if (ublk_process_io(&t) < 0)
1033 break;
1034 } while (1);
1035
1036 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
1037 gettid(), dev_id, t.idx);
1038 ublk_thread_deinit(&t);
1039 return 0;
1040 }
1041
ublk_io_handler_fn(void * data)1042 static void *ublk_io_handler_fn(void *data)
1043 {
1044 struct ublk_thread_info *info = data;
1045
1046 /*
1047 * IO perf is sensitive with queue pthread affinity on NUMA machine
1048 *
1049 * Set sched_affinity at beginning, so following allocated memory/pages
1050 * could be CPU/NUMA aware.
1051 */
1052 if (info->affinity)
1053 ublk_thread_set_sched_affinity(info);
1054
1055 __ublk_io_handler_fn(info);
1056
1057 return NULL;
1058 }
1059
ublk_set_parameters(struct ublk_dev * dev)1060 static void ublk_set_parameters(struct ublk_dev *dev)
1061 {
1062 int ret;
1063
1064 ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
1065 if (ret)
1066 ublk_err("dev %d set basic parameter failed %d\n",
1067 dev->dev_info.dev_id, ret);
1068 }
1069
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)1070 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
1071 {
1072 uint64_t id;
1073 int evtfd = ctx->_evtfd;
1074
1075 if (evtfd < 0)
1076 return -EBADF;
1077
1078 if (dev_id >= 0)
1079 id = dev_id + 1;
1080 else
1081 id = ERROR_EVTFD_DEVID;
1082
1083 if (dev && ctx->shadow_dev)
1084 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
1085
1086 if (write(evtfd, &id, sizeof(id)) != sizeof(id))
1087 return -EINVAL;
1088
1089 close(evtfd);
1090 shmdt(ctx->shadow_dev);
1091
1092 return 0;
1093 }
1094
1095
1096 /*
1097 * Shared memory registration socket listener.
1098 *
1099 * The parent daemon context listens on a per-device unix socket at
1100 * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
1101 * from clients. Clients send a memfd via SCM_RIGHTS; the server
1102 * registers it with the kernel, mmaps it, and returns the assigned index.
1103 */
1104 #define UBLK_SHMEM_SOCK_DIR "/run/ublk"
1105
1106 /* defined in kublk.h, shared with file_backed.c (loop target) */
1107 struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
1108 int shmem_count;
1109
ublk_shmem_sock_path(int dev_id,char * buf,size_t len)1110 static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
1111 {
1112 snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
1113 }
1114
ublk_shmem_sock_create(int dev_id)1115 static int ublk_shmem_sock_create(int dev_id)
1116 {
1117 struct sockaddr_un addr = { .sun_family = AF_UNIX };
1118 char path[108];
1119 int fd;
1120
1121 mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
1122 ublk_shmem_sock_path(dev_id, path, sizeof(path));
1123 unlink(path);
1124
1125 fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
1126 if (fd < 0)
1127 return -1;
1128
1129 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
1130 if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
1131 close(fd);
1132 return -1;
1133 }
1134
1135 listen(fd, 4);
1136 ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
1137 return fd;
1138 }
1139
ublk_shmem_sock_destroy(int dev_id,int sock_fd)1140 static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
1141 {
1142 char path[108];
1143
1144 if (sock_fd >= 0)
1145 close(sock_fd);
1146 ublk_shmem_sock_path(dev_id, path, sizeof(path));
1147 unlink(path);
1148 }
1149
1150 /* Receive a memfd from a client via SCM_RIGHTS */
ublk_shmem_recv_fd(int client_fd)1151 static int ublk_shmem_recv_fd(int client_fd)
1152 {
1153 char buf[1];
1154 struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
1155 union {
1156 char cmsg_buf[CMSG_SPACE(sizeof(int))];
1157 struct cmsghdr align;
1158 } u;
1159 struct msghdr msg = {
1160 .msg_iov = &iov,
1161 .msg_iovlen = 1,
1162 .msg_control = u.cmsg_buf,
1163 .msg_controllen = sizeof(u.cmsg_buf),
1164 };
1165 struct cmsghdr *cmsg;
1166
1167 if (recvmsg(client_fd, &msg, 0) <= 0)
1168 return -1;
1169
1170 cmsg = CMSG_FIRSTHDR(&msg);
1171 if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
1172 cmsg->cmsg_type != SCM_RIGHTS)
1173 return -1;
1174
1175 return *(int *)CMSG_DATA(cmsg);
1176 }
1177
1178 /* Register a shared memory buffer: store fd, mmap it, return index */
ublk_shmem_register(int shmem_fd)1179 static int ublk_shmem_register(int shmem_fd)
1180 {
1181 off_t size;
1182 void *base;
1183 int idx;
1184
1185 if (shmem_count >= UBLK_BUF_MAX)
1186 return -1;
1187
1188 size = lseek(shmem_fd, 0, SEEK_END);
1189 if (size <= 0)
1190 return -1;
1191
1192 base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
1193 shmem_fd, 0);
1194 if (base == MAP_FAILED)
1195 return -1;
1196
1197 idx = shmem_count++;
1198 shmem_table[idx].fd = shmem_fd;
1199 shmem_table[idx].mmap_base = base;
1200 shmem_table[idx].size = size;
1201
1202 ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
1203 idx, shmem_fd, (size_t)size);
1204 return idx;
1205 }
1206
ublk_shmem_unregister_all(void)1207 static void ublk_shmem_unregister_all(void)
1208 {
1209 int i;
1210
1211 for (i = 0; i < shmem_count; i++) {
1212 if (shmem_table[i].mmap_base) {
1213 munmap(shmem_table[i].mmap_base,
1214 shmem_table[i].size);
1215 close(shmem_table[i].fd);
1216 shmem_table[i].mmap_base = NULL;
1217 }
1218 }
1219 shmem_count = 0;
1220 }
1221
ublk_ctrl_reg_buf(struct ublk_dev * dev,void * addr,size_t size,__u32 flags)1222 static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size,
1223 __u32 flags)
1224 {
1225 struct ublk_shmem_buf_reg buf_reg = {
1226 .addr = (unsigned long)addr,
1227 .len = size,
1228 .flags = flags,
1229 };
1230 struct ublk_ctrl_cmd_data data = {
1231 .cmd_op = UBLK_U_CMD_REG_BUF,
1232 .flags = CTRL_CMD_HAS_BUF,
1233 .addr = (unsigned long)&buf_reg,
1234 .len = sizeof(buf_reg),
1235 };
1236
1237 return __ublk_ctrl_cmd(dev, &data);
1238 }
1239
1240 /*
1241 * Handle one client connection: receive memfd, mmap it, register
1242 * the VA range with kernel, send back the assigned index.
1243 */
ublk_shmem_handle_client(int sock_fd,struct ublk_dev * dev)1244 static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
1245 {
1246 int client_fd, memfd, idx, ret;
1247 int32_t reply;
1248 off_t size;
1249 void *base;
1250
1251 client_fd = accept(sock_fd, NULL, NULL);
1252 if (client_fd < 0)
1253 return;
1254
1255 memfd = ublk_shmem_recv_fd(client_fd);
1256 if (memfd < 0) {
1257 reply = -1;
1258 goto out;
1259 }
1260
1261 /* mmap the memfd in server address space */
1262 size = lseek(memfd, 0, SEEK_END);
1263 if (size <= 0) {
1264 reply = -1;
1265 close(memfd);
1266 goto out;
1267 }
1268 base = mmap(NULL, size, PROT_READ | PROT_WRITE,
1269 MAP_SHARED | MAP_POPULATE, memfd, 0);
1270 if (base == MAP_FAILED) {
1271 reply = -1;
1272 close(memfd);
1273 goto out;
1274 }
1275
1276 /* Register server's VA range with kernel for PFN matching */
1277 ret = ublk_ctrl_reg_buf(dev, base, size, 0);
1278 if (ret < 0) {
1279 ublk_dbg(UBLK_DBG_DEV,
1280 "shmem_zc: kernel reg failed %d\n", ret);
1281 munmap(base, size);
1282 close(memfd);
1283 reply = ret;
1284 goto out;
1285 }
1286
1287 /* Store in table for I/O handling */
1288 idx = ublk_shmem_register(memfd);
1289 if (idx >= 0) {
1290 shmem_table[idx].mmap_base = base;
1291 shmem_table[idx].size = size;
1292 }
1293 reply = idx;
1294 out:
1295 send(client_fd, &reply, sizeof(reply), 0);
1296 close(client_fd);
1297 }
1298
1299 struct shmem_listener_info {
1300 int dev_id;
1301 int stop_efd; /* eventfd to signal listener to stop */
1302 int sock_fd; /* listener socket fd (output) */
1303 struct ublk_dev *dev;
1304 };
1305
1306 /*
1307 * Socket listener thread: runs in the parent daemon context alongside
1308 * the I/O threads. Accepts shared memory registration requests from
1309 * clients via SCM_RIGHTS. Exits when stop_efd is signaled.
1310 */
ublk_shmem_listener_fn(void * data)1311 static void *ublk_shmem_listener_fn(void *data)
1312 {
1313 struct shmem_listener_info *info = data;
1314 struct pollfd pfds[2];
1315
1316 info->sock_fd = ublk_shmem_sock_create(info->dev_id);
1317 if (info->sock_fd < 0)
1318 return NULL;
1319
1320 pfds[0].fd = info->sock_fd;
1321 pfds[0].events = POLLIN;
1322 pfds[1].fd = info->stop_efd;
1323 pfds[1].events = POLLIN;
1324
1325 while (1) {
1326 int ret = poll(pfds, 2, -1);
1327
1328 if (ret < 0)
1329 break;
1330
1331 /* Stop signal from parent */
1332 if (pfds[1].revents & POLLIN)
1333 break;
1334
1335 /* Client connection */
1336 if (pfds[0].revents & POLLIN)
1337 ublk_shmem_handle_client(info->sock_fd, info->dev);
1338 }
1339
1340 return NULL;
1341 }
1342
ublk_shmem_htlb_setup(const struct dev_ctx * ctx,struct ublk_dev * dev)1343 static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
1344 struct ublk_dev *dev)
1345 {
1346 int fd, idx, ret;
1347 struct stat st;
1348 void *base;
1349
1350 fd = open(ctx->htlb_path, O_RDWR);
1351 if (fd < 0) {
1352 ublk_err("htlb: can't open %s\n", ctx->htlb_path);
1353 return -errno;
1354 }
1355
1356 if (fstat(fd, &st) < 0 || st.st_size <= 0) {
1357 ublk_err("htlb: invalid file size\n");
1358 close(fd);
1359 return -EINVAL;
1360 }
1361
1362 base = mmap(NULL, st.st_size,
1363 ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE,
1364 MAP_SHARED | MAP_POPULATE, fd, 0);
1365 if (base == MAP_FAILED) {
1366 ublk_err("htlb: mmap failed\n");
1367 close(fd);
1368 return -ENOMEM;
1369 }
1370
1371 ret = ublk_ctrl_reg_buf(dev, base, st.st_size,
1372 ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0);
1373 if (ret < 0) {
1374 ublk_err("htlb: reg_buf failed: %d\n", ret);
1375 munmap(base, st.st_size);
1376 close(fd);
1377 return ret;
1378 }
1379
1380 if (shmem_count >= UBLK_BUF_MAX) {
1381 munmap(base, st.st_size);
1382 close(fd);
1383 return -ENOMEM;
1384 }
1385
1386 idx = shmem_count++;
1387 shmem_table[idx].fd = fd;
1388 shmem_table[idx].mmap_base = base;
1389 shmem_table[idx].size = st.st_size;
1390
1391 ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
1392 idx, (size_t)st.st_size);
1393 return 0;
1394 }
1395
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)1396 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
1397 {
1398 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
1399 struct shmem_listener_info linfo = {};
1400 struct ublk_thread_info *tinfo;
1401 unsigned long long extra_flags = 0;
1402 cpu_set_t *affinity_buf;
1403 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
1404 uint64_t stop_val = 1;
1405 pthread_t listener;
1406 void *thread_ret;
1407 sem_t ready;
1408 int ret, i;
1409
1410 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
1411
1412 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
1413 if (!tinfo)
1414 return -ENOMEM;
1415
1416 sem_init(&ready, 0, 0);
1417 ret = ublk_dev_prep(ctx, dev);
1418 if (ret)
1419 return ret;
1420
1421 ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
1422 if (ret)
1423 return ret;
1424
1425 if (ublk_dev_batch_io(dev)) {
1426 q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
1427 if (!q_thread_map) {
1428 ret = -ENOMEM;
1429 goto fail;
1430 }
1431 ublk_batch_setup_map(q_thread_map, dev->nthreads,
1432 dinfo->nr_hw_queues);
1433 }
1434
1435 if (ctx->auto_zc_fallback)
1436 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
1437 if (ctx->no_ublk_fixed_fd)
1438 extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
1439
1440 for (i = 0; i < dinfo->nr_hw_queues; i++) {
1441 dev->q[i].dev = dev;
1442 dev->q[i].q_id = i;
1443
1444 ret = ublk_queue_init(&dev->q[i], extra_flags,
1445 ctx->metadata_size);
1446 if (ret) {
1447 ublk_err("ublk dev %d queue %d init queue failed\n",
1448 dinfo->dev_id, i);
1449 goto fail;
1450 }
1451 }
1452
1453 for (i = 0; i < dev->nthreads; i++) {
1454 tinfo[i].dev = dev;
1455 tinfo[i].idx = i;
1456 tinfo[i].ready = &ready;
1457 tinfo[i].extra_flags = extra_flags;
1458 tinfo[i].q_thread_map = q_thread_map;
1459
1460 /*
1461 * If threads are not tied 1:1 to queues, setting thread
1462 * affinity based on queue affinity makes little sense.
1463 * However, thread CPU affinity has significant impact
1464 * on performance, so to compare fairly, we'll still set
1465 * thread CPU affinity based on queue affinity where
1466 * possible.
1467 */
1468 if (dev->nthreads == dinfo->nr_hw_queues)
1469 tinfo[i].affinity = &affinity_buf[i];
1470 pthread_create(&tinfo[i].thread, NULL,
1471 ublk_io_handler_fn,
1472 &tinfo[i]);
1473 }
1474
1475 for (i = 0; i < dev->nthreads; i++)
1476 sem_wait(&ready);
1477 free(affinity_buf);
1478 free(q_thread_map);
1479
1480 /* everything is fine now, start us */
1481 if (ctx->recovery)
1482 ret = ublk_ctrl_end_user_recovery(dev, getpid());
1483 else {
1484 ublk_set_parameters(dev);
1485 ret = ublk_ctrl_start_dev(dev, getpid());
1486 }
1487 if (ret < 0) {
1488 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
1489 /* stop device so that inflight uring_cmd can be cancelled */
1490 ublk_ctrl_stop_dev(dev);
1491 goto fail_start;
1492 }
1493
1494 if (ctx->htlb_path) {
1495 ret = ublk_shmem_htlb_setup(ctx, dev);
1496 if (ret < 0) {
1497 ublk_err("htlb setup failed: %d\n", ret);
1498 ublk_ctrl_stop_dev(dev);
1499 goto fail_start;
1500 }
1501 }
1502
1503 ublk_ctrl_get_info(dev);
1504 if (ctx->fg)
1505 ublk_ctrl_dump(dev);
1506 else
1507 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
1508 fail_start:
1509 /*
1510 * Wait for I/O threads to exit. While waiting, a listener
1511 * thread accepts shared memory registration requests from
1512 * clients via a per-device unix socket (SCM_RIGHTS fd passing).
1513 */
1514 linfo.dev_id = dinfo->dev_id;
1515 linfo.dev = dev;
1516 linfo.stop_efd = eventfd(0, 0);
1517 if (linfo.stop_efd >= 0)
1518 pthread_create(&listener, NULL,
1519 ublk_shmem_listener_fn, &linfo);
1520
1521 for (i = 0; i < (int)dev->nthreads; i++)
1522 pthread_join(tinfo[i].thread, &thread_ret);
1523
1524 /* Signal listener thread to stop and wait for it */
1525 if (linfo.stop_efd >= 0) {
1526 write(linfo.stop_efd, &stop_val, sizeof(stop_val));
1527 pthread_join(listener, NULL);
1528 close(linfo.stop_efd);
1529 ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
1530 }
1531 ublk_shmem_unregister_all();
1532 free(tinfo);
1533 fail:
1534 for (i = 0; i < dinfo->nr_hw_queues; i++)
1535 ublk_queue_deinit(&dev->q[i]);
1536 ublk_dev_unprep(dev);
1537 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1538
1539 return ret;
1540 }
1541
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1542 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1543 {
1544 #define EV_SIZE (sizeof(struct inotify_event))
1545 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1546 struct pollfd pfd;
1547 int fd, wd;
1548 int ret = -EINVAL;
1549 const char *dev_name = basename(path);
1550
1551 fd = inotify_init();
1552 if (fd < 0) {
1553 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1554 return fd;
1555 }
1556
1557 wd = inotify_add_watch(fd, "/dev", evt_mask);
1558 if (wd == -1) {
1559 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1560 goto fail;
1561 }
1562
1563 pfd.fd = fd;
1564 pfd.events = POLL_IN;
1565 while (1) {
1566 int i = 0;
1567 char buffer[EV_BUF_LEN];
1568 ret = poll(&pfd, 1, 1000 * timeout);
1569
1570 if (ret == -1) {
1571 ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1572 goto rm_watch;
1573 } else if (ret == 0) {
1574 ublk_err("%s: poll inotify timeout\n", __func__);
1575 ret = -ETIMEDOUT;
1576 goto rm_watch;
1577 }
1578
1579 ret = read(fd, buffer, EV_BUF_LEN);
1580 if (ret < 0) {
1581 ublk_err("%s: read inotify fd failed\n", __func__);
1582 goto rm_watch;
1583 }
1584
1585 while (i < ret) {
1586 struct inotify_event *event = (struct inotify_event *)&buffer[i];
1587
1588 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1589 __func__, event->mask, event->name);
1590 if (event->mask & evt_mask) {
1591 if (!strcmp(event->name, dev_name)) {
1592 ret = 0;
1593 goto rm_watch;
1594 }
1595 }
1596 i += EV_SIZE + event->len;
1597 }
1598 }
1599 rm_watch:
1600 inotify_rm_watch(fd, wd);
1601 fail:
1602 close(fd);
1603 return ret;
1604 }
1605
ublk_stop_io_daemon(const struct ublk_dev * dev)1606 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1607 {
1608 int daemon_pid = dev->dev_info.ublksrv_pid;
1609 int dev_id = dev->dev_info.dev_id;
1610 char ublkc[64];
1611 int ret = 0;
1612
1613 if (daemon_pid < 0)
1614 return 0;
1615
1616 /* daemon may be dead already */
1617 if (kill(daemon_pid, 0) < 0)
1618 goto wait;
1619
1620 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1621
1622 /* ublk char device may be gone already */
1623 if (access(ublkc, F_OK) != 0)
1624 goto wait;
1625
1626 /* Wait until ublk char device is closed, when the daemon is shutdown */
1627 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1628 /* double check and since it may be closed before starting inotify */
1629 if (ret == -ETIMEDOUT)
1630 ret = kill(daemon_pid, 0) < 0;
1631 wait:
1632 waitpid(daemon_pid, NULL, 0);
1633 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1634 __func__, daemon_pid, dev_id, ret);
1635
1636 return ret;
1637 }
1638
__cmd_dev_add(const struct dev_ctx * ctx)1639 static int __cmd_dev_add(const struct dev_ctx *ctx)
1640 {
1641 unsigned nthreads = ctx->nthreads;
1642 unsigned nr_queues = ctx->nr_hw_queues;
1643 const char *tgt_type = ctx->tgt_type;
1644 unsigned depth = ctx->queue_depth;
1645 __u64 features;
1646 const struct ublk_tgt_ops *ops;
1647 struct ublksrv_ctrl_dev_info *info;
1648 struct ublk_dev *dev = NULL;
1649 int dev_id = ctx->dev_id;
1650 int ret, i;
1651
1652 ops = ublk_find_tgt(tgt_type);
1653 if (!ops) {
1654 ublk_err("%s: no such tgt type, type %s\n",
1655 __func__, tgt_type);
1656 ret = -ENODEV;
1657 goto fail;
1658 }
1659
1660 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1661 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1662 __func__, nr_queues, depth);
1663 ret = -EINVAL;
1664 goto fail;
1665 }
1666
1667 /* default to 1:1 threads:queues if nthreads is unspecified */
1668 if (!nthreads)
1669 nthreads = nr_queues;
1670
1671 if (nthreads > UBLK_MAX_THREADS) {
1672 ublk_err("%s: %u is too many threads (max %u)\n",
1673 __func__, nthreads, UBLK_MAX_THREADS);
1674 ret = -EINVAL;
1675 goto fail;
1676 }
1677
1678 if (nthreads != nr_queues && (!ctx->per_io_tasks &&
1679 !(ctx->flags & UBLK_F_BATCH_IO))) {
1680 ublk_err("%s: threads %u must be same as queues %u if "
1681 "not using per_io_tasks\n",
1682 __func__, nthreads, nr_queues);
1683 ret = -EINVAL;
1684 goto fail;
1685 }
1686
1687 dev = ublk_ctrl_init();
1688 if (!dev) {
1689 ublk_err("%s: can't alloc dev id %d, type %s\n",
1690 __func__, dev_id, tgt_type);
1691 ret = -ENOMEM;
1692 goto fail;
1693 }
1694
1695 /* kernel doesn't support get_features */
1696 ret = ublk_ctrl_get_features(dev, &features);
1697 if (ret < 0) {
1698 ret = -EINVAL;
1699 goto fail;
1700 }
1701
1702 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1703 ret = -ENOTSUP;
1704 goto fail;
1705 }
1706
1707 info = &dev->dev_info;
1708 info->dev_id = ctx->dev_id;
1709 info->nr_hw_queues = nr_queues;
1710 info->queue_depth = depth;
1711 info->flags = ctx->flags;
1712 if ((features & UBLK_F_QUIESCE) &&
1713 (info->flags & UBLK_F_USER_RECOVERY))
1714 info->flags |= UBLK_F_QUIESCE;
1715 dev->nthreads = nthreads;
1716 dev->per_io_tasks = ctx->per_io_tasks;
1717 dev->tgt.ops = ops;
1718 dev->tgt.sq_depth = depth;
1719 dev->tgt.cq_depth = depth;
1720
1721 for (i = 0; i < MAX_BACK_FILES; i++) {
1722 if (ctx->files[i]) {
1723 strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1724 dev->tgt.nr_backing_files++;
1725 }
1726 }
1727
1728 if (ctx->recovery)
1729 ret = ublk_ctrl_start_user_recovery(dev);
1730 else
1731 ret = ublk_ctrl_add_dev(dev);
1732 if (ret < 0) {
1733 ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1734 __func__, dev_id, tgt_type, ret);
1735 goto fail;
1736 }
1737
1738 /*
1739 * The kernel may reduce nr_hw_queues (e.g. capped to nr_cpu_ids).
1740 * Cap nthreads to the actual queue count to avoid creating extra
1741 * handler threads that will hang during device removal.
1742 *
1743 * per_io_tasks mode is excluded: threads interleave across all
1744 * queues so nthreads > nr_hw_queues is valid and intentional.
1745 */
1746 if (!ctx->per_io_tasks && dev->nthreads > info->nr_hw_queues)
1747 dev->nthreads = info->nr_hw_queues;
1748
1749 ret = ublk_start_daemon(ctx, dev);
1750 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
1751 if (ret < 0)
1752 ublk_ctrl_del_dev(dev);
1753
1754 fail:
1755 if (ret < 0)
1756 ublk_send_dev_event(ctx, dev, -1);
1757 if (dev)
1758 ublk_ctrl_deinit(dev);
1759 return ret;
1760 }
1761
1762 static int __cmd_dev_list(struct dev_ctx *ctx);
1763
cmd_dev_add(struct dev_ctx * ctx)1764 static int cmd_dev_add(struct dev_ctx *ctx)
1765 {
1766 int res;
1767
1768 if (ctx->fg)
1769 goto run;
1770
1771 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1772 if (ctx->_shmid < 0) {
1773 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1774 exit(-1);
1775 }
1776 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1777 if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1778 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1779 exit(-1);
1780 }
1781 ctx->_evtfd = eventfd(0, 0);
1782 if (ctx->_evtfd < 0) {
1783 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1784 exit(-1);
1785 }
1786
1787 res = fork();
1788 if (res == 0) {
1789 int res2;
1790
1791 setsid();
1792 res2 = fork();
1793 if (res2 == 0) {
1794 /* prepare for detaching */
1795 close(STDIN_FILENO);
1796 close(STDOUT_FILENO);
1797 close(STDERR_FILENO);
1798 run:
1799 res = __cmd_dev_add(ctx);
1800 return res;
1801 } else {
1802 /* detached from the foreground task */
1803 exit(EXIT_SUCCESS);
1804 }
1805 } else if (res > 0) {
1806 uint64_t id;
1807 int exit_code = EXIT_FAILURE;
1808
1809 res = read(ctx->_evtfd, &id, sizeof(id));
1810 close(ctx->_evtfd);
1811 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1812 ctx->dev_id = id - 1;
1813 if (__cmd_dev_list(ctx) >= 0)
1814 exit_code = EXIT_SUCCESS;
1815 }
1816 shmdt(ctx->shadow_dev);
1817 shmctl(ctx->_shmid, IPC_RMID, NULL);
1818 /* wait for child and detach from it */
1819 wait(NULL);
1820 if (exit_code == EXIT_FAILURE)
1821 ublk_err("%s: command failed\n", __func__);
1822 exit(exit_code);
1823 } else {
1824 exit(EXIT_FAILURE);
1825 }
1826 }
1827
__cmd_dev_del(struct dev_ctx * ctx)1828 static int __cmd_dev_del(struct dev_ctx *ctx)
1829 {
1830 int number = ctx->dev_id;
1831 struct ublk_dev *dev;
1832 int ret;
1833
1834 dev = ublk_ctrl_init();
1835 dev->dev_info.dev_id = number;
1836
1837 ret = ublk_ctrl_get_info(dev);
1838 if (ret < 0)
1839 goto fail;
1840
1841 ret = ublk_ctrl_stop_dev(dev);
1842 if (ret < 0)
1843 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1844
1845 ret = ublk_stop_io_daemon(dev);
1846 if (ret < 0)
1847 ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1848 __func__, dev->dev_info.ublksrv_pid, number, ret);
1849 ublk_ctrl_del_dev(dev);
1850 fail:
1851 ublk_ctrl_deinit(dev);
1852
1853 return (ret >= 0) ? 0 : ret;
1854 }
1855
cmd_dev_del(struct dev_ctx * ctx)1856 static int cmd_dev_del(struct dev_ctx *ctx)
1857 {
1858 int i;
1859
1860 if (ctx->dev_id >= 0 || !ctx->all)
1861 return __cmd_dev_del(ctx);
1862
1863 for (i = 0; i < 255; i++) {
1864 ctx->dev_id = i;
1865 __cmd_dev_del(ctx);
1866 }
1867 return 0;
1868 }
1869
cmd_dev_stop(struct dev_ctx * ctx)1870 static int cmd_dev_stop(struct dev_ctx *ctx)
1871 {
1872 int number = ctx->dev_id;
1873 struct ublk_dev *dev;
1874 int ret;
1875
1876 if (number < 0) {
1877 ublk_err("%s: device id is required\n", __func__);
1878 return -EINVAL;
1879 }
1880
1881 dev = ublk_ctrl_init();
1882 dev->dev_info.dev_id = number;
1883
1884 ret = ublk_ctrl_get_info(dev);
1885 if (ret < 0)
1886 goto fail;
1887
1888 if (ctx->safe_stop) {
1889 ret = ublk_ctrl_try_stop_dev(dev);
1890 if (ret < 0)
1891 ublk_err("%s: try_stop dev %d failed ret %d\n",
1892 __func__, number, ret);
1893 } else {
1894 ret = ublk_ctrl_stop_dev(dev);
1895 if (ret < 0)
1896 ublk_err("%s: stop dev %d failed ret %d\n",
1897 __func__, number, ret);
1898 }
1899
1900 fail:
1901 ublk_ctrl_deinit(dev);
1902
1903 return ret;
1904 }
1905
__cmd_dev_list(struct dev_ctx * ctx)1906 static int __cmd_dev_list(struct dev_ctx *ctx)
1907 {
1908 struct ublk_dev *dev = ublk_ctrl_init();
1909 int ret;
1910
1911 if (!dev)
1912 return -ENODEV;
1913
1914 dev->dev_info.dev_id = ctx->dev_id;
1915
1916 ret = ublk_ctrl_get_info(dev);
1917 if (ret < 0) {
1918 if (ctx->logging)
1919 ublk_err("%s: can't get dev info from %d: %d\n",
1920 __func__, ctx->dev_id, ret);
1921 } else {
1922 if (ctx->shadow_dev)
1923 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1924
1925 ublk_ctrl_dump(dev);
1926 }
1927
1928 ublk_ctrl_deinit(dev);
1929
1930 return ret;
1931 }
1932
cmd_dev_list(struct dev_ctx * ctx)1933 static int cmd_dev_list(struct dev_ctx *ctx)
1934 {
1935 int i;
1936
1937 if (ctx->dev_id >= 0 || !ctx->all)
1938 return __cmd_dev_list(ctx);
1939
1940 ctx->logging = false;
1941 for (i = 0; i < 255; i++) {
1942 ctx->dev_id = i;
1943 __cmd_dev_list(ctx);
1944 }
1945 return 0;
1946 }
1947
cmd_dev_get_features(void)1948 static int cmd_dev_get_features(void)
1949 {
1950 #define const_ilog2(x) (63 - __builtin_clzll(x))
1951 #define FEAT_NAME(f) [const_ilog2(f)] = #f
1952 static const char *feat_map[] = {
1953 FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
1954 FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
1955 FEAT_NAME(UBLK_F_NEED_GET_DATA),
1956 FEAT_NAME(UBLK_F_USER_RECOVERY),
1957 FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
1958 FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
1959 FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
1960 FEAT_NAME(UBLK_F_USER_COPY),
1961 FEAT_NAME(UBLK_F_ZONED),
1962 FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
1963 FEAT_NAME(UBLK_F_UPDATE_SIZE),
1964 FEAT_NAME(UBLK_F_AUTO_BUF_REG),
1965 FEAT_NAME(UBLK_F_QUIESCE),
1966 FEAT_NAME(UBLK_F_PER_IO_DAEMON),
1967 FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
1968 FEAT_NAME(UBLK_F_INTEGRITY),
1969 FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
1970 FEAT_NAME(UBLK_F_BATCH_IO),
1971 FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
1972 FEAT_NAME(UBLK_F_SHMEM_ZC),
1973 };
1974 struct ublk_dev *dev;
1975 __u64 features = 0;
1976 int ret;
1977
1978 dev = ublk_ctrl_init();
1979 if (!dev) {
1980 fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1981 return -EOPNOTSUPP;
1982 }
1983
1984 ret = ublk_ctrl_get_features(dev, &features);
1985 if (!ret) {
1986 int i;
1987
1988 printf("ublk_drv features: 0x%llx\n", features);
1989
1990 for (i = 0; i < sizeof(features) * 8; i++) {
1991 const char *feat;
1992
1993 if (!((1ULL << i) & features))
1994 continue;
1995 if (i < ARRAY_SIZE(feat_map))
1996 feat = feat_map[i];
1997 else
1998 feat = "unknown";
1999 printf("0x%-16llx: %s\n", 1ULL << i, feat);
2000 }
2001 }
2002
2003 return ret;
2004 }
2005
cmd_dev_update_size(struct dev_ctx * ctx)2006 static int cmd_dev_update_size(struct dev_ctx *ctx)
2007 {
2008 struct ublk_dev *dev = ublk_ctrl_init();
2009 struct ublk_params p;
2010 int ret = -EINVAL;
2011
2012 if (!dev)
2013 return -ENODEV;
2014
2015 if (ctx->dev_id < 0) {
2016 fprintf(stderr, "device id isn't provided\n");
2017 goto out;
2018 }
2019
2020 dev->dev_info.dev_id = ctx->dev_id;
2021 ret = ublk_ctrl_get_params(dev, &p);
2022 if (ret < 0) {
2023 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
2024 goto out;
2025 }
2026
2027 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
2028 ublk_err("size isn't aligned with logical block size\n");
2029 ret = -EINVAL;
2030 goto out;
2031 }
2032
2033 ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
2034 out:
2035 ublk_ctrl_deinit(dev);
2036 return ret;
2037 }
2038
cmd_dev_quiesce(struct dev_ctx * ctx)2039 static int cmd_dev_quiesce(struct dev_ctx *ctx)
2040 {
2041 struct ublk_dev *dev = ublk_ctrl_init();
2042 int ret = -EINVAL;
2043
2044 if (!dev)
2045 return -ENODEV;
2046
2047 if (ctx->dev_id < 0) {
2048 fprintf(stderr, "device id isn't provided for quiesce\n");
2049 goto out;
2050 }
2051 dev->dev_info.dev_id = ctx->dev_id;
2052 ret = ublk_ctrl_quiesce_dev(dev, 10000);
2053
2054 out:
2055 ublk_ctrl_deinit(dev);
2056 return ret;
2057 }
2058
__cmd_create_help(char * exe,bool recovery)2059 static void __cmd_create_help(char *exe, bool recovery)
2060 {
2061 int i;
2062
2063 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
2064 exe, recovery ? "recover" : "add");
2065 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
2066 printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
2067 printf("\t[--nthreads threads] [--per_io_tasks]\n");
2068 printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
2069 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
2070 printf("\t[--batch|-b] [--no_auto_part_scan]\n");
2071 printf("\t[target options] [backfile1] [backfile2] ...\n");
2072 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
2073 printf("\tdefault: nthreads=nr_queues");
2074
2075 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
2076 const struct ublk_tgt_ops *ops = tgt_ops_list[i];
2077
2078 if (ops->usage)
2079 ops->usage(ops);
2080 }
2081 }
2082
cmd_add_help(char * exe)2083 static void cmd_add_help(char *exe)
2084 {
2085 __cmd_create_help(exe, false);
2086 printf("\n");
2087 }
2088
cmd_recover_help(char * exe)2089 static void cmd_recover_help(char *exe)
2090 {
2091 __cmd_create_help(exe, true);
2092 printf("\tPlease provide exact command line for creating this device with real dev_id\n");
2093 printf("\n");
2094 }
2095
cmd_dev_help(char * exe)2096 static int cmd_dev_help(char *exe)
2097 {
2098 cmd_add_help(exe);
2099 cmd_recover_help(exe);
2100
2101 printf("%s del [-n dev_id] -a \n", exe);
2102 printf("\t -a delete all devices -n delete specified device\n\n");
2103 printf("%s stop -n dev_id [--safe]\n", exe);
2104 printf("\t --safe only stop if device has no active openers\n\n");
2105 printf("%s list [-n dev_id] -a \n", exe);
2106 printf("\t -a list all devices, -n list specified device, default -a \n\n");
2107 printf("%s features\n", exe);
2108 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
2109 printf("%s quiesce -n dev_id\n", exe);
2110 return 0;
2111 }
2112
main(int argc,char * argv[])2113 int main(int argc, char *argv[])
2114 {
2115 static const struct option longopts[] = {
2116 { "all", 0, NULL, 'a' },
2117 { "type", 1, NULL, 't' },
2118 { "number", 1, NULL, 'n' },
2119 { "queues", 1, NULL, 'q' },
2120 { "depth", 1, NULL, 'd' },
2121 { "debug_mask", 1, NULL, 0 },
2122 { "quiet", 0, NULL, 0 },
2123 { "zero_copy", 0, NULL, 'z' },
2124 { "foreground", 0, NULL, 0 },
2125 { "recovery", 1, NULL, 'r' },
2126 { "recovery_fail_io", 1, NULL, 'e'},
2127 { "recovery_reissue", 1, NULL, 'i'},
2128 { "get_data", 1, NULL, 'g'},
2129 { "auto_zc", 0, NULL, 0 },
2130 { "auto_zc_fallback", 0, NULL, 0 },
2131 { "user_copy", 0, NULL, 'u'},
2132 { "size", 1, NULL, 's'},
2133 { "nthreads", 1, NULL, 0 },
2134 { "per_io_tasks", 0, NULL, 0 },
2135 { "no_ublk_fixed_fd", 0, NULL, 0 },
2136 { "integrity_capable", 0, NULL, 0 },
2137 { "integrity_reftag", 0, NULL, 0 },
2138 { "metadata_size", 1, NULL, 0 },
2139 { "pi_offset", 1, NULL, 0 },
2140 { "csum_type", 1, NULL, 0 },
2141 { "tag_size", 1, NULL, 0 },
2142 { "safe", 0, NULL, 0 },
2143 { "batch", 0, NULL, 'b'},
2144 { "no_auto_part_scan", 0, NULL, 0 },
2145 { "shmem_zc", 0, NULL, 0 },
2146 { "htlb", 1, NULL, 0 },
2147 { "rdonly_shmem_buf", 0, NULL, 0 },
2148 { 0, 0, 0, 0 }
2149 };
2150 const struct ublk_tgt_ops *ops = NULL;
2151 int option_idx, opt;
2152 const char *cmd = argv[1];
2153 struct dev_ctx ctx = {
2154 ._evtfd = -1,
2155 .queue_depth = 128,
2156 .nr_hw_queues = 2,
2157 .dev_id = -1,
2158 .tgt_type = "unknown",
2159 .csum_type = LBMD_PI_CSUM_NONE,
2160 };
2161 int ret = -EINVAL, i;
2162 int tgt_argc = 1;
2163 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
2164 int value;
2165
2166 if (argc == 1)
2167 return ret;
2168
2169 opterr = 0;
2170 optind = 2;
2171 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
2172 longopts, &option_idx)) != -1) {
2173 switch (opt) {
2174 case 'a':
2175 ctx.all = 1;
2176 break;
2177 case 'b':
2178 ctx.flags |= UBLK_F_BATCH_IO;
2179 break;
2180 case 'n':
2181 ctx.dev_id = strtol(optarg, NULL, 10);
2182 break;
2183 case 't':
2184 if (strlen(optarg) < sizeof(ctx.tgt_type))
2185 strcpy(ctx.tgt_type, optarg);
2186 break;
2187 case 'q':
2188 ctx.nr_hw_queues = strtol(optarg, NULL, 10);
2189 break;
2190 case 'd':
2191 ctx.queue_depth = strtol(optarg, NULL, 10);
2192 break;
2193 case 'z':
2194 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
2195 break;
2196 case 'r':
2197 value = strtol(optarg, NULL, 10);
2198 if (value)
2199 ctx.flags |= UBLK_F_USER_RECOVERY;
2200 break;
2201 case 'e':
2202 value = strtol(optarg, NULL, 10);
2203 if (value)
2204 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
2205 break;
2206 case 'i':
2207 value = strtol(optarg, NULL, 10);
2208 if (value)
2209 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
2210 break;
2211 case 'g':
2212 ctx.flags |= UBLK_F_NEED_GET_DATA;
2213 break;
2214 case 'u':
2215 ctx.flags |= UBLK_F_USER_COPY;
2216 break;
2217 case 's':
2218 ctx.size = strtoull(optarg, NULL, 10);
2219 break;
2220 case 0:
2221 if (!strcmp(longopts[option_idx].name, "debug_mask"))
2222 ublk_dbg_mask = strtol(optarg, NULL, 16);
2223 if (!strcmp(longopts[option_idx].name, "quiet"))
2224 ublk_dbg_mask = 0;
2225 if (!strcmp(longopts[option_idx].name, "foreground"))
2226 ctx.fg = 1;
2227 if (!strcmp(longopts[option_idx].name, "auto_zc"))
2228 ctx.flags |= UBLK_F_AUTO_BUF_REG;
2229 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
2230 ctx.auto_zc_fallback = 1;
2231 if (!strcmp(longopts[option_idx].name, "nthreads"))
2232 ctx.nthreads = strtol(optarg, NULL, 10);
2233 if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
2234 ctx.per_io_tasks = 1;
2235 if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
2236 ctx.no_ublk_fixed_fd = 1;
2237 if (!strcmp(longopts[option_idx].name, "integrity_capable"))
2238 ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
2239 if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
2240 ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
2241 if (!strcmp(longopts[option_idx].name, "metadata_size"))
2242 ctx.metadata_size = strtoul(optarg, NULL, 0);
2243 if (!strcmp(longopts[option_idx].name, "pi_offset"))
2244 ctx.pi_offset = strtoul(optarg, NULL, 0);
2245 if (!strcmp(longopts[option_idx].name, "csum_type")) {
2246 if (!strcmp(optarg, "ip")) {
2247 ctx.csum_type = LBMD_PI_CSUM_IP;
2248 } else if (!strcmp(optarg, "t10dif")) {
2249 ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
2250 } else if (!strcmp(optarg, "nvme")) {
2251 ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
2252 } else {
2253 ublk_err("invalid csum_type: %s\n", optarg);
2254 return -EINVAL;
2255 }
2256 }
2257 if (!strcmp(longopts[option_idx].name, "tag_size"))
2258 ctx.tag_size = strtoul(optarg, NULL, 0);
2259 if (!strcmp(longopts[option_idx].name, "safe"))
2260 ctx.safe_stop = 1;
2261 if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
2262 ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
2263 if (!strcmp(longopts[option_idx].name, "shmem_zc"))
2264 ctx.flags |= UBLK_F_SHMEM_ZC;
2265 if (!strcmp(longopts[option_idx].name, "htlb"))
2266 ctx.htlb_path = strdup(optarg);
2267 if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf"))
2268 ctx.rdonly_shmem_buf = 1;
2269 break;
2270 case '?':
2271 /*
2272 * target requires every option must have argument
2273 */
2274 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
2275 fprintf(stderr, "every target option requires argument: %s %s\n",
2276 argv[optind - 1], argv[optind]);
2277 exit(EXIT_FAILURE);
2278 }
2279
2280 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
2281 tgt_argv[tgt_argc++] = argv[optind - 1];
2282 tgt_argv[tgt_argc++] = argv[optind];
2283 } else {
2284 fprintf(stderr, "too many target options\n");
2285 exit(EXIT_FAILURE);
2286 }
2287 optind += 1;
2288 break;
2289 }
2290 }
2291
2292 if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
2293 ublk_err("per_io_task and F_BATCH_IO conflict\n");
2294 return -EINVAL;
2295 }
2296
2297 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
2298 if (ctx.auto_zc_fallback &&
2299 !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2300 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
2301 ublk_err("%s: auto_zc_fallback is set but neither "
2302 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
2303 __func__);
2304 return -EINVAL;
2305 }
2306
2307 if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
2308 !!(ctx.flags & UBLK_F_USER_COPY) +
2309 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
2310 (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
2311 ctx.auto_zc_fallback > 1) {
2312 fprintf(stderr, "too many data copy modes specified\n");
2313 return -EINVAL;
2314 }
2315
2316 if (ctx.metadata_size) {
2317 if (!(ctx.flags & UBLK_F_USER_COPY)) {
2318 ublk_err("integrity requires user_copy\n");
2319 return -EINVAL;
2320 }
2321
2322 ctx.flags |= UBLK_F_INTEGRITY;
2323 } else if (ctx.integrity_flags ||
2324 ctx.pi_offset ||
2325 ctx.csum_type != LBMD_PI_CSUM_NONE ||
2326 ctx.tag_size) {
2327 ublk_err("integrity parameters require metadata_size\n");
2328 return -EINVAL;
2329 }
2330
2331 if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2332 (ctx.flags & UBLK_F_BATCH_IO) &&
2333 (ctx.nthreads > ctx.nr_hw_queues)) {
2334 ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
2335 return -EINVAL;
2336 }
2337
2338 i = optind;
2339 while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
2340 ctx.files[ctx.nr_files++] = argv[i++];
2341 }
2342
2343 ops = ublk_find_tgt(ctx.tgt_type);
2344 if (ops && ops->parse_cmd_line) {
2345 optind = 0;
2346
2347 tgt_argv[0] = ctx.tgt_type;
2348 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
2349 }
2350
2351 if (!strcmp(cmd, "add"))
2352 ret = cmd_dev_add(&ctx);
2353 else if (!strcmp(cmd, "recover")) {
2354 if (ctx.dev_id < 0) {
2355 fprintf(stderr, "device id isn't provided for recovering\n");
2356 ret = -EINVAL;
2357 } else {
2358 ctx.recovery = 1;
2359 ret = cmd_dev_add(&ctx);
2360 }
2361 } else if (!strcmp(cmd, "del"))
2362 ret = cmd_dev_del(&ctx);
2363 else if (!strcmp(cmd, "stop"))
2364 ret = cmd_dev_stop(&ctx);
2365 else if (!strcmp(cmd, "list")) {
2366 ctx.all = 1;
2367 ret = cmd_dev_list(&ctx);
2368 } else if (!strcmp(cmd, "help"))
2369 ret = cmd_dev_help(argv[0]);
2370 else if (!strcmp(cmd, "features"))
2371 ret = cmd_dev_get_features();
2372 else if (!strcmp(cmd, "update_size"))
2373 ret = cmd_dev_update_size(&ctx);
2374 else if (!strcmp(cmd, "quiesce"))
2375 ret = cmd_dev_quiesce(&ctx);
2376 else
2377 cmd_dev_help(argv[0]);
2378
2379 return ret;
2380 }
2381