xref: /linux/tools/testing/selftests/ublk/kublk.c (revision 15ecd83dc06277385ad71dc7ea26911d9a79acaf)
1 /* SPDX-License-Identifier: MIT */
2 /*
3  * Description: uring_cmd based ublk
4  */
5 
6 #include "kublk.h"
7 
8 #define MAX_NR_TGT_ARG 	64
9 
10 unsigned int ublk_dbg_mask = UBLK_LOG;
11 static const struct ublk_tgt_ops *tgt_ops_list[] = {
12 	&null_tgt_ops,
13 	&loop_tgt_ops,
14 	&stripe_tgt_ops,
15 	&fault_inject_tgt_ops,
16 };
17 
18 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
19 {
20 	int i;
21 
22 	if (name == NULL)
23 		return NULL;
24 
25 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
26 		if (strcmp(tgt_ops_list[i]->name, name) == 0)
27 			return tgt_ops_list[i];
28 	return NULL;
29 }
30 
31 static inline int ublk_setup_ring(struct io_uring *r, int depth,
32 		int cq_depth, unsigned flags)
33 {
34 	struct io_uring_params p;
35 
36 	memset(&p, 0, sizeof(p));
37 	p.flags = flags | IORING_SETUP_CQSIZE;
38 	p.cq_entries = cq_depth;
39 
40 	return io_uring_queue_init_params(depth, r, &p);
41 }
42 
43 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
44 		struct io_uring_sqe *sqe,
45 		struct ublk_ctrl_cmd_data *data)
46 {
47 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
48 	struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
49 
50 	sqe->fd = dev->ctrl_fd;
51 	sqe->opcode = IORING_OP_URING_CMD;
52 	sqe->ioprio = 0;
53 
54 	if (data->flags & CTRL_CMD_HAS_BUF) {
55 		cmd->addr = data->addr;
56 		cmd->len = data->len;
57 	}
58 
59 	if (data->flags & CTRL_CMD_HAS_DATA)
60 		cmd->data[0] = data->data[0];
61 
62 	cmd->dev_id = info->dev_id;
63 	cmd->queue_id = -1;
64 
65 	ublk_set_sqe_cmd_op(sqe, data->cmd_op);
66 
67 	io_uring_sqe_set_data(sqe, cmd);
68 }
69 
70 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
71 		struct ublk_ctrl_cmd_data *data)
72 {
73 	struct io_uring_sqe *sqe;
74 	struct io_uring_cqe *cqe;
75 	int ret = -EINVAL;
76 
77 	sqe = io_uring_get_sqe(&dev->ring);
78 	if (!sqe) {
79 		ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
80 		return ret;
81 	}
82 
83 	ublk_ctrl_init_cmd(dev, sqe, data);
84 
85 	ret = io_uring_submit(&dev->ring);
86 	if (ret < 0) {
87 		ublk_err("uring submit ret %d\n", ret);
88 		return ret;
89 	}
90 
91 	ret = io_uring_wait_cqe(&dev->ring, &cqe);
92 	if (ret < 0) {
93 		ublk_err("wait cqe: %s\n", strerror(-ret));
94 		return ret;
95 	}
96 	io_uring_cqe_seen(&dev->ring, cqe);
97 
98 	return cqe->res;
99 }
100 
101 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
102 {
103 	struct ublk_ctrl_cmd_data data = {
104 		.cmd_op	= UBLK_U_CMD_STOP_DEV,
105 	};
106 
107 	return __ublk_ctrl_cmd(dev, &data);
108 }
109 
110 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
111 		int daemon_pid)
112 {
113 	struct ublk_ctrl_cmd_data data = {
114 		.cmd_op	= UBLK_U_CMD_START_DEV,
115 		.flags	= CTRL_CMD_HAS_DATA,
116 	};
117 
118 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
119 
120 	return __ublk_ctrl_cmd(dev, &data);
121 }
122 
123 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
124 {
125 	struct ublk_ctrl_cmd_data data = {
126 		.cmd_op	= UBLK_U_CMD_START_USER_RECOVERY,
127 	};
128 
129 	return __ublk_ctrl_cmd(dev, &data);
130 }
131 
132 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
133 {
134 	struct ublk_ctrl_cmd_data data = {
135 		.cmd_op	= UBLK_U_CMD_END_USER_RECOVERY,
136 		.flags	= CTRL_CMD_HAS_DATA,
137 	};
138 
139 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
140 
141 	return __ublk_ctrl_cmd(dev, &data);
142 }
143 
144 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
145 {
146 	struct ublk_ctrl_cmd_data data = {
147 		.cmd_op	= UBLK_U_CMD_ADD_DEV,
148 		.flags	= CTRL_CMD_HAS_BUF,
149 		.addr = (__u64) (uintptr_t) &dev->dev_info,
150 		.len = sizeof(struct ublksrv_ctrl_dev_info),
151 	};
152 
153 	return __ublk_ctrl_cmd(dev, &data);
154 }
155 
156 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
157 {
158 	struct ublk_ctrl_cmd_data data = {
159 		.cmd_op = UBLK_U_CMD_DEL_DEV,
160 		.flags = 0,
161 	};
162 
163 	return __ublk_ctrl_cmd(dev, &data);
164 }
165 
166 static int ublk_ctrl_get_info(struct ublk_dev *dev)
167 {
168 	struct ublk_ctrl_cmd_data data = {
169 		.cmd_op	= UBLK_U_CMD_GET_DEV_INFO,
170 		.flags	= CTRL_CMD_HAS_BUF,
171 		.addr = (__u64) (uintptr_t) &dev->dev_info,
172 		.len = sizeof(struct ublksrv_ctrl_dev_info),
173 	};
174 
175 	return __ublk_ctrl_cmd(dev, &data);
176 }
177 
178 static int ublk_ctrl_set_params(struct ublk_dev *dev,
179 		struct ublk_params *params)
180 {
181 	struct ublk_ctrl_cmd_data data = {
182 		.cmd_op	= UBLK_U_CMD_SET_PARAMS,
183 		.flags	= CTRL_CMD_HAS_BUF,
184 		.addr = (__u64) (uintptr_t) params,
185 		.len = sizeof(*params),
186 	};
187 	params->len = sizeof(*params);
188 	return __ublk_ctrl_cmd(dev, &data);
189 }
190 
191 static int ublk_ctrl_get_params(struct ublk_dev *dev,
192 		struct ublk_params *params)
193 {
194 	struct ublk_ctrl_cmd_data data = {
195 		.cmd_op	= UBLK_U_CMD_GET_PARAMS,
196 		.flags	= CTRL_CMD_HAS_BUF,
197 		.addr = (__u64)params,
198 		.len = sizeof(*params),
199 	};
200 
201 	params->len = sizeof(*params);
202 
203 	return __ublk_ctrl_cmd(dev, &data);
204 }
205 
206 static int ublk_ctrl_get_features(struct ublk_dev *dev,
207 		__u64 *features)
208 {
209 	struct ublk_ctrl_cmd_data data = {
210 		.cmd_op	= UBLK_U_CMD_GET_FEATURES,
211 		.flags	= CTRL_CMD_HAS_BUF,
212 		.addr = (__u64) (uintptr_t) features,
213 		.len = sizeof(*features),
214 	};
215 
216 	return __ublk_ctrl_cmd(dev, &data);
217 }
218 
219 static int ublk_ctrl_update_size(struct ublk_dev *dev,
220 		__u64 nr_sects)
221 {
222 	struct ublk_ctrl_cmd_data data = {
223 		.cmd_op	= UBLK_U_CMD_UPDATE_SIZE,
224 		.flags	= CTRL_CMD_HAS_DATA,
225 	};
226 
227 	data.data[0] = nr_sects;
228 	return __ublk_ctrl_cmd(dev, &data);
229 }
230 
231 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
232 				 unsigned int timeout_ms)
233 {
234 	struct ublk_ctrl_cmd_data data = {
235 		.cmd_op	= UBLK_U_CMD_QUIESCE_DEV,
236 		.flags	= CTRL_CMD_HAS_DATA,
237 	};
238 
239 	data.data[0] = timeout_ms;
240 	return __ublk_ctrl_cmd(dev, &data);
241 }
242 
243 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
244 {
245 	switch (dev->dev_info.state) {
246 	case UBLK_S_DEV_DEAD:
247 		return "DEAD";
248 	case UBLK_S_DEV_LIVE:
249 		return "LIVE";
250 	case UBLK_S_DEV_QUIESCED:
251 		return "QUIESCED";
252 	default:
253 		return "UNKNOWN";
254 	};
255 }
256 
257 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
258 {
259 	unsigned done = 0;
260 	int i;
261 
262 	for (i = 0; i < CPU_SETSIZE; i++) {
263 		if (CPU_ISSET(i, set))
264 			done += snprintf(&buf[done], len - done, "%d ", i);
265 	}
266 }
267 
268 static void ublk_adjust_affinity(cpu_set_t *set)
269 {
270 	int j, updated = 0;
271 
272 	/*
273 	 * Just keep the 1st CPU now.
274 	 *
275 	 * In future, auto affinity selection can be tried.
276 	 */
277 	for (j = 0; j < CPU_SETSIZE; j++) {
278 		if (CPU_ISSET(j, set)) {
279 			if (!updated) {
280 				updated = 1;
281 				continue;
282 			}
283 			CPU_CLR(j, set);
284 		}
285 	}
286 }
287 
288 /* Caller must free the allocated buffer */
289 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
290 {
291 	struct ublk_ctrl_cmd_data data = {
292 		.cmd_op	= UBLK_U_CMD_GET_QUEUE_AFFINITY,
293 		.flags	= CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
294 	};
295 	cpu_set_t *buf;
296 	int i, ret;
297 
298 	buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
299 	if (!buf)
300 		return -ENOMEM;
301 
302 	for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
303 		data.data[0] = i;
304 		data.len = sizeof(cpu_set_t);
305 		data.addr = (__u64)&buf[i];
306 
307 		ret = __ublk_ctrl_cmd(ctrl_dev, &data);
308 		if (ret < 0) {
309 			free(buf);
310 			return ret;
311 		}
312 		ublk_adjust_affinity(&buf[i]);
313 	}
314 
315 	*ptr_buf = buf;
316 	return 0;
317 }
318 
319 static void ublk_ctrl_dump(struct ublk_dev *dev)
320 {
321 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
322 	struct ublk_params p;
323 	cpu_set_t *affinity;
324 	int ret;
325 
326 	ret = ublk_ctrl_get_params(dev, &p);
327 	if (ret < 0) {
328 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
329 		return;
330 	}
331 
332 	ret = ublk_ctrl_get_affinity(dev, &affinity);
333 	if (ret < 0) {
334 		ublk_err("failed to get affinity %m\n");
335 		return;
336 	}
337 
338 	ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
339 			info->dev_id, info->nr_hw_queues, info->queue_depth,
340 			1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
341 	ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
342 			info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
343 			ublk_dev_state_desc(dev));
344 
345 	if (affinity) {
346 		char buf[512];
347 		int i;
348 
349 		for (i = 0; i < info->nr_hw_queues; i++) {
350 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
351 			printf("\tqueue %u: tid %d affinity(%s)\n",
352 					i, dev->q[i].tid, buf);
353 		}
354 		free(affinity);
355 	}
356 
357 	fflush(stdout);
358 }
359 
360 static void ublk_ctrl_deinit(struct ublk_dev *dev)
361 {
362 	close(dev->ctrl_fd);
363 	free(dev);
364 }
365 
366 static struct ublk_dev *ublk_ctrl_init(void)
367 {
368 	struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
369 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
370 	int ret;
371 
372 	dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
373 	if (dev->ctrl_fd < 0) {
374 		free(dev);
375 		return NULL;
376 	}
377 
378 	info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
379 
380 	ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
381 			UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
382 	if (ret < 0) {
383 		ublk_err("queue_init: %s\n", strerror(-ret));
384 		free(dev);
385 		return NULL;
386 	}
387 	dev->nr_fds = 1;
388 
389 	return dev;
390 }
391 
392 static int __ublk_queue_cmd_buf_sz(unsigned depth)
393 {
394 	int size =  depth * sizeof(struct ublksrv_io_desc);
395 	unsigned int page_sz = getpagesize();
396 
397 	return round_up(size, page_sz);
398 }
399 
400 static int ublk_queue_max_cmd_buf_sz(void)
401 {
402 	return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
403 }
404 
405 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
406 {
407 	return __ublk_queue_cmd_buf_sz(q->q_depth);
408 }
409 
410 static void ublk_queue_deinit(struct ublk_queue *q)
411 {
412 	int i;
413 	int nr_ios = q->q_depth;
414 
415 	io_uring_unregister_buffers(&q->ring);
416 
417 	io_uring_unregister_ring_fd(&q->ring);
418 
419 	if (q->ring.ring_fd > 0) {
420 		io_uring_unregister_files(&q->ring);
421 		close(q->ring.ring_fd);
422 		q->ring.ring_fd = -1;
423 	}
424 
425 	if (q->io_cmd_buf)
426 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
427 
428 	for (i = 0; i < nr_ios; i++)
429 		free(q->ios[i].buf_addr);
430 }
431 
432 static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
433 {
434 	struct ublk_dev *dev = q->dev;
435 	int depth = dev->dev_info.queue_depth;
436 	int i, ret = -1;
437 	int cmd_buf_size, io_buf_size;
438 	unsigned long off;
439 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
440 
441 	q->tgt_ops = dev->tgt.ops;
442 	q->state = 0;
443 	q->q_depth = depth;
444 	q->cmd_inflight = 0;
445 	q->tid = gettid();
446 
447 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
448 		q->state |= UBLKSRV_NO_BUF;
449 		if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)
450 			q->state |= UBLKSRV_ZC;
451 		if (dev->dev_info.flags & UBLK_F_AUTO_BUF_REG)
452 			q->state |= UBLKSRV_AUTO_BUF_REG;
453 	}
454 	q->state |= extra_flags;
455 
456 	cmd_buf_size = ublk_queue_cmd_buf_sz(q);
457 	off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
458 	q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
459 			MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
460 	if (q->io_cmd_buf == MAP_FAILED) {
461 		ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
462 				q->dev->dev_info.dev_id, q->q_id);
463 		goto fail;
464 	}
465 
466 	io_buf_size = dev->dev_info.max_io_buf_bytes;
467 	for (i = 0; i < q->q_depth; i++) {
468 		q->ios[i].buf_addr = NULL;
469 		q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
470 
471 		if (q->state & UBLKSRV_NO_BUF)
472 			continue;
473 
474 		if (posix_memalign((void **)&q->ios[i].buf_addr,
475 					getpagesize(), io_buf_size)) {
476 			ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
477 					dev->dev_info.dev_id, q->q_id, i);
478 			goto fail;
479 		}
480 	}
481 
482 	ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
483 			IORING_SETUP_COOP_TASKRUN |
484 			IORING_SETUP_SINGLE_ISSUER |
485 			IORING_SETUP_DEFER_TASKRUN);
486 	if (ret < 0) {
487 		ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
488 				q->dev->dev_info.dev_id, q->q_id, ret);
489 		goto fail;
490 	}
491 
492 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
493 		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
494 		if (ret) {
495 			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
496 					dev->dev_info.dev_id, q->q_id, ret);
497 			goto fail;
498 		}
499 	}
500 
501 	io_uring_register_ring_fd(&q->ring);
502 
503 	ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds);
504 	if (ret) {
505 		ublk_err("ublk dev %d queue %d register files failed %d\n",
506 				q->dev->dev_info.dev_id, q->q_id, ret);
507 		goto fail;
508 	}
509 
510 	return 0;
511  fail:
512 	ublk_queue_deinit(q);
513 	ublk_err("ublk dev %d queue %d failed\n",
514 			dev->dev_info.dev_id, q->q_id);
515 	return -ENOMEM;
516 }
517 
518 #define WAIT_USEC 	100000
519 #define MAX_WAIT_USEC 	(3 * 1000000)
520 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
521 {
522 	int dev_id = dev->dev_info.dev_id;
523 	unsigned int wait_usec = 0;
524 	int ret = 0, fd = -1;
525 	char buf[64];
526 
527 	snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
528 
529 	while (wait_usec < MAX_WAIT_USEC) {
530 		fd = open(buf, O_RDWR);
531 		if (fd >= 0)
532 			break;
533 		usleep(WAIT_USEC);
534 		wait_usec += WAIT_USEC;
535 	}
536 	if (fd < 0) {
537 		ublk_err("can't open %s %s\n", buf, strerror(errno));
538 		return -1;
539 	}
540 
541 	dev->fds[0] = fd;
542 	if (dev->tgt.ops->init_tgt)
543 		ret = dev->tgt.ops->init_tgt(ctx, dev);
544 	if (ret)
545 		close(dev->fds[0]);
546 	return ret;
547 }
548 
549 static void ublk_dev_unprep(struct ublk_dev *dev)
550 {
551 	if (dev->tgt.ops->deinit_tgt)
552 		dev->tgt.ops->deinit_tgt(dev);
553 	close(dev->fds[0]);
554 }
555 
556 static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
557 				  struct io_uring_sqe *sqe,
558 				  unsigned short tag)
559 {
560 	struct ublk_auto_buf_reg buf = {};
561 
562 	if (q->tgt_ops->buf_index)
563 		buf.index = q->tgt_ops->buf_index(q, tag);
564 	else
565 		buf.index = tag;
566 
567 	if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
568 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
569 
570 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
571 }
572 
573 int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
574 {
575 	struct ublksrv_io_cmd *cmd;
576 	struct io_uring_sqe *sqe[1];
577 	unsigned int cmd_op = 0;
578 	__u64 user_data;
579 
580 	/* only freed io can be issued */
581 	if (!(io->flags & UBLKSRV_IO_FREE))
582 		return 0;
583 
584 	/*
585 	 * we issue because we need either fetching or committing or
586 	 * getting data
587 	 */
588 	if (!(io->flags &
589 		(UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_NEED_GET_DATA)))
590 		return 0;
591 
592 	if (io->flags & UBLKSRV_NEED_GET_DATA)
593 		cmd_op = UBLK_U_IO_NEED_GET_DATA;
594 	else if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
595 		cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
596 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
597 		cmd_op = UBLK_U_IO_FETCH_REQ;
598 
599 	if (io_uring_sq_space_left(&q->ring) < 1)
600 		io_uring_submit(&q->ring);
601 
602 	ublk_queue_alloc_sqes(q, sqe, 1);
603 	if (!sqe[0]) {
604 		ublk_err("%s: run out of sqe %d, tag %d\n",
605 				__func__, q->q_id, tag);
606 		return -1;
607 	}
608 
609 	cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
610 
611 	if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
612 		cmd->result = io->result;
613 
614 	/* These fields should be written once, never change */
615 	ublk_set_sqe_cmd_op(sqe[0], cmd_op);
616 	sqe[0]->fd		= 0;	/* dev->fds[0] */
617 	sqe[0]->opcode	= IORING_OP_URING_CMD;
618 	sqe[0]->flags	= IOSQE_FIXED_FILE;
619 	sqe[0]->rw_flags	= 0;
620 	cmd->tag	= tag;
621 	cmd->q_id	= q->q_id;
622 	if (!(q->state & UBLKSRV_NO_BUF))
623 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
624 	else
625 		cmd->addr	= 0;
626 
627 	if (q->state & UBLKSRV_AUTO_BUF_REG)
628 		ublk_set_auto_buf_reg(q, sqe[0], tag);
629 
630 	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
631 	io_uring_sqe_set_data64(sqe[0], user_data);
632 
633 	io->flags = 0;
634 
635 	q->cmd_inflight += 1;
636 
637 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
638 			__func__, q->q_id, tag, cmd_op,
639 			io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
640 	return 1;
641 }
642 
643 static void ublk_submit_fetch_commands(struct ublk_queue *q)
644 {
645 	int i = 0;
646 
647 	for (i = 0; i < q->q_depth; i++)
648 		ublk_queue_io_cmd(q, &q->ios[i], i);
649 }
650 
651 static int ublk_queue_is_idle(struct ublk_queue *q)
652 {
653 	return !io_uring_sq_ready(&q->ring) && !q->io_inflight;
654 }
655 
656 static int ublk_queue_is_done(struct ublk_queue *q)
657 {
658 	return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q);
659 }
660 
661 static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
662 		struct io_uring_cqe *cqe)
663 {
664 	unsigned tag = user_data_to_tag(cqe->user_data);
665 
666 	if (cqe->res < 0 && cqe->res != -EAGAIN)
667 		ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
668 			__func__, cqe->res, q->q_id,
669 			user_data_to_tag(cqe->user_data),
670 			user_data_to_op(cqe->user_data));
671 
672 	if (q->tgt_ops->tgt_io_done)
673 		q->tgt_ops->tgt_io_done(q, tag, cqe);
674 }
675 
676 static void ublk_handle_cqe(struct io_uring *r,
677 		struct io_uring_cqe *cqe, void *data)
678 {
679 	struct ublk_queue *q = container_of(r, struct ublk_queue, ring);
680 	unsigned tag = user_data_to_tag(cqe->user_data);
681 	unsigned cmd_op = user_data_to_op(cqe->user_data);
682 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
683 		!(q->state & UBLKSRV_QUEUE_STOPPING);
684 	struct ublk_io *io;
685 
686 	if (cqe->res < 0 && cqe->res != -ENODEV)
687 		ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
688 				cqe->res, cqe->user_data, q->state);
689 
690 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
691 			__func__, cqe->res, q->q_id, tag, cmd_op,
692 			is_target_io(cqe->user_data),
693 			user_data_to_tgt_data(cqe->user_data),
694 			(q->state & UBLKSRV_QUEUE_STOPPING));
695 
696 	/* Don't retrieve io in case of target io */
697 	if (is_target_io(cqe->user_data)) {
698 		ublksrv_handle_tgt_cqe(q, cqe);
699 		return;
700 	}
701 
702 	io = &q->ios[tag];
703 	q->cmd_inflight--;
704 
705 	if (!fetch) {
706 		q->state |= UBLKSRV_QUEUE_STOPPING;
707 		io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
708 	}
709 
710 	if (cqe->res == UBLK_IO_RES_OK) {
711 		assert(tag < q->q_depth);
712 		if (q->tgt_ops->queue_io)
713 			q->tgt_ops->queue_io(q, tag);
714 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
715 		io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
716 		ublk_queue_io_cmd(q, io, tag);
717 	} else {
718 		/*
719 		 * COMMIT_REQ will be completed immediately since no fetching
720 		 * piggyback is required.
721 		 *
722 		 * Marking IO_FREE only, then this io won't be issued since
723 		 * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*)
724 		 *
725 		 * */
726 		io->flags = UBLKSRV_IO_FREE;
727 	}
728 }
729 
730 static int ublk_reap_events_uring(struct io_uring *r)
731 {
732 	struct io_uring_cqe *cqe;
733 	unsigned head;
734 	int count = 0;
735 
736 	io_uring_for_each_cqe(r, head, cqe) {
737 		ublk_handle_cqe(r, cqe, NULL);
738 		count += 1;
739 	}
740 	io_uring_cq_advance(r, count);
741 
742 	return count;
743 }
744 
745 static int ublk_process_io(struct ublk_queue *q)
746 {
747 	int ret, reapped;
748 
749 	ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n",
750 				q->dev->dev_info.dev_id,
751 				q->q_id, io_uring_sq_ready(&q->ring),
752 				q->cmd_inflight,
753 				(q->state & UBLKSRV_QUEUE_STOPPING));
754 
755 	if (ublk_queue_is_done(q))
756 		return -ENODEV;
757 
758 	ret = io_uring_submit_and_wait(&q->ring, 1);
759 	reapped = ublk_reap_events_uring(&q->ring);
760 
761 	ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
762 			ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
763 			(q->state & UBLKSRV_QUEUE_IDLE));
764 
765 	return reapped;
766 }
767 
768 static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
769 		cpu_set_t *cpuset)
770 {
771         if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
772                 ublk_err("ublk dev %u queue %u set affinity failed",
773                                 q->dev->dev_info.dev_id, q->q_id);
774 }
775 
776 struct ublk_queue_info {
777 	struct ublk_queue 	*q;
778 	sem_t 			*queue_sem;
779 	cpu_set_t 		*affinity;
780 	unsigned char 		auto_zc_fallback;
781 };
782 
783 static void *ublk_io_handler_fn(void *data)
784 {
785 	struct ublk_queue_info *info = data;
786 	struct ublk_queue *q = info->q;
787 	int dev_id = q->dev->dev_info.dev_id;
788 	unsigned extra_flags = 0;
789 	int ret;
790 
791 	if (info->auto_zc_fallback)
792 		extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
793 
794 	ret = ublk_queue_init(q, extra_flags);
795 	if (ret) {
796 		ublk_err("ublk dev %d queue %d init queue failed\n",
797 				dev_id, q->q_id);
798 		return NULL;
799 	}
800 	/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
801 	ublk_queue_set_sched_affinity(q, info->affinity);
802 	sem_post(info->queue_sem);
803 
804 	ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
805 			q->tid, dev_id, q->q_id);
806 
807 	/* submit all io commands to ublk driver */
808 	ublk_submit_fetch_commands(q);
809 	do {
810 		if (ublk_process_io(q) < 0)
811 			break;
812 	} while (1);
813 
814 	ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
815 	ublk_queue_deinit(q);
816 	return NULL;
817 }
818 
819 static void ublk_set_parameters(struct ublk_dev *dev)
820 {
821 	int ret;
822 
823 	ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
824 	if (ret)
825 		ublk_err("dev %d set basic parameter failed %d\n",
826 				dev->dev_info.dev_id, ret);
827 }
828 
829 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
830 {
831 	uint64_t id;
832 	int evtfd = ctx->_evtfd;
833 
834 	if (evtfd < 0)
835 		return -EBADF;
836 
837 	if (dev_id >= 0)
838 		id = dev_id + 1;
839 	else
840 		id = ERROR_EVTFD_DEVID;
841 
842 	if (dev && ctx->shadow_dev)
843 		memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
844 
845 	if (write(evtfd, &id, sizeof(id)) != sizeof(id))
846 		return -EINVAL;
847 
848 	close(evtfd);
849 	shmdt(ctx->shadow_dev);
850 
851 	return 0;
852 }
853 
854 
855 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
856 {
857 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
858 	struct ublk_queue_info *qinfo;
859 	cpu_set_t *affinity_buf;
860 	void *thread_ret;
861 	sem_t queue_sem;
862 	int ret, i;
863 
864 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
865 
866 	qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
867 			dinfo->nr_hw_queues);
868 	if (!qinfo)
869 		return -ENOMEM;
870 
871 	sem_init(&queue_sem, 0, 0);
872 	ret = ublk_dev_prep(ctx, dev);
873 	if (ret)
874 		return ret;
875 
876 	ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
877 	if (ret)
878 		return ret;
879 
880 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
881 		dev->q[i].dev = dev;
882 		dev->q[i].q_id = i;
883 
884 		qinfo[i].q = &dev->q[i];
885 		qinfo[i].queue_sem = &queue_sem;
886 		qinfo[i].affinity = &affinity_buf[i];
887 		qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
888 		pthread_create(&dev->q[i].thread, NULL,
889 				ublk_io_handler_fn,
890 				&qinfo[i]);
891 	}
892 
893 	for (i = 0; i < dinfo->nr_hw_queues; i++)
894 		sem_wait(&queue_sem);
895 	free(qinfo);
896 	free(affinity_buf);
897 
898 	/* everything is fine now, start us */
899 	if (ctx->recovery)
900 		ret = ublk_ctrl_end_user_recovery(dev, getpid());
901 	else {
902 		ublk_set_parameters(dev);
903 		ret = ublk_ctrl_start_dev(dev, getpid());
904 	}
905 	if (ret < 0) {
906 		ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
907 		goto fail;
908 	}
909 
910 	ublk_ctrl_get_info(dev);
911 	if (ctx->fg)
912 		ublk_ctrl_dump(dev);
913 	else
914 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
915 
916 	/* wait until we are terminated */
917 	for (i = 0; i < dinfo->nr_hw_queues; i++)
918 		pthread_join(dev->q[i].thread, &thread_ret);
919  fail:
920 	ublk_dev_unprep(dev);
921 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
922 
923 	return ret;
924 }
925 
926 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
927 {
928 #define EV_SIZE (sizeof(struct inotify_event))
929 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
930 	struct pollfd pfd;
931 	int fd, wd;
932 	int ret = -EINVAL;
933 	const char *dev_name = basename(path);
934 
935 	fd = inotify_init();
936 	if (fd < 0) {
937 		ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
938 		return fd;
939 	}
940 
941 	wd = inotify_add_watch(fd, "/dev", evt_mask);
942 	if (wd == -1) {
943 		ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
944 		goto fail;
945 	}
946 
947 	pfd.fd = fd;
948 	pfd.events = POLL_IN;
949 	while (1) {
950 		int i = 0;
951 		char buffer[EV_BUF_LEN];
952 		ret = poll(&pfd, 1, 1000 * timeout);
953 
954 		if (ret == -1) {
955 			ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
956 			goto rm_watch;
957 		} else if (ret == 0) {
958 			ublk_err("%s: poll inotify timeout\n", __func__);
959 			ret = -ETIMEDOUT;
960 			goto rm_watch;
961 		}
962 
963 		ret = read(fd, buffer, EV_BUF_LEN);
964 		if (ret < 0) {
965 			ublk_err("%s: read inotify fd failed\n", __func__);
966 			goto rm_watch;
967 		}
968 
969 		while (i < ret) {
970 			struct inotify_event *event = (struct inotify_event *)&buffer[i];
971 
972 			ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
973 					__func__, event->mask, event->name);
974 			if (event->mask & evt_mask) {
975 				if (!strcmp(event->name, dev_name)) {
976 					ret = 0;
977 					goto rm_watch;
978 				}
979 			}
980 			i += EV_SIZE + event->len;
981 		}
982 	}
983 rm_watch:
984 	inotify_rm_watch(fd, wd);
985 fail:
986 	close(fd);
987 	return ret;
988 }
989 
990 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
991 {
992 	int daemon_pid = dev->dev_info.ublksrv_pid;
993 	int dev_id = dev->dev_info.dev_id;
994 	char ublkc[64];
995 	int ret = 0;
996 
997 	if (daemon_pid < 0)
998 		return 0;
999 
1000 	/* daemon may be dead already */
1001 	if (kill(daemon_pid, 0) < 0)
1002 		goto wait;
1003 
1004 	snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1005 
1006 	/* ublk char device may be gone already */
1007 	if (access(ublkc, F_OK) != 0)
1008 		goto wait;
1009 
1010 	/* Wait until ublk char device is closed, when the daemon is shutdown */
1011 	ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1012 	/* double check and since it may be closed before starting inotify */
1013 	if (ret == -ETIMEDOUT)
1014 		ret = kill(daemon_pid, 0) < 0;
1015 wait:
1016 	waitpid(daemon_pid, NULL, 0);
1017 	ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1018 			__func__, daemon_pid, dev_id, ret);
1019 
1020 	return ret;
1021 }
1022 
1023 static int __cmd_dev_add(const struct dev_ctx *ctx)
1024 {
1025 	unsigned nr_queues = ctx->nr_hw_queues;
1026 	const char *tgt_type = ctx->tgt_type;
1027 	unsigned depth = ctx->queue_depth;
1028 	__u64 features;
1029 	const struct ublk_tgt_ops *ops;
1030 	struct ublksrv_ctrl_dev_info *info;
1031 	struct ublk_dev *dev;
1032 	int dev_id = ctx->dev_id;
1033 	int ret, i;
1034 
1035 	ops = ublk_find_tgt(tgt_type);
1036 	if (!ops) {
1037 		ublk_err("%s: no such tgt type, type %s\n",
1038 				__func__, tgt_type);
1039 		return -ENODEV;
1040 	}
1041 
1042 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1043 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1044 				__func__, nr_queues, depth);
1045 		return -EINVAL;
1046 	}
1047 
1048 	dev = ublk_ctrl_init();
1049 	if (!dev) {
1050 		ublk_err("%s: can't alloc dev id %d, type %s\n",
1051 				__func__, dev_id, tgt_type);
1052 		return -ENOMEM;
1053 	}
1054 
1055 	/* kernel doesn't support get_features */
1056 	ret = ublk_ctrl_get_features(dev, &features);
1057 	if (ret < 0)
1058 		return -EINVAL;
1059 
1060 	if (!(features & UBLK_F_CMD_IOCTL_ENCODE))
1061 		return -ENOTSUP;
1062 
1063 	info = &dev->dev_info;
1064 	info->dev_id = ctx->dev_id;
1065 	info->nr_hw_queues = nr_queues;
1066 	info->queue_depth = depth;
1067 	info->flags = ctx->flags;
1068 	if ((features & UBLK_F_QUIESCE) &&
1069 			(info->flags & UBLK_F_USER_RECOVERY))
1070 		info->flags |= UBLK_F_QUIESCE;
1071 	dev->tgt.ops = ops;
1072 	dev->tgt.sq_depth = depth;
1073 	dev->tgt.cq_depth = depth;
1074 
1075 	for (i = 0; i < MAX_BACK_FILES; i++) {
1076 		if (ctx->files[i]) {
1077 			strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1078 			dev->tgt.nr_backing_files++;
1079 		}
1080 	}
1081 
1082 	if (ctx->recovery)
1083 		ret = ublk_ctrl_start_user_recovery(dev);
1084 	else
1085 		ret = ublk_ctrl_add_dev(dev);
1086 	if (ret < 0) {
1087 		ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1088 				__func__, dev_id, tgt_type, ret);
1089 		goto fail;
1090 	}
1091 
1092 	ret = ublk_start_daemon(ctx, dev);
1093 	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret);
1094 	if (ret < 0)
1095 		ublk_ctrl_del_dev(dev);
1096 
1097 fail:
1098 	if (ret < 0)
1099 		ublk_send_dev_event(ctx, dev, -1);
1100 	ublk_ctrl_deinit(dev);
1101 	return ret;
1102 }
1103 
1104 static int __cmd_dev_list(struct dev_ctx *ctx);
1105 
1106 static int cmd_dev_add(struct dev_ctx *ctx)
1107 {
1108 	int res;
1109 
1110 	if (ctx->fg)
1111 		goto run;
1112 
1113 	ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1114 	if (ctx->_shmid < 0) {
1115 		ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1116 		exit(-1);
1117 	}
1118 	ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1119 	if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1120 		ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1121 		exit(-1);
1122 	}
1123 	ctx->_evtfd = eventfd(0, 0);
1124 	if (ctx->_evtfd < 0) {
1125 		ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1126 		exit(-1);
1127 	}
1128 
1129 	res = fork();
1130 	if (res == 0) {
1131 		int res2;
1132 
1133 		setsid();
1134 		res2 = fork();
1135 		if (res2 == 0) {
1136 			/* prepare for detaching */
1137 			close(STDIN_FILENO);
1138 			close(STDOUT_FILENO);
1139 			close(STDERR_FILENO);
1140 run:
1141 			res = __cmd_dev_add(ctx);
1142 			return res;
1143 		} else {
1144 			/* detached from the foreground task */
1145 			exit(EXIT_SUCCESS);
1146 		}
1147 	} else if (res > 0) {
1148 		uint64_t id;
1149 		int exit_code = EXIT_FAILURE;
1150 
1151 		res = read(ctx->_evtfd, &id, sizeof(id));
1152 		close(ctx->_evtfd);
1153 		if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1154 			ctx->dev_id = id - 1;
1155 			if (__cmd_dev_list(ctx) >= 0)
1156 				exit_code = EXIT_SUCCESS;
1157 		}
1158 		shmdt(ctx->shadow_dev);
1159 		shmctl(ctx->_shmid, IPC_RMID, NULL);
1160 		/* wait for child and detach from it */
1161 		wait(NULL);
1162 		exit(exit_code);
1163 	} else {
1164 		exit(EXIT_FAILURE);
1165 	}
1166 }
1167 
1168 static int __cmd_dev_del(struct dev_ctx *ctx)
1169 {
1170 	int number = ctx->dev_id;
1171 	struct ublk_dev *dev;
1172 	int ret;
1173 
1174 	dev = ublk_ctrl_init();
1175 	dev->dev_info.dev_id = number;
1176 
1177 	ret = ublk_ctrl_get_info(dev);
1178 	if (ret < 0)
1179 		goto fail;
1180 
1181 	ret = ublk_ctrl_stop_dev(dev);
1182 	if (ret < 0)
1183 		ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1184 
1185 	ret = ublk_stop_io_daemon(dev);
1186 	if (ret < 0)
1187 		ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1188 				__func__, dev->dev_info.ublksrv_pid, number, ret);
1189 	ublk_ctrl_del_dev(dev);
1190 fail:
1191 	ublk_ctrl_deinit(dev);
1192 
1193 	return (ret >= 0) ? 0 : ret;
1194 }
1195 
1196 static int cmd_dev_del(struct dev_ctx *ctx)
1197 {
1198 	int i;
1199 
1200 	if (ctx->dev_id >= 0 || !ctx->all)
1201 		return __cmd_dev_del(ctx);
1202 
1203 	for (i = 0; i < 255; i++) {
1204 		ctx->dev_id = i;
1205 		__cmd_dev_del(ctx);
1206 	}
1207 	return 0;
1208 }
1209 
1210 static int __cmd_dev_list(struct dev_ctx *ctx)
1211 {
1212 	struct ublk_dev *dev = ublk_ctrl_init();
1213 	int ret;
1214 
1215 	if (!dev)
1216 		return -ENODEV;
1217 
1218 	dev->dev_info.dev_id = ctx->dev_id;
1219 
1220 	ret = ublk_ctrl_get_info(dev);
1221 	if (ret < 0) {
1222 		if (ctx->logging)
1223 			ublk_err("%s: can't get dev info from %d: %d\n",
1224 					__func__, ctx->dev_id, ret);
1225 	} else {
1226 		if (ctx->shadow_dev)
1227 			memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1228 
1229 		ublk_ctrl_dump(dev);
1230 	}
1231 
1232 	ublk_ctrl_deinit(dev);
1233 
1234 	return ret;
1235 }
1236 
1237 static int cmd_dev_list(struct dev_ctx *ctx)
1238 {
1239 	int i;
1240 
1241 	if (ctx->dev_id >= 0 || !ctx->all)
1242 		return __cmd_dev_list(ctx);
1243 
1244 	ctx->logging = false;
1245 	for (i = 0; i < 255; i++) {
1246 		ctx->dev_id = i;
1247 		__cmd_dev_list(ctx);
1248 	}
1249 	return 0;
1250 }
1251 
1252 static int cmd_dev_get_features(void)
1253 {
1254 #define const_ilog2(x) (63 - __builtin_clzll(x))
1255 	static const char *feat_map[] = {
1256 		[const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY",
1257 		[const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK",
1258 		[const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA",
1259 		[const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY",
1260 		[const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE",
1261 		[const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV",
1262 		[const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE",
1263 		[const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY",
1264 		[const_ilog2(UBLK_F_ZONED)] = "ZONED",
1265 		[const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO",
1266 		[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
1267 		[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
1268 		[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
1269 	};
1270 	struct ublk_dev *dev;
1271 	__u64 features = 0;
1272 	int ret;
1273 
1274 	dev = ublk_ctrl_init();
1275 	if (!dev) {
1276 		fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1277 		return -EOPNOTSUPP;
1278 	}
1279 
1280 	ret = ublk_ctrl_get_features(dev, &features);
1281 	if (!ret) {
1282 		int i;
1283 
1284 		printf("ublk_drv features: 0x%llx\n", features);
1285 
1286 		for (i = 0; i < sizeof(features) * 8; i++) {
1287 			const char *feat;
1288 
1289 			if (!((1ULL << i)  & features))
1290 				continue;
1291 			if (i < sizeof(feat_map) / sizeof(feat_map[0]))
1292 				feat = feat_map[i];
1293 			else
1294 				feat = "unknown";
1295 			printf("\t%-20s: 0x%llx\n", feat, 1ULL << i);
1296 		}
1297 	}
1298 
1299 	return ret;
1300 }
1301 
1302 static int cmd_dev_update_size(struct dev_ctx *ctx)
1303 {
1304 	struct ublk_dev *dev = ublk_ctrl_init();
1305 	struct ublk_params p;
1306 	int ret = -EINVAL;
1307 
1308 	if (!dev)
1309 		return -ENODEV;
1310 
1311 	if (ctx->dev_id < 0) {
1312 		fprintf(stderr, "device id isn't provided\n");
1313 		goto out;
1314 	}
1315 
1316 	dev->dev_info.dev_id = ctx->dev_id;
1317 	ret = ublk_ctrl_get_params(dev, &p);
1318 	if (ret < 0) {
1319 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
1320 		goto out;
1321 	}
1322 
1323 	if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
1324 		ublk_err("size isn't aligned with logical block size\n");
1325 		ret = -EINVAL;
1326 		goto out;
1327 	}
1328 
1329 	ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
1330 out:
1331 	ublk_ctrl_deinit(dev);
1332 	return ret;
1333 }
1334 
1335 static int cmd_dev_quiesce(struct dev_ctx *ctx)
1336 {
1337 	struct ublk_dev *dev = ublk_ctrl_init();
1338 	int ret = -EINVAL;
1339 
1340 	if (!dev)
1341 		return -ENODEV;
1342 
1343 	if (ctx->dev_id < 0) {
1344 		fprintf(stderr, "device id isn't provided for quiesce\n");
1345 		goto out;
1346 	}
1347 	dev->dev_info.dev_id = ctx->dev_id;
1348 	ret = ublk_ctrl_quiesce_dev(dev, 10000);
1349 
1350 out:
1351 	ublk_ctrl_deinit(dev);
1352 	return ret;
1353 }
1354 
1355 static void __cmd_create_help(char *exe, bool recovery)
1356 {
1357 	int i;
1358 
1359 	printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
1360 			exe, recovery ? "recover" : "add");
1361 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
1362 	printf("\t[-e 0|1 ] [-i 0|1]\n");
1363 	printf("\t[target options] [backfile1] [backfile2] ...\n");
1364 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
1365 
1366 	for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
1367 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
1368 
1369 		if (ops->usage)
1370 			ops->usage(ops);
1371 	}
1372 }
1373 
1374 static void cmd_add_help(char *exe)
1375 {
1376 	__cmd_create_help(exe, false);
1377 	printf("\n");
1378 }
1379 
1380 static void cmd_recover_help(char *exe)
1381 {
1382 	__cmd_create_help(exe, true);
1383 	printf("\tPlease provide exact command line for creating this device with real dev_id\n");
1384 	printf("\n");
1385 }
1386 
1387 static int cmd_dev_help(char *exe)
1388 {
1389 	cmd_add_help(exe);
1390 	cmd_recover_help(exe);
1391 
1392 	printf("%s del [-n dev_id] -a \n", exe);
1393 	printf("\t -a delete all devices -n delete specified device\n\n");
1394 	printf("%s list [-n dev_id] -a \n", exe);
1395 	printf("\t -a list all devices, -n list specified device, default -a \n\n");
1396 	printf("%s features\n", exe);
1397 	printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
1398 	printf("%s quiesce -n dev_id\n", exe);
1399 	return 0;
1400 }
1401 
1402 int main(int argc, char *argv[])
1403 {
1404 	static const struct option longopts[] = {
1405 		{ "all",		0,	NULL, 'a' },
1406 		{ "type",		1,	NULL, 't' },
1407 		{ "number",		1,	NULL, 'n' },
1408 		{ "queues",		1,	NULL, 'q' },
1409 		{ "depth",		1,	NULL, 'd' },
1410 		{ "debug_mask",		1,	NULL,  0  },
1411 		{ "quiet",		0,	NULL,  0  },
1412 		{ "zero_copy",          0,      NULL, 'z' },
1413 		{ "foreground",		0,	NULL,  0  },
1414 		{ "recovery", 		1,      NULL, 'r' },
1415 		{ "recovery_fail_io",	1,	NULL, 'e'},
1416 		{ "recovery_reissue",	1,	NULL, 'i'},
1417 		{ "get_data",		1,	NULL, 'g'},
1418 		{ "auto_zc",		0,	NULL,  0 },
1419 		{ "auto_zc_fallback", 	0,	NULL,  0 },
1420 		{ "size",		1,	NULL, 's'},
1421 		{ 0, 0, 0, 0 }
1422 	};
1423 	const struct ublk_tgt_ops *ops = NULL;
1424 	int option_idx, opt;
1425 	const char *cmd = argv[1];
1426 	struct dev_ctx ctx = {
1427 		.queue_depth	=	128,
1428 		.nr_hw_queues	=	2,
1429 		.dev_id		=	-1,
1430 		.tgt_type	=	"unknown",
1431 	};
1432 	int ret = -EINVAL, i;
1433 	int tgt_argc = 1;
1434 	char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
1435 	int value;
1436 
1437 	if (argc == 1)
1438 		return ret;
1439 
1440 	opterr = 0;
1441 	optind = 2;
1442 	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz",
1443 				  longopts, &option_idx)) != -1) {
1444 		switch (opt) {
1445 		case 'a':
1446 			ctx.all = 1;
1447 			break;
1448 		case 'n':
1449 			ctx.dev_id = strtol(optarg, NULL, 10);
1450 			break;
1451 		case 't':
1452 			if (strlen(optarg) < sizeof(ctx.tgt_type))
1453 				strcpy(ctx.tgt_type, optarg);
1454 			break;
1455 		case 'q':
1456 			ctx.nr_hw_queues = strtol(optarg, NULL, 10);
1457 			break;
1458 		case 'd':
1459 			ctx.queue_depth = strtol(optarg, NULL, 10);
1460 			break;
1461 		case 'z':
1462 			ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
1463 			break;
1464 		case 'r':
1465 			value = strtol(optarg, NULL, 10);
1466 			if (value)
1467 				ctx.flags |= UBLK_F_USER_RECOVERY;
1468 			break;
1469 		case 'e':
1470 			value = strtol(optarg, NULL, 10);
1471 			if (value)
1472 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
1473 			break;
1474 		case 'i':
1475 			value = strtol(optarg, NULL, 10);
1476 			if (value)
1477 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
1478 			break;
1479 		case 'g':
1480 			ctx.flags |= UBLK_F_NEED_GET_DATA;
1481 			break;
1482 		case 's':
1483 			ctx.size = strtoull(optarg, NULL, 10);
1484 			break;
1485 		case 0:
1486 			if (!strcmp(longopts[option_idx].name, "debug_mask"))
1487 				ublk_dbg_mask = strtol(optarg, NULL, 16);
1488 			if (!strcmp(longopts[option_idx].name, "quiet"))
1489 				ublk_dbg_mask = 0;
1490 			if (!strcmp(longopts[option_idx].name, "foreground"))
1491 				ctx.fg = 1;
1492 			if (!strcmp(longopts[option_idx].name, "auto_zc"))
1493 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
1494 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
1495 				ctx.auto_zc_fallback = 1;
1496 			break;
1497 		case '?':
1498 			/*
1499 			 * target requires every option must have argument
1500 			 */
1501 			if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
1502 				fprintf(stderr, "every target option requires argument: %s %s\n",
1503 						argv[optind - 1], argv[optind]);
1504 				exit(EXIT_FAILURE);
1505 			}
1506 
1507 			if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
1508 				tgt_argv[tgt_argc++] = argv[optind - 1];
1509 				tgt_argv[tgt_argc++] = argv[optind];
1510 			} else {
1511 				fprintf(stderr, "too many target options\n");
1512 				exit(EXIT_FAILURE);
1513 			}
1514 			optind += 1;
1515 			break;
1516 		}
1517 	}
1518 
1519 	/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
1520 	if (ctx.auto_zc_fallback &&
1521 	    !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1522 		    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
1523 		ublk_err("%s: auto_zc_fallback is set but neither "
1524 				"F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
1525 					__func__);
1526 		return -EINVAL;
1527 	}
1528 
1529 	i = optind;
1530 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
1531 		ctx.files[ctx.nr_files++] = argv[i++];
1532 	}
1533 
1534 	ops = ublk_find_tgt(ctx.tgt_type);
1535 	if (ops && ops->parse_cmd_line) {
1536 		optind = 0;
1537 
1538 		tgt_argv[0] = ctx.tgt_type;
1539 		ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
1540 	}
1541 
1542 	if (!strcmp(cmd, "add"))
1543 		ret = cmd_dev_add(&ctx);
1544 	else if (!strcmp(cmd, "recover")) {
1545 		if (ctx.dev_id < 0) {
1546 			fprintf(stderr, "device id isn't provided for recovering\n");
1547 			ret = -EINVAL;
1548 		} else {
1549 			ctx.recovery = 1;
1550 			ret = cmd_dev_add(&ctx);
1551 		}
1552 	} else if (!strcmp(cmd, "del"))
1553 		ret = cmd_dev_del(&ctx);
1554 	else if (!strcmp(cmd, "list")) {
1555 		ctx.all = 1;
1556 		ret = cmd_dev_list(&ctx);
1557 	} else if (!strcmp(cmd, "help"))
1558 		ret = cmd_dev_help(argv[0]);
1559 	else if (!strcmp(cmd, "features"))
1560 		ret = cmd_dev_get_features();
1561 	else if (!strcmp(cmd, "update_size"))
1562 		ret = cmd_dev_update_size(&ctx);
1563 	else if (!strcmp(cmd, "quiesce"))
1564 		ret = cmd_dev_quiesce(&ctx);
1565 	else
1566 		cmd_dev_help(argv[0]);
1567 
1568 	return ret;
1569 }
1570