xref: /linux/tools/testing/selftests/ublk/kublk.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 /* SPDX-License-Identifier: MIT */
2 /*
3  * Description: uring_cmd based ublk
4  */
5 
6 #include <linux/fs.h>
7 #include "kublk.h"
8 
9 #define MAX_NR_TGT_ARG 	64
10 
11 unsigned int ublk_dbg_mask = UBLK_LOG;
12 static const struct ublk_tgt_ops *tgt_ops_list[] = {
13 	&null_tgt_ops,
14 	&loop_tgt_ops,
15 	&stripe_tgt_ops,
16 	&fault_inject_tgt_ops,
17 };
18 
19 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
20 {
21 	int i;
22 
23 	if (name == NULL)
24 		return NULL;
25 
26 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
27 		if (strcmp(tgt_ops_list[i]->name, name) == 0)
28 			return tgt_ops_list[i];
29 	return NULL;
30 }
31 
32 static inline int ublk_setup_ring(struct io_uring *r, int depth,
33 		int cq_depth, unsigned flags)
34 {
35 	struct io_uring_params p;
36 
37 	memset(&p, 0, sizeof(p));
38 	p.flags = flags | IORING_SETUP_CQSIZE;
39 	p.cq_entries = cq_depth;
40 
41 	return io_uring_queue_init_params(depth, r, &p);
42 }
43 
44 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
45 		struct io_uring_sqe *sqe,
46 		struct ublk_ctrl_cmd_data *data)
47 {
48 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
49 	struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
50 
51 	sqe->fd = dev->ctrl_fd;
52 	sqe->opcode = IORING_OP_URING_CMD;
53 	sqe->ioprio = 0;
54 
55 	if (data->flags & CTRL_CMD_HAS_BUF) {
56 		cmd->addr = data->addr;
57 		cmd->len = data->len;
58 	}
59 
60 	if (data->flags & CTRL_CMD_HAS_DATA)
61 		cmd->data[0] = data->data[0];
62 
63 	cmd->dev_id = info->dev_id;
64 	cmd->queue_id = -1;
65 
66 	ublk_set_sqe_cmd_op(sqe, data->cmd_op);
67 
68 	io_uring_sqe_set_data(sqe, cmd);
69 }
70 
71 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
72 		struct ublk_ctrl_cmd_data *data)
73 {
74 	struct io_uring_sqe *sqe;
75 	struct io_uring_cqe *cqe;
76 	int ret = -EINVAL;
77 
78 	sqe = io_uring_get_sqe(&dev->ring);
79 	if (!sqe) {
80 		ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
81 		return ret;
82 	}
83 
84 	ublk_ctrl_init_cmd(dev, sqe, data);
85 
86 	ret = io_uring_submit(&dev->ring);
87 	if (ret < 0) {
88 		ublk_err("uring submit ret %d\n", ret);
89 		return ret;
90 	}
91 
92 	ret = io_uring_wait_cqe(&dev->ring, &cqe);
93 	if (ret < 0) {
94 		ublk_err("wait cqe: %s\n", strerror(-ret));
95 		return ret;
96 	}
97 	io_uring_cqe_seen(&dev->ring, cqe);
98 
99 	return cqe->res;
100 }
101 
102 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
103 {
104 	struct ublk_ctrl_cmd_data data = {
105 		.cmd_op	= UBLK_U_CMD_STOP_DEV,
106 	};
107 
108 	return __ublk_ctrl_cmd(dev, &data);
109 }
110 
111 static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
112 {
113 	struct ublk_ctrl_cmd_data data = {
114 		.cmd_op	= UBLK_U_CMD_TRY_STOP_DEV,
115 	};
116 
117 	return __ublk_ctrl_cmd(dev, &data);
118 }
119 
120 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
121 		int daemon_pid)
122 {
123 	struct ublk_ctrl_cmd_data data = {
124 		.cmd_op	= UBLK_U_CMD_START_DEV,
125 		.flags	= CTRL_CMD_HAS_DATA,
126 	};
127 
128 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
129 
130 	return __ublk_ctrl_cmd(dev, &data);
131 }
132 
133 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
134 {
135 	struct ublk_ctrl_cmd_data data = {
136 		.cmd_op	= UBLK_U_CMD_START_USER_RECOVERY,
137 	};
138 
139 	return __ublk_ctrl_cmd(dev, &data);
140 }
141 
142 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
143 {
144 	struct ublk_ctrl_cmd_data data = {
145 		.cmd_op	= UBLK_U_CMD_END_USER_RECOVERY,
146 		.flags	= CTRL_CMD_HAS_DATA,
147 	};
148 
149 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
150 
151 	return __ublk_ctrl_cmd(dev, &data);
152 }
153 
154 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
155 {
156 	struct ublk_ctrl_cmd_data data = {
157 		.cmd_op	= UBLK_U_CMD_ADD_DEV,
158 		.flags	= CTRL_CMD_HAS_BUF,
159 		.addr = (__u64) (uintptr_t) &dev->dev_info,
160 		.len = sizeof(struct ublksrv_ctrl_dev_info),
161 	};
162 
163 	return __ublk_ctrl_cmd(dev, &data);
164 }
165 
166 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
167 {
168 	struct ublk_ctrl_cmd_data data = {
169 		.cmd_op = UBLK_U_CMD_DEL_DEV,
170 		.flags = 0,
171 	};
172 
173 	return __ublk_ctrl_cmd(dev, &data);
174 }
175 
176 static int ublk_ctrl_get_info(struct ublk_dev *dev)
177 {
178 	struct ublk_ctrl_cmd_data data = {
179 		.cmd_op	= UBLK_U_CMD_GET_DEV_INFO,
180 		.flags	= CTRL_CMD_HAS_BUF,
181 		.addr = (__u64) (uintptr_t) &dev->dev_info,
182 		.len = sizeof(struct ublksrv_ctrl_dev_info),
183 	};
184 
185 	return __ublk_ctrl_cmd(dev, &data);
186 }
187 
188 static int ublk_ctrl_set_params(struct ublk_dev *dev,
189 		struct ublk_params *params)
190 {
191 	struct ublk_ctrl_cmd_data data = {
192 		.cmd_op	= UBLK_U_CMD_SET_PARAMS,
193 		.flags	= CTRL_CMD_HAS_BUF,
194 		.addr = (__u64) (uintptr_t) params,
195 		.len = sizeof(*params),
196 	};
197 	params->len = sizeof(*params);
198 	return __ublk_ctrl_cmd(dev, &data);
199 }
200 
201 static int ublk_ctrl_get_params(struct ublk_dev *dev,
202 		struct ublk_params *params)
203 {
204 	struct ublk_ctrl_cmd_data data = {
205 		.cmd_op	= UBLK_U_CMD_GET_PARAMS,
206 		.flags	= CTRL_CMD_HAS_BUF,
207 		.addr = (__u64)params,
208 		.len = sizeof(*params),
209 	};
210 
211 	params->len = sizeof(*params);
212 
213 	return __ublk_ctrl_cmd(dev, &data);
214 }
215 
216 static int ublk_ctrl_get_features(struct ublk_dev *dev,
217 		__u64 *features)
218 {
219 	struct ublk_ctrl_cmd_data data = {
220 		.cmd_op	= UBLK_U_CMD_GET_FEATURES,
221 		.flags	= CTRL_CMD_HAS_BUF,
222 		.addr = (__u64) (uintptr_t) features,
223 		.len = sizeof(*features),
224 	};
225 
226 	return __ublk_ctrl_cmd(dev, &data);
227 }
228 
229 static int ublk_ctrl_update_size(struct ublk_dev *dev,
230 		__u64 nr_sects)
231 {
232 	struct ublk_ctrl_cmd_data data = {
233 		.cmd_op	= UBLK_U_CMD_UPDATE_SIZE,
234 		.flags	= CTRL_CMD_HAS_DATA,
235 	};
236 
237 	data.data[0] = nr_sects;
238 	return __ublk_ctrl_cmd(dev, &data);
239 }
240 
241 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
242 				 unsigned int timeout_ms)
243 {
244 	struct ublk_ctrl_cmd_data data = {
245 		.cmd_op	= UBLK_U_CMD_QUIESCE_DEV,
246 		.flags	= CTRL_CMD_HAS_DATA,
247 	};
248 
249 	data.data[0] = timeout_ms;
250 	return __ublk_ctrl_cmd(dev, &data);
251 }
252 
253 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
254 {
255 	switch (dev->dev_info.state) {
256 	case UBLK_S_DEV_DEAD:
257 		return "DEAD";
258 	case UBLK_S_DEV_LIVE:
259 		return "LIVE";
260 	case UBLK_S_DEV_QUIESCED:
261 		return "QUIESCED";
262 	default:
263 		return "UNKNOWN";
264 	};
265 }
266 
267 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
268 {
269 	unsigned done = 0;
270 	int i;
271 
272 	for (i = 0; i < CPU_SETSIZE; i++) {
273 		if (CPU_ISSET(i, set))
274 			done += snprintf(&buf[done], len - done, "%d ", i);
275 	}
276 }
277 
278 static void ublk_adjust_affinity(cpu_set_t *set)
279 {
280 	int j, updated = 0;
281 
282 	/*
283 	 * Just keep the 1st CPU now.
284 	 *
285 	 * In future, auto affinity selection can be tried.
286 	 */
287 	for (j = 0; j < CPU_SETSIZE; j++) {
288 		if (CPU_ISSET(j, set)) {
289 			if (!updated) {
290 				updated = 1;
291 				continue;
292 			}
293 			CPU_CLR(j, set);
294 		}
295 	}
296 }
297 
298 /* Caller must free the allocated buffer */
299 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
300 {
301 	struct ublk_ctrl_cmd_data data = {
302 		.cmd_op	= UBLK_U_CMD_GET_QUEUE_AFFINITY,
303 		.flags	= CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
304 	};
305 	cpu_set_t *buf;
306 	int i, ret;
307 
308 	buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
309 	if (!buf)
310 		return -ENOMEM;
311 
312 	for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
313 		data.data[0] = i;
314 		data.len = sizeof(cpu_set_t);
315 		data.addr = (__u64)&buf[i];
316 
317 		ret = __ublk_ctrl_cmd(ctrl_dev, &data);
318 		if (ret < 0) {
319 			free(buf);
320 			return ret;
321 		}
322 		ublk_adjust_affinity(&buf[i]);
323 	}
324 
325 	*ptr_buf = buf;
326 	return 0;
327 }
328 
329 static void ublk_ctrl_dump(struct ublk_dev *dev)
330 {
331 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
332 	struct ublk_params p;
333 	cpu_set_t *affinity;
334 	int ret;
335 
336 	ret = ublk_ctrl_get_params(dev, &p);
337 	if (ret < 0) {
338 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
339 		return;
340 	}
341 
342 	ret = ublk_ctrl_get_affinity(dev, &affinity);
343 	if (ret < 0) {
344 		ublk_err("failed to get affinity %m\n");
345 		return;
346 	}
347 
348 	ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
349 			info->dev_id, info->nr_hw_queues, info->queue_depth,
350 			1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
351 	ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
352 			info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
353 			ublk_dev_state_desc(dev));
354 
355 	if (affinity) {
356 		char buf[512];
357 		int i;
358 
359 		for (i = 0; i < info->nr_hw_queues; i++) {
360 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
361 			printf("\tqueue %u: affinity(%s)\n",
362 					i, buf);
363 		}
364 		free(affinity);
365 	}
366 
367 	fflush(stdout);
368 }
369 
370 static void ublk_ctrl_deinit(struct ublk_dev *dev)
371 {
372 	close(dev->ctrl_fd);
373 	free(dev);
374 }
375 
376 static struct ublk_dev *ublk_ctrl_init(void)
377 {
378 	struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
379 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
380 	int ret;
381 
382 	dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
383 	if (dev->ctrl_fd < 0) {
384 		free(dev);
385 		return NULL;
386 	}
387 
388 	info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
389 
390 	ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
391 			UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
392 	if (ret < 0) {
393 		ublk_err("queue_init: %s\n", strerror(-ret));
394 		free(dev);
395 		return NULL;
396 	}
397 	dev->nr_fds = 1;
398 
399 	return dev;
400 }
401 
402 static int __ublk_queue_cmd_buf_sz(unsigned depth)
403 {
404 	int size =  depth * sizeof(struct ublksrv_io_desc);
405 	unsigned int page_sz = getpagesize();
406 
407 	return round_up(size, page_sz);
408 }
409 
410 static int ublk_queue_max_cmd_buf_sz(void)
411 {
412 	return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
413 }
414 
415 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
416 {
417 	return __ublk_queue_cmd_buf_sz(q->q_depth);
418 }
419 
420 static void ublk_queue_deinit(struct ublk_queue *q)
421 {
422 	int i;
423 	int nr_ios = q->q_depth;
424 
425 	if (q->io_cmd_buf)
426 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
427 
428 	for (i = 0; i < nr_ios; i++) {
429 		free(q->ios[i].buf_addr);
430 		free(q->ios[i].integrity_buf);
431 	}
432 }
433 
434 static void ublk_thread_deinit(struct ublk_thread *t)
435 {
436 	io_uring_unregister_buffers(&t->ring);
437 
438 	ublk_batch_free_buf(t);
439 
440 	io_uring_unregister_ring_fd(&t->ring);
441 
442 	if (t->ring.ring_fd > 0) {
443 		io_uring_unregister_files(&t->ring);
444 		close(t->ring.ring_fd);
445 		t->ring.ring_fd = -1;
446 	}
447 }
448 
449 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
450 			   __u8 metadata_size)
451 {
452 	struct ublk_dev *dev = q->dev;
453 	int depth = dev->dev_info.queue_depth;
454 	int i;
455 	int cmd_buf_size, io_buf_size, integrity_size;
456 	unsigned long off;
457 
458 	pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
459 	q->tgt_ops = dev->tgt.ops;
460 	q->flags = 0;
461 	q->q_depth = depth;
462 	q->flags = dev->dev_info.flags;
463 	q->flags |= extra_flags;
464 	q->metadata_size = metadata_size;
465 
466 	/* Cache fd in queue for fast path access */
467 	q->ublk_fd = dev->fds[0];
468 
469 	cmd_buf_size = ublk_queue_cmd_buf_sz(q);
470 	off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
471 	q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
472 			MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
473 	if (q->io_cmd_buf == MAP_FAILED) {
474 		ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
475 				q->dev->dev_info.dev_id, q->q_id);
476 		goto fail;
477 	}
478 
479 	io_buf_size = dev->dev_info.max_io_buf_bytes;
480 	integrity_size = ublk_integrity_len(q, io_buf_size);
481 	for (i = 0; i < q->q_depth; i++) {
482 		q->ios[i].buf_addr = NULL;
483 		q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
484 		q->ios[i].tag = i;
485 
486 		if (integrity_size) {
487 			q->ios[i].integrity_buf = malloc(integrity_size);
488 			if (!q->ios[i].integrity_buf) {
489 				ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
490 					 dev->dev_info.dev_id, q->q_id, i,
491 					 integrity_size);
492 				goto fail;
493 			}
494 		}
495 
496 
497 		if (ublk_queue_no_buf(q))
498 			continue;
499 
500 		if (posix_memalign((void **)&q->ios[i].buf_addr,
501 					getpagesize(), io_buf_size)) {
502 			ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
503 					dev->dev_info.dev_id, q->q_id, i);
504 			goto fail;
505 		}
506 	}
507 
508 	return 0;
509  fail:
510 	ublk_queue_deinit(q);
511 	ublk_err("ublk dev %d queue %d failed\n",
512 			dev->dev_info.dev_id, q->q_id);
513 	return -ENOMEM;
514 }
515 
516 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
517 {
518 	struct ublk_dev *dev = t->dev;
519 	unsigned long long flags = dev->dev_info.flags | extra_flags;
520 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
521 	int ret;
522 
523 	/* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
524 	if (ublk_dev_batch_io(dev))
525 		cq_depth += dev->dev_info.queue_depth * 2;
526 
527 	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
528 			IORING_SETUP_COOP_TASKRUN |
529 			IORING_SETUP_SINGLE_ISSUER |
530 			IORING_SETUP_DEFER_TASKRUN);
531 	if (ret < 0) {
532 		ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
533 				dev->dev_info.dev_id, t->idx, ret);
534 		goto fail;
535 	}
536 
537 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
538 		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
539 		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
540 		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
541 
542 		t->nr_bufs = max_nr_ios_per_thread;
543 	} else {
544 		t->nr_bufs = 0;
545 	}
546 
547 	if (ublk_dev_batch_io(dev))
548 		 ublk_batch_prepare(t);
549 
550 	if (t->nr_bufs) {
551 		ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
552 		if (ret) {
553 			ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
554 					dev->dev_info.dev_id, t->idx, ret);
555 			goto fail;
556 		}
557 	}
558 
559 	if (ublk_dev_batch_io(dev)) {
560 		ret = ublk_batch_alloc_buf(t);
561 		if (ret) {
562 			ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
563 				dev->dev_info.dev_id, t->idx, ret);
564 			goto fail;
565 		}
566 	}
567 
568 	io_uring_register_ring_fd(&t->ring);
569 
570 	if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
571 		/* Register only backing files starting from index 1, exclude ublk control device */
572 		if (dev->nr_fds > 1) {
573 			ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
574 		} else {
575 			/* No backing files to register, skip file registration */
576 			ret = 0;
577 		}
578 	} else {
579 		ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
580 	}
581 	if (ret) {
582 		ublk_err("ublk dev %d thread %d register files failed %d\n",
583 				t->dev->dev_info.dev_id, t->idx, ret);
584 		goto fail;
585 	}
586 
587 	return 0;
588 fail:
589 	ublk_thread_deinit(t);
590 	ublk_err("ublk dev %d thread %d init failed\n",
591 			dev->dev_info.dev_id, t->idx);
592 	return -ENOMEM;
593 }
594 
595 #define WAIT_USEC 	100000
596 #define MAX_WAIT_USEC 	(3 * 1000000)
597 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
598 {
599 	int dev_id = dev->dev_info.dev_id;
600 	unsigned int wait_usec = 0;
601 	int ret = 0, fd = -1;
602 	char buf[64];
603 
604 	snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
605 
606 	while (wait_usec < MAX_WAIT_USEC) {
607 		fd = open(buf, O_RDWR);
608 		if (fd >= 0)
609 			break;
610 		usleep(WAIT_USEC);
611 		wait_usec += WAIT_USEC;
612 	}
613 	if (fd < 0) {
614 		ublk_err("can't open %s %s\n", buf, strerror(errno));
615 		return -1;
616 	}
617 
618 	dev->fds[0] = fd;
619 	if (dev->tgt.ops->init_tgt)
620 		ret = dev->tgt.ops->init_tgt(ctx, dev);
621 	if (ret)
622 		close(dev->fds[0]);
623 	return ret;
624 }
625 
626 static void ublk_dev_unprep(struct ublk_dev *dev)
627 {
628 	if (dev->tgt.ops->deinit_tgt)
629 		dev->tgt.ops->deinit_tgt(dev);
630 	close(dev->fds[0]);
631 }
632 
633 static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
634 				  const struct ublk_queue *q,
635 				  struct io_uring_sqe *sqe,
636 				  unsigned short tag)
637 {
638 	struct ublk_auto_buf_reg buf = {};
639 
640 	if (q->tgt_ops->buf_index)
641 		buf.index = q->tgt_ops->buf_index(t, q, tag);
642 	else
643 		buf.index = ublk_io_buf_idx(t, q, tag);
644 
645 	if (ublk_queue_auto_zc_fallback(q))
646 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
647 
648 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
649 }
650 
651 /* Copy in pieces to test the buffer offset logic */
652 #define UBLK_USER_COPY_LEN 2048
653 
654 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
655 {
656 	const struct ublk_queue *q = ublk_io_to_queue(io);
657 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
658 	__u64 off = ublk_user_copy_offset(q->q_id, io->tag);
659 	__u8 ublk_op = ublksrv_get_op(iod);
660 	__u32 len = iod->nr_sectors << 9;
661 	void *addr = io->buf_addr;
662 	ssize_t copied;
663 
664 	if (ublk_op != match_ublk_op)
665 		return;
666 
667 	while (len) {
668 		__u32 copy_len = min(len, UBLK_USER_COPY_LEN);
669 
670 		if (ublk_op == UBLK_IO_OP_WRITE)
671 			copied = pread(q->ublk_fd, addr, copy_len, off);
672 		else if (ublk_op == UBLK_IO_OP_READ)
673 			copied = pwrite(q->ublk_fd, addr, copy_len, off);
674 		else
675 			assert(0);
676 		assert(copied == (ssize_t)copy_len);
677 		addr += copy_len;
678 		off += copy_len;
679 		len -= copy_len;
680 	}
681 
682 	if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
683 		return;
684 
685 	len = ublk_integrity_len(q, iod->nr_sectors << 9);
686 	off = ublk_user_copy_offset(q->q_id, io->tag);
687 	off |= UBLKSRV_IO_INTEGRITY_FLAG;
688 	if (ublk_op == UBLK_IO_OP_WRITE)
689 		copied = pread(q->ublk_fd, io->integrity_buf, len, off);
690 	else if (ublk_op == UBLK_IO_OP_READ)
691 		copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
692 	else
693 		assert(0);
694 	assert(copied == (ssize_t)len);
695 }
696 
697 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
698 {
699 	struct ublk_queue *q = ublk_io_to_queue(io);
700 	struct ublksrv_io_cmd *cmd;
701 	struct io_uring_sqe *sqe[1];
702 	unsigned int cmd_op = 0;
703 	__u64 user_data;
704 
705 	/* only freed io can be issued */
706 	if (!(io->flags & UBLKS_IO_FREE))
707 		return 0;
708 
709 	/*
710 	 * we issue because we need either fetching or committing or
711 	 * getting data
712 	 */
713 	if (!(io->flags &
714 		(UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
715 		return 0;
716 
717 	if (io->flags & UBLKS_IO_NEED_GET_DATA)
718 		cmd_op = UBLK_U_IO_NEED_GET_DATA;
719 	else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
720 		if (ublk_queue_use_user_copy(q))
721 			ublk_user_copy(io, UBLK_IO_OP_READ);
722 
723 		cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
724 	} else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
725 		cmd_op = UBLK_U_IO_FETCH_REQ;
726 
727 	if (io_uring_sq_space_left(&t->ring) < 1)
728 		io_uring_submit(&t->ring);
729 
730 	ublk_io_alloc_sqes(t, sqe, 1);
731 	if (!sqe[0]) {
732 		ublk_err("%s: run out of sqe. thread %u, tag %d\n",
733 				__func__, t->idx, io->tag);
734 		return -1;
735 	}
736 
737 	cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
738 
739 	if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
740 		cmd->result = io->result;
741 
742 	/* These fields should be written once, never change */
743 	ublk_set_sqe_cmd_op(sqe[0], cmd_op);
744 	sqe[0]->fd	= ublk_get_registered_fd(q, 0);	/* dev->fds[0] */
745 	sqe[0]->opcode	= IORING_OP_URING_CMD;
746 	if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
747 		sqe[0]->flags	= 0;  /* Use raw FD, not fixed file */
748 	else
749 		sqe[0]->flags	= IOSQE_FIXED_FILE;
750 	sqe[0]->rw_flags	= 0;
751 	cmd->tag	= io->tag;
752 	cmd->q_id	= q->q_id;
753 	if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
754 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
755 	else
756 		cmd->addr	= 0;
757 
758 	if (ublk_queue_use_auto_zc(q))
759 		ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
760 
761 	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
762 	io_uring_sqe_set_data64(sqe[0], user_data);
763 
764 	io->flags = 0;
765 
766 	t->cmd_inflight += 1;
767 
768 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
769 			__func__, t->idx, q->q_id, io->tag, cmd_op,
770 			io->flags, !!(t->state & UBLKS_T_STOPPING));
771 	return 1;
772 }
773 
774 static void ublk_submit_fetch_commands(struct ublk_thread *t)
775 {
776 	struct ublk_queue *q;
777 	struct ublk_io *io;
778 	int i = 0, j = 0;
779 
780 	if (t->dev->per_io_tasks) {
781 		/*
782 		 * Lexicographically order all the (qid,tag) pairs, with
783 		 * qid taking priority (so (1,0) > (0,1)). Then make
784 		 * this thread the daemon for every Nth entry in this
785 		 * list (N is the number of threads), starting at this
786 		 * thread's index. This ensures that each queue is
787 		 * handled by as many ublk server threads as possible,
788 		 * so that load that is concentrated on one or a few
789 		 * queues can make use of all ublk server threads.
790 		 */
791 		const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
792 		int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
793 		for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
794 			int q_id = i / dinfo->queue_depth;
795 			int tag = i % dinfo->queue_depth;
796 			q = &t->dev->q[q_id];
797 			io = &q->ios[tag];
798 			io->buf_index = j++;
799 			ublk_queue_io_cmd(t, io);
800 		}
801 	} else {
802 		/*
803 		 * Service exclusively the queue whose q_id matches our
804 		 * thread index.
805 		 */
806 		struct ublk_queue *q = &t->dev->q[t->idx];
807 		for (i = 0; i < q->q_depth; i++) {
808 			io = &q->ios[i];
809 			io->buf_index = i;
810 			ublk_queue_io_cmd(t, io);
811 		}
812 	}
813 }
814 
815 static int ublk_thread_is_idle(struct ublk_thread *t)
816 {
817 	return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
818 }
819 
820 static int ublk_thread_is_done(struct ublk_thread *t)
821 {
822 	return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
823 }
824 
825 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
826 					  struct ublk_queue *q,
827 					  struct io_uring_cqe *cqe)
828 {
829 	if (cqe->res < 0 && cqe->res != -EAGAIN)
830 		ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
831 			__func__, cqe->res, q->q_id,
832 			user_data_to_tag(cqe->user_data),
833 			user_data_to_op(cqe->user_data));
834 
835 	if (q->tgt_ops->tgt_io_done)
836 		q->tgt_ops->tgt_io_done(t, q, cqe);
837 }
838 
839 static void ublk_handle_uring_cmd(struct ublk_thread *t,
840 				  struct ublk_queue *q,
841 				  const struct io_uring_cqe *cqe)
842 {
843 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
844 		!(t->state & UBLKS_T_STOPPING);
845 	unsigned tag = user_data_to_tag(cqe->user_data);
846 	struct ublk_io *io = &q->ios[tag];
847 
848 	t->cmd_inflight--;
849 
850 	if (!fetch) {
851 		t->state |= UBLKS_T_STOPPING;
852 		io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
853 	}
854 
855 	if (cqe->res == UBLK_IO_RES_OK) {
856 		ublk_assert(tag < q->q_depth);
857 
858 		if (ublk_queue_use_user_copy(q))
859 			ublk_user_copy(io, UBLK_IO_OP_WRITE);
860 
861 		if (q->tgt_ops->queue_io)
862 			q->tgt_ops->queue_io(t, q, tag);
863 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
864 		io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
865 		ublk_queue_io_cmd(t, io);
866 	} else {
867 		/*
868 		 * COMMIT_REQ will be completed immediately since no fetching
869 		 * piggyback is required.
870 		 *
871 		 * Marking IO_FREE only, then this io won't be issued since
872 		 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
873 		 *
874 		 * */
875 		io->flags = UBLKS_IO_FREE;
876 	}
877 }
878 
879 static void ublk_handle_cqe(struct ublk_thread *t,
880 		struct io_uring_cqe *cqe, void *data)
881 {
882 	struct ublk_dev *dev = t->dev;
883 	unsigned q_id = user_data_to_q_id(cqe->user_data);
884 	unsigned cmd_op = user_data_to_op(cqe->user_data);
885 
886 	if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
887 		ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
888 				cqe->res, cqe->user_data, t->state);
889 
890 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
891 			"data %lx target %d/%d) stopping %d\n",
892 			__func__, cqe->res, t->idx, q_id,
893 			user_data_to_tag(cqe->user_data),
894 			cmd_op, cqe->user_data, is_target_io(cqe->user_data),
895 			user_data_to_tgt_data(cqe->user_data),
896 			(t->state & UBLKS_T_STOPPING));
897 
898 	/* Don't retrieve io in case of target io */
899 	if (is_target_io(cqe->user_data)) {
900 		ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
901 		return;
902 	}
903 
904 	if (ublk_thread_batch_io(t))
905 		ublk_batch_compl_cmd(t, cqe);
906 	else
907 		ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
908 }
909 
910 static int ublk_reap_events_uring(struct ublk_thread *t)
911 {
912 	struct io_uring_cqe *cqe;
913 	unsigned head;
914 	int count = 0;
915 
916 	io_uring_for_each_cqe(&t->ring, head, cqe) {
917 		ublk_handle_cqe(t, cqe, NULL);
918 		count += 1;
919 	}
920 	io_uring_cq_advance(&t->ring, count);
921 
922 	return count;
923 }
924 
925 static int ublk_process_io(struct ublk_thread *t)
926 {
927 	int ret, reapped;
928 
929 	ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
930 				t->dev->dev_info.dev_id,
931 				t->idx, io_uring_sq_ready(&t->ring),
932 				t->cmd_inflight,
933 				(t->state & UBLKS_T_STOPPING));
934 
935 	if (ublk_thread_is_done(t))
936 		return -ENODEV;
937 
938 	ret = io_uring_submit_and_wait(&t->ring, 1);
939 	if (ublk_thread_batch_io(t)) {
940 		ublk_batch_prep_commit(t);
941 		reapped = ublk_reap_events_uring(t);
942 		ublk_batch_commit_io_cmds(t);
943 	} else {
944 		reapped = ublk_reap_events_uring(t);
945 	}
946 
947 	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
948 			ret, reapped, (t->state & UBLKS_T_STOPPING),
949 			(t->state & UBLKS_T_IDLE));
950 
951 	return reapped;
952 }
953 
954 struct ublk_thread_info {
955 	struct ublk_dev 	*dev;
956 	pthread_t		thread;
957 	unsigned		idx;
958 	sem_t 			*ready;
959 	cpu_set_t 		*affinity;
960 	unsigned long long	extra_flags;
961 	unsigned char		(*q_thread_map)[UBLK_MAX_QUEUES];
962 };
963 
964 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
965 {
966 	if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
967 		ublk_err("ublk dev %u thread %u set affinity failed",
968 				info->dev->dev_info.dev_id, info->idx);
969 }
970 
971 static void ublk_batch_setup_queues(struct ublk_thread *t)
972 {
973 	int i;
974 
975 	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
976 		struct ublk_queue *q = &t->dev->q[i];
977 		int ret;
978 
979 		/*
980 		 * Only prepare io commands in the mapped thread context,
981 		 * otherwise io command buffer index may not work as expected
982 		 */
983 		if (t->q_map[i] == 0)
984 			continue;
985 
986 		ret = ublk_batch_queue_prep_io_cmds(t, q);
987 		ublk_assert(ret >= 0);
988 	}
989 }
990 
991 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
992 {
993 	struct ublk_thread t = {
994 		.dev = info->dev,
995 		.idx = info->idx,
996 	};
997 	int dev_id = info->dev->dev_info.dev_id;
998 	int ret;
999 
1000 	/* Copy per-thread queue mapping into thread-local variable */
1001 	if (info->q_thread_map)
1002 		memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
1003 
1004 	ret = ublk_thread_init(&t, info->extra_flags);
1005 	if (ret) {
1006 		ublk_err("ublk dev %d thread %u init failed\n",
1007 				dev_id, t.idx);
1008 		return ret;
1009 	}
1010 	sem_post(info->ready);
1011 
1012 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
1013 			gettid(), dev_id, t.idx);
1014 
1015 	if (!ublk_thread_batch_io(&t)) {
1016 		/* submit all io commands to ublk driver */
1017 		ublk_submit_fetch_commands(&t);
1018 	} else {
1019 		ublk_batch_setup_queues(&t);
1020 		ublk_batch_start_fetch(&t);
1021 	}
1022 
1023 	do {
1024 		if (ublk_process_io(&t) < 0)
1025 			break;
1026 	} while (1);
1027 
1028 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
1029 		 gettid(), dev_id, t.idx);
1030 	ublk_thread_deinit(&t);
1031 	return 0;
1032 }
1033 
1034 static void *ublk_io_handler_fn(void *data)
1035 {
1036 	struct ublk_thread_info *info = data;
1037 
1038 	/*
1039 	 * IO perf is sensitive with queue pthread affinity on NUMA machine
1040 	 *
1041 	 * Set sched_affinity at beginning, so following allocated memory/pages
1042 	 * could be CPU/NUMA aware.
1043 	 */
1044 	if (info->affinity)
1045 		ublk_thread_set_sched_affinity(info);
1046 
1047 	__ublk_io_handler_fn(info);
1048 
1049 	return NULL;
1050 }
1051 
1052 static void ublk_set_parameters(struct ublk_dev *dev)
1053 {
1054 	int ret;
1055 
1056 	ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
1057 	if (ret)
1058 		ublk_err("dev %d set basic parameter failed %d\n",
1059 				dev->dev_info.dev_id, ret);
1060 }
1061 
1062 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
1063 {
1064 	uint64_t id;
1065 	int evtfd = ctx->_evtfd;
1066 
1067 	if (evtfd < 0)
1068 		return -EBADF;
1069 
1070 	if (dev_id >= 0)
1071 		id = dev_id + 1;
1072 	else
1073 		id = ERROR_EVTFD_DEVID;
1074 
1075 	if (dev && ctx->shadow_dev)
1076 		memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
1077 
1078 	if (write(evtfd, &id, sizeof(id)) != sizeof(id))
1079 		return -EINVAL;
1080 
1081 	close(evtfd);
1082 	shmdt(ctx->shadow_dev);
1083 
1084 	return 0;
1085 }
1086 
1087 
1088 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
1089 {
1090 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
1091 	struct ublk_thread_info *tinfo;
1092 	unsigned long long extra_flags = 0;
1093 	cpu_set_t *affinity_buf;
1094 	unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
1095 	void *thread_ret;
1096 	sem_t ready;
1097 	int ret, i;
1098 
1099 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
1100 
1101 	tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
1102 	if (!tinfo)
1103 		return -ENOMEM;
1104 
1105 	sem_init(&ready, 0, 0);
1106 	ret = ublk_dev_prep(ctx, dev);
1107 	if (ret)
1108 		return ret;
1109 
1110 	ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
1111 	if (ret)
1112 		return ret;
1113 
1114 	if (ublk_dev_batch_io(dev)) {
1115 		q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
1116 		if (!q_thread_map) {
1117 			ret = -ENOMEM;
1118 			goto fail;
1119 		}
1120 		ublk_batch_setup_map(q_thread_map, dev->nthreads,
1121 				     dinfo->nr_hw_queues);
1122 	}
1123 
1124 	if (ctx->auto_zc_fallback)
1125 		extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
1126 	if (ctx->no_ublk_fixed_fd)
1127 		extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
1128 
1129 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
1130 		dev->q[i].dev = dev;
1131 		dev->q[i].q_id = i;
1132 
1133 		ret = ublk_queue_init(&dev->q[i], extra_flags,
1134 				      ctx->metadata_size);
1135 		if (ret) {
1136 			ublk_err("ublk dev %d queue %d init queue failed\n",
1137 				 dinfo->dev_id, i);
1138 			goto fail;
1139 		}
1140 	}
1141 
1142 	for (i = 0; i < dev->nthreads; i++) {
1143 		tinfo[i].dev = dev;
1144 		tinfo[i].idx = i;
1145 		tinfo[i].ready = &ready;
1146 		tinfo[i].extra_flags = extra_flags;
1147 		tinfo[i].q_thread_map = q_thread_map;
1148 
1149 		/*
1150 		 * If threads are not tied 1:1 to queues, setting thread
1151 		 * affinity based on queue affinity makes little sense.
1152 		 * However, thread CPU affinity has significant impact
1153 		 * on performance, so to compare fairly, we'll still set
1154 		 * thread CPU affinity based on queue affinity where
1155 		 * possible.
1156 		 */
1157 		if (dev->nthreads == dinfo->nr_hw_queues)
1158 			tinfo[i].affinity = &affinity_buf[i];
1159 		pthread_create(&tinfo[i].thread, NULL,
1160 				ublk_io_handler_fn,
1161 				&tinfo[i]);
1162 	}
1163 
1164 	for (i = 0; i < dev->nthreads; i++)
1165 		sem_wait(&ready);
1166 	free(affinity_buf);
1167 	free(q_thread_map);
1168 
1169 	/* everything is fine now, start us */
1170 	if (ctx->recovery)
1171 		ret = ublk_ctrl_end_user_recovery(dev, getpid());
1172 	else {
1173 		ublk_set_parameters(dev);
1174 		ret = ublk_ctrl_start_dev(dev, getpid());
1175 	}
1176 	if (ret < 0) {
1177 		ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
1178 		/* stop device so that inflight uring_cmd can be cancelled */
1179 		ublk_ctrl_stop_dev(dev);
1180 		goto fail_start;
1181 	}
1182 
1183 	ublk_ctrl_get_info(dev);
1184 	if (ctx->fg)
1185 		ublk_ctrl_dump(dev);
1186 	else
1187 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
1188 fail_start:
1189 	/* wait until we are terminated */
1190 	for (i = 0; i < dev->nthreads; i++)
1191 		pthread_join(tinfo[i].thread, &thread_ret);
1192 	free(tinfo);
1193  fail:
1194 	for (i = 0; i < dinfo->nr_hw_queues; i++)
1195 		ublk_queue_deinit(&dev->q[i]);
1196 	ublk_dev_unprep(dev);
1197 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1198 
1199 	return ret;
1200 }
1201 
1202 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1203 {
1204 #define EV_SIZE (sizeof(struct inotify_event))
1205 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1206 	struct pollfd pfd;
1207 	int fd, wd;
1208 	int ret = -EINVAL;
1209 	const char *dev_name = basename(path);
1210 
1211 	fd = inotify_init();
1212 	if (fd < 0) {
1213 		ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1214 		return fd;
1215 	}
1216 
1217 	wd = inotify_add_watch(fd, "/dev", evt_mask);
1218 	if (wd == -1) {
1219 		ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1220 		goto fail;
1221 	}
1222 
1223 	pfd.fd = fd;
1224 	pfd.events = POLL_IN;
1225 	while (1) {
1226 		int i = 0;
1227 		char buffer[EV_BUF_LEN];
1228 		ret = poll(&pfd, 1, 1000 * timeout);
1229 
1230 		if (ret == -1) {
1231 			ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1232 			goto rm_watch;
1233 		} else if (ret == 0) {
1234 			ublk_err("%s: poll inotify timeout\n", __func__);
1235 			ret = -ETIMEDOUT;
1236 			goto rm_watch;
1237 		}
1238 
1239 		ret = read(fd, buffer, EV_BUF_LEN);
1240 		if (ret < 0) {
1241 			ublk_err("%s: read inotify fd failed\n", __func__);
1242 			goto rm_watch;
1243 		}
1244 
1245 		while (i < ret) {
1246 			struct inotify_event *event = (struct inotify_event *)&buffer[i];
1247 
1248 			ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1249 					__func__, event->mask, event->name);
1250 			if (event->mask & evt_mask) {
1251 				if (!strcmp(event->name, dev_name)) {
1252 					ret = 0;
1253 					goto rm_watch;
1254 				}
1255 			}
1256 			i += EV_SIZE + event->len;
1257 		}
1258 	}
1259 rm_watch:
1260 	inotify_rm_watch(fd, wd);
1261 fail:
1262 	close(fd);
1263 	return ret;
1264 }
1265 
1266 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1267 {
1268 	int daemon_pid = dev->dev_info.ublksrv_pid;
1269 	int dev_id = dev->dev_info.dev_id;
1270 	char ublkc[64];
1271 	int ret = 0;
1272 
1273 	if (daemon_pid < 0)
1274 		return 0;
1275 
1276 	/* daemon may be dead already */
1277 	if (kill(daemon_pid, 0) < 0)
1278 		goto wait;
1279 
1280 	snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1281 
1282 	/* ublk char device may be gone already */
1283 	if (access(ublkc, F_OK) != 0)
1284 		goto wait;
1285 
1286 	/* Wait until ublk char device is closed, when the daemon is shutdown */
1287 	ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1288 	/* double check and since it may be closed before starting inotify */
1289 	if (ret == -ETIMEDOUT)
1290 		ret = kill(daemon_pid, 0) < 0;
1291 wait:
1292 	waitpid(daemon_pid, NULL, 0);
1293 	ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1294 			__func__, daemon_pid, dev_id, ret);
1295 
1296 	return ret;
1297 }
1298 
1299 static int __cmd_dev_add(const struct dev_ctx *ctx)
1300 {
1301 	unsigned nthreads = ctx->nthreads;
1302 	unsigned nr_queues = ctx->nr_hw_queues;
1303 	const char *tgt_type = ctx->tgt_type;
1304 	unsigned depth = ctx->queue_depth;
1305 	__u64 features;
1306 	const struct ublk_tgt_ops *ops;
1307 	struct ublksrv_ctrl_dev_info *info;
1308 	struct ublk_dev *dev = NULL;
1309 	int dev_id = ctx->dev_id;
1310 	int ret, i;
1311 
1312 	ops = ublk_find_tgt(tgt_type);
1313 	if (!ops) {
1314 		ublk_err("%s: no such tgt type, type %s\n",
1315 				__func__, tgt_type);
1316 		ret = -ENODEV;
1317 		goto fail;
1318 	}
1319 
1320 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1321 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1322 				__func__, nr_queues, depth);
1323 		ret = -EINVAL;
1324 		goto fail;
1325 	}
1326 
1327 	/* default to 1:1 threads:queues if nthreads is unspecified */
1328 	if (!nthreads)
1329 		nthreads = nr_queues;
1330 
1331 	if (nthreads > UBLK_MAX_THREADS) {
1332 		ublk_err("%s: %u is too many threads (max %u)\n",
1333 				__func__, nthreads, UBLK_MAX_THREADS);
1334 		ret = -EINVAL;
1335 		goto fail;
1336 	}
1337 
1338 	if (nthreads != nr_queues && (!ctx->per_io_tasks &&
1339 				!(ctx->flags & UBLK_F_BATCH_IO))) {
1340 		ublk_err("%s: threads %u must be same as queues %u if "
1341 			"not using per_io_tasks\n",
1342 			__func__, nthreads, nr_queues);
1343 		ret = -EINVAL;
1344 		goto fail;
1345 	}
1346 
1347 	dev = ublk_ctrl_init();
1348 	if (!dev) {
1349 		ublk_err("%s: can't alloc dev id %d, type %s\n",
1350 				__func__, dev_id, tgt_type);
1351 		ret = -ENOMEM;
1352 		goto fail;
1353 	}
1354 
1355 	/* kernel doesn't support get_features */
1356 	ret = ublk_ctrl_get_features(dev, &features);
1357 	if (ret < 0) {
1358 		ret = -EINVAL;
1359 		goto fail;
1360 	}
1361 
1362 	if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1363 		ret = -ENOTSUP;
1364 		goto fail;
1365 	}
1366 
1367 	info = &dev->dev_info;
1368 	info->dev_id = ctx->dev_id;
1369 	info->nr_hw_queues = nr_queues;
1370 	info->queue_depth = depth;
1371 	info->flags = ctx->flags;
1372 	if ((features & UBLK_F_QUIESCE) &&
1373 			(info->flags & UBLK_F_USER_RECOVERY))
1374 		info->flags |= UBLK_F_QUIESCE;
1375 	dev->nthreads = nthreads;
1376 	dev->per_io_tasks = ctx->per_io_tasks;
1377 	dev->tgt.ops = ops;
1378 	dev->tgt.sq_depth = depth;
1379 	dev->tgt.cq_depth = depth;
1380 
1381 	for (i = 0; i < MAX_BACK_FILES; i++) {
1382 		if (ctx->files[i]) {
1383 			strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1384 			dev->tgt.nr_backing_files++;
1385 		}
1386 	}
1387 
1388 	if (ctx->recovery)
1389 		ret = ublk_ctrl_start_user_recovery(dev);
1390 	else
1391 		ret = ublk_ctrl_add_dev(dev);
1392 	if (ret < 0) {
1393 		ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1394 				__func__, dev_id, tgt_type, ret);
1395 		goto fail;
1396 	}
1397 
1398 	ret = ublk_start_daemon(ctx, dev);
1399 	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
1400 	if (ret < 0)
1401 		ublk_ctrl_del_dev(dev);
1402 
1403 fail:
1404 	if (ret < 0)
1405 		ublk_send_dev_event(ctx, dev, -1);
1406 	if (dev)
1407 		ublk_ctrl_deinit(dev);
1408 	return ret;
1409 }
1410 
1411 static int __cmd_dev_list(struct dev_ctx *ctx);
1412 
1413 static int cmd_dev_add(struct dev_ctx *ctx)
1414 {
1415 	int res;
1416 
1417 	if (ctx->fg)
1418 		goto run;
1419 
1420 	ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1421 	if (ctx->_shmid < 0) {
1422 		ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1423 		exit(-1);
1424 	}
1425 	ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1426 	if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1427 		ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1428 		exit(-1);
1429 	}
1430 	ctx->_evtfd = eventfd(0, 0);
1431 	if (ctx->_evtfd < 0) {
1432 		ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1433 		exit(-1);
1434 	}
1435 
1436 	res = fork();
1437 	if (res == 0) {
1438 		int res2;
1439 
1440 		setsid();
1441 		res2 = fork();
1442 		if (res2 == 0) {
1443 			/* prepare for detaching */
1444 			close(STDIN_FILENO);
1445 			close(STDOUT_FILENO);
1446 			close(STDERR_FILENO);
1447 run:
1448 			res = __cmd_dev_add(ctx);
1449 			return res;
1450 		} else {
1451 			/* detached from the foreground task */
1452 			exit(EXIT_SUCCESS);
1453 		}
1454 	} else if (res > 0) {
1455 		uint64_t id;
1456 		int exit_code = EXIT_FAILURE;
1457 
1458 		res = read(ctx->_evtfd, &id, sizeof(id));
1459 		close(ctx->_evtfd);
1460 		if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1461 			ctx->dev_id = id - 1;
1462 			if (__cmd_dev_list(ctx) >= 0)
1463 				exit_code = EXIT_SUCCESS;
1464 		}
1465 		shmdt(ctx->shadow_dev);
1466 		shmctl(ctx->_shmid, IPC_RMID, NULL);
1467 		/* wait for child and detach from it */
1468 		wait(NULL);
1469 		if (exit_code == EXIT_FAILURE)
1470 			ublk_err("%s: command failed\n", __func__);
1471 		exit(exit_code);
1472 	} else {
1473 		exit(EXIT_FAILURE);
1474 	}
1475 }
1476 
1477 static int __cmd_dev_del(struct dev_ctx *ctx)
1478 {
1479 	int number = ctx->dev_id;
1480 	struct ublk_dev *dev;
1481 	int ret;
1482 
1483 	dev = ublk_ctrl_init();
1484 	dev->dev_info.dev_id = number;
1485 
1486 	ret = ublk_ctrl_get_info(dev);
1487 	if (ret < 0)
1488 		goto fail;
1489 
1490 	ret = ublk_ctrl_stop_dev(dev);
1491 	if (ret < 0)
1492 		ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1493 
1494 	ret = ublk_stop_io_daemon(dev);
1495 	if (ret < 0)
1496 		ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1497 				__func__, dev->dev_info.ublksrv_pid, number, ret);
1498 	ublk_ctrl_del_dev(dev);
1499 fail:
1500 	ublk_ctrl_deinit(dev);
1501 
1502 	return (ret >= 0) ? 0 : ret;
1503 }
1504 
1505 static int cmd_dev_del(struct dev_ctx *ctx)
1506 {
1507 	int i;
1508 
1509 	if (ctx->dev_id >= 0 || !ctx->all)
1510 		return __cmd_dev_del(ctx);
1511 
1512 	for (i = 0; i < 255; i++) {
1513 		ctx->dev_id = i;
1514 		__cmd_dev_del(ctx);
1515 	}
1516 	return 0;
1517 }
1518 
1519 static int cmd_dev_stop(struct dev_ctx *ctx)
1520 {
1521 	int number = ctx->dev_id;
1522 	struct ublk_dev *dev;
1523 	int ret;
1524 
1525 	if (number < 0) {
1526 		ublk_err("%s: device id is required\n", __func__);
1527 		return -EINVAL;
1528 	}
1529 
1530 	dev = ublk_ctrl_init();
1531 	dev->dev_info.dev_id = number;
1532 
1533 	ret = ublk_ctrl_get_info(dev);
1534 	if (ret < 0)
1535 		goto fail;
1536 
1537 	if (ctx->safe_stop) {
1538 		ret = ublk_ctrl_try_stop_dev(dev);
1539 		if (ret < 0)
1540 			ublk_err("%s: try_stop dev %d failed ret %d\n",
1541 					__func__, number, ret);
1542 	} else {
1543 		ret = ublk_ctrl_stop_dev(dev);
1544 		if (ret < 0)
1545 			ublk_err("%s: stop dev %d failed ret %d\n",
1546 					__func__, number, ret);
1547 	}
1548 
1549 fail:
1550 	ublk_ctrl_deinit(dev);
1551 
1552 	return ret;
1553 }
1554 
1555 static int __cmd_dev_list(struct dev_ctx *ctx)
1556 {
1557 	struct ublk_dev *dev = ublk_ctrl_init();
1558 	int ret;
1559 
1560 	if (!dev)
1561 		return -ENODEV;
1562 
1563 	dev->dev_info.dev_id = ctx->dev_id;
1564 
1565 	ret = ublk_ctrl_get_info(dev);
1566 	if (ret < 0) {
1567 		if (ctx->logging)
1568 			ublk_err("%s: can't get dev info from %d: %d\n",
1569 					__func__, ctx->dev_id, ret);
1570 	} else {
1571 		if (ctx->shadow_dev)
1572 			memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1573 
1574 		ublk_ctrl_dump(dev);
1575 	}
1576 
1577 	ublk_ctrl_deinit(dev);
1578 
1579 	return ret;
1580 }
1581 
1582 static int cmd_dev_list(struct dev_ctx *ctx)
1583 {
1584 	int i;
1585 
1586 	if (ctx->dev_id >= 0 || !ctx->all)
1587 		return __cmd_dev_list(ctx);
1588 
1589 	ctx->logging = false;
1590 	for (i = 0; i < 255; i++) {
1591 		ctx->dev_id = i;
1592 		__cmd_dev_list(ctx);
1593 	}
1594 	return 0;
1595 }
1596 
1597 static int cmd_dev_get_features(void)
1598 {
1599 #define const_ilog2(x) (63 - __builtin_clzll(x))
1600 #define FEAT_NAME(f) [const_ilog2(f)] = #f
1601 	static const char *feat_map[] = {
1602 		FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
1603 		FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
1604 		FEAT_NAME(UBLK_F_NEED_GET_DATA),
1605 		FEAT_NAME(UBLK_F_USER_RECOVERY),
1606 		FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
1607 		FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
1608 		FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
1609 		FEAT_NAME(UBLK_F_USER_COPY),
1610 		FEAT_NAME(UBLK_F_ZONED),
1611 		FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
1612 		FEAT_NAME(UBLK_F_UPDATE_SIZE),
1613 		FEAT_NAME(UBLK_F_AUTO_BUF_REG),
1614 		FEAT_NAME(UBLK_F_QUIESCE),
1615 		FEAT_NAME(UBLK_F_PER_IO_DAEMON),
1616 		FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
1617 		FEAT_NAME(UBLK_F_INTEGRITY),
1618 		FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
1619 		FEAT_NAME(UBLK_F_BATCH_IO),
1620 		FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
1621 	};
1622 	struct ublk_dev *dev;
1623 	__u64 features = 0;
1624 	int ret;
1625 
1626 	dev = ublk_ctrl_init();
1627 	if (!dev) {
1628 		fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1629 		return -EOPNOTSUPP;
1630 	}
1631 
1632 	ret = ublk_ctrl_get_features(dev, &features);
1633 	if (!ret) {
1634 		int i;
1635 
1636 		printf("ublk_drv features: 0x%llx\n", features);
1637 
1638 		for (i = 0; i < sizeof(features) * 8; i++) {
1639 			const char *feat;
1640 
1641 			if (!((1ULL << i)  & features))
1642 				continue;
1643 			if (i < ARRAY_SIZE(feat_map))
1644 				feat = feat_map[i];
1645 			else
1646 				feat = "unknown";
1647 			printf("0x%-16llx: %s\n", 1ULL << i, feat);
1648 		}
1649 	}
1650 
1651 	return ret;
1652 }
1653 
1654 static int cmd_dev_update_size(struct dev_ctx *ctx)
1655 {
1656 	struct ublk_dev *dev = ublk_ctrl_init();
1657 	struct ublk_params p;
1658 	int ret = -EINVAL;
1659 
1660 	if (!dev)
1661 		return -ENODEV;
1662 
1663 	if (ctx->dev_id < 0) {
1664 		fprintf(stderr, "device id isn't provided\n");
1665 		goto out;
1666 	}
1667 
1668 	dev->dev_info.dev_id = ctx->dev_id;
1669 	ret = ublk_ctrl_get_params(dev, &p);
1670 	if (ret < 0) {
1671 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
1672 		goto out;
1673 	}
1674 
1675 	if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
1676 		ublk_err("size isn't aligned with logical block size\n");
1677 		ret = -EINVAL;
1678 		goto out;
1679 	}
1680 
1681 	ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
1682 out:
1683 	ublk_ctrl_deinit(dev);
1684 	return ret;
1685 }
1686 
1687 static int cmd_dev_quiesce(struct dev_ctx *ctx)
1688 {
1689 	struct ublk_dev *dev = ublk_ctrl_init();
1690 	int ret = -EINVAL;
1691 
1692 	if (!dev)
1693 		return -ENODEV;
1694 
1695 	if (ctx->dev_id < 0) {
1696 		fprintf(stderr, "device id isn't provided for quiesce\n");
1697 		goto out;
1698 	}
1699 	dev->dev_info.dev_id = ctx->dev_id;
1700 	ret = ublk_ctrl_quiesce_dev(dev, 10000);
1701 
1702 out:
1703 	ublk_ctrl_deinit(dev);
1704 	return ret;
1705 }
1706 
1707 static void __cmd_create_help(char *exe, bool recovery)
1708 {
1709 	int i;
1710 
1711 	printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
1712 			exe, recovery ? "recover" : "add");
1713 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
1714 	printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
1715 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
1716 	printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
1717 		 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
1718 	printf("\t[--batch|-b] [--no_auto_part_scan]\n");
1719 	printf("\t[target options] [backfile1] [backfile2] ...\n");
1720 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
1721 	printf("\tdefault: nthreads=nr_queues");
1722 
1723 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
1724 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
1725 
1726 		if (ops->usage)
1727 			ops->usage(ops);
1728 	}
1729 }
1730 
1731 static void cmd_add_help(char *exe)
1732 {
1733 	__cmd_create_help(exe, false);
1734 	printf("\n");
1735 }
1736 
1737 static void cmd_recover_help(char *exe)
1738 {
1739 	__cmd_create_help(exe, true);
1740 	printf("\tPlease provide exact command line for creating this device with real dev_id\n");
1741 	printf("\n");
1742 }
1743 
1744 static int cmd_dev_help(char *exe)
1745 {
1746 	cmd_add_help(exe);
1747 	cmd_recover_help(exe);
1748 
1749 	printf("%s del [-n dev_id] -a \n", exe);
1750 	printf("\t -a delete all devices -n delete specified device\n\n");
1751 	printf("%s stop -n dev_id [--safe]\n", exe);
1752 	printf("\t --safe only stop if device has no active openers\n\n");
1753 	printf("%s list [-n dev_id] -a \n", exe);
1754 	printf("\t -a list all devices, -n list specified device, default -a \n\n");
1755 	printf("%s features\n", exe);
1756 	printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
1757 	printf("%s quiesce -n dev_id\n", exe);
1758 	return 0;
1759 }
1760 
1761 int main(int argc, char *argv[])
1762 {
1763 	static const struct option longopts[] = {
1764 		{ "all",		0,	NULL, 'a' },
1765 		{ "type",		1,	NULL, 't' },
1766 		{ "number",		1,	NULL, 'n' },
1767 		{ "queues",		1,	NULL, 'q' },
1768 		{ "depth",		1,	NULL, 'd' },
1769 		{ "debug_mask",		1,	NULL,  0  },
1770 		{ "quiet",		0,	NULL,  0  },
1771 		{ "zero_copy",          0,      NULL, 'z' },
1772 		{ "foreground",		0,	NULL,  0  },
1773 		{ "recovery", 		1,      NULL, 'r' },
1774 		{ "recovery_fail_io",	1,	NULL, 'e'},
1775 		{ "recovery_reissue",	1,	NULL, 'i'},
1776 		{ "get_data",		1,	NULL, 'g'},
1777 		{ "auto_zc",		0,	NULL,  0 },
1778 		{ "auto_zc_fallback", 	0,	NULL,  0 },
1779 		{ "user_copy",		0,	NULL, 'u'},
1780 		{ "size",		1,	NULL, 's'},
1781 		{ "nthreads",		1,	NULL,  0 },
1782 		{ "per_io_tasks",	0,	NULL,  0 },
1783 		{ "no_ublk_fixed_fd",	0,	NULL,  0 },
1784 		{ "integrity_capable",	0,	NULL,  0 },
1785 		{ "integrity_reftag",	0,	NULL,  0 },
1786 		{ "metadata_size",	1,	NULL,  0 },
1787 		{ "pi_offset",		1,	NULL,  0 },
1788 		{ "csum_type",		1,	NULL,  0 },
1789 		{ "tag_size",		1,	NULL,  0 },
1790 		{ "safe",		0,	NULL,  0 },
1791 		{ "batch",              0,      NULL, 'b'},
1792 		{ "no_auto_part_scan",	0,	NULL,  0 },
1793 		{ 0, 0, 0, 0 }
1794 	};
1795 	const struct ublk_tgt_ops *ops = NULL;
1796 	int option_idx, opt;
1797 	const char *cmd = argv[1];
1798 	struct dev_ctx ctx = {
1799 		._evtfd         =       -1,
1800 		.queue_depth	=	128,
1801 		.nr_hw_queues	=	2,
1802 		.dev_id		=	-1,
1803 		.tgt_type	=	"unknown",
1804 		.csum_type	=	LBMD_PI_CSUM_NONE,
1805 	};
1806 	int ret = -EINVAL, i;
1807 	int tgt_argc = 1;
1808 	char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
1809 	int value;
1810 
1811 	if (argc == 1)
1812 		return ret;
1813 
1814 	opterr = 0;
1815 	optind = 2;
1816 	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
1817 				  longopts, &option_idx)) != -1) {
1818 		switch (opt) {
1819 		case 'a':
1820 			ctx.all = 1;
1821 			break;
1822 		case 'b':
1823 			ctx.flags |= UBLK_F_BATCH_IO;
1824 			break;
1825 		case 'n':
1826 			ctx.dev_id = strtol(optarg, NULL, 10);
1827 			break;
1828 		case 't':
1829 			if (strlen(optarg) < sizeof(ctx.tgt_type))
1830 				strcpy(ctx.tgt_type, optarg);
1831 			break;
1832 		case 'q':
1833 			ctx.nr_hw_queues = strtol(optarg, NULL, 10);
1834 			break;
1835 		case 'd':
1836 			ctx.queue_depth = strtol(optarg, NULL, 10);
1837 			break;
1838 		case 'z':
1839 			ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
1840 			break;
1841 		case 'r':
1842 			value = strtol(optarg, NULL, 10);
1843 			if (value)
1844 				ctx.flags |= UBLK_F_USER_RECOVERY;
1845 			break;
1846 		case 'e':
1847 			value = strtol(optarg, NULL, 10);
1848 			if (value)
1849 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
1850 			break;
1851 		case 'i':
1852 			value = strtol(optarg, NULL, 10);
1853 			if (value)
1854 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
1855 			break;
1856 		case 'g':
1857 			ctx.flags |= UBLK_F_NEED_GET_DATA;
1858 			break;
1859 		case 'u':
1860 			ctx.flags |= UBLK_F_USER_COPY;
1861 			break;
1862 		case 's':
1863 			ctx.size = strtoull(optarg, NULL, 10);
1864 			break;
1865 		case 0:
1866 			if (!strcmp(longopts[option_idx].name, "debug_mask"))
1867 				ublk_dbg_mask = strtol(optarg, NULL, 16);
1868 			if (!strcmp(longopts[option_idx].name, "quiet"))
1869 				ublk_dbg_mask = 0;
1870 			if (!strcmp(longopts[option_idx].name, "foreground"))
1871 				ctx.fg = 1;
1872 			if (!strcmp(longopts[option_idx].name, "auto_zc"))
1873 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
1874 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
1875 				ctx.auto_zc_fallback = 1;
1876 			if (!strcmp(longopts[option_idx].name, "nthreads"))
1877 				ctx.nthreads = strtol(optarg, NULL, 10);
1878 			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
1879 				ctx.per_io_tasks = 1;
1880 			if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
1881 				ctx.no_ublk_fixed_fd = 1;
1882 			if (!strcmp(longopts[option_idx].name, "integrity_capable"))
1883 				ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
1884 			if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
1885 				ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
1886 			if (!strcmp(longopts[option_idx].name, "metadata_size"))
1887 				ctx.metadata_size = strtoul(optarg, NULL, 0);
1888 			if (!strcmp(longopts[option_idx].name, "pi_offset"))
1889 				ctx.pi_offset = strtoul(optarg, NULL, 0);
1890 			if (!strcmp(longopts[option_idx].name, "csum_type")) {
1891 				if (!strcmp(optarg, "ip")) {
1892 					ctx.csum_type = LBMD_PI_CSUM_IP;
1893 				} else if (!strcmp(optarg, "t10dif")) {
1894 					ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
1895 				} else if (!strcmp(optarg, "nvme")) {
1896 					ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
1897 				} else {
1898 					ublk_err("invalid csum_type: %s\n", optarg);
1899 					return -EINVAL;
1900 				}
1901 			}
1902 			if (!strcmp(longopts[option_idx].name, "tag_size"))
1903 				ctx.tag_size = strtoul(optarg, NULL, 0);
1904 			if (!strcmp(longopts[option_idx].name, "safe"))
1905 				ctx.safe_stop = 1;
1906 			if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
1907 				ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
1908 			break;
1909 		case '?':
1910 			/*
1911 			 * target requires every option must have argument
1912 			 */
1913 			if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
1914 				fprintf(stderr, "every target option requires argument: %s %s\n",
1915 						argv[optind - 1], argv[optind]);
1916 				exit(EXIT_FAILURE);
1917 			}
1918 
1919 			if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
1920 				tgt_argv[tgt_argc++] = argv[optind - 1];
1921 				tgt_argv[tgt_argc++] = argv[optind];
1922 			} else {
1923 				fprintf(stderr, "too many target options\n");
1924 				exit(EXIT_FAILURE);
1925 			}
1926 			optind += 1;
1927 			break;
1928 		}
1929 	}
1930 
1931 	if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
1932 		ublk_err("per_io_task and F_BATCH_IO conflict\n");
1933 		return -EINVAL;
1934 	}
1935 
1936 	/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
1937 	if (ctx.auto_zc_fallback &&
1938 	    !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1939 		    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
1940 		ublk_err("%s: auto_zc_fallback is set but neither "
1941 				"F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
1942 					__func__);
1943 		return -EINVAL;
1944 	}
1945 
1946 	if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
1947 	    !!(ctx.flags & UBLK_F_USER_COPY) +
1948 	    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
1949 	    (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
1950 	    ctx.auto_zc_fallback > 1) {
1951 		fprintf(stderr, "too many data copy modes specified\n");
1952 		return -EINVAL;
1953 	}
1954 
1955 	if (ctx.metadata_size) {
1956 		if (!(ctx.flags & UBLK_F_USER_COPY)) {
1957 			ublk_err("integrity requires user_copy\n");
1958 			return -EINVAL;
1959 		}
1960 
1961 		ctx.flags |= UBLK_F_INTEGRITY;
1962 	} else if (ctx.integrity_flags ||
1963 		   ctx.pi_offset ||
1964 		   ctx.csum_type != LBMD_PI_CSUM_NONE ||
1965 		   ctx.tag_size) {
1966 		ublk_err("integrity parameters require metadata_size\n");
1967 		return -EINVAL;
1968 	}
1969 
1970 	if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1971 			(ctx.flags & UBLK_F_BATCH_IO) &&
1972 			(ctx.nthreads > ctx.nr_hw_queues)) {
1973 		ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
1974 		return -EINVAL;
1975 	}
1976 
1977 	i = optind;
1978 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
1979 		ctx.files[ctx.nr_files++] = argv[i++];
1980 	}
1981 
1982 	ops = ublk_find_tgt(ctx.tgt_type);
1983 	if (ops && ops->parse_cmd_line) {
1984 		optind = 0;
1985 
1986 		tgt_argv[0] = ctx.tgt_type;
1987 		ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
1988 	}
1989 
1990 	if (!strcmp(cmd, "add"))
1991 		ret = cmd_dev_add(&ctx);
1992 	else if (!strcmp(cmd, "recover")) {
1993 		if (ctx.dev_id < 0) {
1994 			fprintf(stderr, "device id isn't provided for recovering\n");
1995 			ret = -EINVAL;
1996 		} else {
1997 			ctx.recovery = 1;
1998 			ret = cmd_dev_add(&ctx);
1999 		}
2000 	} else if (!strcmp(cmd, "del"))
2001 		ret = cmd_dev_del(&ctx);
2002 	else if (!strcmp(cmd, "stop"))
2003 		ret = cmd_dev_stop(&ctx);
2004 	else if (!strcmp(cmd, "list")) {
2005 		ctx.all = 1;
2006 		ret = cmd_dev_list(&ctx);
2007 	} else if (!strcmp(cmd, "help"))
2008 		ret = cmd_dev_help(argv[0]);
2009 	else if (!strcmp(cmd, "features"))
2010 		ret = cmd_dev_get_features();
2011 	else if (!strcmp(cmd, "update_size"))
2012 		ret = cmd_dev_update_size(&ctx);
2013 	else if (!strcmp(cmd, "quiesce"))
2014 		ret = cmd_dev_quiesce(&ctx);
2015 	else
2016 		cmd_dev_help(argv[0]);
2017 
2018 	return ret;
2019 }
2020