xref: /linux/kernel/trace/blktrace.c (revision 23b0f90ba871f096474e1c27c3d14f455189d2d9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
4  *
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/kernel.h>
10 #include <linux/blkdev.h>
11 #include <linux/blktrace_api.h>
12 #include <linux/percpu.h>
13 #include <linux/init.h>
14 #include <linux/mutex.h>
15 #include <linux/slab.h>
16 #include <linux/debugfs.h>
17 #include <linux/export.h>
18 #include <linux/time.h>
19 #include <linux/uaccess.h>
20 #include <linux/list.h>
21 #include <linux/blk-cgroup.h>
22 
23 #include "../../block/blk.h"
24 
25 #include <trace/events/block.h>
26 
27 #include "trace_output.h"
28 
29 #ifdef CONFIG_BLK_DEV_IO_TRACE
30 
31 static unsigned int blktrace_seq __read_mostly = 1;
32 
33 static struct trace_array *blk_tr;
34 static bool blk_tracer_enabled __read_mostly;
35 
36 static LIST_HEAD(running_trace_list);
37 static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
38 
39 /* Select an alternative, minimalistic output than the original one */
40 #define TRACE_BLK_OPT_CLASSIC	0x1
41 #define TRACE_BLK_OPT_CGROUP	0x2
42 #define TRACE_BLK_OPT_CGNAME	0x4
43 
44 static struct tracer_opt blk_tracer_opts[] = {
45 	/* Default disable the minimalistic output */
46 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
47 #ifdef CONFIG_BLK_CGROUP
48 	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
49 	{ TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
50 #endif
51 	{ }
52 };
53 
54 static struct tracer_flags blk_tracer_flags = {
55 	.val  = 0,
56 	.opts = blk_tracer_opts,
57 };
58 
59 /* Global reference count of probes */
60 static DEFINE_MUTEX(blk_probe_mutex);
61 static int blk_probes_ref;
62 
63 static void blk_register_tracepoints(void);
64 static void blk_unregister_tracepoints(void);
65 
66 static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu,
67 				  sector_t sector, int bytes, u64 what,
68 				  dev_t dev, int error, u64 cgid,
69 				  ssize_t cgid_len, void *pdu_data, int pdu_len)
70 
71 {
72 	/*
73 	 * These two are not needed in ftrace as they are in the
74 	 * generic trace_entry, filled by tracing_generic_entry_update,
75 	 * but for the trace_event->bin() synthesizer benefit we do it
76 	 * here too.
77 	 */
78 	t->cpu = cpu;
79 	t->pid = pid;
80 
81 	t->sector = sector;
82 	t->bytes = bytes;
83 	t->action = lower_32_bits(what);
84 	t->device = dev;
85 	t->error = error;
86 	t->pdu_len = pdu_len + cgid_len;
87 
88 	if (cgid_len)
89 		memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
90 	if (pdu_len)
91 		memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
92 }
93 
94 static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu,
95 				   sector_t sector, int bytes, u64 what,
96 				   dev_t dev, int error, u64 cgid,
97 				   ssize_t cgid_len, void *pdu_data,
98 				   int pdu_len)
99 {
100 	t2->pid = pid;
101 	t2->cpu = cpu;
102 
103 	t2->sector = sector;
104 	t2->bytes = bytes;
105 	t2->action = what;
106 	t2->device = dev;
107 	t2->error = error;
108 	t2->pdu_len = pdu_len + cgid_len;
109 
110 	if (cgid_len)
111 		memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len);
112 	if (pdu_len)
113 		memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len);
114 }
115 
116 static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence,
117 				 pid_t pid, int cpu, sector_t sector, int bytes,
118 				 u64 what, int error, u64 cgid,
119 				 ssize_t cgid_len, void *pdu_data, int pdu_len)
120 {
121 	struct blk_io_trace *t;
122 	size_t trace_len = sizeof(*t) + pdu_len + cgid_len;
123 
124 	t = relay_reserve(bt->rchan, trace_len);
125 	if (!t)
126 		return;
127 
128 	t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
129 	t->sequence = sequence;
130 	t->time = ktime_to_ns(ktime_get());
131 
132 	record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error,
133 			      cgid, cgid_len, pdu_data, pdu_len);
134 }
135 
136 static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence,
137 				  pid_t pid, int cpu, sector_t sector,
138 				  int bytes, u64 what, int error, u64 cgid,
139 				  ssize_t cgid_len, void *pdu_data, int pdu_len)
140 {
141 	struct blk_io_trace2 *t;
142 	size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len;
143 
144 	t = relay_reserve(bt->rchan, trace_len);
145 	if (!t)
146 		return;
147 
148 	t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION;
149 	t->sequence = sequence;
150 	t->time = ktime_to_ns(ktime_get());
151 
152 	record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error,
153 			       cgid, cgid_len, pdu_data, pdu_len);
154 }
155 
156 static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence,
157 				 pid_t pid, int cpu, sector_t sector, int bytes,
158 				 u64 what, int error, u64 cgid,
159 				 ssize_t cgid_len, void *pdu_data, int pdu_len)
160 {
161 	if (bt->version == 2)
162 		return relay_blktrace_event2(bt, sequence, pid, cpu, sector,
163 					     bytes, what, error, cgid, cgid_len,
164 					     pdu_data, pdu_len);
165 	return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes,
166 				     what, error, cgid, cgid_len, pdu_data,
167 				     pdu_len);
168 }
169 
170 /*
171  * Send out a notify message.
172  */
173 static void trace_note(struct blk_trace *bt, pid_t pid, u64 action,
174 		       const void *data, size_t len, u64 cgid)
175 {
176 	struct ring_buffer_event *event = NULL;
177 	struct trace_buffer *buffer = NULL;
178 	unsigned int trace_ctx = 0;
179 	int cpu = smp_processor_id();
180 	bool blk_tracer = blk_tracer_enabled;
181 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
182 
183 	action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0));
184 	if (blk_tracer) {
185 		struct blk_io_trace2 *t;
186 		size_t trace_len = sizeof(*t) + cgid_len + len;
187 
188 		buffer = blk_tr->array_buffer.buffer;
189 		trace_ctx = tracing_gen_ctx_flags(0);
190 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
191 						  trace_len, trace_ctx);
192 		if (!event)
193 			return;
194 		t = ring_buffer_event_data(event);
195 		record_blktrace_event2(t, pid, cpu, 0, 0,
196 				       action, bt->dev, 0, cgid, cgid_len,
197 				       (void *)data, len);
198 		trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
199 		return;
200 	}
201 
202 	if (!bt->rchan)
203 		return;
204 
205 	relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid,
206 			     cgid_len, (void *)data, len);
207 }
208 
209 /*
210  * Send out a notify for this process, if we haven't done so since a trace
211  * started
212  */
213 static void trace_note_tsk(struct task_struct *tsk)
214 {
215 	unsigned long flags;
216 	struct blk_trace *bt;
217 
218 	tsk->btrace_seq = blktrace_seq;
219 	raw_spin_lock_irqsave(&running_trace_lock, flags);
220 	list_for_each_entry(bt, &running_trace_list, running_list) {
221 		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
222 			   sizeof(tsk->comm), 0);
223 	}
224 	raw_spin_unlock_irqrestore(&running_trace_lock, flags);
225 }
226 
227 static void trace_note_time(struct blk_trace *bt)
228 {
229 	struct timespec64 now;
230 	unsigned long flags;
231 	u32 words[2];
232 
233 	/* need to check user space to see if this breaks in y2038 or y2106 */
234 	ktime_get_real_ts64(&now);
235 	words[0] = (u32)now.tv_sec;
236 	words[1] = now.tv_nsec;
237 
238 	local_irq_save(flags);
239 	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
240 	local_irq_restore(flags);
241 }
242 
243 void __blk_trace_note_message(struct blk_trace *bt,
244 		struct cgroup_subsys_state *css, const char *fmt, ...)
245 {
246 	int n;
247 	va_list args;
248 	unsigned long flags;
249 	char *buf;
250 	u64 cgid = 0;
251 
252 	if (unlikely(bt->trace_state != Blktrace_running &&
253 		     !blk_tracer_enabled))
254 		return;
255 
256 	/*
257 	 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
258 	 * message to the trace.
259 	 */
260 	if (!(bt->act_mask & BLK_TC_NOTIFY))
261 		return;
262 
263 	local_irq_save(flags);
264 	buf = this_cpu_ptr(bt->msg_data);
265 	va_start(args, fmt);
266 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
267 	va_end(args);
268 
269 #ifdef CONFIG_BLK_CGROUP
270 	if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
271 		cgid = cgroup_id(css->cgroup);
272 	else
273 		cgid = 1;
274 #endif
275 	trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
276 	local_irq_restore(flags);
277 }
278 EXPORT_SYMBOL_GPL(__blk_trace_note_message);
279 
280 static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector,
281 			 pid_t pid)
282 {
283 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
284 		return 1;
285 	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
286 		return 1;
287 	if (bt->pid && pid != bt->pid)
288 		return 1;
289 
290 	return 0;
291 }
292 
293 /*
294  * Data direction bit lookup
295  */
296 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
297 				 BLK_TC_ACT(BLK_TC_WRITE) };
298 
299 #define BLK_TC_RAHEAD		BLK_TC_AHEAD
300 #define BLK_TC_PREFLUSH		BLK_TC_FLUSH
301 
302 /* The ilog2() calls fall out because they're constant */
303 #define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) <<	\
304 	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
305 
306 /*
307  * The worker for the various blk_add_trace*() types. Fills out a
308  * blk_io_trace structure and places it in a per-cpu subbuffer.
309  */
310 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
311 			    const blk_opf_t opf, u64 what, int error,
312 			    int pdu_len, void *pdu_data, u64 cgid)
313 {
314 	struct task_struct *tsk = current;
315 	struct ring_buffer_event *event = NULL;
316 	struct trace_buffer *buffer = NULL;
317 	unsigned long flags = 0;
318 	unsigned long *sequence;
319 	unsigned int trace_ctx = 0;
320 	pid_t pid;
321 	int cpu;
322 	bool blk_tracer = blk_tracer_enabled;
323 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
324 	const enum req_op op = opf & REQ_OP_MASK;
325 	size_t trace_len;
326 
327 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
328 		return;
329 
330 	what |= ddir_act[op_is_write(op) ? WRITE : READ];
331 	what |= MASK_TC_BIT(opf, SYNC);
332 	what |= MASK_TC_BIT(opf, RAHEAD);
333 	what |= MASK_TC_BIT(opf, META);
334 	what |= MASK_TC_BIT(opf, PREFLUSH);
335 	what |= MASK_TC_BIT(opf, FUA);
336 
337 	switch (op) {
338 	case REQ_OP_DISCARD:
339 	case REQ_OP_SECURE_ERASE:
340 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
341 		break;
342 	case REQ_OP_FLUSH:
343 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
344 		break;
345 	case REQ_OP_ZONE_APPEND:
346 		what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND);
347 		break;
348 	case REQ_OP_ZONE_RESET:
349 		what |= BLK_TC_ACT(BLK_TC_ZONE_RESET);
350 		break;
351 	case REQ_OP_ZONE_RESET_ALL:
352 		what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL);
353 		break;
354 	case REQ_OP_ZONE_FINISH:
355 		what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH);
356 		break;
357 	case REQ_OP_ZONE_OPEN:
358 		what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN);
359 		break;
360 	case REQ_OP_ZONE_CLOSE:
361 		what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
362 		break;
363 	case REQ_OP_WRITE_ZEROES:
364 		what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES);
365 		break;
366 	default:
367 		break;
368 	}
369 
370 	/* Drop trace events for zone operations with blktrace v1 */
371 	if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) {
372 		pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n",
373 				(unsigned long long)what);
374 		return;
375 	}
376 
377 	if (cgid)
378 		what |= __BLK_TA_CGROUP;
379 
380 	pid = tsk->pid;
381 	if (act_log_check(bt, what, sector, pid))
382 		return;
383 	cpu = raw_smp_processor_id();
384 
385 	if (blk_tracer) {
386 		tracing_record_cmdline(current);
387 
388 		buffer = blk_tr->array_buffer.buffer;
389 		trace_ctx = tracing_gen_ctx_flags(0);
390 		switch (bt->version) {
391 		case 1:
392 			trace_len = sizeof(struct blk_io_trace);
393 			break;
394 		case 2:
395 		default:
396 			/*
397 			 * ftrace always uses v2 (blk_io_trace2) format.
398 			 *
399 			 * For sysfs-enabled tracing path (enabled via
400 			 * /sys/block/DEV/trace/enable), blk_trace_setup_queue()
401 			 * never initializes bt->version, leaving it 0 from
402 			 * kzalloc(). We must handle version==0 safely here.
403 			 *
404 			 * Fall through to default to ensure we never hit the
405 			 * old bug where default set trace_len=0, causing
406 			 * buffer underflow and memory corruption.
407 			 *
408 			 * Always use v2 format for ftrace and normalize
409 			 * bt->version to 2 when uninitialized.
410 			 */
411 			trace_len = sizeof(struct blk_io_trace2);
412 			if (bt->version == 0)
413 				bt->version = 2;
414 			break;
415 		}
416 		trace_len += pdu_len + cgid_len;
417 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
418 						  trace_len, trace_ctx);
419 		if (!event)
420 			return;
421 
422 		switch (bt->version) {
423 		case 1:
424 			record_blktrace_event(ring_buffer_event_data(event),
425 					      pid, cpu, sector, bytes,
426 					      what, bt->dev, error, cgid, cgid_len,
427 					      pdu_data, pdu_len);
428 			break;
429 		case 2:
430 		default:
431 			/*
432 			 * Use v2 recording function (record_blktrace_event2)
433 			 * which writes blk_io_trace2 structure with correct
434 			 * field layout:
435 			 *   - 32-bit pid at offset 28
436 			 *   - 64-bit action at offset 32
437 			 *
438 			 * Fall through to default handles version==0 case
439 			 * (from sysfs path), ensuring we always use correct
440 			 * v2 recording function to match the v2 buffer
441 			 * allocated above.
442 			 */
443 			record_blktrace_event2(ring_buffer_event_data(event),
444 					       pid, cpu, sector, bytes,
445 					       what, bt->dev, error, cgid, cgid_len,
446 					       pdu_data, pdu_len);
447 			break;
448 		}
449 
450 		trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
451 		return;
452 	}
453 
454 	if (unlikely(tsk->btrace_seq != blktrace_seq))
455 		trace_note_tsk(tsk);
456 
457 	/*
458 	 * A word about the locking here - we disable interrupts to reserve
459 	 * some space in the relay per-cpu buffer, to prevent an irq
460 	 * from coming in and stepping on our toes.
461 	 */
462 	local_irq_save(flags);
463 	sequence = per_cpu_ptr(bt->sequence, cpu);
464 	(*sequence)++;
465 	relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes,
466 			     what, error, cgid, cgid_len, pdu_data, pdu_len);
467 	local_irq_restore(flags);
468 }
469 
470 static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
471 {
472 	relay_close(bt->rchan);
473 
474 	/*
475 	 * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
476 	 * under 'q->debugfs_dir', thus lookup and remove them.
477 	 */
478 	if (!bt->dir) {
479 		debugfs_lookup_and_remove("dropped", q->debugfs_dir);
480 		debugfs_lookup_and_remove("msg", q->debugfs_dir);
481 	} else {
482 		debugfs_remove(bt->dir);
483 	}
484 	free_percpu(bt->sequence);
485 	free_percpu(bt->msg_data);
486 	kfree(bt);
487 }
488 
489 static void get_probe_ref(void)
490 {
491 	mutex_lock(&blk_probe_mutex);
492 	if (++blk_probes_ref == 1)
493 		blk_register_tracepoints();
494 	mutex_unlock(&blk_probe_mutex);
495 }
496 
497 static void put_probe_ref(void)
498 {
499 	mutex_lock(&blk_probe_mutex);
500 	if (!--blk_probes_ref)
501 		blk_unregister_tracepoints();
502 	mutex_unlock(&blk_probe_mutex);
503 }
504 
505 static int blk_trace_start(struct blk_trace *bt)
506 {
507 	if (bt->trace_state != Blktrace_setup &&
508 	    bt->trace_state != Blktrace_stopped)
509 		return -EINVAL;
510 
511 	blktrace_seq++;
512 	smp_mb();
513 	bt->trace_state = Blktrace_running;
514 	raw_spin_lock_irq(&running_trace_lock);
515 	list_add(&bt->running_list, &running_trace_list);
516 	raw_spin_unlock_irq(&running_trace_lock);
517 	trace_note_time(bt);
518 
519 	return 0;
520 }
521 
522 static int blk_trace_stop(struct blk_trace *bt)
523 {
524 	if (bt->trace_state != Blktrace_running)
525 		return -EINVAL;
526 
527 	bt->trace_state = Blktrace_stopped;
528 	raw_spin_lock_irq(&running_trace_lock);
529 	list_del_init(&bt->running_list);
530 	raw_spin_unlock_irq(&running_trace_lock);
531 	relay_flush(bt->rchan);
532 
533 	return 0;
534 }
535 
536 static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
537 {
538 	blk_trace_stop(bt);
539 	synchronize_rcu();
540 	blk_trace_free(q, bt);
541 	put_probe_ref();
542 }
543 
544 static int __blk_trace_remove(struct request_queue *q)
545 {
546 	struct blk_trace *bt;
547 
548 	bt = rcu_replace_pointer(q->blk_trace, NULL,
549 				 lockdep_is_held(&q->debugfs_mutex));
550 	if (!bt)
551 		return -EINVAL;
552 
553 	blk_trace_cleanup(q, bt);
554 
555 	return 0;
556 }
557 
558 int blk_trace_remove(struct request_queue *q)
559 {
560 	int ret;
561 
562 	blk_debugfs_lock_nomemsave(q);
563 	ret = __blk_trace_remove(q);
564 	blk_debugfs_unlock_nomemrestore(q);
565 
566 	return ret;
567 }
568 EXPORT_SYMBOL_GPL(blk_trace_remove);
569 
570 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
571 				size_t count, loff_t *ppos)
572 {
573 	struct blk_trace *bt = filp->private_data;
574 	size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL);
575 	char buf[16];
576 
577 	snprintf(buf, sizeof(buf), "%zu\n", dropped);
578 
579 	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
580 }
581 
582 static const struct file_operations blk_dropped_fops = {
583 	.owner =	THIS_MODULE,
584 	.open =		simple_open,
585 	.read =		blk_dropped_read,
586 	.llseek =	default_llseek,
587 };
588 
589 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
590 				size_t count, loff_t *ppos)
591 {
592 	char *msg;
593 	struct blk_trace *bt;
594 
595 	if (count >= BLK_TN_MAX_MSG)
596 		return -EINVAL;
597 
598 	msg = memdup_user_nul(buffer, count);
599 	if (IS_ERR(msg))
600 		return PTR_ERR(msg);
601 
602 	bt = filp->private_data;
603 	__blk_trace_note_message(bt, NULL, "%s", msg);
604 	kfree(msg);
605 
606 	return count;
607 }
608 
609 static const struct file_operations blk_msg_fops = {
610 	.owner =	THIS_MODULE,
611 	.open =		simple_open,
612 	.write =	blk_msg_write,
613 	.llseek =	noop_llseek,
614 };
615 
616 static int blk_remove_buf_file_callback(struct dentry *dentry)
617 {
618 	debugfs_remove(dentry);
619 
620 	return 0;
621 }
622 
623 static struct dentry *blk_create_buf_file_callback(const char *filename,
624 						   struct dentry *parent,
625 						   umode_t mode,
626 						   struct rchan_buf *buf,
627 						   int *is_global)
628 {
629 	return debugfs_create_file(filename, mode, parent, buf,
630 					&relay_file_operations);
631 }
632 
633 static const struct rchan_callbacks blk_relay_callbacks = {
634 	.create_buf_file	= blk_create_buf_file_callback,
635 	.remove_buf_file	= blk_remove_buf_file_callback,
636 };
637 
638 static void blk_trace_setup_lba(struct blk_trace *bt,
639 				struct block_device *bdev)
640 {
641 	if (bdev) {
642 		bt->start_lba = bdev->bd_start_sect;
643 		bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
644 	} else {
645 		bt->start_lba = 0;
646 		bt->end_lba = -1ULL;
647 	}
648 }
649 
650 /*
651  * Setup everything required to start tracing
652  */
653 static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q,
654 						 char *name, dev_t dev,
655 						 u32 buf_size, u32 buf_nr,
656 						 struct block_device *bdev)
657 {
658 	struct blk_trace *bt = NULL;
659 	struct dentry *dir = NULL;
660 	int ret;
661 
662 	lockdep_assert_held(&q->debugfs_mutex);
663 
664 	/*
665 	 * bdev can be NULL, as with scsi-generic, this is a helpful as
666 	 * we can be.
667 	 */
668 	if (rcu_dereference_protected(q->blk_trace,
669 				      lockdep_is_held(&q->debugfs_mutex))) {
670 		pr_warn("Concurrent blktraces are not allowed on %s\n", name);
671 		return ERR_PTR(-EBUSY);
672 	}
673 
674 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
675 	if (!bt)
676 		return ERR_PTR(-ENOMEM);
677 
678 	ret = -ENOMEM;
679 	bt->sequence = alloc_percpu(unsigned long);
680 	if (!bt->sequence)
681 		goto err;
682 
683 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
684 	if (!bt->msg_data)
685 		goto err;
686 
687 	/*
688 	 * When tracing the whole disk reuse the existing debugfs directory
689 	 * created by the block layer on init. For partitions block devices,
690 	 * and scsi-generic block devices we create a temporary new debugfs
691 	 * directory that will be removed once the trace ends.
692 	 */
693 	if (bdev && !bdev_is_partition(bdev))
694 		dir = q->debugfs_dir;
695 	else
696 		bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root);
697 
698 	/*
699 	 * As blktrace relies on debugfs for its interface the debugfs directory
700 	 * is required, contrary to the usual mantra of not checking for debugfs
701 	 * files or directories.
702 	 */
703 	if (IS_ERR_OR_NULL(dir)) {
704 		pr_warn("debugfs_dir not present for %s so skipping\n", name);
705 		ret = -ENOENT;
706 		goto err;
707 	}
708 
709 	bt->dev = dev;
710 	INIT_LIST_HEAD(&bt->running_list);
711 
712 	ret = -EIO;
713 	debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
714 	debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
715 
716 	bt->rchan = relay_open("trace", dir, buf_size, buf_nr,
717 			       &blk_relay_callbacks, bt);
718 	if (!bt->rchan)
719 		goto err;
720 
721 	blk_trace_setup_lba(bt, bdev);
722 
723 	return bt;
724 
725 err:
726 	blk_trace_free(q, bt);
727 
728 	return ERR_PTR(ret);
729 }
730 
731 static void blk_trace_setup_finalize(struct request_queue *q,
732 				     char *name, int version,
733 				     struct blk_trace *bt,
734 				     struct blk_user_trace_setup2 *buts)
735 
736 {
737 	strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2);
738 
739 	/*
740 	 * some device names have larger paths - convert the slashes
741 	 * to underscores for this to work as expected
742 	 */
743 	strreplace(buts->name, '/', '_');
744 
745 	bt->version = version;
746 	bt->act_mask = buts->act_mask;
747 	if (!bt->act_mask)
748 		bt->act_mask = (u16) -1;
749 
750 	/* overwrite with user settings */
751 	if (buts->start_lba)
752 		bt->start_lba = buts->start_lba;
753 	if (buts->end_lba)
754 		bt->end_lba = buts->end_lba;
755 
756 	bt->pid = buts->pid;
757 	bt->trace_state = Blktrace_setup;
758 
759 	rcu_assign_pointer(q->blk_trace, bt);
760 	get_probe_ref();
761 }
762 
763 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
764 		    struct block_device *bdev,
765 		    char __user *arg)
766 {
767 	struct blk_user_trace_setup2 buts2;
768 	struct blk_user_trace_setup buts;
769 	struct blk_trace *bt;
770 	unsigned int memflags;
771 	int ret;
772 
773 	ret = copy_from_user(&buts, arg, sizeof(buts));
774 	if (ret)
775 		return -EFAULT;
776 
777 	if (!buts.buf_size || !buts.buf_nr)
778 		return -EINVAL;
779 
780 	buts2 = (struct blk_user_trace_setup2) {
781 		.act_mask = buts.act_mask,
782 		.buf_size = buts.buf_size,
783 		.buf_nr = buts.buf_nr,
784 		.start_lba = buts.start_lba,
785 		.end_lba = buts.end_lba,
786 		.pid = buts.pid,
787 	};
788 
789 	memflags = blk_debugfs_lock(q);
790 	bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
791 				     bdev);
792 	if (IS_ERR(bt)) {
793 		blk_debugfs_unlock(q, memflags);
794 		return PTR_ERR(bt);
795 	}
796 	blk_trace_setup_finalize(q, name, 1, bt, &buts2);
797 	strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE);
798 	blk_debugfs_unlock(q, memflags);
799 
800 	if (copy_to_user(arg, &buts, sizeof(buts))) {
801 		blk_trace_remove(q);
802 		return -EFAULT;
803 	}
804 	return 0;
805 }
806 EXPORT_SYMBOL_GPL(blk_trace_setup);
807 
808 static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev,
809 			    struct block_device *bdev, char __user *arg)
810 {
811 	struct blk_user_trace_setup2 buts2;
812 	struct blk_trace *bt;
813 	unsigned int memflags;
814 
815 	if (copy_from_user(&buts2, arg, sizeof(buts2)))
816 		return -EFAULT;
817 
818 	if (!buts2.buf_size || !buts2.buf_nr)
819 		return -EINVAL;
820 
821 	if (buts2.flags != 0)
822 		return -EINVAL;
823 
824 	memflags = blk_debugfs_lock(q);
825 	bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
826 				     bdev);
827 	if (IS_ERR(bt)) {
828 		blk_debugfs_unlock(q, memflags);
829 		return PTR_ERR(bt);
830 	}
831 	blk_trace_setup_finalize(q, name, 2, bt, &buts2);
832 	blk_debugfs_unlock(q, memflags);
833 
834 	if (copy_to_user(arg, &buts2, sizeof(buts2))) {
835 		blk_trace_remove(q);
836 		return -EFAULT;
837 	}
838 	return 0;
839 }
840 
841 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
842 static int compat_blk_trace_setup(struct request_queue *q, char *name,
843 				  dev_t dev, struct block_device *bdev,
844 				  char __user *arg)
845 {
846 	struct blk_user_trace_setup2 buts2;
847 	struct compat_blk_user_trace_setup cbuts;
848 	struct blk_trace *bt;
849 	unsigned int memflags;
850 
851 	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
852 		return -EFAULT;
853 
854 	if (!cbuts.buf_size || !cbuts.buf_nr)
855 		return -EINVAL;
856 
857 	buts2 = (struct blk_user_trace_setup2) {
858 		.act_mask = cbuts.act_mask,
859 		.buf_size = cbuts.buf_size,
860 		.buf_nr = cbuts.buf_nr,
861 		.start_lba = cbuts.start_lba,
862 		.end_lba = cbuts.end_lba,
863 		.pid = cbuts.pid,
864 	};
865 
866 	memflags = blk_debugfs_lock(q);
867 	bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
868 				     bdev);
869 	if (IS_ERR(bt)) {
870 		blk_debugfs_unlock(q, memflags);
871 		return PTR_ERR(bt);
872 	}
873 	blk_trace_setup_finalize(q, name, 1, bt, &buts2);
874 	blk_debugfs_unlock(q, memflags);
875 
876 	if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) {
877 		blk_trace_remove(q);
878 		return -EFAULT;
879 	}
880 
881 	return 0;
882 }
883 #endif
884 
885 static int __blk_trace_startstop(struct request_queue *q, int start)
886 {
887 	struct blk_trace *bt;
888 
889 	bt = rcu_dereference_protected(q->blk_trace,
890 				       lockdep_is_held(&q->debugfs_mutex));
891 	if (bt == NULL)
892 		return -EINVAL;
893 
894 	if (start)
895 		return blk_trace_start(bt);
896 	else
897 		return blk_trace_stop(bt);
898 }
899 
900 int blk_trace_startstop(struct request_queue *q, int start)
901 {
902 	int ret;
903 
904 	blk_debugfs_lock_nomemsave(q);
905 	ret = __blk_trace_startstop(q, start);
906 	blk_debugfs_unlock_nomemrestore(q);
907 
908 	return ret;
909 }
910 EXPORT_SYMBOL_GPL(blk_trace_startstop);
911 
912 /*
913  * When reading or writing the blktrace sysfs files, the references to the
914  * opened sysfs or device files should prevent the underlying block device
915  * from being removed. So no further delete protection is really needed.
916  */
917 
918 /**
919  * blk_trace_ioctl - handle the ioctls associated with tracing
920  * @bdev:	the block device
921  * @cmd:	the ioctl cmd
922  * @arg:	the argument data, if any
923  *
924  **/
925 int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
926 {
927 	struct request_queue *q = bdev_get_queue(bdev);
928 	int ret, start = 0;
929 	char b[BDEVNAME_SIZE];
930 
931 	switch (cmd) {
932 	case BLKTRACESETUP2:
933 		snprintf(b, sizeof(b), "%pg", bdev);
934 		ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg);
935 		break;
936 	case BLKTRACESETUP:
937 		snprintf(b, sizeof(b), "%pg", bdev);
938 		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
939 		break;
940 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
941 	case BLKTRACESETUP32:
942 		snprintf(b, sizeof(b), "%pg", bdev);
943 		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
944 		break;
945 #endif
946 	case BLKTRACESTART:
947 		start = 1;
948 		fallthrough;
949 	case BLKTRACESTOP:
950 		ret = blk_trace_startstop(q, start);
951 		break;
952 	case BLKTRACETEARDOWN:
953 		ret = blk_trace_remove(q);
954 		break;
955 	default:
956 		ret = -ENOTTY;
957 		break;
958 	}
959 	return ret;
960 }
961 
962 /**
963  * blk_trace_shutdown - stop and cleanup trace structures
964  * @q:    the request queue associated with the device
965  *
966  **/
967 void blk_trace_shutdown(struct request_queue *q)
968 {
969 	if (rcu_dereference_protected(q->blk_trace,
970 				      lockdep_is_held(&q->debugfs_mutex)))
971 		__blk_trace_remove(q);
972 }
973 
974 #ifdef CONFIG_BLK_CGROUP
975 static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
976 {
977 	struct cgroup_subsys_state *blkcg_css;
978 	struct blk_trace *bt;
979 
980 	/* We don't use the 'bt' value here except as an optimization... */
981 	bt = rcu_dereference_protected(q->blk_trace, 1);
982 	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
983 		return 0;
984 
985 	blkcg_css = bio_blkcg_css(bio);
986 	if (!blkcg_css)
987 		return 0;
988 	return cgroup_id(blkcg_css->cgroup);
989 }
990 #else
991 static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
992 {
993 	return 0;
994 }
995 #endif
996 
997 static u64
998 blk_trace_request_get_cgid(struct request *rq)
999 {
1000 	if (!rq->bio)
1001 		return 0;
1002 	/* Use the first bio */
1003 	return blk_trace_bio_get_cgid(rq->q, rq->bio);
1004 }
1005 
1006 /*
1007  * blktrace probes
1008  */
1009 
1010 /**
1011  * blk_add_trace_rq - Add a trace for a request oriented action
1012  * @rq:		the source request
1013  * @error:	return status to log
1014  * @nr_bytes:	number of completed bytes
1015  * @what:	the action
1016  * @cgid:	the cgroup info
1017  *
1018  * Description:
1019  *     Records an action against a request. Will log the bio offset + size.
1020  *
1021  **/
1022 static void blk_add_trace_rq(struct request *rq, blk_status_t error,
1023 			     unsigned int nr_bytes, u64 what, u64 cgid)
1024 {
1025 	struct blk_trace *bt;
1026 
1027 	rcu_read_lock();
1028 	bt = rcu_dereference(rq->q->blk_trace);
1029 	if (likely(!bt)) {
1030 		rcu_read_unlock();
1031 		return;
1032 	}
1033 
1034 	if (blk_rq_is_passthrough(rq))
1035 		what |= BLK_TC_ACT(BLK_TC_PC);
1036 	else
1037 		what |= BLK_TC_ACT(BLK_TC_FS);
1038 
1039 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
1040 			what, blk_status_to_errno(error), 0, NULL, cgid);
1041 	rcu_read_unlock();
1042 }
1043 
1044 static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
1045 {
1046 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
1047 			 blk_trace_request_get_cgid(rq));
1048 }
1049 
1050 static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
1051 {
1052 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
1053 			 blk_trace_request_get_cgid(rq));
1054 }
1055 
1056 static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
1057 {
1058 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
1059 			 blk_trace_request_get_cgid(rq));
1060 }
1061 
1062 static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
1063 {
1064 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
1065 			 blk_trace_request_get_cgid(rq));
1066 }
1067 
1068 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
1069 			blk_status_t error, unsigned int nr_bytes)
1070 {
1071 	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
1072 			 blk_trace_request_get_cgid(rq));
1073 }
1074 
1075 static void blk_add_trace_zone_update_request(void *ignore, struct request *rq)
1076 {
1077 	struct blk_trace *bt;
1078 
1079 	rcu_read_lock();
1080 	bt = rcu_dereference(rq->q->blk_trace);
1081 	if (likely(!bt) || bt->version < 2) {
1082 		rcu_read_unlock();
1083 		return;
1084 	}
1085 	rcu_read_unlock();
1086 
1087 	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND,
1088 			 blk_trace_request_get_cgid(rq));
1089 }
1090 
1091 /**
1092  * blk_add_trace_bio - Add a trace for a bio oriented action
1093  * @q:		queue the io is for
1094  * @bio:	the source bio
1095  * @what:	the action
1096  * @error:	error, if any
1097  *
1098  * Description:
1099  *     Records an action against a bio. Will log the bio offset + size.
1100  *
1101  **/
1102 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
1103 			      u64 what, int error)
1104 {
1105 	struct blk_trace *bt;
1106 
1107 	rcu_read_lock();
1108 	bt = rcu_dereference(q->blk_trace);
1109 	if (likely(!bt)) {
1110 		rcu_read_unlock();
1111 		return;
1112 	}
1113 
1114 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
1115 			bio->bi_opf, what, error, 0, NULL,
1116 			blk_trace_bio_get_cgid(q, bio));
1117 	rcu_read_unlock();
1118 }
1119 
1120 static void blk_add_trace_bio_complete(void *ignore,
1121 				       struct request_queue *q, struct bio *bio)
1122 {
1123 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
1124 			  blk_status_to_errno(bio->bi_status));
1125 }
1126 
1127 static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
1128 {
1129 	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
1130 			0);
1131 }
1132 
1133 static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
1134 {
1135 	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
1136 			0);
1137 }
1138 
1139 static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
1140 {
1141 	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
1142 }
1143 
1144 static void blk_add_trace_getrq(void *ignore, struct bio *bio)
1145 {
1146 	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
1147 }
1148 
1149 static void blk_add_trace_plug(void *ignore, struct request_queue *q)
1150 {
1151 	struct blk_trace *bt;
1152 
1153 	rcu_read_lock();
1154 	bt = rcu_dereference(q->blk_trace);
1155 	if (bt)
1156 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
1157 	rcu_read_unlock();
1158 }
1159 
1160 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
1161 				    unsigned int depth, bool explicit)
1162 {
1163 	struct blk_trace *bt;
1164 
1165 	rcu_read_lock();
1166 	bt = rcu_dereference(q->blk_trace);
1167 	if (bt) {
1168 		__be64 rpdu = cpu_to_be64(depth);
1169 		u64 what;
1170 
1171 		if (explicit)
1172 			what = BLK_TA_UNPLUG_IO;
1173 		else
1174 			what = BLK_TA_UNPLUG_TIMER;
1175 
1176 		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
1177 	}
1178 	rcu_read_unlock();
1179 }
1180 
1181 static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q,
1182 				    unsigned int zno, sector_t sector,
1183 				    unsigned int sectors)
1184 {
1185 	struct blk_trace *bt;
1186 
1187 	rcu_read_lock();
1188 	bt = rcu_dereference(q->blk_trace);
1189 	if (bt && bt->version >= 2)
1190 		__blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
1191 				BLK_TA_ZONE_PLUG, 0, 0, NULL, 0);
1192 	rcu_read_unlock();
1193 
1194 	return;
1195 }
1196 
1197 static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q,
1198 				      unsigned int zno, sector_t sector,
1199 				      unsigned int sectors)
1200 {
1201 	struct blk_trace *bt;
1202 
1203 	rcu_read_lock();
1204 	bt = rcu_dereference(q->blk_trace);
1205 	if (bt && bt->version >= 2)
1206 		__blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
1207 				BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0);
1208 	rcu_read_unlock();
1209 	return;
1210 }
1211 
1212 static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
1213 {
1214 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1215 	struct blk_trace *bt;
1216 
1217 	rcu_read_lock();
1218 	bt = rcu_dereference(q->blk_trace);
1219 	if (bt) {
1220 		__be64 rpdu = cpu_to_be64(pdu);
1221 
1222 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
1223 				bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
1224 				blk_status_to_errno(bio->bi_status),
1225 				sizeof(rpdu), &rpdu,
1226 				blk_trace_bio_get_cgid(q, bio));
1227 	}
1228 	rcu_read_unlock();
1229 }
1230 
1231 /**
1232  * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
1233  * @ignore:	trace callback data parameter (not used)
1234  * @bio:	the source bio
1235  * @dev:	source device
1236  * @from:	source sector
1237  *
1238  * Called after a bio is remapped to a different device and/or sector.
1239  **/
1240 static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
1241 				    sector_t from)
1242 {
1243 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1244 	struct blk_trace *bt;
1245 	struct blk_io_trace_remap r;
1246 
1247 	rcu_read_lock();
1248 	bt = rcu_dereference(q->blk_trace);
1249 	if (likely(!bt)) {
1250 		rcu_read_unlock();
1251 		return;
1252 	}
1253 
1254 	r.device_from = cpu_to_be32(dev);
1255 	r.device_to   = cpu_to_be32(bio_dev(bio));
1256 	r.sector_from = cpu_to_be64(from);
1257 
1258 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
1259 			bio->bi_opf, BLK_TA_REMAP,
1260 			blk_status_to_errno(bio->bi_status),
1261 			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
1262 	rcu_read_unlock();
1263 }
1264 
1265 /**
1266  * blk_add_trace_rq_remap - Add a trace for a request-remap operation
1267  * @ignore:	trace callback data parameter (not used)
1268  * @rq:		the source request
1269  * @dev:	target device
1270  * @from:	source sector
1271  *
1272  * Description:
1273  *     Device mapper remaps request to other devices.
1274  *     Add a trace for that action.
1275  *
1276  **/
1277 static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
1278 				   sector_t from)
1279 {
1280 	struct blk_trace *bt;
1281 	struct blk_io_trace_remap r;
1282 
1283 	rcu_read_lock();
1284 	bt = rcu_dereference(rq->q->blk_trace);
1285 	if (likely(!bt)) {
1286 		rcu_read_unlock();
1287 		return;
1288 	}
1289 
1290 	r.device_from = cpu_to_be32(dev);
1291 	r.device_to   = cpu_to_be32(disk_devt(rq->q->disk));
1292 	r.sector_from = cpu_to_be64(from);
1293 
1294 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
1295 			rq->cmd_flags, BLK_TA_REMAP, 0,
1296 			sizeof(r), &r, blk_trace_request_get_cgid(rq));
1297 	rcu_read_unlock();
1298 }
1299 
1300 /**
1301  * blk_add_driver_data - Add binary message with driver-specific data
1302  * @rq:		io request
1303  * @data:	driver-specific data
1304  * @len:	length of driver-specific data
1305  *
1306  * Description:
1307  *     Some drivers might want to write driver-specific data per request.
1308  *
1309  **/
1310 void blk_add_driver_data(struct request *rq, void *data, size_t len)
1311 {
1312 	struct blk_trace *bt;
1313 
1314 	rcu_read_lock();
1315 	bt = rcu_dereference(rq->q->blk_trace);
1316 	if (likely(!bt)) {
1317 		rcu_read_unlock();
1318 		return;
1319 	}
1320 
1321 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
1322 				BLK_TA_DRV_DATA, 0, len, data,
1323 				blk_trace_request_get_cgid(rq));
1324 	rcu_read_unlock();
1325 }
1326 EXPORT_SYMBOL_GPL(blk_add_driver_data);
1327 
1328 static void blk_register_tracepoints(void)
1329 {
1330 	int ret;
1331 
1332 	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1333 	WARN_ON(ret);
1334 	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1335 	WARN_ON(ret);
1336 	ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
1337 	WARN_ON(ret);
1338 	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1339 	WARN_ON(ret);
1340 	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
1341 	WARN_ON(ret);
1342 	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
1343 	WARN_ON(ret);
1344 	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
1345 	WARN_ON(ret);
1346 	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
1347 	WARN_ON(ret);
1348 	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
1349 	WARN_ON(ret);
1350 	ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
1351 	WARN_ON(ret);
1352 	ret = register_trace_blk_zone_append_update_request_bio(
1353 		blk_add_trace_zone_update_request, NULL);
1354 	WARN_ON(ret);
1355 	ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug,
1356 						     NULL);
1357 	WARN_ON(ret);
1358 	ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug,
1359 						NULL);
1360 	WARN_ON(ret);
1361 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1362 	WARN_ON(ret);
1363 	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1364 	WARN_ON(ret);
1365 	ret = register_trace_block_split(blk_add_trace_split, NULL);
1366 	WARN_ON(ret);
1367 	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1368 	WARN_ON(ret);
1369 	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1370 	WARN_ON(ret);
1371 }
1372 
1373 static void blk_unregister_tracepoints(void)
1374 {
1375 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1376 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1377 	unregister_trace_block_split(blk_add_trace_split, NULL);
1378 	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1379 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
1380 	unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL);
1381 	unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL);
1382 	unregister_trace_blk_zone_append_update_request_bio(
1383 		blk_add_trace_zone_update_request, NULL);
1384 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
1385 	unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
1386 	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
1387 	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
1388 	unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
1389 	unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
1390 	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1391 	unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
1392 	unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1393 	unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1394 
1395 	tracepoint_synchronize_unregister();
1396 }
1397 
1398 /*
1399  * struct blk_io_tracer formatting routines
1400  */
1401 
1402 static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t)
1403 {
1404 	int i = 0;
1405 	int tc = t->action >> BLK_TC_SHIFT;
1406 
1407 	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
1408 		rwbs[i++] = 'N';
1409 		goto out;
1410 	}
1411 
1412 	if (tc & BLK_TC_FLUSH)
1413 		rwbs[i++] = 'F';
1414 
1415 	if (tc & BLK_TC_DISCARD)
1416 		rwbs[i++] = 'D';
1417 	else if (tc & BLK_TC_WRITE_ZEROES) {
1418 		rwbs[i++] = 'W';
1419 		rwbs[i++] = 'Z';
1420 	} else if (tc & BLK_TC_WRITE)
1421 		rwbs[i++] = 'W';
1422 	else if (t->bytes)
1423 		rwbs[i++] = 'R';
1424 	else
1425 		rwbs[i++] = 'N';
1426 
1427 	if (tc & BLK_TC_FUA)
1428 		rwbs[i++] = 'F';
1429 	if (tc & BLK_TC_AHEAD)
1430 		rwbs[i++] = 'A';
1431 	if (tc & BLK_TC_SYNC)
1432 		rwbs[i++] = 'S';
1433 	if (tc & BLK_TC_META)
1434 		rwbs[i++] = 'M';
1435 out:
1436 	rwbs[i] = '\0';
1437 }
1438 
1439 static inline
1440 const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent)
1441 {
1442 	return (const struct blk_io_trace2 *)ent;
1443 }
1444 
1445 static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
1446 {
1447 	return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
1448 }
1449 
1450 static inline u64 t_cgid(const struct trace_entry *ent)
1451 {
1452 	return *(u64 *)(te_blk_io_trace(ent) + 1);
1453 }
1454 
1455 static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
1456 {
1457 	return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
1458 }
1459 
1460 static inline u32 t_action(const struct trace_entry *ent)
1461 {
1462 	return te_blk_io_trace(ent)->action;
1463 }
1464 
1465 static inline u32 t_bytes(const struct trace_entry *ent)
1466 {
1467 	return te_blk_io_trace(ent)->bytes;
1468 }
1469 
1470 static inline u32 t_sec(const struct trace_entry *ent)
1471 {
1472 	return te_blk_io_trace(ent)->bytes >> 9;
1473 }
1474 
1475 static inline unsigned long long t_sector(const struct trace_entry *ent)
1476 {
1477 	return te_blk_io_trace(ent)->sector;
1478 }
1479 
1480 static inline __u16 t_error(const struct trace_entry *ent)
1481 {
1482 	return te_blk_io_trace(ent)->error;
1483 }
1484 
1485 static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
1486 {
1487 	const __be64 *val = pdu_start(ent, has_cg);
1488 	return be64_to_cpu(*val);
1489 }
1490 
1491 typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
1492 	bool has_cg);
1493 
1494 static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
1495 	bool has_cg)
1496 {
1497 	char rwbs[RWBS_LEN];
1498 	unsigned long long ts  = iter->ts;
1499 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1500 	unsigned secs	       = (unsigned long)ts;
1501 	const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
1502 
1503 	fill_rwbs(rwbs, t);
1504 
1505 	trace_seq_printf(&iter->seq,
1506 			 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1507 			 MAJOR(t->device), MINOR(t->device), iter->cpu,
1508 			 secs, nsec_rem, iter->ent->pid, act, rwbs);
1509 }
1510 
1511 static void blk_log_action(struct trace_iterator *iter, const char *act,
1512 	bool has_cg)
1513 {
1514 	char rwbs[RWBS_LEN];
1515 	const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
1516 
1517 	fill_rwbs(rwbs, t);
1518 	if (has_cg) {
1519 		u64 id = t_cgid(iter->ent);
1520 
1521 		if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
1522 			char blkcg_name_buf[NAME_MAX + 1] = "<...>";
1523 
1524 			cgroup_path_from_kernfs_id(id, blkcg_name_buf,
1525 				sizeof(blkcg_name_buf));
1526 			trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
1527 				 MAJOR(t->device), MINOR(t->device),
1528 				 blkcg_name_buf, act, rwbs);
1529 		} else {
1530 			/*
1531 			 * The cgid portion used to be "INO,GEN".  Userland
1532 			 * builds a FILEID_INO32_GEN fid out of them and
1533 			 * opens the cgroup using open_by_handle_at(2).
1534 			 * While 32bit ino setups are still the same, 64bit
1535 			 * ones now use the 64bit ino as the whole ID and
1536 			 * no longer use generation.
1537 			 *
1538 			 * Regardless of the content, always output
1539 			 * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
1540 			 * be mapped back to @id on both 64 and 32bit ino
1541 			 * setups.  See __kernfs_fh_to_dentry().
1542 			 */
1543 			trace_seq_printf(&iter->seq,
1544 				 "%3d,%-3d %llx,%-llx %2s %3s ",
1545 				 MAJOR(t->device), MINOR(t->device),
1546 				 id & U32_MAX, id >> 32, act, rwbs);
1547 		}
1548 	} else
1549 		trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1550 				 MAJOR(t->device), MINOR(t->device), act, rwbs);
1551 }
1552 
1553 static void blk_log_dump_pdu(struct trace_seq *s,
1554 	const struct trace_entry *ent, bool has_cg)
1555 {
1556 	const unsigned char *pdu_buf;
1557 	int pdu_len;
1558 	int i, end;
1559 
1560 	pdu_buf = pdu_start(ent, has_cg);
1561 	pdu_len = pdu_real_len(ent, has_cg);
1562 
1563 	if (!pdu_len)
1564 		return;
1565 
1566 	/* find the last zero that needs to be printed */
1567 	for (end = pdu_len - 1; end >= 0; end--)
1568 		if (pdu_buf[end])
1569 			break;
1570 	end++;
1571 
1572 	trace_seq_putc(s, '(');
1573 
1574 	for (i = 0; i < pdu_len; i++) {
1575 
1576 		trace_seq_printf(s, "%s%02x",
1577 				 i == 0 ? "" : " ", pdu_buf[i]);
1578 
1579 		/*
1580 		 * stop when the rest is just zeros and indicate so
1581 		 * with a ".." appended
1582 		 */
1583 		if (i == end && end != pdu_len - 1) {
1584 			trace_seq_puts(s, " ..) ");
1585 			return;
1586 		}
1587 	}
1588 
1589 	trace_seq_puts(s, ") ");
1590 }
1591 
1592 static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1593 {
1594 	char cmd[TASK_COMM_LEN];
1595 
1596 	trace_find_cmdline(ent->pid, cmd);
1597 
1598 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1599 		trace_seq_printf(s, "%u ", t_bytes(ent));
1600 		blk_log_dump_pdu(s, ent, has_cg);
1601 		trace_seq_printf(s, "[%s]\n", cmd);
1602 	} else {
1603 		if (t_sec(ent))
1604 			trace_seq_printf(s, "%llu + %u [%s]\n",
1605 						t_sector(ent), t_sec(ent), cmd);
1606 		else
1607 			trace_seq_printf(s, "[%s]\n", cmd);
1608 	}
1609 }
1610 
1611 static void blk_log_with_error(struct trace_seq *s,
1612 			      const struct trace_entry *ent, bool has_cg)
1613 {
1614 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1615 		blk_log_dump_pdu(s, ent, has_cg);
1616 		trace_seq_printf(s, "[%d]\n", t_error(ent));
1617 	} else {
1618 		if (t_sec(ent))
1619 			trace_seq_printf(s, "%llu + %u [%d]\n",
1620 					 t_sector(ent),
1621 					 t_sec(ent), t_error(ent));
1622 		else
1623 			trace_seq_printf(s, "%llu [%d]\n",
1624 					 t_sector(ent), t_error(ent));
1625 	}
1626 }
1627 
1628 static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1629 {
1630 	const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
1631 
1632 	trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1633 			 t_sector(ent), t_sec(ent),
1634 			 MAJOR(be32_to_cpu(__r->device_from)),
1635 			 MINOR(be32_to_cpu(__r->device_from)),
1636 			 be64_to_cpu(__r->sector_from));
1637 }
1638 
1639 static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1640 {
1641 	char cmd[TASK_COMM_LEN];
1642 
1643 	trace_find_cmdline(ent->pid, cmd);
1644 
1645 	trace_seq_printf(s, "[%s]\n", cmd);
1646 }
1647 
1648 static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1649 {
1650 	char cmd[TASK_COMM_LEN];
1651 
1652 	trace_find_cmdline(ent->pid, cmd);
1653 
1654 	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
1655 }
1656 
1657 static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1658 {
1659 	char cmd[TASK_COMM_LEN];
1660 
1661 	trace_find_cmdline(ent->pid, cmd);
1662 
1663 	trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1664 			 get_pdu_int(ent, has_cg), cmd);
1665 }
1666 
1667 static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
1668 			bool has_cg)
1669 {
1670 
1671 	trace_seq_putmem(s, pdu_start(ent, has_cg),
1672 		pdu_real_len(ent, has_cg));
1673 	trace_seq_putc(s, '\n');
1674 }
1675 
1676 /*
1677  * struct tracer operations
1678  */
1679 
1680 static void blk_tracer_print_header(struct seq_file *m)
1681 {
1682 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1683 		return;
1684 	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
1685 		    "#  |     |     |           |   |   |\n");
1686 }
1687 
1688 static void blk_tracer_start(struct trace_array *tr)
1689 {
1690 	blk_tracer_enabled = true;
1691 }
1692 
1693 static int blk_tracer_init(struct trace_array *tr)
1694 {
1695 	blk_tr = tr;
1696 	blk_tracer_start(tr);
1697 	return 0;
1698 }
1699 
1700 static void blk_tracer_stop(struct trace_array *tr)
1701 {
1702 	blk_tracer_enabled = false;
1703 }
1704 
1705 static void blk_tracer_reset(struct trace_array *tr)
1706 {
1707 	blk_tracer_stop(tr);
1708 }
1709 
1710 static const struct {
1711 	const char *act[2];
1712 	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent,
1713 			    bool has_cg);
1714 } what2act[] = {
1715 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
1716 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
1717 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
1718 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
1719 	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
1720 	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
1721 	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
1722 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
1723 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
1724 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
1725 	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
1726 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
1727 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
1728 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
1729 };
1730 
1731 static enum print_line_t print_one_line(struct trace_iterator *iter,
1732 					bool classic)
1733 {
1734 	struct trace_array *tr = iter->tr;
1735 	struct trace_seq *s = &iter->seq;
1736 	const struct blk_io_trace2 *t;
1737 	u16 what;
1738 	bool long_act;
1739 	blk_log_action_t *log_action;
1740 	bool has_cg;
1741 
1742 	t	   = te_blk_io_trace(iter->ent);
1743 	what	   = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
1744 	long_act   = !!(tr->trace_flags & TRACE_ITER(VERBOSE));
1745 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
1746 	has_cg	   = t->action & __BLK_TA_CGROUP;
1747 
1748 	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
1749 		log_action(iter, long_act ? "message" : "m", has_cg);
1750 		blk_log_msg(s, iter->ent, has_cg);
1751 		return trace_handle_return(s);
1752 	}
1753 
1754 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1755 		trace_seq_printf(s, "Unknown action %x\n", what);
1756 	else {
1757 		log_action(iter, what2act[what].act[long_act], has_cg);
1758 		what2act[what].print(s, iter->ent, has_cg);
1759 	}
1760 
1761 	return trace_handle_return(s);
1762 }
1763 
1764 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1765 					       int flags, struct trace_event *event)
1766 {
1767 	return print_one_line(iter, false);
1768 }
1769 
1770 static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1771 {
1772 	struct trace_seq *s = &iter->seq;
1773 	struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent;
1774 	const int offset = offsetof(struct blk_io_trace2, sector);
1775 	struct blk_io_trace old = {
1776 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1777 		.time     = iter->ts,
1778 	};
1779 
1780 	trace_seq_putmem(s, &old, offset);
1781 	trace_seq_putmem(s, &t->sector,
1782 			 sizeof(old) - offset + t->pdu_len);
1783 }
1784 
1785 static enum print_line_t
1786 blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1787 			     struct trace_event *event)
1788 {
1789 	blk_trace_synthesize_old_trace(iter);
1790 
1791 	return trace_handle_return(&iter->seq);
1792 }
1793 
1794 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1795 {
1796 	if ((iter->ent->type != TRACE_BLK) ||
1797 	    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1798 		return TRACE_TYPE_UNHANDLED;
1799 
1800 	return print_one_line(iter, true);
1801 }
1802 
1803 static int
1804 blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1805 {
1806 	/* don't output context-info for blk_classic output */
1807 	if (bit == TRACE_BLK_OPT_CLASSIC) {
1808 		if (set)
1809 			tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO);
1810 		else
1811 			tr->trace_flags |= TRACE_ITER(CONTEXT_INFO);
1812 	}
1813 	return 0;
1814 }
1815 
1816 static struct tracer blk_tracer __read_mostly = {
1817 	.name		= "blk",
1818 	.init		= blk_tracer_init,
1819 	.reset		= blk_tracer_reset,
1820 	.start		= blk_tracer_start,
1821 	.stop		= blk_tracer_stop,
1822 	.print_header	= blk_tracer_print_header,
1823 	.print_line	= blk_tracer_print_line,
1824 	.flags		= &blk_tracer_flags,
1825 	.set_flag	= blk_tracer_set_flag,
1826 };
1827 
1828 static struct trace_event_functions trace_blk_event_funcs = {
1829 	.trace		= blk_trace_event_print,
1830 	.binary		= blk_trace_event_print_binary,
1831 };
1832 
1833 static struct trace_event trace_blk_event = {
1834 	.type		= TRACE_BLK,
1835 	.funcs		= &trace_blk_event_funcs,
1836 };
1837 
1838 static struct work_struct blktrace_works __initdata;
1839 
1840 static int __init __init_blk_tracer(void)
1841 {
1842 	if (!register_trace_event(&trace_blk_event)) {
1843 		pr_warn("Warning: could not register block events\n");
1844 		return 1;
1845 	}
1846 
1847 	if (register_tracer(&blk_tracer) != 0) {
1848 		pr_warn("Warning: could not register the block tracer\n");
1849 		unregister_trace_event(&trace_blk_event);
1850 		return 1;
1851 	}
1852 
1853 	BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
1854 		     __alignof__(long));
1855 	BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long));
1856 
1857 	return 0;
1858 }
1859 
1860 static void __init blktrace_works_func(struct work_struct *work)
1861 {
1862 	__init_blk_tracer();
1863 }
1864 
1865 static int __init init_blk_tracer(void)
1866 {
1867 	int ret = 0;
1868 
1869 	if (trace_init_wq) {
1870 		INIT_WORK(&blktrace_works, blktrace_works_func);
1871 		queue_work(trace_init_wq, &blktrace_works);
1872 	} else {
1873 		ret = __init_blk_tracer();
1874 	}
1875 
1876 	return ret;
1877 }
1878 
1879 device_initcall(init_blk_tracer);
1880 
1881 static int blk_trace_remove_queue(struct request_queue *q)
1882 {
1883 	struct blk_trace *bt;
1884 
1885 	bt = rcu_replace_pointer(q->blk_trace, NULL,
1886 				 lockdep_is_held(&q->debugfs_mutex));
1887 	if (bt == NULL)
1888 		return -EINVAL;
1889 
1890 	blk_trace_stop(bt);
1891 
1892 	put_probe_ref();
1893 	synchronize_rcu();
1894 	blk_trace_free(q, bt);
1895 	return 0;
1896 }
1897 
1898 /*
1899  * Setup everything required to start tracing
1900  */
1901 static int blk_trace_setup_queue(struct request_queue *q,
1902 				 struct block_device *bdev)
1903 {
1904 	struct blk_trace *bt = NULL;
1905 	int ret = -ENOMEM;
1906 
1907 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1908 	if (!bt)
1909 		return -ENOMEM;
1910 
1911 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1912 	if (!bt->msg_data)
1913 		goto free_bt;
1914 
1915 	bt->dev = bdev->bd_dev;
1916 	bt->act_mask = (u16)-1;
1917 
1918 	blk_trace_setup_lba(bt, bdev);
1919 
1920 	rcu_assign_pointer(q->blk_trace, bt);
1921 	get_probe_ref();
1922 	return 0;
1923 
1924 free_bt:
1925 	blk_trace_free(q, bt);
1926 	return ret;
1927 }
1928 
1929 /*
1930  * sysfs interface to enable and configure tracing
1931  */
1932 
1933 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1934 					 struct device_attribute *attr,
1935 					 char *buf);
1936 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1937 					  struct device_attribute *attr,
1938 					  const char *buf, size_t count);
1939 #define BLK_TRACE_DEVICE_ATTR(_name) \
1940 	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1941 		    sysfs_blk_trace_attr_show, \
1942 		    sysfs_blk_trace_attr_store)
1943 
1944 static BLK_TRACE_DEVICE_ATTR(enable);
1945 static BLK_TRACE_DEVICE_ATTR(act_mask);
1946 static BLK_TRACE_DEVICE_ATTR(pid);
1947 static BLK_TRACE_DEVICE_ATTR(start_lba);
1948 static BLK_TRACE_DEVICE_ATTR(end_lba);
1949 
1950 static struct attribute *blk_trace_attrs[] = {
1951 	&dev_attr_enable.attr,
1952 	&dev_attr_act_mask.attr,
1953 	&dev_attr_pid.attr,
1954 	&dev_attr_start_lba.attr,
1955 	&dev_attr_end_lba.attr,
1956 	NULL
1957 };
1958 
1959 struct attribute_group blk_trace_attr_group = {
1960 	.name  = "trace",
1961 	.attrs = blk_trace_attrs,
1962 };
1963 
1964 static const struct {
1965 	int mask;
1966 	const char *str;
1967 } mask_maps[] = {
1968 	{ BLK_TC_READ,		"read"		},
1969 	{ BLK_TC_WRITE,		"write"		},
1970 	{ BLK_TC_FLUSH,		"flush"		},
1971 	{ BLK_TC_SYNC,		"sync"		},
1972 	{ BLK_TC_QUEUE,		"queue"		},
1973 	{ BLK_TC_REQUEUE,	"requeue"	},
1974 	{ BLK_TC_ISSUE,		"issue"		},
1975 	{ BLK_TC_COMPLETE,	"complete"	},
1976 	{ BLK_TC_FS,		"fs"		},
1977 	{ BLK_TC_PC,		"pc"		},
1978 	{ BLK_TC_NOTIFY,	"notify"	},
1979 	{ BLK_TC_AHEAD,		"ahead"		},
1980 	{ BLK_TC_META,		"meta"		},
1981 	{ BLK_TC_DISCARD,	"discard"	},
1982 	{ BLK_TC_DRV_DATA,	"drv_data"	},
1983 	{ BLK_TC_FUA,		"fua"		},
1984 	{ BLK_TC_WRITE_ZEROES,	"write-zeroes"	},
1985 };
1986 
1987 static int blk_trace_str2mask(const char *str)
1988 {
1989 	int i;
1990 	int mask = 0;
1991 	char *buf, *s, *token;
1992 
1993 	buf = kstrdup(str, GFP_KERNEL);
1994 	if (buf == NULL)
1995 		return -ENOMEM;
1996 	s = strstrip(buf);
1997 
1998 	while (1) {
1999 		token = strsep(&s, ",");
2000 		if (token == NULL)
2001 			break;
2002 
2003 		if (*token == '\0')
2004 			continue;
2005 
2006 		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
2007 			if (strcasecmp(token, mask_maps[i].str) == 0) {
2008 				mask |= mask_maps[i].mask;
2009 				break;
2010 			}
2011 		}
2012 		if (i == ARRAY_SIZE(mask_maps)) {
2013 			mask = -EINVAL;
2014 			break;
2015 		}
2016 	}
2017 	kfree(buf);
2018 
2019 	return mask;
2020 }
2021 
2022 static ssize_t blk_trace_mask2str(char *buf, int mask)
2023 {
2024 	int i;
2025 	char *p = buf;
2026 
2027 	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
2028 		if (mask & mask_maps[i].mask) {
2029 			p += sprintf(p, "%s%s",
2030 				    (p == buf) ? "" : ",", mask_maps[i].str);
2031 		}
2032 	}
2033 	*p++ = '\n';
2034 
2035 	return p - buf;
2036 }
2037 
2038 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
2039 					 struct device_attribute *attr,
2040 					 char *buf)
2041 {
2042 	struct block_device *bdev = dev_to_bdev(dev);
2043 	struct request_queue *q = bdev_get_queue(bdev);
2044 	struct blk_trace *bt;
2045 	ssize_t ret = -ENXIO;
2046 
2047 	blk_debugfs_lock_nomemsave(q);
2048 
2049 	bt = rcu_dereference_protected(q->blk_trace,
2050 				       lockdep_is_held(&q->debugfs_mutex));
2051 	if (attr == &dev_attr_enable) {
2052 		ret = sprintf(buf, "%u\n", !!bt);
2053 		goto out_unlock_bdev;
2054 	}
2055 
2056 	if (bt == NULL)
2057 		ret = sprintf(buf, "disabled\n");
2058 	else if (attr == &dev_attr_act_mask)
2059 		ret = blk_trace_mask2str(buf, bt->act_mask);
2060 	else if (attr == &dev_attr_pid)
2061 		ret = sprintf(buf, "%u\n", bt->pid);
2062 	else if (attr == &dev_attr_start_lba)
2063 		ret = sprintf(buf, "%llu\n", bt->start_lba);
2064 	else if (attr == &dev_attr_end_lba)
2065 		ret = sprintf(buf, "%llu\n", bt->end_lba);
2066 
2067 out_unlock_bdev:
2068 	blk_debugfs_unlock_nomemrestore(q);
2069 	return ret;
2070 }
2071 
2072 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
2073 					  struct device_attribute *attr,
2074 					  const char *buf, size_t count)
2075 {
2076 	struct block_device *bdev = dev_to_bdev(dev);
2077 	struct request_queue *q = bdev_get_queue(bdev);
2078 	struct blk_trace *bt;
2079 	unsigned int memflags;
2080 	u64 value;
2081 	ssize_t ret = -EINVAL;
2082 
2083 	if (count == 0)
2084 		goto out;
2085 
2086 	if (attr == &dev_attr_act_mask) {
2087 		if (kstrtoull(buf, 0, &value)) {
2088 			/* Assume it is a list of trace category names */
2089 			ret = blk_trace_str2mask(buf);
2090 			if (ret < 0)
2091 				goto out;
2092 			value = ret;
2093 		}
2094 	} else {
2095 		if (kstrtoull(buf, 0, &value))
2096 			goto out;
2097 	}
2098 
2099 	memflags = blk_debugfs_lock(q);
2100 
2101 	bt = rcu_dereference_protected(q->blk_trace,
2102 				       lockdep_is_held(&q->debugfs_mutex));
2103 	if (attr == &dev_attr_enable) {
2104 		if (!!value == !!bt) {
2105 			ret = 0;
2106 			goto out_unlock_bdev;
2107 		}
2108 		if (value)
2109 			ret = blk_trace_setup_queue(q, bdev);
2110 		else
2111 			ret = blk_trace_remove_queue(q);
2112 		goto out_unlock_bdev;
2113 	}
2114 
2115 	ret = 0;
2116 	if (bt == NULL) {
2117 		ret = blk_trace_setup_queue(q, bdev);
2118 		bt = rcu_dereference_protected(q->blk_trace,
2119 				lockdep_is_held(&q->debugfs_mutex));
2120 	}
2121 
2122 	if (ret == 0) {
2123 		if (attr == &dev_attr_act_mask)
2124 			bt->act_mask = value;
2125 		else if (attr == &dev_attr_pid)
2126 			bt->pid = value;
2127 		else if (attr == &dev_attr_start_lba)
2128 			bt->start_lba = value;
2129 		else if (attr == &dev_attr_end_lba)
2130 			bt->end_lba = value;
2131 	}
2132 
2133 out_unlock_bdev:
2134 	blk_debugfs_unlock(q, memflags);
2135 out:
2136 	return ret ? ret : count;
2137 }
2138 #endif /* CONFIG_BLK_DEV_IO_TRACE */
2139 
2140 #ifdef CONFIG_EVENT_TRACING
2141 
2142 /**
2143  * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
2144  * @rwbs:	buffer to be filled
2145  * @opf:	request operation type (REQ_OP_XXX) and flags for the tracepoint
2146  *
2147  * Description:
2148  *     Maps each request operation and flag to a single character and fills the
2149  *     buffer provided by the caller with resulting string.
2150  *
2151  **/
2152 void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
2153 {
2154 	int i = 0;
2155 
2156 	if (opf & REQ_PREFLUSH)
2157 		rwbs[i++] = 'F';
2158 
2159 	switch (opf & REQ_OP_MASK) {
2160 	case REQ_OP_WRITE:
2161 		rwbs[i++] = 'W';
2162 		break;
2163 	case REQ_OP_DISCARD:
2164 		rwbs[i++] = 'D';
2165 		break;
2166 	case REQ_OP_SECURE_ERASE:
2167 		rwbs[i++] = 'D';
2168 		rwbs[i++] = 'E';
2169 		break;
2170 	case REQ_OP_FLUSH:
2171 		rwbs[i++] = 'F';
2172 		break;
2173 	case REQ_OP_READ:
2174 		rwbs[i++] = 'R';
2175 		break;
2176 	case REQ_OP_ZONE_APPEND:
2177 		rwbs[i++] = 'Z';
2178 		rwbs[i++] = 'A';
2179 		break;
2180 	case REQ_OP_ZONE_RESET:
2181 	case REQ_OP_ZONE_RESET_ALL:
2182 		rwbs[i++] = 'Z';
2183 		rwbs[i++] = 'R';
2184 		if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
2185 			rwbs[i++] = 'A';
2186 		break;
2187 	case REQ_OP_ZONE_FINISH:
2188 		rwbs[i++] = 'Z';
2189 		rwbs[i++] = 'F';
2190 		break;
2191 	case REQ_OP_ZONE_OPEN:
2192 		rwbs[i++] = 'Z';
2193 		rwbs[i++] = 'O';
2194 		break;
2195 	case REQ_OP_ZONE_CLOSE:
2196 		rwbs[i++] = 'Z';
2197 		rwbs[i++] = 'C';
2198 		break;
2199 	case REQ_OP_WRITE_ZEROES:
2200 		rwbs[i++] = 'W';
2201 		rwbs[i++] = 'Z';
2202 		break;
2203 	default:
2204 		rwbs[i++] = 'N';
2205 	}
2206 
2207 	if (opf & REQ_FUA)
2208 		rwbs[i++] = 'F';
2209 	if (opf & REQ_RAHEAD)
2210 		rwbs[i++] = 'A';
2211 	if (opf & REQ_SYNC)
2212 		rwbs[i++] = 'S';
2213 	if (opf & REQ_META)
2214 		rwbs[i++] = 'M';
2215 	if (opf & REQ_ATOMIC)
2216 		rwbs[i++] = 'U';
2217 
2218 	WARN_ON_ONCE(i >= RWBS_LEN);
2219 
2220 	rwbs[i] = '\0';
2221 }
2222 EXPORT_SYMBOL_GPL(blk_fill_rwbs);
2223 
2224 #endif /* CONFIG_EVENT_TRACING */
2225 
2226