xref: /linux/tools/perf/util/s390-cpumsf.c (revision 3f2a5ba784b808109cac0aac921213e43143a216)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright IBM Corp. 2018
4  * Auxtrace support for s390 CPU-Measurement Sampling Facility
5  *
6  * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
7  *
8  * Auxiliary traces are collected during 'perf record' using rbd000 event.
9  * Several PERF_RECORD_XXX are generated during recording:
10  *
11  * PERF_RECORD_AUX:
12  *	Records that new data landed in the AUX buffer part.
13  * PERF_RECORD_AUXTRACE:
14  *	Defines auxtrace data. Followed by the actual data. The contents of
15  *	the auxtrace data is dependent on the event and the CPU.
16  *	This record is generated by perf record command. For details
17  *	see Documentation/perf.data-file-format.txt.
18  * PERF_RECORD_AUXTRACE_INFO:
19  *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
20  *	record is generated during 'perf record' command. Each record contains
21  *	up to 256 entries describing offset and size of the AUXTRACE data in the
22  *	perf.data file.
23  * PERF_RECORD_AUXTRACE_ERROR:
24  *	Indicates an error during AUXTRACE collection such as buffer overflow.
25  * PERF_RECORD_FINISHED_ROUND:
26  *	Perf events are not necessarily in time stamp order, as they can be
27  *	collected in parallel on different CPUs. If the events should be
28  *	processed in time order they need to be sorted first.
29  *	Perf report guarantees that there is no reordering over a
30  *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
31  *	time stamp lower than this record are processed (and displayed) before
32  *	the succeeding perf record are processed.
33  *
34  * These records are evaluated during perf report command.
35  *
36  * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
37  * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
38  * below.
39  * Auxiliary trace data is collected per CPU. To merge the data into the report
40  * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
41  * data is in ascending order.
42  *
43  * Each queue has a double linked list of auxtrace_buffers. This list contains
44  * the offset and size of a CPU's auxtrace data. During auxtrace processing
45  * the data portion is mmap()'ed.
46  *
47  * To sort the queues in chronological order, all queue access is controlled
48  * by the auxtrace_heap. This is basically a stack, each stack element has two
49  * entries, the queue number and a time stamp. However the stack is sorted by
50  * the time stamps. The highest time stamp is at the bottom the lowest
51  * (nearest) time stamp is at the top. That sort order is maintained at all
52  * times!
53  *
54  * After the auxtrace infrastructure has been setup, the auxtrace queues are
55  * filled with data (offset/size pairs) and the auxtrace_heap is populated.
56  *
57  * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
58  * Each record is handled by s390_cpumsf_process_event(). The time stamp of
59  * the perf record is compared with the time stamp located on the auxtrace_heap
60  * top element. If that time stamp is lower than the time stamp from the
61  * record sample, the auxtrace queues will be processed. As auxtrace queues
62  * control many auxtrace_buffers and each buffer can be quite large, the
63  * auxtrace buffer might be processed only partially. In this case the
64  * position in the auxtrace_buffer of that queue is remembered and the time
65  * stamp of the last processed entry of the auxtrace_buffer replaces the
66  * current auxtrace_heap top.
67  *
68  * 3. Auxtrace_queues might run of out data and are fed by the
69  * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
70  *
71  * Event Generation
72  * Each sampling-data entry in the auxiliary trace data generates a perf sample.
73  * This sample is filled
74  * with data from the auxtrace such as PID/TID, instruction address, CPU state,
75  * etc. This sample is processed with perf_session__deliver_synth_event() to
76  * be included into the GUI.
77  *
78  * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
79  * auxiliary traces entries until the time stamp of this record is reached
80  * auxtrace_heap top. This is triggered by ordered_event->deliver().
81  *
82  *
83  * Perf event processing.
84  * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
85  * This is the function call sequence:
86  *
87  * __cmd_report()
88  * |
89  * perf_session__process_events()
90  * |
91  * __perf_session__process_events()
92  * |
93  * perf_session__process_event()
94  * |  This functions splits the PERF_RECORD_XXX records.
95  * |  - Those generated by perf record command (type number equal or higher
96  * |    than PERF_RECORD_USER_TYPE_START) are handled by
97  * |    perf_session__process_user_event(see below)
98  * |  - Those generated by the kernel are handled by
99  * |    evlist__parse_sample_timestamp()
100  * |
101  * evlist__parse_sample_timestamp()
102  * |  Extract time stamp from sample data.
103  * |
104  * perf_session__queue_event()
105  * |  If timestamp is positive the sample is entered into an ordered_event
106  * |  list, sort order is the timestamp. The event processing is deferred until
107  * |  later (see perf_session__process_user_event()).
108  * |  Other timestamps (0 or -1) are handled immediately by
109  * |  perf_session__deliver_event(). These are events generated at start up
110  * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
111  * |  records. They are needed to create a list of running processes and its
112  * |  memory mappings and layout. They are needed at the beginning to enable
113  * |  command perf report to create process trees and memory mappings.
114  * |
115  * perf_session__deliver_event()
116  * |  Delivers a PERF_RECORD_XXX entry for handling.
117  * |
118  * auxtrace__process_event()
119  * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
120  * |  time stamps from the auxiliary trace buffers. This enables
121  * |  synchronization between auxiliary trace data and the events on the
122  * |  perf.data file.
123  * |
124  * machine__deliver_event()
125  * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
126  *    It might update the process tree, update a process memory map or enter
127  *    a sample with IP and call back chain data into GUI data pool.
128  *
129  *
130  * Deferred processing determined by perf_session__process_user_event() is
131  * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
132  * are generated during command perf record.
133  * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
134  * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
135  * built up while reading the perf.data file.
136  * Each event is now processed by calling perf_session__deliver_event().
137  * This enables time synchronization between the data in the perf.data file and
138  * the data in the auxiliary trace buffers.
139  */
140 
141 #include <endian.h>
142 #include <errno.h>
143 #include <byteswap.h>
144 #include <inttypes.h>
145 #include <linux/kernel.h>
146 #include <linux/types.h>
147 #include <linux/bitops.h>
148 #include <linux/log2.h>
149 #include <linux/zalloc.h>
150 
151 #include <sys/stat.h>
152 #include <sys/types.h>
153 
154 #include "color.h"
155 #include "evsel.h"
156 #include "evlist.h"
157 #include "machine.h"
158 #include "session.h"
159 #include "tool.h"
160 #include "debug.h"
161 #include "auxtrace.h"
162 #include "s390-cpumsf.h"
163 #include "s390-cpumsf-kernel.h"
164 #include "s390-cpumcf-kernel.h"
165 #include "config.h"
166 #include "util/sample.h"
167 
168 struct s390_cpumsf {
169 	struct auxtrace		auxtrace;
170 	struct auxtrace_queues	queues;
171 	struct auxtrace_heap	heap;
172 	struct perf_session	*session;
173 	struct machine		*machine;
174 	u32			auxtrace_type;
175 	u32			pmu_type;
176 	u16			machine_type;
177 	bool			data_queued;
178 	bool			use_logfile;
179 	char			*logdir;
180 };
181 
182 struct s390_cpumsf_queue {
183 	struct s390_cpumsf	*sf;
184 	unsigned int		queue_nr;
185 	struct auxtrace_buffer	*buffer;
186 	int			cpu;
187 	FILE			*logfile;
188 	FILE			*logfile_ctr;
189 };
190 
191 /* Check if the raw data should be dumped to file. If this is the case and
192  * the file to dump to has not been opened for writing, do so.
193  *
194  * Return 0 on success and greater zero on error so processing continues.
195  */
196 static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
197 			       struct perf_sample *sample)
198 {
199 	struct s390_cpumsf_queue *sfq;
200 	struct auxtrace_queue *q;
201 	int rc = 0;
202 
203 	if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
204 		return rc;
205 
206 	q = &sf->queues.queue_array[sample->cpu];
207 	sfq = q->priv;
208 	if (!sfq)		/* Queue not yet allocated */
209 		return rc;
210 
211 	if (!sfq->logfile_ctr) {
212 		char *name;
213 
214 		rc = (sf->logdir)
215 			? asprintf(&name, "%s/aux.ctr.%02x",
216 				 sf->logdir, sample->cpu)
217 			: asprintf(&name, "aux.ctr.%02x", sample->cpu);
218 		if (rc > 0)
219 			sfq->logfile_ctr = fopen(name, "w");
220 		if (sfq->logfile_ctr == NULL) {
221 			pr_err("Failed to open counter set log file %s, "
222 			       "continue...\n", name);
223 			rc = 1;
224 		}
225 		free(name);
226 	}
227 
228 	if (sfq->logfile_ctr) {
229 		/* See comment above for -4 */
230 		size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
231 				  sfq->logfile_ctr);
232 		if (n != 1) {
233 			pr_err("Failed to write counter set data\n");
234 			rc = 1;
235 		}
236 	}
237 	return rc;
238 }
239 
240 /* Display s390 CPU measurement facility basic-sampling data entry
241  * Data written on s390 in big endian byte order and contains bit
242  * fields across byte boundaries.
243  */
244 static bool s390_cpumsf_basic_show(const char *color, size_t pos,
245 				   struct hws_basic_entry *basicp)
246 {
247 	struct hws_basic_entry *basic = basicp;
248 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
249 	struct hws_basic_entry local;
250 	unsigned long long word = be64toh(*(unsigned long long *)basicp);
251 
252 	memset(&local, 0, sizeof(local));
253 	local.def = be16toh(basicp->def);
254 	local.prim_asn = word & 0xffff;
255 	local.CL = word >> 30 & 0x3;
256 	local.I = word >> 32 & 0x1;
257 	local.AS = word >> 33 & 0x3;
258 	local.P = word >> 35 & 0x1;
259 	local.W = word >> 36 & 0x1;
260 	local.T = word >> 37 & 0x1;
261 	local.U = word >> 40 & 0xf;
262 	local.ia = be64toh(basicp->ia);
263 	local.gpp = be64toh(basicp->gpp);
264 	local.hpp = be64toh(basicp->hpp);
265 	basic = &local;
266 #endif
267 	if (basic->def != 1) {
268 		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
269 		return false;
270 	}
271 	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
272 		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
273 		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
274 		      pos, basic->def, basic->U,
275 		      basic->T ? 'T' : ' ',
276 		      basic->W ? 'W' : ' ',
277 		      basic->P ? 'P' : ' ',
278 		      basic->I ? 'I' : ' ',
279 		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
280 		      basic->hpp, basic->gpp);
281 	return true;
282 }
283 
284 /* Display s390 CPU measurement facility diagnostic-sampling data entry.
285  * Data written on s390 in big endian byte order and contains bit
286  * fields across byte boundaries.
287  */
288 static bool s390_cpumsf_diag_show(const char *color, size_t pos,
289 				  struct hws_diag_entry *diagp)
290 {
291 	struct hws_diag_entry *diag = diagp;
292 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
293 	struct hws_diag_entry local;
294 	unsigned long long word = be64toh(*(unsigned long long *)diagp);
295 
296 	local.def = be16toh(diagp->def);
297 	local.I = word >> 32 & 0x1;
298 	diag = &local;
299 #endif
300 	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
301 		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
302 		return false;
303 	}
304 	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
305 		      pos, diag->def, diag->I ? 'I' : ' ');
306 	return true;
307 }
308 
309 /* Return TOD timestamp contained in an trailer entry */
310 static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
311 					    int idx)
312 {
313 	/* te->t set: TOD in STCKE format, bytes 8-15
314 	 * to->t not set: TOD in STCK format, bytes 0-7
315 	 */
316 	unsigned long long ts;
317 
318 	memcpy(&ts, &te->timestamp[idx], sizeof(ts));
319 	return be64toh(ts);
320 }
321 
322 /* Display s390 CPU measurement facility trailer entry */
323 static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
324 				     struct hws_trailer_entry *te)
325 {
326 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
327 	struct hws_trailer_entry local;
328 	const unsigned long long flags = be64toh(te->flags);
329 
330 	memset(&local, 0, sizeof(local));
331 	local.f = flags >> 63 & 0x1;
332 	local.a = flags >> 62 & 0x1;
333 	local.t = flags >> 61 & 0x1;
334 	local.bsdes = be16toh((flags >> 16 & 0xffff));
335 	local.dsdes = be16toh((flags & 0xffff));
336 	memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
337 	local.overflow = be64toh(te->overflow);
338 	local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
339 	local.progusage2 = be64toh(te->progusage2);
340 	te = &local;
341 #endif
342 	if (te->bsdes != sizeof(struct hws_basic_entry)) {
343 		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
344 		return false;
345 	}
346 	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
347 		      " dsdes:%d Overflow:%lld Time:%#llx\n"
348 		      "\t\tC:%d TOD:%#llx\n",
349 		      pos,
350 		      te->f ? 'F' : ' ',
351 		      te->a ? 'A' : ' ',
352 		      te->t ? 'T' : ' ',
353 		      te->bsdes, te->dsdes, te->overflow,
354 		      trailer_timestamp(te, te->clock_base),
355 		      te->clock_base, te->progusage2);
356 	return true;
357 }
358 
359 /* Test a sample data block. It must be 4KB or a multiple thereof in size and
360  * 4KB page aligned. Each sample data page has a trailer entry at the
361  * end which contains the sample entry data sizes.
362  *
363  * Return true if the sample data block passes the checks and set the
364  * basic set entry size and diagnostic set entry size.
365  *
366  * Return false on failure.
367  *
368  * Note: Old hardware does not set the basic or diagnostic entry sizes
369  * in the trailer entry. Use the type number instead.
370  */
371 static bool s390_cpumsf_validate(int machine_type,
372 				 unsigned char *buf, size_t len,
373 				 unsigned short *bsdes,
374 				 unsigned short *dsdes)
375 {
376 	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
377 	struct hws_trailer_entry *te;
378 
379 	*dsdes = *bsdes = 0;
380 	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
381 		return false;
382 	if (be16toh(basic->def) != 1)	/* No basic set entry, must be first */
383 		return false;
384 	/* Check for trailer entry at end of SDB */
385 	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
386 					      - sizeof(*te));
387 	*bsdes = be16toh(te->bsdes);
388 	*dsdes = be16toh(te->dsdes);
389 	if (!te->bsdes && !te->dsdes) {
390 		/* Very old hardware, use CPUID */
391 		switch (machine_type) {
392 		case 2097:
393 		case 2098:
394 			*dsdes = 64;
395 			*bsdes = 32;
396 			break;
397 		case 2817:
398 		case 2818:
399 			*dsdes = 74;
400 			*bsdes = 32;
401 			break;
402 		case 2827:
403 		case 2828:
404 			*dsdes = 85;
405 			*bsdes = 32;
406 			break;
407 		case 2964:
408 		case 2965:
409 			*dsdes = 112;
410 			*bsdes = 32;
411 			break;
412 		default:
413 			/* Illegal trailer entry */
414 			return false;
415 		}
416 	}
417 	return true;
418 }
419 
420 /* Return true if there is room for another entry */
421 static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
422 {
423 	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
424 
425 	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
426 		return false;
427 	return true;
428 }
429 
430 /* Dump an auxiliary buffer. These buffers are multiple of
431  * 4KB SDB pages.
432  */
433 static void s390_cpumsf_dump(struct s390_cpumsf *sf,
434 			     unsigned char *buf, size_t len)
435 {
436 	const char *color = PERF_COLOR_BLUE;
437 	struct hws_basic_entry *basic;
438 	struct hws_diag_entry *diag;
439 	unsigned short bsdes, dsdes;
440 	size_t pos = 0;
441 
442 	color_fprintf(stdout, color,
443 		      ". ... s390 AUX data: size %zu bytes\n",
444 		      len);
445 
446 	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
447 				  &dsdes)) {
448 		pr_err("Invalid AUX trace data block size:%zu"
449 		       " (type:%d bsdes:%hd dsdes:%hd)\n",
450 		       len, sf->machine_type, bsdes, dsdes);
451 		return;
452 	}
453 
454 	/* s390 kernel always returns 4KB blocks fully occupied,
455 	 * no partially filled SDBs.
456 	 */
457 	while (pos < len) {
458 		/* Handle Basic entry */
459 		basic = (struct hws_basic_entry *)(buf + pos);
460 		if (s390_cpumsf_basic_show(color, pos, basic))
461 			pos += bsdes;
462 		else
463 			return;
464 
465 		/* Handle Diagnostic entry */
466 		diag = (struct hws_diag_entry *)(buf + pos);
467 		if (s390_cpumsf_diag_show(color, pos, diag))
468 			pos += dsdes;
469 		else
470 			return;
471 
472 		/* Check for trailer entry */
473 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
474 			/* Show trailer entry */
475 			struct hws_trailer_entry te;
476 
477 			pos = (pos + S390_CPUMSF_PAGESZ)
478 			       & ~(S390_CPUMSF_PAGESZ - 1);
479 			pos -= sizeof(te);
480 			memcpy(&te, buf + pos, sizeof(te));
481 			/* Set descriptor sizes in case of old hardware
482 			 * where these values are not set.
483 			 */
484 			te.bsdes = bsdes;
485 			te.dsdes = dsdes;
486 			if (s390_cpumsf_trailer_show(color, pos, &te))
487 				pos += sizeof(te);
488 			else
489 				return;
490 		}
491 	}
492 }
493 
494 static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
495 				   size_t len)
496 {
497 	printf(".\n");
498 	s390_cpumsf_dump(sf, buf, len);
499 }
500 
501 #define	S390_LPP_PID_MASK	0xffffffff
502 
503 static bool s390_cpumsf_make_event(size_t pos,
504 				   struct hws_basic_entry *basic,
505 				   struct s390_cpumsf_queue *sfq)
506 {
507 	struct perf_sample sample = {
508 				.ip = basic->ia,
509 				.pid = basic->hpp & S390_LPP_PID_MASK,
510 				.tid = basic->hpp & S390_LPP_PID_MASK,
511 				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
512 				.cpu = sfq->cpu,
513 				.period = 1
514 			    };
515 	union perf_event event;
516 	int ret;
517 
518 	memset(&event, 0, sizeof(event));
519 	if (basic->CL == 1)	/* Native LPAR mode */
520 		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
521 					  : PERF_RECORD_MISC_KERNEL;
522 	else if (basic->CL == 2)	/* Guest kernel/user space */
523 		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
524 					  : PERF_RECORD_MISC_GUEST_KERNEL;
525 	else if (basic->gpp || basic->prim_asn != 0xffff)
526 		/* Use heuristics on old hardware */
527 		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
528 					  : PERF_RECORD_MISC_GUEST_KERNEL;
529 	else
530 		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
531 					  : PERF_RECORD_MISC_KERNEL;
532 
533 	event.sample.header.type = PERF_RECORD_SAMPLE;
534 	event.sample.header.misc = sample.cpumode;
535 	event.sample.header.size = sizeof(struct perf_event_header);
536 
537 	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
538 		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
539 		 sample.tid, sample.cpumode, sample.cpu);
540 	ret = perf_session__deliver_synth_event(sfq->sf->session, &event, &sample);
541 	perf_sample__exit(&sample);
542 	if (ret) {
543 		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
544 		return false;
545 	}
546 	return true;
547 }
548 
549 static unsigned long long get_trailer_time(const unsigned char *buf)
550 {
551 	struct hws_trailer_entry *te;
552 	unsigned long long aux_time, progusage2;
553 	bool clock_base;
554 
555 	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
556 					      - sizeof(*te));
557 
558 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
559 	clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
560 	progusage2 = be64toh(te->progusage[1]);
561 #else
562 	clock_base = te->clock_base;
563 	progusage2 = te->progusage2;
564 #endif
565 	if (!clock_base)	/* TOD_CLOCK_BASE value missing */
566 		return 0;
567 
568 	/* Correct calculation to convert time stamp in trailer entry to
569 	 * nano seconds (taken from arch/s390 function tod_to_ns()).
570 	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
571 	 */
572 	aux_time = trailer_timestamp(te, clock_base) - progusage2;
573 	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
574 	return aux_time;
575 }
576 
577 /* Process the data samples of a single queue. The first parameter is a
578  * pointer to the queue, the second parameter is the time stamp. This
579  * is the time stamp:
580  * - of the event that triggered this processing.
581  * - or the time stamp when the last processing of this queue stopped.
582  *   In this case it stopped at a 4KB page boundary and record the
583  *   position on where to continue processing on the next invocation
584  *   (see buffer->use_data and buffer->use_size).
585  *
586  * When this function returns the second parameter is updated to
587  * reflect the time stamp of the last processed auxiliary data entry
588  * (taken from the trailer entry of that page). The caller uses this
589  * returned time stamp to record the last processed entry in this
590  * queue.
591  *
592  * The function returns:
593  * 0:  Processing successful. The second parameter returns the
594  *     time stamp from the trailer entry until which position
595  *     processing took place. Subsequent calls resume from this
596  *     position.
597  * <0: An error occurred during processing. The second parameter
598  *     returns the maximum time stamp.
599  * >0: Done on this queue. The second parameter returns the
600  *     maximum time stamp.
601  */
602 static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
603 {
604 	struct s390_cpumsf *sf = sfq->sf;
605 	unsigned char *buf = sfq->buffer->use_data;
606 	size_t len = sfq->buffer->use_size;
607 	struct hws_basic_entry *basic;
608 	unsigned short bsdes, dsdes;
609 	size_t pos = 0;
610 	int err = 1;
611 	u64 aux_ts;
612 
613 	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
614 				  &dsdes)) {
615 		*ts = ~0ULL;
616 		return -1;
617 	}
618 
619 	/* Get trailer entry time stamp and check if entries in
620 	 * this auxiliary page are ready for processing. If the
621 	 * time stamp of the first entry is too high, whole buffer
622 	 * can be skipped. In this case return time stamp.
623 	 */
624 	aux_ts = get_trailer_time(buf);
625 	if (!aux_ts) {
626 		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
627 		       (s64)sfq->buffer->data_offset);
628 		aux_ts = ~0ULL;
629 		goto out;
630 	}
631 	if (aux_ts > *ts) {
632 		*ts = aux_ts;
633 		return 0;
634 	}
635 
636 	while (pos < len) {
637 		/* Handle Basic entry */
638 		basic = (struct hws_basic_entry *)(buf + pos);
639 		if (s390_cpumsf_make_event(pos, basic, sfq))
640 			pos += bsdes;
641 		else {
642 			err = -EBADF;
643 			goto out;
644 		}
645 
646 		pos += dsdes;	/* Skip diagnostic entry */
647 
648 		/* Check for trailer entry */
649 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
650 			pos = (pos + S390_CPUMSF_PAGESZ)
651 			       & ~(S390_CPUMSF_PAGESZ - 1);
652 			/* Check existence of next page */
653 			if (pos >= len)
654 				break;
655 			aux_ts = get_trailer_time(buf + pos);
656 			if (!aux_ts) {
657 				aux_ts = ~0ULL;
658 				goto out;
659 			}
660 			if (aux_ts > *ts) {
661 				*ts = aux_ts;
662 				sfq->buffer->use_data += pos;
663 				sfq->buffer->use_size -= pos;
664 				return 0;
665 			}
666 		}
667 	}
668 out:
669 	*ts = aux_ts;
670 	sfq->buffer->use_size = 0;
671 	sfq->buffer->use_data = NULL;
672 	return err;	/* Buffer completely scanned or error */
673 }
674 
675 /* Run the s390 auxiliary trace decoder.
676  * Select the queue buffer to operate on, the caller already selected
677  * the proper queue, depending on second parameter 'ts'.
678  * This is the time stamp until which the auxiliary entries should
679  * be processed. This value is updated by called functions and
680  * returned to the caller.
681  *
682  * Resume processing in the current buffer. If there is no buffer
683  * get a new buffer from the queue and setup start position for
684  * processing.
685  * When a buffer is completely processed remove it from the queue
686  * before returning.
687  *
688  * This function returns
689  * 1: When the queue is empty. Second parameter will be set to
690  *    maximum time stamp.
691  * 0: Normal processing done.
692  * <0: Error during queue buffer setup. This causes the caller
693  *     to stop processing completely.
694  */
695 static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
696 				   u64 *ts)
697 {
698 
699 	struct auxtrace_buffer *buffer;
700 	struct auxtrace_queue *queue;
701 	int err;
702 
703 	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
704 
705 	/* Get buffer and last position in buffer to resume
706 	 * decoding the auxiliary entries. One buffer might be large
707 	 * and decoding might stop in between. This depends on the time
708 	 * stamp of the trailer entry in each page of the auxiliary
709 	 * data and the time stamp of the event triggering the decoding.
710 	 */
711 	if (sfq->buffer == NULL) {
712 		sfq->buffer = buffer = auxtrace_buffer__next(queue,
713 							     sfq->buffer);
714 		if (!buffer) {
715 			*ts = ~0ULL;
716 			return 1;	/* Processing done on this queue */
717 		}
718 		/* Start with a new buffer on this queue */
719 		if (buffer->data) {
720 			buffer->use_size = buffer->size;
721 			buffer->use_data = buffer->data;
722 		}
723 		if (sfq->logfile) {	/* Write into log file */
724 			size_t rc = fwrite(buffer->data, buffer->size, 1,
725 					   sfq->logfile);
726 			if (rc != 1)
727 				pr_err("Failed to write auxiliary data\n");
728 		}
729 	} else
730 		buffer = sfq->buffer;
731 
732 	if (!buffer->data) {
733 		int fd = perf_data__fd(sfq->sf->session->data);
734 
735 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
736 		if (!buffer->data)
737 			return -ENOMEM;
738 		buffer->use_size = buffer->size;
739 		buffer->use_data = buffer->data;
740 
741 		if (sfq->logfile) {	/* Write into log file */
742 			size_t rc = fwrite(buffer->data, buffer->size, 1,
743 					   sfq->logfile);
744 			if (rc != 1)
745 				pr_err("Failed to write auxiliary data\n");
746 		}
747 	}
748 	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
749 		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
750 		  buffer->size, buffer->use_size);
751 	err = s390_cpumsf_samples(sfq, ts);
752 
753 	/* If non-zero, there is either an error (err < 0) or the buffer is
754 	 * completely done (err > 0). The error is unrecoverable, usually
755 	 * some descriptors could not be read successfully, so continue with
756 	 * the next buffer.
757 	 * In both cases the parameter 'ts' has been updated.
758 	 */
759 	if (err) {
760 		sfq->buffer = NULL;
761 		list_del_init(&buffer->list);
762 		auxtrace_buffer__free(buffer);
763 		if (err > 0)		/* Buffer done, no error */
764 			err = 0;
765 	}
766 	return err;
767 }
768 
769 static struct s390_cpumsf_queue *
770 s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
771 {
772 	struct s390_cpumsf_queue *sfq;
773 
774 	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
775 	if (sfq == NULL)
776 		return NULL;
777 
778 	sfq->sf = sf;
779 	sfq->queue_nr = queue_nr;
780 	sfq->cpu = -1;
781 	if (sf->use_logfile) {
782 		char *name;
783 		int rc;
784 
785 		rc = (sf->logdir)
786 			? asprintf(&name, "%s/aux.smp.%02x",
787 				 sf->logdir, queue_nr)
788 			: asprintf(&name, "aux.smp.%02x", queue_nr);
789 		if (rc > 0)
790 			sfq->logfile = fopen(name, "w");
791 		if (sfq->logfile == NULL) {
792 			pr_err("Failed to open auxiliary log file %s,"
793 			       "continue...\n", name);
794 			sf->use_logfile = false;
795 		}
796 		free(name);
797 	}
798 	return sfq;
799 }
800 
801 static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
802 				   struct auxtrace_queue *queue,
803 				   unsigned int queue_nr, u64 ts)
804 {
805 	struct s390_cpumsf_queue *sfq = queue->priv;
806 
807 	if (list_empty(&queue->head))
808 		return 0;
809 
810 	if (sfq == NULL) {
811 		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
812 		if (!sfq)
813 			return -ENOMEM;
814 		queue->priv = sfq;
815 
816 		if (queue->cpu != -1)
817 			sfq->cpu = queue->cpu;
818 	}
819 	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
820 }
821 
822 static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
823 {
824 	unsigned int i;
825 	int ret = 0;
826 
827 	for (i = 0; i < sf->queues.nr_queues; i++) {
828 		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
829 					      i, ts);
830 		if (ret)
831 			break;
832 	}
833 	return ret;
834 }
835 
836 static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
837 {
838 	if (!sf->queues.new_data)
839 		return 0;
840 
841 	sf->queues.new_data = false;
842 	return s390_cpumsf_setup_queues(sf, ts);
843 }
844 
845 static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
846 {
847 	unsigned int queue_nr;
848 	u64 ts;
849 	int ret;
850 
851 	while (1) {
852 		struct auxtrace_queue *queue;
853 		struct s390_cpumsf_queue *sfq;
854 
855 		if (!sf->heap.heap_cnt)
856 			return 0;
857 
858 		if (sf->heap.heap_array[0].ordinal >= timestamp)
859 			return 0;
860 
861 		queue_nr = sf->heap.heap_array[0].queue_nr;
862 		queue = &sf->queues.queue_array[queue_nr];
863 		sfq = queue->priv;
864 
865 		auxtrace_heap__pop(&sf->heap);
866 		if (sf->heap.heap_cnt) {
867 			ts = sf->heap.heap_array[0].ordinal + 1;
868 			if (ts > timestamp)
869 				ts = timestamp;
870 		} else {
871 			ts = timestamp;
872 		}
873 
874 		ret = s390_cpumsf_run_decoder(sfq, &ts);
875 		if (ret < 0) {
876 			auxtrace_heap__add(&sf->heap, queue_nr, ts);
877 			return ret;
878 		}
879 		if (!ret) {
880 			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
881 			if (ret < 0)
882 				return ret;
883 		}
884 	}
885 	return 0;
886 }
887 
888 static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
889 				   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
890 {
891 	char msg[MAX_AUXTRACE_ERROR_MSG];
892 	union perf_event event;
893 	int err;
894 
895 	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
896 	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
897 			     code, cpu, pid, tid, ip, msg, timestamp);
898 
899 	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
900 	if (err)
901 		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
902 			"error %d\n", err);
903 	return err;
904 }
905 
906 static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
907 {
908 	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
909 				       sample->pid, sample->tid, 0,
910 				       sample->time);
911 }
912 
913 static int
914 s390_cpumsf_process_event(struct perf_session *session,
915 			  union perf_event *event,
916 			  struct perf_sample *sample,
917 			  const struct perf_tool *tool)
918 {
919 	struct s390_cpumsf *sf = container_of(session->auxtrace,
920 					      struct s390_cpumsf,
921 					      auxtrace);
922 	u64 timestamp = sample->time;
923 	struct evsel *ev_bc000;
924 
925 	int err = 0;
926 
927 	if (dump_trace)
928 		return 0;
929 
930 	if (!tool->ordered_events) {
931 		pr_err("s390 Auxiliary Trace requires ordered events\n");
932 		return -EINVAL;
933 	}
934 
935 	if (event->header.type == PERF_RECORD_SAMPLE &&
936 	    sample->raw_size) {
937 		/* Handle event with raw data */
938 		ev_bc000 = evlist__event2evsel(session->evlist, event);
939 		if (ev_bc000 &&
940 		    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
941 			err = s390_cpumcf_dumpctr(sf, sample);
942 		return err;
943 	}
944 
945 	if (event->header.type == PERF_RECORD_AUX &&
946 	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
947 		return s390_cpumsf_lost(sf, sample);
948 
949 	if (timestamp) {
950 		err = s390_cpumsf_update_queues(sf, timestamp);
951 		if (!err)
952 			err = s390_cpumsf_process_queues(sf, timestamp);
953 	}
954 	return err;
955 }
956 
957 static int
958 s390_cpumsf_process_auxtrace_event(struct perf_session *session,
959 				   union perf_event *event __maybe_unused,
960 				   const struct perf_tool *tool __maybe_unused)
961 {
962 	struct s390_cpumsf *sf = container_of(session->auxtrace,
963 					      struct s390_cpumsf,
964 					      auxtrace);
965 
966 	int fd = perf_data__fd(session->data);
967 	struct auxtrace_buffer *buffer;
968 	off_t data_offset;
969 	int err;
970 
971 	if (sf->data_queued)
972 		return 0;
973 
974 	if (perf_data__is_pipe(session->data)) {
975 		data_offset = 0;
976 	} else {
977 		data_offset = lseek(fd, 0, SEEK_CUR);
978 		if (data_offset == -1)
979 			return -errno;
980 	}
981 
982 	err = auxtrace_queues__add_event(&sf->queues, session, event,
983 					 data_offset, &buffer);
984 	if (err)
985 		return err;
986 
987 	/* Dump here after copying piped trace out of the pipe */
988 	if (dump_trace) {
989 		if (auxtrace_buffer__get_data(buffer, fd)) {
990 			s390_cpumsf_dump_event(sf, buffer->data,
991 					       buffer->size);
992 			auxtrace_buffer__put_data(buffer);
993 		}
994 	}
995 	return 0;
996 }
997 
998 static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
999 {
1000 }
1001 
1002 static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
1003 			     const struct perf_tool *tool __maybe_unused)
1004 {
1005 	return 0;
1006 }
1007 
1008 static void s390_cpumsf_free_queues(struct perf_session *session)
1009 {
1010 	struct s390_cpumsf *sf = container_of(session->auxtrace,
1011 					      struct s390_cpumsf,
1012 					      auxtrace);
1013 	struct auxtrace_queues *queues = &sf->queues;
1014 	unsigned int i;
1015 
1016 	for (i = 0; i < queues->nr_queues; i++) {
1017 		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
1018 						queues->queue_array[i].priv;
1019 
1020 		if (sfq != NULL) {
1021 			if (sfq->logfile) {
1022 				fclose(sfq->logfile);
1023 				sfq->logfile = NULL;
1024 			}
1025 			if (sfq->logfile_ctr) {
1026 				fclose(sfq->logfile_ctr);
1027 				sfq->logfile_ctr = NULL;
1028 			}
1029 		}
1030 		zfree(&queues->queue_array[i].priv);
1031 	}
1032 	auxtrace_queues__free(queues);
1033 }
1034 
1035 static void s390_cpumsf_free(struct perf_session *session)
1036 {
1037 	struct s390_cpumsf *sf = container_of(session->auxtrace,
1038 					      struct s390_cpumsf,
1039 					      auxtrace);
1040 
1041 	auxtrace_heap__free(&sf->heap);
1042 	s390_cpumsf_free_queues(session);
1043 	session->auxtrace = NULL;
1044 	zfree(&sf->logdir);
1045 	free(sf);
1046 }
1047 
1048 static bool
1049 s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused,
1050 			      struct evsel *evsel)
1051 {
1052 	return evsel->core.attr.type == PERF_TYPE_RAW &&
1053 	       evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG;
1054 }
1055 
1056 static int s390_cpumsf_get_type(const char *cpuid)
1057 {
1058 	int ret, family = 0;
1059 
1060 	ret = sscanf(cpuid, "%*[^,],%u", &family);
1061 	return (ret == 1) ? family : 0;
1062 }
1063 
1064 /* Check itrace options set on perf report command.
1065  * Return true, if none are set or all options specified can be
1066  * handled on s390 (currently only option 'd' for logging.
1067  * Return false otherwise.
1068  */
1069 static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
1070 {
1071 	bool ison = false;
1072 
1073 	if (!itops || !itops->set)
1074 		return true;
1075 	ison = itops->inject || itops->instructions || itops->branches ||
1076 		itops->transactions || itops->ptwrites ||
1077 		itops->pwr_events || itops->errors ||
1078 		itops->dont_decode || itops->calls || itops->returns ||
1079 		itops->callchain || itops->thread_stack ||
1080 		itops->last_branch || itops->add_callchain ||
1081 		itops->add_last_branch;
1082 	if (!ison)
1083 		return true;
1084 	pr_err("Unsupported --itrace options specified\n");
1085 	return false;
1086 }
1087 
1088 /* Check for AUXTRACE dump directory if it is needed.
1089  * On failure print an error message but continue.
1090  * Return 0 on wrong keyword in config file and 1 otherwise.
1091  */
1092 static int s390_cpumsf__config(const char *var, const char *value, void *cb)
1093 {
1094 	struct s390_cpumsf *sf = cb;
1095 	struct stat stbuf;
1096 	int rc;
1097 
1098 	if (strcmp(var, "auxtrace.dumpdir"))
1099 		return 0;
1100 	sf->logdir = strdup(value);
1101 	if (sf->logdir == NULL) {
1102 		pr_err("Failed to find auxtrace log directory %s,"
1103 		       " continue with current directory...\n", value);
1104 		return 1;
1105 	}
1106 	rc = stat(sf->logdir, &stbuf);
1107 	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
1108 		pr_err("Missing auxtrace log directory %s,"
1109 		       " continue with current directory...\n", value);
1110 		zfree(&sf->logdir);
1111 	}
1112 	return 1;
1113 }
1114 
1115 int s390_cpumsf_process_auxtrace_info(union perf_event *event,
1116 				      struct perf_session *session)
1117 {
1118 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1119 	struct s390_cpumsf *sf;
1120 	int err;
1121 
1122 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
1123 		return -EINVAL;
1124 
1125 	sf = zalloc(sizeof(struct s390_cpumsf));
1126 	if (sf == NULL)
1127 		return -ENOMEM;
1128 
1129 	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
1130 		err = -EINVAL;
1131 		goto err_free;
1132 	}
1133 	sf->use_logfile = session->itrace_synth_opts->log;
1134 	if (sf->use_logfile)
1135 		perf_config(s390_cpumsf__config, sf);
1136 
1137 	err = auxtrace_queues__init(&sf->queues);
1138 	if (err)
1139 		goto err_free;
1140 
1141 	sf->session = session;
1142 	sf->machine = &session->machines.host; /* No kvm support */
1143 	sf->auxtrace_type = auxtrace_info->type;
1144 	sf->pmu_type = PERF_TYPE_RAW;
1145 	sf->machine_type = s390_cpumsf_get_type(perf_session__env(session)->cpuid);
1146 
1147 	sf->auxtrace.process_event = s390_cpumsf_process_event;
1148 	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
1149 	sf->auxtrace.flush_events = s390_cpumsf_flush;
1150 	sf->auxtrace.free_events = s390_cpumsf_free_events;
1151 	sf->auxtrace.free = s390_cpumsf_free;
1152 	sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace;
1153 	session->auxtrace = &sf->auxtrace;
1154 
1155 	if (dump_trace)
1156 		return 0;
1157 
1158 	err = auxtrace_queues__process_index(&sf->queues, session);
1159 	if (err)
1160 		goto err_free_queues;
1161 
1162 	if (sf->queues.populated)
1163 		sf->data_queued = true;
1164 
1165 	return 0;
1166 
1167 err_free_queues:
1168 	auxtrace_queues__free(&sf->queues);
1169 	session->auxtrace = NULL;
1170 err_free:
1171 	zfree(&sf->logdir);
1172 	free(sf);
1173 	return err;
1174 }
1175