xref: /linux/tools/testing/selftests/net/bench/page_pool/time_bench.c (revision 8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Benchmarking code execution time inside the kernel
4  *
5  * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
6  */
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/module.h>
10 #include <linux/time.h>
11 
12 #include <linux/perf_event.h> /* perf_event_create_kernel_counter() */
13 
14 /* For concurrency testing */
15 #include <linux/completion.h>
16 #include <linux/sched.h>
17 #include <linux/workqueue.h>
18 #include <linux/kthread.h>
19 
20 #include "time_bench.h"
21 
22 static int verbose = 1;
23 
24 /** TSC (Time-Stamp Counter) based **
25  * See: linux/time_bench.h
26  *  tsc_start_clock() and tsc_stop_clock()
27  */
28 
29 /** Wall-clock based **
30  */
31 
32 /** PMU (Performance Monitor Unit) based **
33  */
34 #define PERF_FORMAT                                                            \
35 	(PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \
36 	 PERF_FORMAT_TOTAL_TIME_RUNNING)
37 
38 struct raw_perf_event {
39 	uint64_t config; /* event */
40 	uint64_t config1; /* umask */
41 	struct perf_event *save;
42 	char *desc;
43 };
44 
45 /* if HT is enable a maximum of 4 events (5 if one is instructions
46  * retired can be specified, if HT is disabled a maximum of 8 (9 if
47  * one is instructions retired) can be specified.
48  *
49  * From Table 19-1. Architectural Performance Events
50  * Architectures Software Developer’s Manual Volume 3: System Programming
51  * Guide
52  */
53 struct raw_perf_event perf_events[] = {
54 	{ 0x3c, 0x00, NULL, "Unhalted CPU Cycles" },
55 	{ 0xc0, 0x00, NULL, "Instruction Retired" }
56 };
57 
58 #define NUM_EVTS (ARRAY_SIZE(perf_events))
59 
60 /* WARNING: PMU config is currently broken!
61  */
time_bench_PMU_config(bool enable)62 bool time_bench_PMU_config(bool enable)
63 {
64 	int i;
65 	struct perf_event_attr perf_conf;
66 	struct perf_event *perf_event;
67 	int cpu;
68 
69 	preempt_disable();
70 	cpu = smp_processor_id();
71 	pr_info("DEBUG: cpu:%d\n", cpu);
72 	preempt_enable();
73 
74 	memset(&perf_conf, 0, sizeof(struct perf_event_attr));
75 	perf_conf.type           = PERF_TYPE_RAW;
76 	perf_conf.size           = sizeof(struct perf_event_attr);
77 	perf_conf.read_format    = PERF_FORMAT;
78 	perf_conf.pinned         = 1;
79 	perf_conf.exclude_user   = 1; /* No userspace events */
80 	perf_conf.exclude_kernel = 0; /* Only kernel events */
81 
82 	for (i = 0; i < NUM_EVTS; i++) {
83 		perf_conf.disabled = enable;
84 		//perf_conf.disabled = (i == 0) ? 1 : 0;
85 		perf_conf.config   = perf_events[i].config;
86 		perf_conf.config1  = perf_events[i].config1;
87 		if (verbose)
88 			pr_info("%s() enable PMU counter: %s\n",
89 				__func__, perf_events[i].desc);
90 		perf_event = perf_event_create_kernel_counter(&perf_conf, cpu,
91 							      NULL /* task */,
92 							      NULL /* overflow_handler*/,
93 							      NULL /* context */);
94 		if (perf_event) {
95 			perf_events[i].save = perf_event;
96 			pr_info("%s():DEBUG perf_event success\n", __func__);
97 
98 			perf_event_enable(perf_event);
99 		} else {
100 			pr_info("%s():DEBUG perf_event is NULL\n", __func__);
101 		}
102 	}
103 
104 	return true;
105 }
106 
107 /** Generic functions **
108  */
109 
110 /* Calculate stats, store results in record */
time_bench_calc_stats(struct time_bench_record * rec)111 bool time_bench_calc_stats(struct time_bench_record *rec)
112 {
113 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
114 	uint64_t ns_per_call_tmp_rem = 0;
115 	uint32_t ns_per_call_remainder = 0;
116 	uint64_t pmc_ipc_tmp_rem = 0;
117 	uint32_t pmc_ipc_remainder = 0;
118 	uint32_t pmc_ipc_div = 0;
119 	uint32_t invoked_cnt_precision = 0;
120 	uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */
121 
122 	if (rec->flags & TIME_BENCH_LOOP) {
123 		if (rec->invoked_cnt < 1000) {
124 			pr_err("ERR: need more(>1000) loops(%llu) for timing\n",
125 			       rec->invoked_cnt);
126 			return false;
127 		}
128 		if (rec->invoked_cnt > ((1ULL << 32) - 1)) {
129 			/* div_u64_rem() can only support div with 32bit*/
130 			pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n",
131 			       rec->invoked_cnt);
132 			return false;
133 		}
134 		invoked_cnt = (uint32_t)rec->invoked_cnt;
135 	}
136 
137 	/* TSC (Time-Stamp Counter) records */
138 	if (rec->flags & TIME_BENCH_TSC) {
139 		rec->tsc_interval = rec->tsc_stop - rec->tsc_start;
140 		if (rec->tsc_interval == 0) {
141 			pr_err("ABORT: timing took ZERO TSC time\n");
142 			return false;
143 		}
144 		/* Calculate stats */
145 		if (rec->flags & TIME_BENCH_LOOP)
146 			rec->tsc_cycles = rec->tsc_interval / invoked_cnt;
147 		else
148 			rec->tsc_cycles = rec->tsc_interval;
149 	}
150 
151 	/* Wall-clock time calc */
152 	if (rec->flags & TIME_BENCH_WALLCLOCK) {
153 		rec->time_start = rec->ts_start.tv_nsec +
154 				  (NANOSEC_PER_SEC * rec->ts_start.tv_sec);
155 		rec->time_stop = rec->ts_stop.tv_nsec +
156 				 (NANOSEC_PER_SEC * rec->ts_stop.tv_sec);
157 		rec->time_interval = rec->time_stop - rec->time_start;
158 		if (rec->time_interval == 0) {
159 			pr_err("ABORT: timing took ZERO wallclock time\n");
160 			return false;
161 		}
162 		/* Calculate stats */
163 		/*** Division in kernel it tricky ***/
164 		/* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */
165 		/* remainder only correct because NANOSEC_PER_SEC is 10^9 */
166 		rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC,
167 					    &rec->time_sec_remainder);
168 		//TODO: use existing struct timespec records instead of div?
169 
170 		if (rec->flags & TIME_BENCH_LOOP) {
171 			/*** Division in kernel it tricky ***/
172 			/* Orig: ns = ((double)time_interval / invoked_cnt); */
173 			/* First get quotient */
174 			rec->ns_per_call_quotient =
175 				div_u64_rem(rec->time_interval, invoked_cnt,
176 					    &ns_per_call_remainder);
177 			/* Now get decimals .xxx precision (incorrect roundup)*/
178 			ns_per_call_tmp_rem = ns_per_call_remainder;
179 			invoked_cnt_precision = invoked_cnt / 1000;
180 			if (invoked_cnt_precision > 0) {
181 				rec->ns_per_call_decimal =
182 					div_u64_rem(ns_per_call_tmp_rem,
183 						    invoked_cnt_precision,
184 						    &ns_per_call_remainder);
185 			}
186 		}
187 	}
188 
189 	/* Performance Monitor Unit (PMU) counters */
190 	if (rec->flags & TIME_BENCH_PMU) {
191 		//FIXME: Overflow handling???
192 		rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start;
193 		rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start;
194 
195 		/* Calc Instruction Per Cycle (IPC) */
196 		/* First get quotient */
197 		rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk,
198 						    &pmc_ipc_remainder);
199 		/* Now get decimals .xxx precision (incorrect roundup)*/
200 		pmc_ipc_tmp_rem = pmc_ipc_remainder;
201 		pmc_ipc_div = rec->pmc_clk / 1000;
202 		if (pmc_ipc_div > 0) {
203 			rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem,
204 							   pmc_ipc_div,
205 							   &pmc_ipc_remainder);
206 		}
207 	}
208 
209 	return true;
210 }
211 
212 /* Generic function for invoking a loop function and calculating
213  * execution time stats.  The function being called/timed is assumed
214  * to perform a tight loop, and update the timing record struct.
215  */
time_bench_loop(uint32_t loops,int step,char * txt,void * data,int (* func)(struct time_bench_record * record,void * data))216 bool time_bench_loop(uint32_t loops, int step, char *txt, void *data,
217 		     int (*func)(struct time_bench_record *record, void *data))
218 {
219 	struct time_bench_record rec;
220 
221 	/* Setup record */
222 	memset(&rec, 0, sizeof(rec)); /* zero func might not update all */
223 	rec.version_abi = 1;
224 	rec.loops       = loops;
225 	rec.step        = step;
226 	rec.flags       = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK);
227 
228 	/*** Loop function being timed ***/
229 	if (!func(&rec, data)) {
230 		pr_err("ABORT: function being timed failed\n");
231 		return false;
232 	}
233 
234 	if (rec.invoked_cnt < loops)
235 		pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n",
236 			rec.invoked_cnt, loops);
237 
238 	/* Calculate stats */
239 	time_bench_calc_stats(&rec);
240 
241 	pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
242 		txt, rec.tsc_cycles, rec.ns_per_call_quotient,
243 		rec.ns_per_call_decimal, rec.step, rec.time_sec,
244 		rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt,
245 		rec.tsc_interval);
246 	if (rec.flags & TIME_BENCH_PMU)
247 		pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n",
248 			txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient,
249 			rec.pmc_ipc_decimal);
250 	return true;
251 }
252 
253 /* Function getting invoked by kthread */
invoke_test_on_cpu_func(void * private)254 static int invoke_test_on_cpu_func(void *private)
255 {
256 	struct time_bench_cpu *cpu = private;
257 	struct time_bench_sync *sync = cpu->sync;
258 	cpumask_t newmask = CPU_MASK_NONE;
259 	void *data = cpu->data;
260 
261 	/* Restrict CPU */
262 	cpumask_set_cpu(cpu->rec.cpu, &newmask);
263 	set_cpus_allowed_ptr(current, &newmask);
264 
265 	/* Synchronize start of concurrency test */
266 	atomic_inc(&sync->nr_tests_running);
267 	wait_for_completion(&sync->start_event);
268 
269 	/* Start benchmark function */
270 	if (!cpu->bench_func(&cpu->rec, data)) {
271 		pr_err("ERROR: function being timed failed on CPU:%d(%d)\n",
272 		       cpu->rec.cpu, smp_processor_id());
273 	} else {
274 		if (verbose)
275 			pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu,
276 				smp_processor_id());
277 	}
278 	cpu->did_bench_run = true;
279 
280 	/* End test */
281 	atomic_dec(&sync->nr_tests_running);
282 	/*  Wait for kthread_stop() telling us to stop */
283 	while (!kthread_should_stop()) {
284 		set_current_state(TASK_INTERRUPTIBLE);
285 		schedule();
286 	}
287 	__set_current_state(TASK_RUNNING);
288 	return 0;
289 }
290 
time_bench_print_stats_cpumask(const char * desc,struct time_bench_cpu * cpu_tasks,const struct cpumask * mask)291 void time_bench_print_stats_cpumask(const char *desc,
292 				    struct time_bench_cpu *cpu_tasks,
293 				    const struct cpumask *mask)
294 {
295 	uint64_t average = 0;
296 	int cpu;
297 	int step = 0;
298 	struct sum {
299 		uint64_t tsc_cycles;
300 		int records;
301 	} sum = { 0 };
302 
303 	/* Get stats */
304 	for_each_cpu(cpu, mask) {
305 		struct time_bench_cpu *c = &cpu_tasks[cpu];
306 		struct time_bench_record *rec = &c->rec;
307 
308 		/* Calculate stats */
309 		time_bench_calc_stats(rec);
310 
311 		pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n",
312 			desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient,
313 			rec->ns_per_call_decimal, rec->step, rec->time_sec,
314 			rec->time_sec_remainder, rec->time_interval,
315 			rec->invoked_cnt, rec->tsc_interval);
316 
317 		/* Collect average */
318 		sum.records++;
319 		sum.tsc_cycles += rec->tsc_cycles;
320 		step = rec->step;
321 	}
322 
323 	if (sum.records) /* avoid div-by-zero */
324 		average = sum.tsc_cycles / sum.records;
325 	pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc,
326 		average, sum.records, step);
327 }
328 
time_bench_run_concurrent(uint32_t loops,int step,void * data,const struct cpumask * mask,struct time_bench_sync * sync,struct time_bench_cpu * cpu_tasks,int (* func)(struct time_bench_record * record,void * data))329 void time_bench_run_concurrent(uint32_t loops, int step, void *data,
330 			       const struct cpumask *mask, /* Support masking outsome CPUs*/
331 			       struct time_bench_sync *sync,
332 			       struct time_bench_cpu *cpu_tasks,
333 			       int (*func)(struct time_bench_record *record, void *data))
334 {
335 	int cpu, running = 0;
336 
337 	if (verbose) // DEBUG
338 		pr_warn("%s() Started on CPU:%d\n", __func__,
339 			smp_processor_id());
340 
341 	/* Reset sync conditions */
342 	atomic_set(&sync->nr_tests_running, 0);
343 	init_completion(&sync->start_event);
344 
345 	/* Spawn off jobs on all CPUs */
346 	for_each_cpu(cpu, mask) {
347 		struct time_bench_cpu *c = &cpu_tasks[cpu];
348 
349 		running++;
350 		c->sync = sync; /* Send sync variable along */
351 		c->data = data; /* Send opaque along */
352 
353 		/* Init benchmark record */
354 		memset(&c->rec, 0, sizeof(struct time_bench_record));
355 		c->rec.version_abi = 1;
356 		c->rec.loops       = loops;
357 		c->rec.step        = step;
358 		c->rec.flags       = (TIME_BENCH_LOOP | TIME_BENCH_TSC |
359 				      TIME_BENCH_WALLCLOCK);
360 		c->rec.cpu = cpu;
361 		c->bench_func = func;
362 		c->task = kthread_run(invoke_test_on_cpu_func, c,
363 				      "time_bench%d", cpu);
364 		if (IS_ERR(c->task)) {
365 			pr_err("%s(): Failed to start test func\n", __func__);
366 			return; /* Argh, what about cleanup?! */
367 		}
368 	}
369 
370 	/* Wait until all processes are running */
371 	while (atomic_read(&sync->nr_tests_running) < running) {
372 		set_current_state(TASK_UNINTERRUPTIBLE);
373 		schedule_timeout(10);
374 	}
375 	/* Kick off all CPU concurrently on completion event */
376 	complete_all(&sync->start_event);
377 
378 	/* Wait for CPUs to finish */
379 	while (atomic_read(&sync->nr_tests_running)) {
380 		set_current_state(TASK_UNINTERRUPTIBLE);
381 		schedule_timeout(10);
382 	}
383 
384 	/* Stop the kthreads */
385 	for_each_cpu(cpu, mask) {
386 		struct time_bench_cpu *c = &cpu_tasks[cpu];
387 
388 		kthread_stop(c->task);
389 	}
390 
391 	if (verbose) // DEBUG - happens often, finish on another CPU
392 		pr_warn("%s() Finished on CPU:%d\n", __func__,
393 			smp_processor_id());
394 }
395