xref: /linux/arch/s390/kernel/hiperdispatch.c (revision 5afca7e996c42aed1b4a42d4712817601ba42aff)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright IBM Corp. 2024
4  */
5 
6 #define KMSG_COMPONENT "hd"
7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
8 
9 /*
10  * Hiperdispatch:
11  * Dynamically calculates the optimum number of high capacity COREs
12  * by considering the state the system is in. When hiperdispatch decides
13  * that a capacity update is necessary, it schedules a topology update.
14  * During topology updates the CPU capacities are always re-adjusted.
15  *
16  * There is two places where CPU capacities are being accessed within
17  * hiperdispatch.
18  * -> hiperdispatch's reoccuring work function reads CPU capacities to
19  *    determine high capacity CPU count.
20  * -> during a topology update hiperdispatch's adjustment function
21  *    updates CPU capacities.
22  * These two can run on different CPUs in parallel which can cause
23  * hiperdispatch to make wrong decisions. This can potentially cause
24  * some overhead by leading to extra rebuild_sched_domains() calls
25  * for correction. Access to capacities within hiperdispatch has to be
26  * serialized to prevent the overhead.
27  *
28  * Hiperdispatch decision making revolves around steal time.
29  * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
30  * crosses the threshold value hiperdispatch falls back to giving high
31  * capacities to entitled CPUs. When steal time drops below the
32  * threshold boundary, hiperdispatch utilizes all CPUs by giving all
33  * of them high capacity.
34  *
35  * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
36  * performance. Comparing the throughput of;
37  * - single CORE, with N threads, running N tasks
38  * - N separate COREs running N tasks,
39  * using individual COREs for individual tasks yield better
40  * performance. This performance difference is roughly ~30% (can change
41  * between machine generations)
42  *
43  * Hiperdispatch tries to hint scheduler to use individual COREs for
44  * each task, as long as steal time on those COREs are less than 30%,
45  * therefore delaying the throughput loss caused by using SMP threads.
46  */
47 
48 #include <linux/cpumask.h>
49 #include <linux/debugfs.h>
50 #include <linux/device.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/kstrtox.h>
53 #include <linux/ktime.h>
54 #include <linux/sysctl.h>
55 #include <linux/types.h>
56 #include <linux/workqueue.h>
57 #include <asm/hiperdispatch.h>
58 #include <asm/setup.h>
59 #include <asm/smp.h>
60 #include <asm/topology.h>
61 
62 #define CREATE_TRACE_POINTS
63 #include <asm/trace/hiperdispatch.h>
64 
65 #define HD_DELAY_FACTOR			(4)
66 #define HD_DELAY_INTERVAL		(HZ / 4)
67 #define HD_STEAL_THRESHOLD		30
68 #define HD_STEAL_AVG_WEIGHT		16
69 
70 static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
71 static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
72 static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
73 static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
74 static int hd_online_cores;		/* Current online CORE count */
75 
76 static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
77 static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
78 static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
79 static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */
80 
81 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
82 static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
83 static int hd_enabled;
84 
85 static void hd_capacity_work_fn(struct work_struct *work);
86 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
87 
88 static int hd_set_hiperdispatch_mode(int enable)
89 {
90 	if (!MACHINE_HAS_TOPOLOGY)
91 		enable = 0;
92 	if (hd_enabled == enable)
93 		return 0;
94 	hd_enabled = enable;
95 	return 1;
96 }
97 
98 void hd_reset_state(void)
99 {
100 	cpumask_clear(&hd_vl_coremask);
101 	cpumask_clear(&hd_vmvl_cpumask);
102 	hd_entitled_cores = 0;
103 	hd_online_cores = 0;
104 }
105 
106 void hd_add_core(int cpu)
107 {
108 	const struct cpumask *siblings;
109 	int polarization;
110 
111 	hd_online_cores++;
112 	polarization = smp_cpu_get_polarization(cpu);
113 	siblings = topology_sibling_cpumask(cpu);
114 	switch (polarization) {
115 	case POLARIZATION_VH:
116 		hd_entitled_cores++;
117 		break;
118 	case POLARIZATION_VM:
119 		hd_entitled_cores++;
120 		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
121 		break;
122 	case POLARIZATION_VL:
123 		cpumask_set_cpu(cpu, &hd_vl_coremask);
124 		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
125 		break;
126 	}
127 }
128 
129 /* Serialize update and read operations of debug counters. */
130 static DEFINE_MUTEX(hd_counter_mutex);
131 
132 static void hd_update_times(void)
133 {
134 	static ktime_t prev;
135 	ktime_t now;
136 
137 	/*
138 	 * Check if hiperdispatch is active, if not set the prev to 0.
139 	 * This way it is possible to differentiate the first update iteration after
140 	 * enabling hiperdispatch.
141 	 */
142 	if (hd_entitled_cores == 0 || hd_enabled == 0) {
143 		prev = ktime_set(0, 0);
144 		return;
145 	}
146 	now = ktime_get();
147 	if (ktime_after(prev, 0)) {
148 		if (hd_high_capacity_cores == hd_online_cores)
149 			hd_high_time += ktime_ms_delta(now, prev);
150 		else
151 			hd_low_time += ktime_ms_delta(now, prev);
152 	}
153 	prev = now;
154 }
155 
156 static void hd_update_capacities(void)
157 {
158 	int cpu, upscaling_cores;
159 	unsigned long capacity;
160 
161 	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
162 	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
163 	hd_high_capacity_cores = hd_entitled_cores;
164 	for_each_cpu(cpu, &hd_vl_coremask) {
165 		smp_set_core_capacity(cpu, capacity);
166 		if (capacity != CPU_CAPACITY_HIGH)
167 			continue;
168 		hd_high_capacity_cores++;
169 		upscaling_cores--;
170 		if (upscaling_cores == 0)
171 			capacity = CPU_CAPACITY_LOW;
172 	}
173 }
174 
175 void hd_disable_hiperdispatch(void)
176 {
177 	cancel_delayed_work_sync(&hd_capacity_work);
178 	hd_high_capacity_cores = hd_online_cores;
179 	hd_previous_steal = 0;
180 }
181 
182 int hd_enable_hiperdispatch(void)
183 {
184 	mutex_lock(&hd_counter_mutex);
185 	hd_update_times();
186 	mutex_unlock(&hd_counter_mutex);
187 	if (hd_enabled == 0)
188 		return 0;
189 	if (hd_entitled_cores == 0)
190 		return 0;
191 	if (hd_online_cores <= hd_entitled_cores)
192 		return 0;
193 	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
194 	hd_update_capacities();
195 	return 1;
196 }
197 
198 static unsigned long hd_steal_avg(unsigned long new)
199 {
200 	static unsigned long steal;
201 
202 	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
203 	return steal;
204 }
205 
206 static unsigned long hd_calculate_steal_percentage(void)
207 {
208 	unsigned long time_delta, steal_delta, steal, percentage;
209 	static ktime_t prev;
210 	int cpus, cpu;
211 	ktime_t now;
212 
213 	cpus = 0;
214 	steal = 0;
215 	percentage = 0;
216 	for_each_cpu(cpu, &hd_vmvl_cpumask) {
217 		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
218 		cpus++;
219 	}
220 	/*
221 	 * If there is no vertical medium and low CPUs steal time
222 	 * is 0 as vertical high CPUs shouldn't experience steal time.
223 	 */
224 	if (cpus == 0)
225 		return percentage;
226 	now = ktime_get();
227 	time_delta = ktime_to_ns(ktime_sub(now, prev));
228 	if (steal > hd_previous_steal && hd_previous_steal != 0) {
229 		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
230 		percentage = steal_delta / cpus;
231 	}
232 	hd_previous_steal = steal;
233 	prev = now;
234 	return percentage;
235 }
236 
237 static void hd_capacity_work_fn(struct work_struct *work)
238 {
239 	unsigned long steal_percentage, new_cores;
240 
241 	mutex_lock(&smp_cpu_state_mutex);
242 	/*
243 	 * If online cores are less or equal to entitled cores hiperdispatch
244 	 * does not need to make any adjustments, call a topology update to
245 	 * disable hiperdispatch.
246 	 * Normally this check is handled on topology update, but during cpu
247 	 * unhotplug, topology and cpu mask updates are done in reverse
248 	 * order, causing hd_enable_hiperdispatch() to get stale data.
249 	 */
250 	if (hd_online_cores <= hd_entitled_cores) {
251 		topology_schedule_update();
252 		mutex_unlock(&smp_cpu_state_mutex);
253 		return;
254 	}
255 	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
256 	if (steal_percentage < hd_steal_threshold)
257 		new_cores = hd_online_cores;
258 	else
259 		new_cores = hd_entitled_cores;
260 	if (hd_high_capacity_cores != new_cores) {
261 		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
262 		hd_high_capacity_cores = new_cores;
263 		atomic64_inc(&hd_adjustments);
264 		topology_schedule_update();
265 	}
266 	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
267 	mutex_unlock(&smp_cpu_state_mutex);
268 	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
269 }
270 
271 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
272 				     void *buffer, size_t *lenp, loff_t *ppos)
273 {
274 	int hiperdispatch;
275 	int rc;
276 	struct ctl_table ctl_entry = {
277 		.procname	= ctl->procname,
278 		.data		= &hiperdispatch,
279 		.maxlen		= sizeof(int),
280 		.extra1		= SYSCTL_ZERO,
281 		.extra2		= SYSCTL_ONE,
282 	};
283 
284 	hiperdispatch = hd_enabled;
285 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
286 	if (rc < 0 || !write)
287 		return rc;
288 	mutex_lock(&smp_cpu_state_mutex);
289 	if (hd_set_hiperdispatch_mode(hiperdispatch))
290 		topology_schedule_update();
291 	mutex_unlock(&smp_cpu_state_mutex);
292 	return 0;
293 }
294 
295 static struct ctl_table hiperdispatch_ctl_table[] = {
296 	{
297 		.procname	= "hiperdispatch",
298 		.mode		= 0644,
299 		.proc_handler	= hiperdispatch_ctl_handler,
300 	},
301 };
302 
303 static ssize_t hd_steal_threshold_show(struct device *dev,
304 				       struct device_attribute *attr,
305 				       char *buf)
306 {
307 	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
308 }
309 
310 static ssize_t hd_steal_threshold_store(struct device *dev,
311 					struct device_attribute *attr,
312 					const char *buf,
313 					size_t count)
314 {
315 	unsigned int val;
316 	int rc;
317 
318 	rc = kstrtouint(buf, 0, &val);
319 	if (rc)
320 		return rc;
321 	if (val > 100)
322 		return -ERANGE;
323 	hd_steal_threshold = val;
324 	return count;
325 }
326 
327 static DEVICE_ATTR_RW(hd_steal_threshold);
328 
329 static ssize_t hd_delay_factor_show(struct device *dev,
330 				    struct device_attribute *attr,
331 				    char *buf)
332 {
333 	return sysfs_emit(buf, "%u\n", hd_delay_factor);
334 }
335 
336 static ssize_t hd_delay_factor_store(struct device *dev,
337 				     struct device_attribute *attr,
338 				     const char *buf,
339 				     size_t count)
340 {
341 	unsigned int val;
342 	int rc;
343 
344 	rc = kstrtouint(buf, 0, &val);
345 	if (rc)
346 		return rc;
347 	if (!val)
348 		return -ERANGE;
349 	hd_delay_factor = val;
350 	return count;
351 }
352 
353 static DEVICE_ATTR_RW(hd_delay_factor);
354 
355 static struct attribute *hd_attrs[] = {
356 	&dev_attr_hd_steal_threshold.attr,
357 	&dev_attr_hd_delay_factor.attr,
358 	NULL,
359 };
360 
361 static const struct attribute_group hd_attr_group = {
362 	.name  = "hiperdispatch",
363 	.attrs = hd_attrs,
364 };
365 
366 static int hd_greedy_time_get(void *unused, u64 *val)
367 {
368 	mutex_lock(&hd_counter_mutex);
369 	hd_update_times();
370 	*val = hd_high_time;
371 	mutex_unlock(&hd_counter_mutex);
372 	return 0;
373 }
374 
375 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
376 
377 static int hd_conservative_time_get(void *unused, u64 *val)
378 {
379 	mutex_lock(&hd_counter_mutex);
380 	hd_update_times();
381 	*val = hd_low_time;
382 	mutex_unlock(&hd_counter_mutex);
383 	return 0;
384 }
385 
386 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
387 
388 static int hd_adjustment_count_get(void *unused, u64 *val)
389 {
390 	*val = atomic64_read(&hd_adjustments);
391 	return 0;
392 }
393 
394 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
395 
396 static void __init hd_create_debugfs_counters(void)
397 {
398 	struct dentry *dir;
399 
400 	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
401 	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
402 	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
403 	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
404 }
405 
406 static void __init hd_create_attributes(void)
407 {
408 	struct device *dev;
409 
410 	dev = bus_get_dev_root(&cpu_subsys);
411 	if (!dev)
412 		return;
413 	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
414 		pr_warn("Unable to create hiperdispatch attribute group\n");
415 	put_device(dev);
416 }
417 
418 static int __init hd_init(void)
419 {
420 	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
421 		hd_set_hiperdispatch_mode(1);
422 		topology_schedule_update();
423 	}
424 	if (!register_sysctl("s390", hiperdispatch_ctl_table))
425 		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
426 	hd_create_debugfs_counters();
427 	hd_create_attributes();
428 	return 0;
429 }
430 late_initcall(hd_init);
431