1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2024 4 */ 5 6 #define KMSG_COMPONENT "hd" 7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 8 9 /* 10 * Hiperdispatch: 11 * Dynamically calculates the optimum number of high capacity COREs 12 * by considering the state the system is in. When hiperdispatch decides 13 * that a capacity update is necessary, it schedules a topology update. 14 * During topology updates the CPU capacities are always re-adjusted. 15 * 16 * There is two places where CPU capacities are being accessed within 17 * hiperdispatch. 18 * -> hiperdispatch's reoccuring work function reads CPU capacities to 19 * determine high capacity CPU count. 20 * -> during a topology update hiperdispatch's adjustment function 21 * updates CPU capacities. 22 * These two can run on different CPUs in parallel which can cause 23 * hiperdispatch to make wrong decisions. This can potentially cause 24 * some overhead by leading to extra rebuild_sched_domains() calls 25 * for correction. Access to capacities within hiperdispatch has to be 26 * serialized to prevent the overhead. 27 * 28 * Hiperdispatch decision making revolves around steal time. 29 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time 30 * crosses the threshold value hiperdispatch falls back to giving high 31 * capacities to entitled CPUs. When steal time drops below the 32 * threshold boundary, hiperdispatch utilizes all CPUs by giving all 33 * of them high capacity. 34 * 35 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread 36 * performance. Comparing the throughput of; 37 * - single CORE, with N threads, running N tasks 38 * - N separate COREs running N tasks, 39 * using individual COREs for individual tasks yield better 40 * performance. This performance difference is roughly ~30% (can change 41 * between machine generations) 42 * 43 * Hiperdispatch tries to hint scheduler to use individual COREs for 44 * each task, as long as steal time on those COREs are less than 30%, 45 * therefore delaying the throughput loss caused by using SMP threads. 46 */ 47 48 #include <linux/cpumask.h> 49 #include <linux/debugfs.h> 50 #include <linux/device.h> 51 #include <linux/kernel_stat.h> 52 #include <linux/kstrtox.h> 53 #include <linux/ktime.h> 54 #include <linux/sysctl.h> 55 #include <linux/types.h> 56 #include <linux/workqueue.h> 57 #include <asm/hiperdispatch.h> 58 #include <asm/setup.h> 59 #include <asm/smp.h> 60 #include <asm/topology.h> 61 62 #define CREATE_TRACE_POINTS 63 #include <asm/trace/hiperdispatch.h> 64 65 #define HD_DELAY_FACTOR (4) 66 #define HD_DELAY_INTERVAL (HZ / 4) 67 #define HD_STEAL_THRESHOLD 30 68 #define HD_STEAL_AVG_WEIGHT 16 69 70 static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ 71 static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ 72 static int hd_high_capacity_cores; /* Current CORE count with high capacity */ 73 static int hd_entitled_cores; /* Total vertical high and medium CORE count */ 74 static int hd_online_cores; /* Current online CORE count */ 75 76 static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ 77 static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ 78 static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ 79 static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ 80 81 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; 82 static unsigned int hd_delay_factor = HD_DELAY_FACTOR; 83 static int hd_enabled; 84 85 static void hd_capacity_work_fn(struct work_struct *work); 86 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); 87 88 static int hd_set_hiperdispatch_mode(int enable) 89 { 90 if (!MACHINE_HAS_TOPOLOGY) 91 enable = 0; 92 if (hd_enabled == enable) 93 return 0; 94 hd_enabled = enable; 95 return 1; 96 } 97 98 void hd_reset_state(void) 99 { 100 cpumask_clear(&hd_vl_coremask); 101 cpumask_clear(&hd_vmvl_cpumask); 102 hd_entitled_cores = 0; 103 hd_online_cores = 0; 104 } 105 106 void hd_add_core(int cpu) 107 { 108 const struct cpumask *siblings; 109 int polarization; 110 111 hd_online_cores++; 112 polarization = smp_cpu_get_polarization(cpu); 113 siblings = topology_sibling_cpumask(cpu); 114 switch (polarization) { 115 case POLARIZATION_VH: 116 hd_entitled_cores++; 117 break; 118 case POLARIZATION_VM: 119 hd_entitled_cores++; 120 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); 121 break; 122 case POLARIZATION_VL: 123 cpumask_set_cpu(cpu, &hd_vl_coremask); 124 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); 125 break; 126 } 127 } 128 129 /* Serialize update and read operations of debug counters. */ 130 static DEFINE_MUTEX(hd_counter_mutex); 131 132 static void hd_update_times(void) 133 { 134 static ktime_t prev; 135 ktime_t now; 136 137 /* 138 * Check if hiperdispatch is active, if not set the prev to 0. 139 * This way it is possible to differentiate the first update iteration after 140 * enabling hiperdispatch. 141 */ 142 if (hd_entitled_cores == 0 || hd_enabled == 0) { 143 prev = ktime_set(0, 0); 144 return; 145 } 146 now = ktime_get(); 147 if (ktime_after(prev, 0)) { 148 if (hd_high_capacity_cores == hd_online_cores) 149 hd_high_time += ktime_ms_delta(now, prev); 150 else 151 hd_low_time += ktime_ms_delta(now, prev); 152 } 153 prev = now; 154 } 155 156 static void hd_update_capacities(void) 157 { 158 int cpu, upscaling_cores; 159 unsigned long capacity; 160 161 upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; 162 capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; 163 hd_high_capacity_cores = hd_entitled_cores; 164 for_each_cpu(cpu, &hd_vl_coremask) { 165 smp_set_core_capacity(cpu, capacity); 166 if (capacity != CPU_CAPACITY_HIGH) 167 continue; 168 hd_high_capacity_cores++; 169 upscaling_cores--; 170 if (upscaling_cores == 0) 171 capacity = CPU_CAPACITY_LOW; 172 } 173 } 174 175 void hd_disable_hiperdispatch(void) 176 { 177 cancel_delayed_work_sync(&hd_capacity_work); 178 hd_high_capacity_cores = hd_online_cores; 179 hd_previous_steal = 0; 180 } 181 182 int hd_enable_hiperdispatch(void) 183 { 184 mutex_lock(&hd_counter_mutex); 185 hd_update_times(); 186 mutex_unlock(&hd_counter_mutex); 187 if (hd_enabled == 0) 188 return 0; 189 if (hd_entitled_cores == 0) 190 return 0; 191 if (hd_online_cores <= hd_entitled_cores) 192 return 0; 193 mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); 194 hd_update_capacities(); 195 return 1; 196 } 197 198 static unsigned long hd_steal_avg(unsigned long new) 199 { 200 static unsigned long steal; 201 202 steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; 203 return steal; 204 } 205 206 static unsigned long hd_calculate_steal_percentage(void) 207 { 208 unsigned long time_delta, steal_delta, steal, percentage; 209 static ktime_t prev; 210 int cpus, cpu; 211 ktime_t now; 212 213 cpus = 0; 214 steal = 0; 215 percentage = 0; 216 for_each_cpu(cpu, &hd_vmvl_cpumask) { 217 steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; 218 cpus++; 219 } 220 /* 221 * If there is no vertical medium and low CPUs steal time 222 * is 0 as vertical high CPUs shouldn't experience steal time. 223 */ 224 if (cpus == 0) 225 return percentage; 226 now = ktime_get(); 227 time_delta = ktime_to_ns(ktime_sub(now, prev)); 228 if (steal > hd_previous_steal && hd_previous_steal != 0) { 229 steal_delta = (steal - hd_previous_steal) * 100 / time_delta; 230 percentage = steal_delta / cpus; 231 } 232 hd_previous_steal = steal; 233 prev = now; 234 return percentage; 235 } 236 237 static void hd_capacity_work_fn(struct work_struct *work) 238 { 239 unsigned long steal_percentage, new_cores; 240 241 mutex_lock(&smp_cpu_state_mutex); 242 /* 243 * If online cores are less or equal to entitled cores hiperdispatch 244 * does not need to make any adjustments, call a topology update to 245 * disable hiperdispatch. 246 * Normally this check is handled on topology update, but during cpu 247 * unhotplug, topology and cpu mask updates are done in reverse 248 * order, causing hd_enable_hiperdispatch() to get stale data. 249 */ 250 if (hd_online_cores <= hd_entitled_cores) { 251 topology_schedule_update(); 252 mutex_unlock(&smp_cpu_state_mutex); 253 return; 254 } 255 steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); 256 if (steal_percentage < hd_steal_threshold) 257 new_cores = hd_online_cores; 258 else 259 new_cores = hd_entitled_cores; 260 if (hd_high_capacity_cores != new_cores) { 261 trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); 262 hd_high_capacity_cores = new_cores; 263 atomic64_inc(&hd_adjustments); 264 topology_schedule_update(); 265 } 266 trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); 267 mutex_unlock(&smp_cpu_state_mutex); 268 schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); 269 } 270 271 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, 272 void *buffer, size_t *lenp, loff_t *ppos) 273 { 274 int hiperdispatch; 275 int rc; 276 struct ctl_table ctl_entry = { 277 .procname = ctl->procname, 278 .data = &hiperdispatch, 279 .maxlen = sizeof(int), 280 .extra1 = SYSCTL_ZERO, 281 .extra2 = SYSCTL_ONE, 282 }; 283 284 hiperdispatch = hd_enabled; 285 rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); 286 if (rc < 0 || !write) 287 return rc; 288 mutex_lock(&smp_cpu_state_mutex); 289 if (hd_set_hiperdispatch_mode(hiperdispatch)) 290 topology_schedule_update(); 291 mutex_unlock(&smp_cpu_state_mutex); 292 return 0; 293 } 294 295 static struct ctl_table hiperdispatch_ctl_table[] = { 296 { 297 .procname = "hiperdispatch", 298 .mode = 0644, 299 .proc_handler = hiperdispatch_ctl_handler, 300 }, 301 }; 302 303 static ssize_t hd_steal_threshold_show(struct device *dev, 304 struct device_attribute *attr, 305 char *buf) 306 { 307 return sysfs_emit(buf, "%u\n", hd_steal_threshold); 308 } 309 310 static ssize_t hd_steal_threshold_store(struct device *dev, 311 struct device_attribute *attr, 312 const char *buf, 313 size_t count) 314 { 315 unsigned int val; 316 int rc; 317 318 rc = kstrtouint(buf, 0, &val); 319 if (rc) 320 return rc; 321 if (val > 100) 322 return -ERANGE; 323 hd_steal_threshold = val; 324 return count; 325 } 326 327 static DEVICE_ATTR_RW(hd_steal_threshold); 328 329 static ssize_t hd_delay_factor_show(struct device *dev, 330 struct device_attribute *attr, 331 char *buf) 332 { 333 return sysfs_emit(buf, "%u\n", hd_delay_factor); 334 } 335 336 static ssize_t hd_delay_factor_store(struct device *dev, 337 struct device_attribute *attr, 338 const char *buf, 339 size_t count) 340 { 341 unsigned int val; 342 int rc; 343 344 rc = kstrtouint(buf, 0, &val); 345 if (rc) 346 return rc; 347 if (!val) 348 return -ERANGE; 349 hd_delay_factor = val; 350 return count; 351 } 352 353 static DEVICE_ATTR_RW(hd_delay_factor); 354 355 static struct attribute *hd_attrs[] = { 356 &dev_attr_hd_steal_threshold.attr, 357 &dev_attr_hd_delay_factor.attr, 358 NULL, 359 }; 360 361 static const struct attribute_group hd_attr_group = { 362 .name = "hiperdispatch", 363 .attrs = hd_attrs, 364 }; 365 366 static int hd_greedy_time_get(void *unused, u64 *val) 367 { 368 mutex_lock(&hd_counter_mutex); 369 hd_update_times(); 370 *val = hd_high_time; 371 mutex_unlock(&hd_counter_mutex); 372 return 0; 373 } 374 375 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); 376 377 static int hd_conservative_time_get(void *unused, u64 *val) 378 { 379 mutex_lock(&hd_counter_mutex); 380 hd_update_times(); 381 *val = hd_low_time; 382 mutex_unlock(&hd_counter_mutex); 383 return 0; 384 } 385 386 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); 387 388 static int hd_adjustment_count_get(void *unused, u64 *val) 389 { 390 *val = atomic64_read(&hd_adjustments); 391 return 0; 392 } 393 394 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); 395 396 static void __init hd_create_debugfs_counters(void) 397 { 398 struct dentry *dir; 399 400 dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); 401 debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); 402 debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); 403 debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); 404 } 405 406 static void __init hd_create_attributes(void) 407 { 408 struct device *dev; 409 410 dev = bus_get_dev_root(&cpu_subsys); 411 if (!dev) 412 return; 413 if (sysfs_create_group(&dev->kobj, &hd_attr_group)) 414 pr_warn("Unable to create hiperdispatch attribute group\n"); 415 put_device(dev); 416 } 417 418 static int __init hd_init(void) 419 { 420 if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { 421 hd_set_hiperdispatch_mode(1); 422 topology_schedule_update(); 423 } 424 if (!register_sysctl("s390", hiperdispatch_ctl_table)) 425 pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); 426 hd_create_debugfs_counters(); 427 hd_create_attributes(); 428 return 0; 429 } 430 late_initcall(hd_init); 431