1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Housekeeping management. Manage the targets for routine code that can run on
4 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
5 *
6 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
7 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
8 *
9 */
10 #include <linux/sched/isolation.h>
11 #include <linux/pci.h>
12 #include "sched.h"
13
14 enum hk_flags {
15 HK_FLAG_DOMAIN_BOOT = BIT(HK_TYPE_DOMAIN_BOOT),
16 HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
17 HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
18 HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
19 };
20
21 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
22 EXPORT_SYMBOL_GPL(housekeeping_overridden);
23
24 struct housekeeping {
25 struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
26 unsigned long flags;
27 };
28
29 static struct housekeeping housekeeping;
30
housekeeping_enabled(enum hk_type type)31 bool housekeeping_enabled(enum hk_type type)
32 {
33 return !!(READ_ONCE(housekeeping.flags) & BIT(type));
34 }
35 EXPORT_SYMBOL_GPL(housekeeping_enabled);
36
housekeeping_dereference_check(enum hk_type type)37 static bool housekeeping_dereference_check(enum hk_type type)
38 {
39 if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) {
40 /* Cpuset isn't even writable yet? */
41 if (system_state <= SYSTEM_SCHEDULING)
42 return true;
43
44 /* CPU hotplug write locked, so cpuset partition can't be overwritten */
45 if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held())
46 return true;
47
48 /* Cpuset lock held, partitions not writable */
49 if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held())
50 return true;
51
52 return false;
53 }
54
55 return true;
56 }
57
housekeeping_cpumask_dereference(enum hk_type type)58 static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
59 {
60 return rcu_dereference_all_check(housekeeping.cpumasks[type],
61 housekeeping_dereference_check(type));
62 }
63
housekeeping_cpumask(enum hk_type type)64 const struct cpumask *housekeeping_cpumask(enum hk_type type)
65 {
66 const struct cpumask *mask = NULL;
67
68 if (static_branch_unlikely(&housekeeping_overridden)) {
69 if (READ_ONCE(housekeeping.flags) & BIT(type))
70 mask = housekeeping_cpumask_dereference(type);
71 }
72 if (!mask)
73 mask = cpu_possible_mask;
74 return mask;
75 }
76 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
77
housekeeping_any_cpu(enum hk_type type)78 int housekeeping_any_cpu(enum hk_type type)
79 {
80 int cpu;
81
82 if (static_branch_unlikely(&housekeeping_overridden)) {
83 if (housekeeping.flags & BIT(type)) {
84 cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
85 if (cpu < nr_cpu_ids)
86 return cpu;
87
88 cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
89 if (likely(cpu < nr_cpu_ids))
90 return cpu;
91 /*
92 * Unless we have another problem this can only happen
93 * at boot time before start_secondary() brings the 1st
94 * housekeeping CPU up.
95 */
96 WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
97 type != HK_TYPE_TIMER);
98 }
99 }
100 return smp_processor_id();
101 }
102 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
103
housekeeping_affine(struct task_struct * t,enum hk_type type)104 void housekeeping_affine(struct task_struct *t, enum hk_type type)
105 {
106 if (static_branch_unlikely(&housekeeping_overridden))
107 if (housekeeping.flags & BIT(type))
108 set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
109 }
110 EXPORT_SYMBOL_GPL(housekeeping_affine);
111
housekeeping_test_cpu(int cpu,enum hk_type type)112 bool housekeeping_test_cpu(int cpu, enum hk_type type)
113 {
114 if (static_branch_unlikely(&housekeeping_overridden) &&
115 READ_ONCE(housekeeping.flags) & BIT(type))
116 return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
117 return true;
118 }
119 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
120
housekeeping_update(struct cpumask * isol_mask)121 int housekeeping_update(struct cpumask *isol_mask)
122 {
123 struct cpumask *trial, *old = NULL;
124 int err;
125
126 trial = kmalloc(cpumask_size(), GFP_KERNEL);
127 if (!trial)
128 return -ENOMEM;
129
130 cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask);
131 if (!cpumask_intersects(trial, cpu_online_mask)) {
132 kfree(trial);
133 return -EINVAL;
134 }
135
136 if (!housekeeping.flags)
137 static_branch_enable(&housekeeping_overridden);
138
139 if (housekeeping.flags & HK_FLAG_DOMAIN)
140 old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
141 else
142 WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN);
143 rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial);
144
145 synchronize_rcu();
146
147 pci_probe_flush_workqueue();
148 mem_cgroup_flush_workqueue();
149 vmstat_flush_workqueue();
150
151 err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN));
152 WARN_ON_ONCE(err < 0);
153
154 err = tmigr_isolated_exclude_cpumask(isol_mask);
155 WARN_ON_ONCE(err < 0);
156
157 err = kthreads_update_housekeeping();
158 WARN_ON_ONCE(err < 0);
159
160 kfree(old);
161
162 return 0;
163 }
164
housekeeping_init(void)165 void __init housekeeping_init(void)
166 {
167 enum hk_type type;
168
169 if (!housekeeping.flags)
170 return;
171
172 static_branch_enable(&housekeeping_overridden);
173
174 if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
175 sched_tick_offload_init();
176 /*
177 * Realloc with a proper allocator so that any cpumask update
178 * can indifferently free the old version with kfree().
179 */
180 for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
181 struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
182
183 if (WARN_ON_ONCE(!nmask))
184 return;
185
186 omask = rcu_dereference(housekeeping.cpumasks[type]);
187
188 /* We need at least one CPU to handle housekeeping work */
189 WARN_ON_ONCE(cpumask_empty(omask));
190 cpumask_copy(nmask, omask);
191 RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
192 memblock_free(omask, cpumask_size());
193 }
194 }
195
housekeeping_setup_type(enum hk_type type,cpumask_var_t housekeeping_staging)196 static void __init housekeeping_setup_type(enum hk_type type,
197 cpumask_var_t housekeeping_staging)
198 {
199 struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
200
201 cpumask_copy(mask, housekeeping_staging);
202 RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
203 }
204
housekeeping_setup(char * str,unsigned long flags)205 static int __init housekeeping_setup(char *str, unsigned long flags)
206 {
207 cpumask_var_t non_housekeeping_mask, housekeeping_staging;
208 unsigned int first_cpu;
209 int err = 0;
210
211 if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
212 if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
213 pr_warn("Housekeeping: nohz unsupported."
214 " Build with CONFIG_NO_HZ_FULL\n");
215 return 0;
216 }
217 }
218
219 alloc_bootmem_cpumask_var(&non_housekeeping_mask);
220 if (cpulist_parse(str, non_housekeeping_mask) < 0) {
221 pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
222 goto free_non_housekeeping_mask;
223 }
224
225 alloc_bootmem_cpumask_var(&housekeeping_staging);
226 cpumask_andnot(housekeeping_staging,
227 cpu_possible_mask, non_housekeeping_mask);
228
229 first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
230 if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
231 __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
232 __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
233 if (!housekeeping.flags) {
234 pr_warn("Housekeeping: must include one present CPU, "
235 "using boot CPU:%d\n", smp_processor_id());
236 }
237 }
238
239 if (cpumask_empty(non_housekeeping_mask))
240 goto free_housekeeping_staging;
241
242 if (!housekeeping.flags) {
243 /* First setup call ("nohz_full=" or "isolcpus=") */
244 enum hk_type type;
245
246 for_each_set_bit(type, &flags, HK_TYPE_MAX)
247 housekeeping_setup_type(type, housekeeping_staging);
248 } else {
249 /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
250 enum hk_type type;
251 unsigned long iter_flags = flags & housekeeping.flags;
252
253 for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
254 if (!cpumask_equal(housekeeping_staging,
255 housekeeping_cpumask(type))) {
256 pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
257 goto free_housekeeping_staging;
258 }
259 }
260
261 /*
262 * Check the combination of nohz_full and isolcpus=domain,
263 * necessary to avoid problems with the timer migration
264 * hierarchy. managed_irq is ignored by this check since it
265 * isn't considered in the timer migration logic.
266 */
267 iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
268 type = find_first_bit(&iter_flags, HK_TYPE_MAX);
269 /*
270 * Pass the check if none of these flags were previously set or
271 * are not in the current selection.
272 */
273 iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
274 first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
275 cpumask_first_and_and(cpu_present_mask,
276 housekeeping_staging, housekeeping_cpumask(type));
277 if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
278 pr_warn("Housekeeping: must include one present CPU "
279 "neither in nohz_full= nor in isolcpus=domain, "
280 "ignoring setting %s\n", str);
281 goto free_housekeeping_staging;
282 }
283
284 iter_flags = flags & ~housekeeping.flags;
285
286 for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
287 housekeeping_setup_type(type, housekeeping_staging);
288 }
289
290 if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
291 tick_nohz_full_setup(non_housekeeping_mask);
292
293 housekeeping.flags |= flags;
294 err = 1;
295
296 free_housekeeping_staging:
297 free_bootmem_cpumask_var(housekeeping_staging);
298 free_non_housekeeping_mask:
299 free_bootmem_cpumask_var(non_housekeeping_mask);
300
301 return err;
302 }
303
housekeeping_nohz_full_setup(char * str)304 static int __init housekeeping_nohz_full_setup(char *str)
305 {
306 unsigned long flags;
307
308 flags = HK_FLAG_KERNEL_NOISE;
309
310 return housekeeping_setup(str, flags);
311 }
312 __setup("nohz_full=", housekeeping_nohz_full_setup);
313
housekeeping_isolcpus_setup(char * str)314 static int __init housekeeping_isolcpus_setup(char *str)
315 {
316 unsigned long flags = 0;
317 bool illegal = false;
318 char *par;
319 int len;
320
321 while (isalpha(*str)) {
322 /*
323 * isolcpus=nohz is equivalent to nohz_full.
324 */
325 if (!strncmp(str, "nohz,", 5)) {
326 str += 5;
327 flags |= HK_FLAG_KERNEL_NOISE;
328 continue;
329 }
330
331 if (!strncmp(str, "domain,", 7)) {
332 str += 7;
333 flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
334 continue;
335 }
336
337 if (!strncmp(str, "managed_irq,", 12)) {
338 str += 12;
339 flags |= HK_FLAG_MANAGED_IRQ;
340 continue;
341 }
342
343 /*
344 * Skip unknown sub-parameter and validate that it is not
345 * containing an invalid character.
346 */
347 for (par = str, len = 0; *str && *str != ','; str++, len++) {
348 if (!isalpha(*str) && *str != '_')
349 illegal = true;
350 }
351
352 if (illegal) {
353 pr_warn("isolcpus: Invalid flag %.*s\n", len, par);
354 return 0;
355 }
356
357 pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par);
358 str++;
359 }
360
361 /* Default behaviour for isolcpus without flags */
362 if (!flags)
363 flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
364
365 return housekeeping_setup(str, flags);
366 }
367 __setup("isolcpus=", housekeeping_isolcpus_setup);
368