xref: /linux/kernel/sched/isolation.c (revision 37a93dd5c49b5fda807fd204edf2547c3493319c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Housekeeping management. Manage the targets for routine code that can run on
4  *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
5  *
6  * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
7  * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
8  *
9  */
10 #include <linux/sched/isolation.h>
11 #include <linux/pci.h>
12 #include "sched.h"
13 
14 enum hk_flags {
15 	HK_FLAG_DOMAIN_BOOT	= BIT(HK_TYPE_DOMAIN_BOOT),
16 	HK_FLAG_DOMAIN		= BIT(HK_TYPE_DOMAIN),
17 	HK_FLAG_MANAGED_IRQ	= BIT(HK_TYPE_MANAGED_IRQ),
18 	HK_FLAG_KERNEL_NOISE	= BIT(HK_TYPE_KERNEL_NOISE),
19 };
20 
21 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
22 EXPORT_SYMBOL_GPL(housekeeping_overridden);
23 
24 struct housekeeping {
25 	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
26 	unsigned long flags;
27 };
28 
29 static struct housekeeping housekeeping;
30 
31 bool housekeeping_enabled(enum hk_type type)
32 {
33 	return !!(READ_ONCE(housekeeping.flags) & BIT(type));
34 }
35 EXPORT_SYMBOL_GPL(housekeeping_enabled);
36 
37 static bool housekeeping_dereference_check(enum hk_type type)
38 {
39 	if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) {
40 		/* Cpuset isn't even writable yet? */
41 		if (system_state <= SYSTEM_SCHEDULING)
42 			return true;
43 
44 		/* CPU hotplug write locked, so cpuset partition can't be overwritten */
45 		if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held())
46 			return true;
47 
48 		/* Cpuset lock held, partitions not writable */
49 		if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held())
50 			return true;
51 
52 		return false;
53 	}
54 
55 	return true;
56 }
57 
58 static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
59 {
60 	return rcu_dereference_all_check(housekeeping.cpumasks[type],
61 					 housekeeping_dereference_check(type));
62 }
63 
64 const struct cpumask *housekeeping_cpumask(enum hk_type type)
65 {
66 	const struct cpumask *mask = NULL;
67 
68 	if (static_branch_unlikely(&housekeeping_overridden)) {
69 		if (READ_ONCE(housekeeping.flags) & BIT(type))
70 			mask = housekeeping_cpumask_dereference(type);
71 	}
72 	if (!mask)
73 		mask = cpu_possible_mask;
74 	return mask;
75 }
76 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
77 
78 int housekeeping_any_cpu(enum hk_type type)
79 {
80 	int cpu;
81 
82 	if (static_branch_unlikely(&housekeeping_overridden)) {
83 		if (housekeeping.flags & BIT(type)) {
84 			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
85 			if (cpu < nr_cpu_ids)
86 				return cpu;
87 
88 			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
89 			if (likely(cpu < nr_cpu_ids))
90 				return cpu;
91 			/*
92 			 * Unless we have another problem this can only happen
93 			 * at boot time before start_secondary() brings the 1st
94 			 * housekeeping CPU up.
95 			 */
96 			WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
97 				     type != HK_TYPE_TIMER);
98 		}
99 	}
100 	return smp_processor_id();
101 }
102 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
103 
104 void housekeeping_affine(struct task_struct *t, enum hk_type type)
105 {
106 	if (static_branch_unlikely(&housekeeping_overridden))
107 		if (housekeeping.flags & BIT(type))
108 			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
109 }
110 EXPORT_SYMBOL_GPL(housekeeping_affine);
111 
112 bool housekeeping_test_cpu(int cpu, enum hk_type type)
113 {
114 	if (static_branch_unlikely(&housekeeping_overridden) &&
115 	    READ_ONCE(housekeeping.flags) & BIT(type))
116 		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
117 	return true;
118 }
119 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
120 
121 int housekeeping_update(struct cpumask *isol_mask)
122 {
123 	struct cpumask *trial, *old = NULL;
124 	int err;
125 
126 	lockdep_assert_cpus_held();
127 
128 	trial = kmalloc(cpumask_size(), GFP_KERNEL);
129 	if (!trial)
130 		return -ENOMEM;
131 
132 	cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask);
133 	if (!cpumask_intersects(trial, cpu_online_mask)) {
134 		kfree(trial);
135 		return -EINVAL;
136 	}
137 
138 	if (!housekeeping.flags)
139 		static_branch_enable_cpuslocked(&housekeeping_overridden);
140 
141 	if (housekeeping.flags & HK_FLAG_DOMAIN)
142 		old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
143 	else
144 		WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN);
145 	rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial);
146 
147 	synchronize_rcu();
148 
149 	pci_probe_flush_workqueue();
150 	mem_cgroup_flush_workqueue();
151 	vmstat_flush_workqueue();
152 
153 	err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN));
154 	WARN_ON_ONCE(err < 0);
155 
156 	err = tmigr_isolated_exclude_cpumask(isol_mask);
157 	WARN_ON_ONCE(err < 0);
158 
159 	err = kthreads_update_housekeeping();
160 	WARN_ON_ONCE(err < 0);
161 
162 	kfree(old);
163 
164 	return 0;
165 }
166 
167 void __init housekeeping_init(void)
168 {
169 	enum hk_type type;
170 
171 	if (!housekeeping.flags)
172 		return;
173 
174 	static_branch_enable(&housekeeping_overridden);
175 
176 	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
177 		sched_tick_offload_init();
178 	/*
179 	 * Realloc with a proper allocator so that any cpumask update
180 	 * can indifferently free the old version with kfree().
181 	 */
182 	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
183 		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
184 
185 		if (WARN_ON_ONCE(!nmask))
186 			return;
187 
188 		omask = rcu_dereference(housekeeping.cpumasks[type]);
189 
190 		/* We need at least one CPU to handle housekeeping work */
191 		WARN_ON_ONCE(cpumask_empty(omask));
192 		cpumask_copy(nmask, omask);
193 		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
194 		memblock_free(omask, cpumask_size());
195 	}
196 }
197 
198 static void __init housekeeping_setup_type(enum hk_type type,
199 					   cpumask_var_t housekeeping_staging)
200 {
201 	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
202 
203 	cpumask_copy(mask, housekeeping_staging);
204 	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
205 }
206 
207 static int __init housekeeping_setup(char *str, unsigned long flags)
208 {
209 	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
210 	unsigned int first_cpu;
211 	int err = 0;
212 
213 	if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
214 		if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
215 			pr_warn("Housekeeping: nohz unsupported."
216 				" Build with CONFIG_NO_HZ_FULL\n");
217 			return 0;
218 		}
219 	}
220 
221 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
222 	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
223 		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
224 		goto free_non_housekeeping_mask;
225 	}
226 
227 	alloc_bootmem_cpumask_var(&housekeeping_staging);
228 	cpumask_andnot(housekeeping_staging,
229 		       cpu_possible_mask, non_housekeeping_mask);
230 
231 	first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
232 	if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
233 		__cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
234 		__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
235 		if (!housekeeping.flags) {
236 			pr_warn("Housekeeping: must include one present CPU, "
237 				"using boot CPU:%d\n", smp_processor_id());
238 		}
239 	}
240 
241 	if (cpumask_empty(non_housekeeping_mask))
242 		goto free_housekeeping_staging;
243 
244 	if (!housekeeping.flags) {
245 		/* First setup call ("nohz_full=" or "isolcpus=") */
246 		enum hk_type type;
247 
248 		for_each_set_bit(type, &flags, HK_TYPE_MAX)
249 			housekeeping_setup_type(type, housekeeping_staging);
250 	} else {
251 		/* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
252 		enum hk_type type;
253 		unsigned long iter_flags = flags & housekeeping.flags;
254 
255 		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
256 			if (!cpumask_equal(housekeeping_staging,
257 					   housekeeping_cpumask(type))) {
258 				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
259 				goto free_housekeeping_staging;
260 			}
261 		}
262 
263 		/*
264 		 * Check the combination of nohz_full and isolcpus=domain,
265 		 * necessary to avoid problems with the timer migration
266 		 * hierarchy. managed_irq is ignored by this check since it
267 		 * isn't considered in the timer migration logic.
268 		 */
269 		iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
270 		type = find_first_bit(&iter_flags, HK_TYPE_MAX);
271 		/*
272 		 * Pass the check if none of these flags were previously set or
273 		 * are not in the current selection.
274 		 */
275 		iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
276 		first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
277 			    cpumask_first_and_and(cpu_present_mask,
278 						  housekeeping_staging, housekeeping_cpumask(type));
279 		if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
280 			pr_warn("Housekeeping: must include one present CPU "
281 				"neither in nohz_full= nor in isolcpus=domain, "
282 				"ignoring setting %s\n", str);
283 			goto free_housekeeping_staging;
284 		}
285 
286 		iter_flags = flags & ~housekeeping.flags;
287 
288 		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
289 			housekeeping_setup_type(type, housekeeping_staging);
290 	}
291 
292 	if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
293 		tick_nohz_full_setup(non_housekeeping_mask);
294 
295 	housekeeping.flags |= flags;
296 	err = 1;
297 
298 free_housekeeping_staging:
299 	free_bootmem_cpumask_var(housekeeping_staging);
300 free_non_housekeeping_mask:
301 	free_bootmem_cpumask_var(non_housekeeping_mask);
302 
303 	return err;
304 }
305 
306 static int __init housekeeping_nohz_full_setup(char *str)
307 {
308 	unsigned long flags;
309 
310 	flags = HK_FLAG_KERNEL_NOISE;
311 
312 	return housekeeping_setup(str, flags);
313 }
314 __setup("nohz_full=", housekeeping_nohz_full_setup);
315 
316 static int __init housekeeping_isolcpus_setup(char *str)
317 {
318 	unsigned long flags = 0;
319 	bool illegal = false;
320 	char *par;
321 	int len;
322 
323 	while (isalpha(*str)) {
324 		/*
325 		 * isolcpus=nohz is equivalent to nohz_full.
326 		 */
327 		if (!strncmp(str, "nohz,", 5)) {
328 			str += 5;
329 			flags |= HK_FLAG_KERNEL_NOISE;
330 			continue;
331 		}
332 
333 		if (!strncmp(str, "domain,", 7)) {
334 			str += 7;
335 			flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
336 			continue;
337 		}
338 
339 		if (!strncmp(str, "managed_irq,", 12)) {
340 			str += 12;
341 			flags |= HK_FLAG_MANAGED_IRQ;
342 			continue;
343 		}
344 
345 		/*
346 		 * Skip unknown sub-parameter and validate that it is not
347 		 * containing an invalid character.
348 		 */
349 		for (par = str, len = 0; *str && *str != ','; str++, len++) {
350 			if (!isalpha(*str) && *str != '_')
351 				illegal = true;
352 		}
353 
354 		if (illegal) {
355 			pr_warn("isolcpus: Invalid flag %.*s\n", len, par);
356 			return 0;
357 		}
358 
359 		pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par);
360 		str++;
361 	}
362 
363 	/* Default behaviour for isolcpus without flags */
364 	if (!flags)
365 		flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
366 
367 	return housekeeping_setup(str, flags);
368 }
369 __setup("isolcpus=", housekeeping_isolcpus_setup);
370