xref: /linux/kernel/sched/isolation.c (revision 57885276cc16a2e2b76282c808a4e84cbecb3aae)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Housekeeping management. Manage the targets for routine code that can run on
4  *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
5  *
6  * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
7  * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
8  *
9  */
10 #include <linux/sched/isolation.h>
11 #include <linux/pci.h>
12 #include "sched.h"
13 
14 enum hk_flags {
15 	HK_FLAG_DOMAIN_BOOT	= BIT(HK_TYPE_DOMAIN_BOOT),
16 	HK_FLAG_DOMAIN		= BIT(HK_TYPE_DOMAIN),
17 	HK_FLAG_MANAGED_IRQ	= BIT(HK_TYPE_MANAGED_IRQ),
18 	HK_FLAG_KERNEL_NOISE	= BIT(HK_TYPE_KERNEL_NOISE),
19 };
20 
21 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
22 EXPORT_SYMBOL_GPL(housekeeping_overridden);
23 
24 struct housekeeping {
25 	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
26 	unsigned long flags;
27 };
28 
29 static struct housekeeping housekeeping;
30 
31 bool housekeeping_enabled(enum hk_type type)
32 {
33 	return !!(READ_ONCE(housekeeping.flags) & BIT(type));
34 }
35 EXPORT_SYMBOL_GPL(housekeeping_enabled);
36 
37 static bool housekeeping_dereference_check(enum hk_type type)
38 {
39 	if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) {
40 		/* Cpuset isn't even writable yet? */
41 		if (system_state <= SYSTEM_SCHEDULING)
42 			return true;
43 
44 		/* CPU hotplug write locked, so cpuset partition can't be overwritten */
45 		if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held())
46 			return true;
47 
48 		/* Cpuset lock held, partitions not writable */
49 		if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held())
50 			return true;
51 
52 		return false;
53 	}
54 
55 	return true;
56 }
57 
58 static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
59 {
60 	return rcu_dereference_all_check(housekeeping.cpumasks[type],
61 					 housekeeping_dereference_check(type));
62 }
63 
64 const struct cpumask *housekeeping_cpumask(enum hk_type type)
65 {
66 	const struct cpumask *mask = NULL;
67 
68 	if (static_branch_unlikely(&housekeeping_overridden)) {
69 		if (READ_ONCE(housekeeping.flags) & BIT(type))
70 			mask = housekeeping_cpumask_dereference(type);
71 	}
72 	if (!mask)
73 		mask = cpu_possible_mask;
74 	return mask;
75 }
76 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
77 
78 int housekeeping_any_cpu(enum hk_type type)
79 {
80 	int cpu;
81 
82 	if (static_branch_unlikely(&housekeeping_overridden)) {
83 		if (housekeeping.flags & BIT(type)) {
84 			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
85 			if (cpu < nr_cpu_ids)
86 				return cpu;
87 
88 			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
89 			if (likely(cpu < nr_cpu_ids))
90 				return cpu;
91 			/*
92 			 * Unless we have another problem this can only happen
93 			 * at boot time before start_secondary() brings the 1st
94 			 * housekeeping CPU up.
95 			 */
96 			WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
97 				     type != HK_TYPE_TIMER);
98 		}
99 	}
100 	return smp_processor_id();
101 }
102 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
103 
104 void housekeeping_affine(struct task_struct *t, enum hk_type type)
105 {
106 	if (static_branch_unlikely(&housekeeping_overridden))
107 		if (housekeeping.flags & BIT(type))
108 			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
109 }
110 EXPORT_SYMBOL_GPL(housekeeping_affine);
111 
112 bool housekeeping_test_cpu(int cpu, enum hk_type type)
113 {
114 	if (static_branch_unlikely(&housekeeping_overridden) &&
115 	    READ_ONCE(housekeeping.flags) & BIT(type))
116 		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
117 	return true;
118 }
119 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
120 
121 int housekeeping_update(struct cpumask *isol_mask)
122 {
123 	struct cpumask *trial, *old = NULL;
124 	int err;
125 
126 	trial = kmalloc(cpumask_size(), GFP_KERNEL);
127 	if (!trial)
128 		return -ENOMEM;
129 
130 	cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask);
131 	if (!cpumask_intersects(trial, cpu_online_mask)) {
132 		kfree(trial);
133 		return -EINVAL;
134 	}
135 
136 	if (!housekeeping.flags)
137 		static_branch_enable(&housekeeping_overridden);
138 
139 	if (housekeeping.flags & HK_FLAG_DOMAIN)
140 		old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
141 	else
142 		WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN);
143 	rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial);
144 
145 	synchronize_rcu();
146 
147 	pci_probe_flush_workqueue();
148 	mem_cgroup_flush_workqueue();
149 	vmstat_flush_workqueue();
150 
151 	err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN));
152 	WARN_ON_ONCE(err < 0);
153 
154 	err = tmigr_isolated_exclude_cpumask(isol_mask);
155 	WARN_ON_ONCE(err < 0);
156 
157 	err = kthreads_update_housekeeping();
158 	WARN_ON_ONCE(err < 0);
159 
160 	kfree(old);
161 
162 	return 0;
163 }
164 
165 void __init housekeeping_init(void)
166 {
167 	enum hk_type type;
168 
169 	if (!housekeeping.flags)
170 		return;
171 
172 	static_branch_enable(&housekeeping_overridden);
173 
174 	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
175 		sched_tick_offload_init();
176 	/*
177 	 * Realloc with a proper allocator so that any cpumask update
178 	 * can indifferently free the old version with kfree().
179 	 */
180 	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
181 		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
182 
183 		if (WARN_ON_ONCE(!nmask))
184 			return;
185 
186 		omask = rcu_dereference(housekeeping.cpumasks[type]);
187 
188 		/* We need at least one CPU to handle housekeeping work */
189 		WARN_ON_ONCE(cpumask_empty(omask));
190 		cpumask_copy(nmask, omask);
191 		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
192 		memblock_free(omask, cpumask_size());
193 	}
194 }
195 
196 static void __init housekeeping_setup_type(enum hk_type type,
197 					   cpumask_var_t housekeeping_staging)
198 {
199 	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
200 
201 	cpumask_copy(mask, housekeeping_staging);
202 	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
203 }
204 
205 static int __init housekeeping_setup(char *str, unsigned long flags)
206 {
207 	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
208 	unsigned int first_cpu;
209 	int err = 0;
210 
211 	if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
212 		if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
213 			pr_warn("Housekeeping: nohz unsupported."
214 				" Build with CONFIG_NO_HZ_FULL\n");
215 			return 0;
216 		}
217 	}
218 
219 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
220 	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
221 		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
222 		goto free_non_housekeeping_mask;
223 	}
224 
225 	alloc_bootmem_cpumask_var(&housekeeping_staging);
226 	cpumask_andnot(housekeeping_staging,
227 		       cpu_possible_mask, non_housekeeping_mask);
228 
229 	first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
230 	if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
231 		__cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
232 		__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
233 		if (!housekeeping.flags) {
234 			pr_warn("Housekeeping: must include one present CPU, "
235 				"using boot CPU:%d\n", smp_processor_id());
236 		}
237 	}
238 
239 	if (cpumask_empty(non_housekeeping_mask))
240 		goto free_housekeeping_staging;
241 
242 	if (!housekeeping.flags) {
243 		/* First setup call ("nohz_full=" or "isolcpus=") */
244 		enum hk_type type;
245 
246 		for_each_set_bit(type, &flags, HK_TYPE_MAX)
247 			housekeeping_setup_type(type, housekeeping_staging);
248 	} else {
249 		/* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
250 		enum hk_type type;
251 		unsigned long iter_flags = flags & housekeeping.flags;
252 
253 		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
254 			if (!cpumask_equal(housekeeping_staging,
255 					   housekeeping_cpumask(type))) {
256 				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
257 				goto free_housekeeping_staging;
258 			}
259 		}
260 
261 		/*
262 		 * Check the combination of nohz_full and isolcpus=domain,
263 		 * necessary to avoid problems with the timer migration
264 		 * hierarchy. managed_irq is ignored by this check since it
265 		 * isn't considered in the timer migration logic.
266 		 */
267 		iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
268 		type = find_first_bit(&iter_flags, HK_TYPE_MAX);
269 		/*
270 		 * Pass the check if none of these flags were previously set or
271 		 * are not in the current selection.
272 		 */
273 		iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
274 		first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
275 			    cpumask_first_and_and(cpu_present_mask,
276 						  housekeeping_staging, housekeeping_cpumask(type));
277 		if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
278 			pr_warn("Housekeeping: must include one present CPU "
279 				"neither in nohz_full= nor in isolcpus=domain, "
280 				"ignoring setting %s\n", str);
281 			goto free_housekeeping_staging;
282 		}
283 
284 		iter_flags = flags & ~housekeeping.flags;
285 
286 		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
287 			housekeeping_setup_type(type, housekeeping_staging);
288 	}
289 
290 	if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
291 		tick_nohz_full_setup(non_housekeeping_mask);
292 
293 	housekeeping.flags |= flags;
294 	err = 1;
295 
296 free_housekeeping_staging:
297 	free_bootmem_cpumask_var(housekeeping_staging);
298 free_non_housekeeping_mask:
299 	free_bootmem_cpumask_var(non_housekeeping_mask);
300 
301 	return err;
302 }
303 
304 static int __init housekeeping_nohz_full_setup(char *str)
305 {
306 	unsigned long flags;
307 
308 	flags = HK_FLAG_KERNEL_NOISE;
309 
310 	return housekeeping_setup(str, flags);
311 }
312 __setup("nohz_full=", housekeeping_nohz_full_setup);
313 
314 static int __init housekeeping_isolcpus_setup(char *str)
315 {
316 	unsigned long flags = 0;
317 	bool illegal = false;
318 	char *par;
319 	int len;
320 
321 	while (isalpha(*str)) {
322 		/*
323 		 * isolcpus=nohz is equivalent to nohz_full.
324 		 */
325 		if (!strncmp(str, "nohz,", 5)) {
326 			str += 5;
327 			flags |= HK_FLAG_KERNEL_NOISE;
328 			continue;
329 		}
330 
331 		if (!strncmp(str, "domain,", 7)) {
332 			str += 7;
333 			flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
334 			continue;
335 		}
336 
337 		if (!strncmp(str, "managed_irq,", 12)) {
338 			str += 12;
339 			flags |= HK_FLAG_MANAGED_IRQ;
340 			continue;
341 		}
342 
343 		/*
344 		 * Skip unknown sub-parameter and validate that it is not
345 		 * containing an invalid character.
346 		 */
347 		for (par = str, len = 0; *str && *str != ','; str++, len++) {
348 			if (!isalpha(*str) && *str != '_')
349 				illegal = true;
350 		}
351 
352 		if (illegal) {
353 			pr_warn("isolcpus: Invalid flag %.*s\n", len, par);
354 			return 0;
355 		}
356 
357 		pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par);
358 		str++;
359 	}
360 
361 	/* Default behaviour for isolcpus without flags */
362 	if (!flags)
363 		flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT;
364 
365 	return housekeeping_setup(str, flags);
366 }
367 __setup("isolcpus=", housekeeping_isolcpus_setup);
368