xref: /linux/kernel/trace/pid_list.c (revision 69c5079b49fa120c1a108b6e28b3a6a8e4ae2db5)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
4  */
5 #include <linux/spinlock.h>
6 #include <linux/seqlock.h>
7 #include <linux/irq_work.h>
8 #include <linux/slab.h>
9 #include "trace.h"
10 
11 /* See pid_list.h for details */
12 
get_lower_chunk(struct trace_pid_list * pid_list)13 static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list)
14 {
15 	union lower_chunk *chunk;
16 
17 	lockdep_assert_held(&pid_list->lock);
18 
19 	if (!pid_list->lower_list)
20 		return NULL;
21 
22 	chunk = pid_list->lower_list;
23 	pid_list->lower_list = chunk->next;
24 	pid_list->free_lower_chunks--;
25 	WARN_ON_ONCE(pid_list->free_lower_chunks < 0);
26 	chunk->next = NULL;
27 	/*
28 	 * If a refill needs to happen, it can not happen here
29 	 * as the scheduler run queue locks are held.
30 	 */
31 	if (pid_list->free_lower_chunks <= CHUNK_REALLOC)
32 		irq_work_queue(&pid_list->refill_irqwork);
33 
34 	return chunk;
35 }
36 
get_upper_chunk(struct trace_pid_list * pid_list)37 static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list)
38 {
39 	union upper_chunk *chunk;
40 
41 	lockdep_assert_held(&pid_list->lock);
42 
43 	if (!pid_list->upper_list)
44 		return NULL;
45 
46 	chunk = pid_list->upper_list;
47 	pid_list->upper_list = chunk->next;
48 	pid_list->free_upper_chunks--;
49 	WARN_ON_ONCE(pid_list->free_upper_chunks < 0);
50 	chunk->next = NULL;
51 	/*
52 	 * If a refill needs to happen, it can not happen here
53 	 * as the scheduler run queue locks are held.
54 	 */
55 	if (pid_list->free_upper_chunks <= CHUNK_REALLOC)
56 		irq_work_queue(&pid_list->refill_irqwork);
57 
58 	return chunk;
59 }
60 
put_lower_chunk(struct trace_pid_list * pid_list,union lower_chunk * chunk)61 static inline void put_lower_chunk(struct trace_pid_list *pid_list,
62 				   union lower_chunk *chunk)
63 {
64 	lockdep_assert_held(&pid_list->lock);
65 
66 	chunk->next = pid_list->lower_list;
67 	pid_list->lower_list = chunk;
68 	pid_list->free_lower_chunks++;
69 }
70 
put_upper_chunk(struct trace_pid_list * pid_list,union upper_chunk * chunk)71 static inline void put_upper_chunk(struct trace_pid_list *pid_list,
72 				   union upper_chunk *chunk)
73 {
74 	lockdep_assert_held(&pid_list->lock);
75 
76 	chunk->next = pid_list->upper_list;
77 	pid_list->upper_list = chunk;
78 	pid_list->free_upper_chunks++;
79 }
80 
upper_empty(union upper_chunk * chunk)81 static inline bool upper_empty(union upper_chunk *chunk)
82 {
83 	/*
84 	 * If chunk->data has no lower chunks, it will be the same
85 	 * as a zeroed bitmask.
86 	 */
87 	return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data));
88 }
89 
pid_split(unsigned int pid,unsigned int * upper1,unsigned int * upper2,unsigned int * lower)90 static inline int pid_split(unsigned int pid, unsigned int *upper1,
91 			     unsigned int *upper2, unsigned int *lower)
92 {
93 	/* MAX_PID should cover all pids */
94 	BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT);
95 
96 	/* In case a bad pid is passed in, then fail */
97 	if (unlikely(pid >= MAX_PID))
98 		return -1;
99 
100 	*upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK;
101 	*upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK;
102 	*lower = pid & LOWER_MASK;
103 
104 	return 0;
105 }
106 
pid_join(unsigned int upper1,unsigned int upper2,unsigned int lower)107 static inline unsigned int pid_join(unsigned int upper1,
108 				    unsigned int upper2, unsigned int lower)
109 {
110 	return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) |
111 		((upper2 & UPPER_MASK) << UPPER2_SHIFT) |
112 		(lower & LOWER_MASK);
113 }
114 
115 /**
116  * trace_pid_list_is_set - test if the pid is set in the list
117  * @pid_list: The pid list to test
118  * @pid: The pid to see if set in the list.
119  *
120  * Tests if @pid is set in the @pid_list. This is usually called
121  * from the scheduler when a task is scheduled. Its pid is checked
122  * if it should be traced or not.
123  *
124  * Return true if the pid is in the list, false otherwise.
125  */
trace_pid_list_is_set(struct trace_pid_list * pid_list,unsigned int pid)126 bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
127 {
128 	union upper_chunk *upper_chunk;
129 	union lower_chunk *lower_chunk;
130 	unsigned int seq;
131 	unsigned int upper1;
132 	unsigned int upper2;
133 	unsigned int lower;
134 	bool ret = false;
135 
136 	if (!pid_list)
137 		return false;
138 
139 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
140 		return false;
141 
142 	do {
143 		seq = read_seqcount_begin(&pid_list->seqcount);
144 		ret = false;
145 		upper_chunk = pid_list->upper[upper1];
146 		if (upper_chunk) {
147 			lower_chunk = upper_chunk->data[upper2];
148 			if (lower_chunk)
149 				ret = test_bit(lower, lower_chunk->data);
150 		}
151 	} while (read_seqcount_retry(&pid_list->seqcount, seq));
152 
153 	return ret;
154 }
155 
156 /**
157  * trace_pid_list_set - add a pid to the list
158  * @pid_list: The pid list to add the @pid to.
159  * @pid: The pid to add.
160  *
161  * Adds @pid to @pid_list. This is usually done explicitly by a user
162  * adding a task to be traced, or indirectly by the fork function
163  * when children should be traced and a task's pid is in the list.
164  *
165  * Return 0 on success, negative otherwise.
166  */
trace_pid_list_set(struct trace_pid_list * pid_list,unsigned int pid)167 int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
168 {
169 	union upper_chunk *upper_chunk;
170 	union lower_chunk *lower_chunk;
171 	unsigned long flags;
172 	unsigned int upper1;
173 	unsigned int upper2;
174 	unsigned int lower;
175 	int ret;
176 
177 	if (!pid_list)
178 		return -ENODEV;
179 
180 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
181 		return -EINVAL;
182 
183 	raw_spin_lock_irqsave(&pid_list->lock, flags);
184 	write_seqcount_begin(&pid_list->seqcount);
185 	upper_chunk = pid_list->upper[upper1];
186 	if (!upper_chunk) {
187 		upper_chunk = get_upper_chunk(pid_list);
188 		if (!upper_chunk) {
189 			ret = -ENOMEM;
190 			goto out;
191 		}
192 		pid_list->upper[upper1] = upper_chunk;
193 	}
194 	lower_chunk = upper_chunk->data[upper2];
195 	if (!lower_chunk) {
196 		lower_chunk = get_lower_chunk(pid_list);
197 		if (!lower_chunk) {
198 			ret = -ENOMEM;
199 			goto out;
200 		}
201 		upper_chunk->data[upper2] = lower_chunk;
202 	}
203 	set_bit(lower, lower_chunk->data);
204 	ret = 0;
205  out:
206 	write_seqcount_end(&pid_list->seqcount);
207 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
208 	return ret;
209 }
210 
211 /**
212  * trace_pid_list_clear - remove a pid from the list
213  * @pid_list: The pid list to remove the @pid from.
214  * @pid: The pid to remove.
215  *
216  * Removes @pid from @pid_list. This is usually done explicitly by a user
217  * removing tasks from tracing, or indirectly by the exit function
218  * when a task that is set to be traced exits.
219  *
220  * Return 0 on success, negative otherwise.
221  */
trace_pid_list_clear(struct trace_pid_list * pid_list,unsigned int pid)222 int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
223 {
224 	union upper_chunk *upper_chunk;
225 	union lower_chunk *lower_chunk;
226 	unsigned long flags;
227 	unsigned int upper1;
228 	unsigned int upper2;
229 	unsigned int lower;
230 
231 	if (!pid_list)
232 		return -ENODEV;
233 
234 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
235 		return -EINVAL;
236 
237 	raw_spin_lock_irqsave(&pid_list->lock, flags);
238 	write_seqcount_begin(&pid_list->seqcount);
239 	upper_chunk = pid_list->upper[upper1];
240 	if (!upper_chunk)
241 		goto out;
242 
243 	lower_chunk = upper_chunk->data[upper2];
244 	if (!lower_chunk)
245 		goto out;
246 
247 	clear_bit(lower, lower_chunk->data);
248 
249 	/* if there's no more bits set, add it to the free list */
250 	if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
251 		put_lower_chunk(pid_list, lower_chunk);
252 		upper_chunk->data[upper2] = NULL;
253 		if (upper_empty(upper_chunk)) {
254 			put_upper_chunk(pid_list, upper_chunk);
255 			pid_list->upper[upper1] = NULL;
256 		}
257 	}
258  out:
259 	write_seqcount_end(&pid_list->seqcount);
260 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
261 	return 0;
262 }
263 
264 /**
265  * trace_pid_list_next - return the next pid in the list
266  * @pid_list: The pid list to examine.
267  * @pid: The pid to start from
268  * @next: The pointer to place the pid that is set starting from @pid.
269  *
270  * Looks for the next consecutive pid that is in @pid_list starting
271  * at the pid specified by @pid. If one is set (including @pid), then
272  * that pid is placed into @next.
273  *
274  * Return 0 when a pid is found, -1 if there are no more pids included.
275  */
trace_pid_list_next(struct trace_pid_list * pid_list,unsigned int pid,unsigned int * next)276 int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
277 			unsigned int *next)
278 {
279 	union upper_chunk *upper_chunk;
280 	union lower_chunk *lower_chunk;
281 	unsigned long flags;
282 	unsigned int upper1;
283 	unsigned int upper2;
284 	unsigned int lower;
285 
286 	if (!pid_list)
287 		return -ENODEV;
288 
289 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
290 		return -EINVAL;
291 
292 	raw_spin_lock_irqsave(&pid_list->lock, flags);
293 	for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
294 		upper_chunk = pid_list->upper[upper1];
295 
296 		if (!upper_chunk)
297 			continue;
298 
299 		for (; upper2 <= UPPER_MASK; upper2++, lower = 0) {
300 			lower_chunk = upper_chunk->data[upper2];
301 			if (!lower_chunk)
302 				continue;
303 
304 			lower = find_next_bit(lower_chunk->data, LOWER_MAX,
305 					    lower);
306 			if (lower < LOWER_MAX)
307 				goto found;
308 		}
309 	}
310 
311  found:
312 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
313 	if (upper1 > UPPER_MASK)
314 		return -1;
315 
316 	*next = pid_join(upper1, upper2, lower);
317 	return 0;
318 }
319 
320 /**
321  * trace_pid_list_first - return the first pid in the list
322  * @pid_list: The pid list to examine.
323  * @pid: The pointer to place the pid first found pid that is set.
324  *
325  * Looks for the first pid that is set in @pid_list, and places it
326  * into @pid if found.
327  *
328  * Return 0 when a pid is found, -1 if there are no pids set.
329  */
trace_pid_list_first(struct trace_pid_list * pid_list,unsigned int * pid)330 int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid)
331 {
332 	return trace_pid_list_next(pid_list, 0, pid);
333 }
334 
pid_list_refill_irq(struct irq_work * iwork)335 static void pid_list_refill_irq(struct irq_work *iwork)
336 {
337 	struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list,
338 						       refill_irqwork);
339 	union upper_chunk *upper = NULL;
340 	union lower_chunk *lower = NULL;
341 	union upper_chunk **upper_next = &upper;
342 	union lower_chunk **lower_next = &lower;
343 	int upper_count;
344 	int lower_count;
345 	int ucnt = 0;
346 	int lcnt = 0;
347 
348  again:
349 	raw_spin_lock(&pid_list->lock);
350 	write_seqcount_begin(&pid_list->seqcount);
351 	upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
352 	lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
353 	write_seqcount_end(&pid_list->seqcount);
354 	raw_spin_unlock(&pid_list->lock);
355 
356 	if (upper_count <= 0 && lower_count <= 0)
357 		return;
358 
359 	while (upper_count-- > 0) {
360 		union upper_chunk *chunk;
361 
362 		chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
363 		if (!chunk)
364 			break;
365 		*upper_next = chunk;
366 		upper_next = &chunk->next;
367 		ucnt++;
368 	}
369 
370 	while (lower_count-- > 0) {
371 		union lower_chunk *chunk;
372 
373 		chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
374 		if (!chunk)
375 			break;
376 		*lower_next = chunk;
377 		lower_next = &chunk->next;
378 		lcnt++;
379 	}
380 
381 	raw_spin_lock(&pid_list->lock);
382 	write_seqcount_begin(&pid_list->seqcount);
383 	if (upper) {
384 		*upper_next = pid_list->upper_list;
385 		pid_list->upper_list = upper;
386 		pid_list->free_upper_chunks += ucnt;
387 	}
388 	if (lower) {
389 		*lower_next = pid_list->lower_list;
390 		pid_list->lower_list = lower;
391 		pid_list->free_lower_chunks += lcnt;
392 	}
393 	write_seqcount_end(&pid_list->seqcount);
394 	raw_spin_unlock(&pid_list->lock);
395 
396 	/*
397 	 * On success of allocating all the chunks, both counters
398 	 * will be less than zero. If they are not, then an allocation
399 	 * failed, and we should not try again.
400 	 */
401 	if (upper_count >= 0 || lower_count >= 0)
402 		return;
403 	/*
404 	 * When the locks were released, free chunks could have
405 	 * been used and allocation needs to be done again. Might as
406 	 * well allocate it now.
407 	 */
408 	goto again;
409 }
410 
411 /**
412  * trace_pid_list_alloc - create a new pid_list
413  *
414  * Allocates a new pid_list to store pids into.
415  *
416  * Returns the pid_list on success, NULL otherwise.
417  */
trace_pid_list_alloc(void)418 struct trace_pid_list *trace_pid_list_alloc(void)
419 {
420 	struct trace_pid_list *pid_list;
421 	int i;
422 
423 	/* According to linux/thread.h, pids can be no bigger that 30 bits */
424 	WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
425 
426 	pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
427 	if (!pid_list)
428 		return NULL;
429 
430 	init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
431 
432 	raw_spin_lock_init(&pid_list->lock);
433 	seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock);
434 
435 	for (i = 0; i < CHUNK_ALLOC; i++) {
436 		union upper_chunk *chunk;
437 
438 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
439 		if (!chunk)
440 			break;
441 		chunk->next = pid_list->upper_list;
442 		pid_list->upper_list = chunk;
443 		pid_list->free_upper_chunks++;
444 	}
445 
446 	for (i = 0; i < CHUNK_ALLOC; i++) {
447 		union lower_chunk *chunk;
448 
449 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
450 		if (!chunk)
451 			break;
452 		chunk->next = pid_list->lower_list;
453 		pid_list->lower_list = chunk;
454 		pid_list->free_lower_chunks++;
455 	}
456 
457 	return pid_list;
458 }
459 
460 /**
461  * trace_pid_list_free - Frees an allocated pid_list.
462  * @pid_list: The pid list to free.
463  *
464  * Frees the memory for a pid_list that was allocated.
465  */
trace_pid_list_free(struct trace_pid_list * pid_list)466 void trace_pid_list_free(struct trace_pid_list *pid_list)
467 {
468 	union upper_chunk *upper;
469 	union lower_chunk *lower;
470 	int i, j;
471 
472 	if (!pid_list)
473 		return;
474 
475 	irq_work_sync(&pid_list->refill_irqwork);
476 
477 	while (pid_list->lower_list) {
478 		union lower_chunk *chunk;
479 
480 		chunk = pid_list->lower_list;
481 		pid_list->lower_list = pid_list->lower_list->next;
482 		kfree(chunk);
483 	}
484 
485 	while (pid_list->upper_list) {
486 		union upper_chunk *chunk;
487 
488 		chunk = pid_list->upper_list;
489 		pid_list->upper_list = pid_list->upper_list->next;
490 		kfree(chunk);
491 	}
492 
493 	for (i = 0; i < UPPER1_SIZE; i++) {
494 		upper = pid_list->upper[i];
495 		if (upper) {
496 			for (j = 0; j < UPPER2_SIZE; j++) {
497 				lower = upper->data[j];
498 				kfree(lower);
499 			}
500 			kfree(upper);
501 		}
502 	}
503 	kfree(pid_list);
504 }
505