xref: /linux/kernel/trace/pid_list.c (revision 2dcb8e8782d8e4c38903bf37b1a24d3ffd193da7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
4  */
5 #include <linux/spinlock.h>
6 #include <linux/irq_work.h>
7 #include <linux/slab.h>
8 #include "trace.h"
9 
10 /* See pid_list.h for details */
11 
12 static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list)
13 {
14 	union lower_chunk *chunk;
15 
16 	lockdep_assert_held(&pid_list->lock);
17 
18 	if (!pid_list->lower_list)
19 		return NULL;
20 
21 	chunk = pid_list->lower_list;
22 	pid_list->lower_list = chunk->next;
23 	pid_list->free_lower_chunks--;
24 	WARN_ON_ONCE(pid_list->free_lower_chunks < 0);
25 	chunk->next = NULL;
26 	/*
27 	 * If a refill needs to happen, it can not happen here
28 	 * as the scheduler run queue locks are held.
29 	 */
30 	if (pid_list->free_lower_chunks <= CHUNK_REALLOC)
31 		irq_work_queue(&pid_list->refill_irqwork);
32 
33 	return chunk;
34 }
35 
36 static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list)
37 {
38 	union upper_chunk *chunk;
39 
40 	lockdep_assert_held(&pid_list->lock);
41 
42 	if (!pid_list->upper_list)
43 		return NULL;
44 
45 	chunk = pid_list->upper_list;
46 	pid_list->upper_list = chunk->next;
47 	pid_list->free_upper_chunks--;
48 	WARN_ON_ONCE(pid_list->free_upper_chunks < 0);
49 	chunk->next = NULL;
50 	/*
51 	 * If a refill needs to happen, it can not happen here
52 	 * as the scheduler run queue locks are held.
53 	 */
54 	if (pid_list->free_upper_chunks <= CHUNK_REALLOC)
55 		irq_work_queue(&pid_list->refill_irqwork);
56 
57 	return chunk;
58 }
59 
60 static inline void put_lower_chunk(struct trace_pid_list *pid_list,
61 				   union lower_chunk *chunk)
62 {
63 	lockdep_assert_held(&pid_list->lock);
64 
65 	chunk->next = pid_list->lower_list;
66 	pid_list->lower_list = chunk;
67 	pid_list->free_lower_chunks++;
68 }
69 
70 static inline void put_upper_chunk(struct trace_pid_list *pid_list,
71 				   union upper_chunk *chunk)
72 {
73 	lockdep_assert_held(&pid_list->lock);
74 
75 	chunk->next = pid_list->upper_list;
76 	pid_list->upper_list = chunk;
77 	pid_list->free_upper_chunks++;
78 }
79 
80 static inline bool upper_empty(union upper_chunk *chunk)
81 {
82 	/*
83 	 * If chunk->data has no lower chunks, it will be the same
84 	 * as a zeroed bitmask. Use find_first_bit() to test it
85 	 * and if it doesn't find any bits set, then the array
86 	 * is empty.
87 	 */
88 	int bit = find_first_bit((unsigned long *)chunk->data,
89 				 sizeof(chunk->data) * 8);
90 	return bit >= sizeof(chunk->data) * 8;
91 }
92 
93 static inline int pid_split(unsigned int pid, unsigned int *upper1,
94 			     unsigned int *upper2, unsigned int *lower)
95 {
96 	/* MAX_PID should cover all pids */
97 	BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT);
98 
99 	/* In case a bad pid is passed in, then fail */
100 	if (unlikely(pid >= MAX_PID))
101 		return -1;
102 
103 	*upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK;
104 	*upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK;
105 	*lower = pid & LOWER_MASK;
106 
107 	return 0;
108 }
109 
110 static inline unsigned int pid_join(unsigned int upper1,
111 				    unsigned int upper2, unsigned int lower)
112 {
113 	return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) |
114 		((upper2 & UPPER_MASK) << UPPER2_SHIFT) |
115 		(lower & LOWER_MASK);
116 }
117 
118 /**
119  * trace_pid_list_is_set - test if the pid is set in the list
120  * @pid_list: The pid list to test
121  * @pid: The pid to to see if set in the list.
122  *
123  * Tests if @pid is is set in the @pid_list. This is usually called
124  * from the scheduler when a task is scheduled. Its pid is checked
125  * if it should be traced or not.
126  *
127  * Return true if the pid is in the list, false otherwise.
128  */
129 bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
130 {
131 	union upper_chunk *upper_chunk;
132 	union lower_chunk *lower_chunk;
133 	unsigned long flags;
134 	unsigned int upper1;
135 	unsigned int upper2;
136 	unsigned int lower;
137 	bool ret = false;
138 
139 	if (!pid_list)
140 		return false;
141 
142 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
143 		return false;
144 
145 	raw_spin_lock_irqsave(&pid_list->lock, flags);
146 	upper_chunk = pid_list->upper[upper1];
147 	if (upper_chunk) {
148 		lower_chunk = upper_chunk->data[upper2];
149 		if (lower_chunk)
150 			ret = test_bit(lower, lower_chunk->data);
151 	}
152 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
153 
154 	return ret;
155 }
156 
157 /**
158  * trace_pid_list_set - add a pid to the list
159  * @pid_list: The pid list to add the @pid to.
160  * @pid: The pid to add.
161  *
162  * Adds @pid to @pid_list. This is usually done explicitly by a user
163  * adding a task to be traced, or indirectly by the fork function
164  * when children should be traced and a task's pid is in the list.
165  *
166  * Return 0 on success, negative otherwise.
167  */
168 int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
169 {
170 	union upper_chunk *upper_chunk;
171 	union lower_chunk *lower_chunk;
172 	unsigned long flags;
173 	unsigned int upper1;
174 	unsigned int upper2;
175 	unsigned int lower;
176 	int ret;
177 
178 	if (!pid_list)
179 		return -ENODEV;
180 
181 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
182 		return -EINVAL;
183 
184 	raw_spin_lock_irqsave(&pid_list->lock, flags);
185 	upper_chunk = pid_list->upper[upper1];
186 	if (!upper_chunk) {
187 		upper_chunk = get_upper_chunk(pid_list);
188 		if (!upper_chunk) {
189 			ret = -ENOMEM;
190 			goto out;
191 		}
192 		pid_list->upper[upper1] = upper_chunk;
193 	}
194 	lower_chunk = upper_chunk->data[upper2];
195 	if (!lower_chunk) {
196 		lower_chunk = get_lower_chunk(pid_list);
197 		if (!lower_chunk) {
198 			ret = -ENOMEM;
199 			goto out;
200 		}
201 		upper_chunk->data[upper2] = lower_chunk;
202 	}
203 	set_bit(lower, lower_chunk->data);
204 	ret = 0;
205  out:
206 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
207 	return ret;
208 }
209 
210 /**
211  * trace_pid_list_clear - remove a pid from the list
212  * @pid_list: The pid list to remove the @pid from.
213  * @pid: The pid to remove.
214  *
215  * Removes @pid from @pid_list. This is usually done explicitly by a user
216  * removing tasks from tracing, or indirectly by the exit function
217  * when a task that is set to be traced exits.
218  *
219  * Return 0 on success, negative otherwise.
220  */
221 int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
222 {
223 	union upper_chunk *upper_chunk;
224 	union lower_chunk *lower_chunk;
225 	unsigned long flags;
226 	unsigned int upper1;
227 	unsigned int upper2;
228 	unsigned int lower;
229 
230 	if (!pid_list)
231 		return -ENODEV;
232 
233 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
234 		return -EINVAL;
235 
236 	raw_spin_lock_irqsave(&pid_list->lock, flags);
237 	upper_chunk = pid_list->upper[upper1];
238 	if (!upper_chunk)
239 		goto out;
240 
241 	lower_chunk = upper_chunk->data[upper2];
242 	if (!lower_chunk)
243 		goto out;
244 
245 	clear_bit(lower, lower_chunk->data);
246 
247 	/* if there's no more bits set, add it to the free list */
248 	if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
249 		put_lower_chunk(pid_list, lower_chunk);
250 		upper_chunk->data[upper2] = NULL;
251 		if (upper_empty(upper_chunk)) {
252 			put_upper_chunk(pid_list, upper_chunk);
253 			pid_list->upper[upper1] = NULL;
254 		}
255 	}
256  out:
257 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
258 	return 0;
259 }
260 
261 /**
262  * trace_pid_list_next - return the next pid in the list
263  * @pid_list: The pid list to examine.
264  * @pid: The pid to start from
265  * @next: The pointer to place the pid that is set starting from @pid.
266  *
267  * Looks for the next consecutive pid that is in @pid_list starting
268  * at the pid specified by @pid. If one is set (including @pid), then
269  * that pid is placed into @next.
270  *
271  * Return 0 when a pid is found, -1 if there are no more pids included.
272  */
273 int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
274 			unsigned int *next)
275 {
276 	union upper_chunk *upper_chunk;
277 	union lower_chunk *lower_chunk;
278 	unsigned long flags;
279 	unsigned int upper1;
280 	unsigned int upper2;
281 	unsigned int lower;
282 
283 	if (!pid_list)
284 		return -ENODEV;
285 
286 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
287 		return -EINVAL;
288 
289 	raw_spin_lock_irqsave(&pid_list->lock, flags);
290 	for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
291 		upper_chunk = pid_list->upper[upper1];
292 
293 		if (!upper_chunk)
294 			continue;
295 
296 		for (; upper2 <= UPPER_MASK; upper2++, lower = 0) {
297 			lower_chunk = upper_chunk->data[upper2];
298 			if (!lower_chunk)
299 				continue;
300 
301 			lower = find_next_bit(lower_chunk->data, LOWER_MAX,
302 					    lower);
303 			if (lower < LOWER_MAX)
304 				goto found;
305 		}
306 	}
307 
308  found:
309 	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
310 	if (upper1 > UPPER_MASK)
311 		return -1;
312 
313 	*next = pid_join(upper1, upper2, lower);
314 	return 0;
315 }
316 
317 /**
318  * trace_pid_list_first - return the first pid in the list
319  * @pid_list: The pid list to examine.
320  * @pid: The pointer to place the pid first found pid that is set.
321  *
322  * Looks for the first pid that is set in @pid_list, and places it
323  * into @pid if found.
324  *
325  * Return 0 when a pid is found, -1 if there are no pids set.
326  */
327 int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid)
328 {
329 	return trace_pid_list_next(pid_list, 0, pid);
330 }
331 
332 static void pid_list_refill_irq(struct irq_work *iwork)
333 {
334 	struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list,
335 						       refill_irqwork);
336 	union upper_chunk *upper = NULL;
337 	union lower_chunk *lower = NULL;
338 	union upper_chunk **upper_next = &upper;
339 	union lower_chunk **lower_next = &lower;
340 	int upper_count;
341 	int lower_count;
342 	int ucnt = 0;
343 	int lcnt = 0;
344 
345  again:
346 	raw_spin_lock(&pid_list->lock);
347 	upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
348 	lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
349 	raw_spin_unlock(&pid_list->lock);
350 
351 	if (upper_count <= 0 && lower_count <= 0)
352 		return;
353 
354 	while (upper_count-- > 0) {
355 		union upper_chunk *chunk;
356 
357 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
358 		if (!chunk)
359 			break;
360 		*upper_next = chunk;
361 		upper_next = &chunk->next;
362 		ucnt++;
363 	}
364 
365 	while (lower_count-- > 0) {
366 		union lower_chunk *chunk;
367 
368 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
369 		if (!chunk)
370 			break;
371 		*lower_next = chunk;
372 		lower_next = &chunk->next;
373 		lcnt++;
374 	}
375 
376 	raw_spin_lock(&pid_list->lock);
377 	if (upper) {
378 		*upper_next = pid_list->upper_list;
379 		pid_list->upper_list = upper;
380 		pid_list->free_upper_chunks += ucnt;
381 	}
382 	if (lower) {
383 		*lower_next = pid_list->lower_list;
384 		pid_list->lower_list = lower;
385 		pid_list->free_lower_chunks += lcnt;
386 	}
387 	raw_spin_unlock(&pid_list->lock);
388 
389 	/*
390 	 * On success of allocating all the chunks, both counters
391 	 * will be less than zero. If they are not, then an allocation
392 	 * failed, and we should not try again.
393 	 */
394 	if (upper_count >= 0 || lower_count >= 0)
395 		return;
396 	/*
397 	 * When the locks were released, free chunks could have
398 	 * been used and allocation needs to be done again. Might as
399 	 * well allocate it now.
400 	 */
401 	goto again;
402 }
403 
404 /**
405  * trace_pid_list_alloc - create a new pid_list
406  *
407  * Allocates a new pid_list to store pids into.
408  *
409  * Returns the pid_list on success, NULL otherwise.
410  */
411 struct trace_pid_list *trace_pid_list_alloc(void)
412 {
413 	struct trace_pid_list *pid_list;
414 	int i;
415 
416 	/* According to linux/thread.h, pids can be no bigger that 30 bits */
417 	WARN_ON_ONCE(pid_max > (1 << 30));
418 
419 	pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
420 	if (!pid_list)
421 		return NULL;
422 
423 	init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
424 
425 	raw_spin_lock_init(&pid_list->lock);
426 
427 	for (i = 0; i < CHUNK_ALLOC; i++) {
428 		union upper_chunk *chunk;
429 
430 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
431 		if (!chunk)
432 			break;
433 		chunk->next = pid_list->upper_list;
434 		pid_list->upper_list = chunk;
435 		pid_list->free_upper_chunks++;
436 	}
437 
438 	for (i = 0; i < CHUNK_ALLOC; i++) {
439 		union lower_chunk *chunk;
440 
441 		chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
442 		if (!chunk)
443 			break;
444 		chunk->next = pid_list->lower_list;
445 		pid_list->lower_list = chunk;
446 		pid_list->free_lower_chunks++;
447 	}
448 
449 	return pid_list;
450 }
451 
452 /**
453  * trace_pid_list_free - Frees an allocated pid_list.
454  *
455  * Frees the memory for a pid_list that was allocated.
456  */
457 void trace_pid_list_free(struct trace_pid_list *pid_list)
458 {
459 	union upper_chunk *upper;
460 	union lower_chunk *lower;
461 	int i, j;
462 
463 	if (!pid_list)
464 		return;
465 
466 	irq_work_sync(&pid_list->refill_irqwork);
467 
468 	while (pid_list->lower_list) {
469 		union lower_chunk *chunk;
470 
471 		chunk = pid_list->lower_list;
472 		pid_list->lower_list = pid_list->lower_list->next;
473 		kfree(chunk);
474 	}
475 
476 	while (pid_list->upper_list) {
477 		union upper_chunk *chunk;
478 
479 		chunk = pid_list->upper_list;
480 		pid_list->upper_list = pid_list->upper_list->next;
481 		kfree(chunk);
482 	}
483 
484 	for (i = 0; i < UPPER1_SIZE; i++) {
485 		upper = pid_list->upper[i];
486 		if (upper) {
487 			for (j = 0; j < UPPER2_SIZE; j++) {
488 				lower = upper->data[j];
489 				kfree(lower);
490 			}
491 			kfree(upper);
492 		}
493 	}
494 	kfree(pid_list);
495 }
496