xref: /linux/tools/workqueue/wq_monitor.py (revision 3c999d1ae3c75991902a1a7dad0cb62c2a3008b4)
1725e8ec5STejun Heo#!/usr/bin/env drgn
2725e8ec5STejun Heo#
3725e8ec5STejun Heo# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
4725e8ec5STejun Heo# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
5725e8ec5STejun Heo
6725e8ec5STejun Heodesc = """
7725e8ec5STejun HeoThis is a drgn script to monitor workqueues. For more info on drgn, visit
8725e8ec5STejun Heohttps://github.com/osandov/drgn.
9725e8ec5STejun Heo
10725e8ec5STejun Heo  total    Total number of work items executed by the workqueue.
11725e8ec5STejun Heo
12725e8ec5STejun Heo  infl     The number of currently in-flight work items.
13725e8ec5STejun Heo
148a1dd1e5STejun Heo  CPUtime  Total CPU time consumed by the workqueue in seconds. This is
158a1dd1e5STejun Heo           sampled from scheduler ticks and only provides ballpark
168a1dd1e5STejun Heo           measurement. "nohz_full=" CPUs are excluded from measurement.
178a1dd1e5STejun Heo
18616db877STejun Heo  CPUitsv  The number of times a concurrency-managed work item hogged CPU
19616db877STejun Heo           longer than the threshold (workqueue.cpu_intensive_thresh_us)
20616db877STejun Heo           and got excluded from concurrency management to avoid stalling
21616db877STejun Heo           other work items.
22616db877STejun Heo
238639ecebSTejun Heo  CMW/RPR  For per-cpu workqueues, the number of concurrency-management
248639ecebSTejun Heo           wake-ups while executing a work item of the workqueue. For
258639ecebSTejun Heo           unbound workqueues, the number of times a worker was repatriated
268639ecebSTejun Heo           to its affinity scope after being migrated to an off-scope CPU by
278639ecebSTejun Heo           the scheduler.
28725e8ec5STejun Heo
29725e8ec5STejun Heo  mayday   The number of times the rescuer was requested while waiting for
30725e8ec5STejun Heo           new worker creation.
31725e8ec5STejun Heo
32725e8ec5STejun Heo  rescued  The number of work items executed by the rescuer.
33725e8ec5STejun Heo"""
34725e8ec5STejun Heo
35725e8ec5STejun Heoimport signal
36725e8ec5STejun Heoimport re
37725e8ec5STejun Heoimport time
38725e8ec5STejun Heoimport json
39725e8ec5STejun Heo
40725e8ec5STejun Heoimport drgn
41*8034b314SKemeng Shifrom drgn.helpers.linux.list import list_for_each_entry
42725e8ec5STejun Heo
43725e8ec5STejun Heoimport argparse
44725e8ec5STejun Heoparser = argparse.ArgumentParser(description=desc,
45725e8ec5STejun Heo                                 formatter_class=argparse.RawTextHelpFormatter)
46725e8ec5STejun Heoparser.add_argument('workqueue', metavar='REGEX', nargs='*',
47725e8ec5STejun Heo                    help='Target workqueue name patterns (all if empty)')
48725e8ec5STejun Heoparser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
49725e8ec5STejun Heo                    help='Monitoring interval (0 to print once and exit)')
50725e8ec5STejun Heoparser.add_argument('-j', '--json', action='store_true',
51725e8ec5STejun Heo                    help='Output in json')
52725e8ec5STejun Heoargs = parser.parse_args()
53725e8ec5STejun Heo
54725e8ec5STejun Heoworkqueues              = prog['workqueues']
55725e8ec5STejun Heo
56725e8ec5STejun HeoWQ_UNBOUND              = prog['WQ_UNBOUND']
57725e8ec5STejun HeoWQ_MEM_RECLAIM          = prog['WQ_MEM_RECLAIM']
58725e8ec5STejun Heo
59725e8ec5STejun HeoPWQ_STAT_STARTED        = prog['PWQ_STAT_STARTED']      # work items started execution
60725e8ec5STejun HeoPWQ_STAT_COMPLETED      = prog['PWQ_STAT_COMPLETED']	# work items completed execution
618a1dd1e5STejun HeoPWQ_STAT_CPU_TIME       = prog['PWQ_STAT_CPU_TIME']     # total CPU time consumed
62616db877STejun HeoPWQ_STAT_CPU_INTENSIVE  = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
63725e8ec5STejun HeoPWQ_STAT_CM_WAKEUP      = prog['PWQ_STAT_CM_WAKEUP']    # concurrency-management worker wakeups
648639ecebSTejun HeoPWQ_STAT_REPATRIATED    = prog['PWQ_STAT_REPATRIATED']  # unbound workers brought back into scope
65725e8ec5STejun HeoPWQ_STAT_MAYDAY         = prog['PWQ_STAT_MAYDAY']	# maydays to rescuer
66725e8ec5STejun HeoPWQ_STAT_RESCUED        = prog['PWQ_STAT_RESCUED']	# linked work items executed by rescuer
67725e8ec5STejun HeoPWQ_NR_STATS            = prog['PWQ_NR_STATS']
68725e8ec5STejun Heo
69725e8ec5STejun Heoclass WqStats:
70725e8ec5STejun Heo    def __init__(self, wq):
71725e8ec5STejun Heo        self.name = wq.name.string_().decode()
72725e8ec5STejun Heo        self.unbound = wq.flags & WQ_UNBOUND != 0
73725e8ec5STejun Heo        self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
74725e8ec5STejun Heo        self.stats = [0] * PWQ_NR_STATS
75725e8ec5STejun Heo        for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
76725e8ec5STejun Heo            for i in range(PWQ_NR_STATS):
77725e8ec5STejun Heo                self.stats[i] += int(pwq.stats[i])
78725e8ec5STejun Heo
79725e8ec5STejun Heo    def dict(self, now):
80725e8ec5STejun Heo        return { 'timestamp'            : now,
81725e8ec5STejun Heo                 'name'                 : self.name,
82725e8ec5STejun Heo                 'unbound'              : self.unbound,
83725e8ec5STejun Heo                 'mem_reclaim'          : self.mem_reclaim,
84725e8ec5STejun Heo                 'started'              : self.stats[PWQ_STAT_STARTED],
85725e8ec5STejun Heo                 'completed'            : self.stats[PWQ_STAT_COMPLETED],
868a1dd1e5STejun Heo                 'cpu_time'             : self.stats[PWQ_STAT_CPU_TIME],
87616db877STejun Heo                 'cpu_intensive'        : self.stats[PWQ_STAT_CPU_INTENSIVE],
88725e8ec5STejun Heo                 'cm_wakeup'            : self.stats[PWQ_STAT_CM_WAKEUP],
898639ecebSTejun Heo                 'repatriated'          : self.stats[PWQ_STAT_REPATRIATED],
90725e8ec5STejun Heo                 'mayday'               : self.stats[PWQ_STAT_MAYDAY],
91725e8ec5STejun Heo                 'rescued'              : self.stats[PWQ_STAT_RESCUED], }
92725e8ec5STejun Heo
93725e8ec5STejun Heo    def table_header_str():
948a1dd1e5STejun Heo        return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
958639ecebSTejun Heo            f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}'
96725e8ec5STejun Heo
97725e8ec5STejun Heo    def table_row_str(self):
98616db877STejun Heo        cpu_intensive = '-'
998639ecebSTejun Heo        cmw_rpr = '-'
100725e8ec5STejun Heo        mayday = '-'
101725e8ec5STejun Heo        rescued = '-'
102725e8ec5STejun Heo
1038639ecebSTejun Heo        if self.unbound:
1048639ecebSTejun Heo            cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]);
1058639ecebSTejun Heo        else:
106616db877STejun Heo            cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
1078639ecebSTejun Heo            cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP])
108725e8ec5STejun Heo
109725e8ec5STejun Heo        if self.mem_reclaim:
110725e8ec5STejun Heo            mayday = str(self.stats[PWQ_STAT_MAYDAY])
111725e8ec5STejun Heo            rescued = str(self.stats[PWQ_STAT_RESCUED])
112725e8ec5STejun Heo
113725e8ec5STejun Heo        out = f'{self.name[-24:]:24} ' \
114725e8ec5STejun Heo              f'{self.stats[PWQ_STAT_STARTED]:8} ' \
115725e8ec5STejun Heo              f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
1168a1dd1e5STejun Heo              f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
117616db877STejun Heo              f'{cpu_intensive:>7} ' \
1188639ecebSTejun Heo              f'{cmw_rpr:>7} ' \
119725e8ec5STejun Heo              f'{mayday:>7} ' \
120725e8ec5STejun Heo              f'{rescued:>7} '
121725e8ec5STejun Heo        return out.rstrip(':')
122725e8ec5STejun Heo
123725e8ec5STejun Heoexit_req = False
124725e8ec5STejun Heo
125725e8ec5STejun Heodef sigint_handler(signr, frame):
126725e8ec5STejun Heo    global exit_req
127725e8ec5STejun Heo    exit_req = True
128725e8ec5STejun Heo
129725e8ec5STejun Heodef main():
130725e8ec5STejun Heo    # handle args
131725e8ec5STejun Heo    table_fmt = not args.json
132725e8ec5STejun Heo    interval = args.interval
133725e8ec5STejun Heo
134725e8ec5STejun Heo    re_str = None
135725e8ec5STejun Heo    if args.workqueue:
136725e8ec5STejun Heo        for r in args.workqueue:
137725e8ec5STejun Heo            if re_str is None:
138725e8ec5STejun Heo                re_str = r
139725e8ec5STejun Heo            else:
140725e8ec5STejun Heo                re_str += '|' + r
141725e8ec5STejun Heo
142725e8ec5STejun Heo    filter_re = re.compile(re_str) if re_str else None
143725e8ec5STejun Heo
144725e8ec5STejun Heo    # monitoring loop
145725e8ec5STejun Heo    signal.signal(signal.SIGINT, sigint_handler)
146725e8ec5STejun Heo
147725e8ec5STejun Heo    while not exit_req:
148725e8ec5STejun Heo        now = time.time()
149725e8ec5STejun Heo
150725e8ec5STejun Heo        if table_fmt:
151725e8ec5STejun Heo            print()
152725e8ec5STejun Heo            print(WqStats.table_header_str())
153725e8ec5STejun Heo
154725e8ec5STejun Heo        for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
155725e8ec5STejun Heo            stats = WqStats(wq)
156725e8ec5STejun Heo            if filter_re and not filter_re.search(stats.name):
157725e8ec5STejun Heo                continue
158725e8ec5STejun Heo            if table_fmt:
159725e8ec5STejun Heo                print(stats.table_row_str())
160725e8ec5STejun Heo            else:
161725e8ec5STejun Heo                print(stats.dict(now))
162725e8ec5STejun Heo
163725e8ec5STejun Heo        if interval == 0:
164725e8ec5STejun Heo            break
165725e8ec5STejun Heo        time.sleep(interval)
166725e8ec5STejun Heo
167725e8ec5STejun Heoif __name__ == "__main__":
168725e8ec5STejun Heo    main()
169