xref: /linux/net/core/drop_monitor.c (revision a508da6cc0093171833efb8376b00473f24221b9)
1 /*
2  * Monitoring code for network dropped packet alerts
3  *
4  * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/netdevice.h>
10 #include <linux/etherdevice.h>
11 #include <linux/string.h>
12 #include <linux/if_arp.h>
13 #include <linux/inetdevice.h>
14 #include <linux/inet.h>
15 #include <linux/interrupt.h>
16 #include <linux/netpoll.h>
17 #include <linux/sched.h>
18 #include <linux/delay.h>
19 #include <linux/types.h>
20 #include <linux/workqueue.h>
21 #include <linux/netlink.h>
22 #include <linux/net_dropmon.h>
23 #include <linux/percpu.h>
24 #include <linux/timer.h>
25 #include <linux/bitops.h>
26 #include <linux/slab.h>
27 #include <linux/module.h>
28 #include <net/genetlink.h>
29 #include <net/netevent.h>
30 
31 #include <trace/events/skb.h>
32 #include <trace/events/napi.h>
33 
34 #include <asm/unaligned.h>
35 
36 #define TRACE_ON 1
37 #define TRACE_OFF 0
38 
39 static void send_dm_alert(struct work_struct *unused);
40 
41 
42 /*
43  * Globals, our netlink socket pointer
44  * and the work handle that will send up
45  * netlink alerts
46  */
47 static int trace_state = TRACE_OFF;
48 static DEFINE_MUTEX(trace_state_mutex);
49 
50 struct per_cpu_dm_data {
51 	struct work_struct dm_alert_work;
52 	struct sk_buff __rcu *skb;
53 	atomic_t dm_hit_count;
54 	struct timer_list send_timer;
55 	int cpu;
56 };
57 
58 struct dm_hw_stat_delta {
59 	struct net_device *dev;
60 	unsigned long last_rx;
61 	struct list_head list;
62 	struct rcu_head rcu;
63 	unsigned long last_drop_val;
64 };
65 
66 static struct genl_family net_drop_monitor_family = {
67 	.id             = GENL_ID_GENERATE,
68 	.hdrsize        = 0,
69 	.name           = "NET_DM",
70 	.version        = 2,
71 	.maxattr        = NET_DM_CMD_MAX,
72 };
73 
74 static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
75 
76 static int dm_hit_limit = 64;
77 static int dm_delay = 1;
78 static unsigned long dm_hw_check_delta = 2*HZ;
79 static LIST_HEAD(hw_stats_list);
80 
81 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
82 {
83 	size_t al;
84 	struct net_dm_alert_msg *msg;
85 	struct nlattr *nla;
86 	struct sk_buff *skb;
87 	struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
88 
89 	al = sizeof(struct net_dm_alert_msg);
90 	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
91 	al += sizeof(struct nlattr);
92 
93 	skb = genlmsg_new(al, GFP_KERNEL);
94 
95 	if (skb) {
96 		genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
97 				0, NET_DM_CMD_ALERT);
98 		nla = nla_reserve(skb, NLA_UNSPEC,
99 				  sizeof(struct net_dm_alert_msg));
100 		msg = nla_data(nla);
101 		memset(msg, 0, al);
102 	} else
103 		schedule_work_on(data->cpu, &data->dm_alert_work);
104 
105 	/*
106 	 * Don't need to lock this, since we are guaranteed to only
107 	 * run this on a single cpu at a time.
108 	 * Note also that we only update data->skb if the old and new skb
109 	 * pointers don't match.  This ensures that we don't continually call
110 	 * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
111 	 */
112 	if (skb != oskb) {
113 		rcu_assign_pointer(data->skb, skb);
114 
115 		synchronize_rcu();
116 
117 		atomic_set(&data->dm_hit_count, dm_hit_limit);
118 	}
119 
120 }
121 
122 static void send_dm_alert(struct work_struct *unused)
123 {
124 	struct sk_buff *skb;
125 	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
126 
127 	WARN_ON_ONCE(data->cpu != smp_processor_id());
128 
129 	/*
130 	 * Grab the skb we're about to send
131 	 */
132 	skb = rcu_dereference_protected(data->skb, 1);
133 
134 	/*
135 	 * Replace it with a new one
136 	 */
137 	reset_per_cpu_data(data);
138 
139 	/*
140 	 * Ship it!
141 	 */
142 	if (skb)
143 		genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
144 
145 	put_cpu_var(dm_cpu_data);
146 }
147 
148 /*
149  * This is the timer function to delay the sending of an alert
150  * in the event that more drops will arrive during the
151  * hysteresis period.  Note that it operates under the timer interrupt
152  * so we don't need to disable preemption here
153  */
154 static void sched_send_work(unsigned long unused)
155 {
156 	struct per_cpu_dm_data *data =  &get_cpu_var(dm_cpu_data);
157 
158 	schedule_work_on(smp_processor_id(), &data->dm_alert_work);
159 
160 	put_cpu_var(dm_cpu_data);
161 }
162 
163 static void trace_drop_common(struct sk_buff *skb, void *location)
164 {
165 	struct net_dm_alert_msg *msg;
166 	struct nlmsghdr *nlh;
167 	struct nlattr *nla;
168 	int i;
169 	struct sk_buff *dskb;
170 	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
171 
172 
173 	rcu_read_lock();
174 	dskb = rcu_dereference(data->skb);
175 
176 	if (!dskb)
177 		goto out;
178 
179 	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
180 		/*
181 		 * we're already at zero, discard this hit
182 		 */
183 		goto out;
184 	}
185 
186 	nlh = (struct nlmsghdr *)dskb->data;
187 	nla = genlmsg_data(nlmsg_data(nlh));
188 	msg = nla_data(nla);
189 	for (i = 0; i < msg->entries; i++) {
190 		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
191 			msg->points[i].count++;
192 			atomic_inc(&data->dm_hit_count);
193 			goto out;
194 		}
195 	}
196 
197 	/*
198 	 * We need to create a new entry
199 	 */
200 	__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
201 	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
202 	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
203 	msg->points[msg->entries].count = 1;
204 	msg->entries++;
205 
206 	if (!timer_pending(&data->send_timer)) {
207 		data->send_timer.expires = jiffies + dm_delay * HZ;
208 		add_timer_on(&data->send_timer, smp_processor_id());
209 	}
210 
211 out:
212 	rcu_read_unlock();
213 	put_cpu_var(dm_cpu_data);
214 	return;
215 }
216 
217 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
218 {
219 	trace_drop_common(skb, location);
220 }
221 
222 static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
223 {
224 	struct dm_hw_stat_delta *new_stat;
225 
226 	/*
227 	 * Don't check napi structures with no associated device
228 	 */
229 	if (!napi->dev)
230 		return;
231 
232 	rcu_read_lock();
233 	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
234 		/*
235 		 * only add a note to our monitor buffer if:
236 		 * 1) this is the dev we received on
237 		 * 2) its after the last_rx delta
238 		 * 3) our rx_dropped count has gone up
239 		 */
240 		if ((new_stat->dev == napi->dev)  &&
241 		    (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
242 		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
243 			trace_drop_common(NULL, NULL);
244 			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
245 			new_stat->last_rx = jiffies;
246 			break;
247 		}
248 	}
249 	rcu_read_unlock();
250 }
251 
252 static int set_all_monitor_traces(int state)
253 {
254 	int rc = 0;
255 	struct dm_hw_stat_delta *new_stat = NULL;
256 	struct dm_hw_stat_delta *temp;
257 
258 	mutex_lock(&trace_state_mutex);
259 
260 	if (state == trace_state) {
261 		rc = -EAGAIN;
262 		goto out_unlock;
263 	}
264 
265 	switch (state) {
266 	case TRACE_ON:
267 		if (!try_module_get(THIS_MODULE)) {
268 			rc = -ENODEV;
269 			break;
270 		}
271 
272 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
273 		rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
274 		break;
275 
276 	case TRACE_OFF:
277 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
278 		rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
279 
280 		tracepoint_synchronize_unregister();
281 
282 		/*
283 		 * Clean the device list
284 		 */
285 		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
286 			if (new_stat->dev == NULL) {
287 				list_del_rcu(&new_stat->list);
288 				kfree_rcu(new_stat, rcu);
289 			}
290 		}
291 
292 		module_put(THIS_MODULE);
293 
294 		break;
295 	default:
296 		rc = 1;
297 		break;
298 	}
299 
300 	if (!rc)
301 		trace_state = state;
302 	else
303 		rc = -EINPROGRESS;
304 
305 out_unlock:
306 	mutex_unlock(&trace_state_mutex);
307 
308 	return rc;
309 }
310 
311 
312 static int net_dm_cmd_config(struct sk_buff *skb,
313 			struct genl_info *info)
314 {
315 	return -ENOTSUPP;
316 }
317 
318 static int net_dm_cmd_trace(struct sk_buff *skb,
319 			struct genl_info *info)
320 {
321 	switch (info->genlhdr->cmd) {
322 	case NET_DM_CMD_START:
323 		return set_all_monitor_traces(TRACE_ON);
324 		break;
325 	case NET_DM_CMD_STOP:
326 		return set_all_monitor_traces(TRACE_OFF);
327 		break;
328 	}
329 
330 	return -ENOTSUPP;
331 }
332 
333 static int dropmon_net_event(struct notifier_block *ev_block,
334 			unsigned long event, void *ptr)
335 {
336 	struct net_device *dev = ptr;
337 	struct dm_hw_stat_delta *new_stat = NULL;
338 	struct dm_hw_stat_delta *tmp;
339 
340 	switch (event) {
341 	case NETDEV_REGISTER:
342 		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
343 
344 		if (!new_stat)
345 			goto out;
346 
347 		new_stat->dev = dev;
348 		new_stat->last_rx = jiffies;
349 		mutex_lock(&trace_state_mutex);
350 		list_add_rcu(&new_stat->list, &hw_stats_list);
351 		mutex_unlock(&trace_state_mutex);
352 		break;
353 	case NETDEV_UNREGISTER:
354 		mutex_lock(&trace_state_mutex);
355 		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
356 			if (new_stat->dev == dev) {
357 				new_stat->dev = NULL;
358 				if (trace_state == TRACE_OFF) {
359 					list_del_rcu(&new_stat->list);
360 					kfree_rcu(new_stat, rcu);
361 					break;
362 				}
363 			}
364 		}
365 		mutex_unlock(&trace_state_mutex);
366 		break;
367 	}
368 out:
369 	return NOTIFY_DONE;
370 }
371 
372 static struct genl_ops dropmon_ops[] = {
373 	{
374 		.cmd = NET_DM_CMD_CONFIG,
375 		.doit = net_dm_cmd_config,
376 	},
377 	{
378 		.cmd = NET_DM_CMD_START,
379 		.doit = net_dm_cmd_trace,
380 	},
381 	{
382 		.cmd = NET_DM_CMD_STOP,
383 		.doit = net_dm_cmd_trace,
384 	},
385 };
386 
387 static struct notifier_block dropmon_net_notifier = {
388 	.notifier_call = dropmon_net_event
389 };
390 
391 static int __init init_net_drop_monitor(void)
392 {
393 	struct per_cpu_dm_data *data;
394 	int cpu, rc;
395 
396 	pr_info("Initializing network drop monitor service\n");
397 
398 	if (sizeof(void *) > 8) {
399 		pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
400 		return -ENOSPC;
401 	}
402 
403 	rc = genl_register_family_with_ops(&net_drop_monitor_family,
404 					   dropmon_ops,
405 					   ARRAY_SIZE(dropmon_ops));
406 	if (rc) {
407 		pr_err("Could not create drop monitor netlink family\n");
408 		return rc;
409 	}
410 
411 	rc = register_netdevice_notifier(&dropmon_net_notifier);
412 	if (rc < 0) {
413 		pr_crit("Failed to register netdevice notifier\n");
414 		goto out_unreg;
415 	}
416 
417 	rc = 0;
418 
419 	for_each_possible_cpu(cpu) {
420 		data = &per_cpu(dm_cpu_data, cpu);
421 		data->cpu = cpu;
422 		INIT_WORK(&data->dm_alert_work, send_dm_alert);
423 		init_timer(&data->send_timer);
424 		data->send_timer.data = cpu;
425 		data->send_timer.function = sched_send_work;
426 		reset_per_cpu_data(data);
427 	}
428 
429 
430 	goto out;
431 
432 out_unreg:
433 	genl_unregister_family(&net_drop_monitor_family);
434 out:
435 	return rc;
436 }
437 
438 static void exit_net_drop_monitor(void)
439 {
440 	struct per_cpu_dm_data *data;
441 	int cpu;
442 
443 	BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
444 
445 	/*
446 	 * Because of the module_get/put we do in the trace state change path
447 	 * we are guarnateed not to have any current users when we get here
448 	 * all we need to do is make sure that we don't have any running timers
449 	 * or pending schedule calls
450 	 */
451 
452 	for_each_possible_cpu(cpu) {
453 		data = &per_cpu(dm_cpu_data, cpu);
454 		del_timer_sync(&data->send_timer);
455 		cancel_work_sync(&data->dm_alert_work);
456 		/*
457 		 * At this point, we should have exclusive access
458 		 * to this struct and can free the skb inside it
459 		 */
460 		kfree_skb(data->skb);
461 	}
462 
463 	BUG_ON(genl_unregister_family(&net_drop_monitor_family));
464 }
465 
466 module_init(init_net_drop_monitor);
467 module_exit(exit_net_drop_monitor);
468 
469 MODULE_LICENSE("GPL v2");
470 MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
471