1 /* 2 * Monitoring code for network dropped packet alerts 3 * 4 * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> 5 */ 6 7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 9 #include <linux/netdevice.h> 10 #include <linux/etherdevice.h> 11 #include <linux/string.h> 12 #include <linux/if_arp.h> 13 #include <linux/inetdevice.h> 14 #include <linux/inet.h> 15 #include <linux/interrupt.h> 16 #include <linux/netpoll.h> 17 #include <linux/sched.h> 18 #include <linux/delay.h> 19 #include <linux/types.h> 20 #include <linux/workqueue.h> 21 #include <linux/netlink.h> 22 #include <linux/net_dropmon.h> 23 #include <linux/percpu.h> 24 #include <linux/timer.h> 25 #include <linux/bitops.h> 26 #include <linux/slab.h> 27 #include <linux/module.h> 28 #include <net/genetlink.h> 29 #include <net/netevent.h> 30 31 #include <trace/events/skb.h> 32 #include <trace/events/napi.h> 33 34 #include <asm/unaligned.h> 35 36 #define TRACE_ON 1 37 #define TRACE_OFF 0 38 39 static void send_dm_alert(struct work_struct *unused); 40 41 42 /* 43 * Globals, our netlink socket pointer 44 * and the work handle that will send up 45 * netlink alerts 46 */ 47 static int trace_state = TRACE_OFF; 48 static DEFINE_MUTEX(trace_state_mutex); 49 50 struct per_cpu_dm_data { 51 struct work_struct dm_alert_work; 52 struct sk_buff __rcu *skb; 53 atomic_t dm_hit_count; 54 struct timer_list send_timer; 55 int cpu; 56 }; 57 58 struct dm_hw_stat_delta { 59 struct net_device *dev; 60 unsigned long last_rx; 61 struct list_head list; 62 struct rcu_head rcu; 63 unsigned long last_drop_val; 64 }; 65 66 static struct genl_family net_drop_monitor_family = { 67 .id = GENL_ID_GENERATE, 68 .hdrsize = 0, 69 .name = "NET_DM", 70 .version = 2, 71 .maxattr = NET_DM_CMD_MAX, 72 }; 73 74 static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); 75 76 static int dm_hit_limit = 64; 77 static int dm_delay = 1; 78 static unsigned long dm_hw_check_delta = 2*HZ; 79 static LIST_HEAD(hw_stats_list); 80 81 static void reset_per_cpu_data(struct per_cpu_dm_data *data) 82 { 83 size_t al; 84 struct net_dm_alert_msg *msg; 85 struct nlattr *nla; 86 struct sk_buff *skb; 87 struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1); 88 89 al = sizeof(struct net_dm_alert_msg); 90 al += dm_hit_limit * sizeof(struct net_dm_drop_point); 91 al += sizeof(struct nlattr); 92 93 skb = genlmsg_new(al, GFP_KERNEL); 94 95 if (skb) { 96 genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 97 0, NET_DM_CMD_ALERT); 98 nla = nla_reserve(skb, NLA_UNSPEC, 99 sizeof(struct net_dm_alert_msg)); 100 msg = nla_data(nla); 101 memset(msg, 0, al); 102 } else 103 schedule_work_on(data->cpu, &data->dm_alert_work); 104 105 /* 106 * Don't need to lock this, since we are guaranteed to only 107 * run this on a single cpu at a time. 108 * Note also that we only update data->skb if the old and new skb 109 * pointers don't match. This ensures that we don't continually call 110 * synchornize_rcu if we repeatedly fail to alloc a new netlink message. 111 */ 112 if (skb != oskb) { 113 rcu_assign_pointer(data->skb, skb); 114 115 synchronize_rcu(); 116 117 atomic_set(&data->dm_hit_count, dm_hit_limit); 118 } 119 120 } 121 122 static void send_dm_alert(struct work_struct *unused) 123 { 124 struct sk_buff *skb; 125 struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); 126 127 WARN_ON_ONCE(data->cpu != smp_processor_id()); 128 129 /* 130 * Grab the skb we're about to send 131 */ 132 skb = rcu_dereference_protected(data->skb, 1); 133 134 /* 135 * Replace it with a new one 136 */ 137 reset_per_cpu_data(data); 138 139 /* 140 * Ship it! 141 */ 142 if (skb) 143 genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); 144 145 put_cpu_var(dm_cpu_data); 146 } 147 148 /* 149 * This is the timer function to delay the sending of an alert 150 * in the event that more drops will arrive during the 151 * hysteresis period. Note that it operates under the timer interrupt 152 * so we don't need to disable preemption here 153 */ 154 static void sched_send_work(unsigned long unused) 155 { 156 struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); 157 158 schedule_work_on(smp_processor_id(), &data->dm_alert_work); 159 160 put_cpu_var(dm_cpu_data); 161 } 162 163 static void trace_drop_common(struct sk_buff *skb, void *location) 164 { 165 struct net_dm_alert_msg *msg; 166 struct nlmsghdr *nlh; 167 struct nlattr *nla; 168 int i; 169 struct sk_buff *dskb; 170 struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); 171 172 173 rcu_read_lock(); 174 dskb = rcu_dereference(data->skb); 175 176 if (!dskb) 177 goto out; 178 179 if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { 180 /* 181 * we're already at zero, discard this hit 182 */ 183 goto out; 184 } 185 186 nlh = (struct nlmsghdr *)dskb->data; 187 nla = genlmsg_data(nlmsg_data(nlh)); 188 msg = nla_data(nla); 189 for (i = 0; i < msg->entries; i++) { 190 if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { 191 msg->points[i].count++; 192 atomic_inc(&data->dm_hit_count); 193 goto out; 194 } 195 } 196 197 /* 198 * We need to create a new entry 199 */ 200 __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); 201 nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); 202 memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); 203 msg->points[msg->entries].count = 1; 204 msg->entries++; 205 206 if (!timer_pending(&data->send_timer)) { 207 data->send_timer.expires = jiffies + dm_delay * HZ; 208 add_timer_on(&data->send_timer, smp_processor_id()); 209 } 210 211 out: 212 rcu_read_unlock(); 213 put_cpu_var(dm_cpu_data); 214 return; 215 } 216 217 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) 218 { 219 trace_drop_common(skb, location); 220 } 221 222 static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) 223 { 224 struct dm_hw_stat_delta *new_stat; 225 226 /* 227 * Don't check napi structures with no associated device 228 */ 229 if (!napi->dev) 230 return; 231 232 rcu_read_lock(); 233 list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { 234 /* 235 * only add a note to our monitor buffer if: 236 * 1) this is the dev we received on 237 * 2) its after the last_rx delta 238 * 3) our rx_dropped count has gone up 239 */ 240 if ((new_stat->dev == napi->dev) && 241 (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && 242 (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { 243 trace_drop_common(NULL, NULL); 244 new_stat->last_drop_val = napi->dev->stats.rx_dropped; 245 new_stat->last_rx = jiffies; 246 break; 247 } 248 } 249 rcu_read_unlock(); 250 } 251 252 static int set_all_monitor_traces(int state) 253 { 254 int rc = 0; 255 struct dm_hw_stat_delta *new_stat = NULL; 256 struct dm_hw_stat_delta *temp; 257 258 mutex_lock(&trace_state_mutex); 259 260 if (state == trace_state) { 261 rc = -EAGAIN; 262 goto out_unlock; 263 } 264 265 switch (state) { 266 case TRACE_ON: 267 if (!try_module_get(THIS_MODULE)) { 268 rc = -ENODEV; 269 break; 270 } 271 272 rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); 273 rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); 274 break; 275 276 case TRACE_OFF: 277 rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); 278 rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); 279 280 tracepoint_synchronize_unregister(); 281 282 /* 283 * Clean the device list 284 */ 285 list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { 286 if (new_stat->dev == NULL) { 287 list_del_rcu(&new_stat->list); 288 kfree_rcu(new_stat, rcu); 289 } 290 } 291 292 module_put(THIS_MODULE); 293 294 break; 295 default: 296 rc = 1; 297 break; 298 } 299 300 if (!rc) 301 trace_state = state; 302 else 303 rc = -EINPROGRESS; 304 305 out_unlock: 306 mutex_unlock(&trace_state_mutex); 307 308 return rc; 309 } 310 311 312 static int net_dm_cmd_config(struct sk_buff *skb, 313 struct genl_info *info) 314 { 315 return -ENOTSUPP; 316 } 317 318 static int net_dm_cmd_trace(struct sk_buff *skb, 319 struct genl_info *info) 320 { 321 switch (info->genlhdr->cmd) { 322 case NET_DM_CMD_START: 323 return set_all_monitor_traces(TRACE_ON); 324 break; 325 case NET_DM_CMD_STOP: 326 return set_all_monitor_traces(TRACE_OFF); 327 break; 328 } 329 330 return -ENOTSUPP; 331 } 332 333 static int dropmon_net_event(struct notifier_block *ev_block, 334 unsigned long event, void *ptr) 335 { 336 struct net_device *dev = ptr; 337 struct dm_hw_stat_delta *new_stat = NULL; 338 struct dm_hw_stat_delta *tmp; 339 340 switch (event) { 341 case NETDEV_REGISTER: 342 new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); 343 344 if (!new_stat) 345 goto out; 346 347 new_stat->dev = dev; 348 new_stat->last_rx = jiffies; 349 mutex_lock(&trace_state_mutex); 350 list_add_rcu(&new_stat->list, &hw_stats_list); 351 mutex_unlock(&trace_state_mutex); 352 break; 353 case NETDEV_UNREGISTER: 354 mutex_lock(&trace_state_mutex); 355 list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { 356 if (new_stat->dev == dev) { 357 new_stat->dev = NULL; 358 if (trace_state == TRACE_OFF) { 359 list_del_rcu(&new_stat->list); 360 kfree_rcu(new_stat, rcu); 361 break; 362 } 363 } 364 } 365 mutex_unlock(&trace_state_mutex); 366 break; 367 } 368 out: 369 return NOTIFY_DONE; 370 } 371 372 static struct genl_ops dropmon_ops[] = { 373 { 374 .cmd = NET_DM_CMD_CONFIG, 375 .doit = net_dm_cmd_config, 376 }, 377 { 378 .cmd = NET_DM_CMD_START, 379 .doit = net_dm_cmd_trace, 380 }, 381 { 382 .cmd = NET_DM_CMD_STOP, 383 .doit = net_dm_cmd_trace, 384 }, 385 }; 386 387 static struct notifier_block dropmon_net_notifier = { 388 .notifier_call = dropmon_net_event 389 }; 390 391 static int __init init_net_drop_monitor(void) 392 { 393 struct per_cpu_dm_data *data; 394 int cpu, rc; 395 396 pr_info("Initializing network drop monitor service\n"); 397 398 if (sizeof(void *) > 8) { 399 pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); 400 return -ENOSPC; 401 } 402 403 rc = genl_register_family_with_ops(&net_drop_monitor_family, 404 dropmon_ops, 405 ARRAY_SIZE(dropmon_ops)); 406 if (rc) { 407 pr_err("Could not create drop monitor netlink family\n"); 408 return rc; 409 } 410 411 rc = register_netdevice_notifier(&dropmon_net_notifier); 412 if (rc < 0) { 413 pr_crit("Failed to register netdevice notifier\n"); 414 goto out_unreg; 415 } 416 417 rc = 0; 418 419 for_each_possible_cpu(cpu) { 420 data = &per_cpu(dm_cpu_data, cpu); 421 data->cpu = cpu; 422 INIT_WORK(&data->dm_alert_work, send_dm_alert); 423 init_timer(&data->send_timer); 424 data->send_timer.data = cpu; 425 data->send_timer.function = sched_send_work; 426 reset_per_cpu_data(data); 427 } 428 429 430 goto out; 431 432 out_unreg: 433 genl_unregister_family(&net_drop_monitor_family); 434 out: 435 return rc; 436 } 437 438 static void exit_net_drop_monitor(void) 439 { 440 struct per_cpu_dm_data *data; 441 int cpu; 442 443 BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); 444 445 /* 446 * Because of the module_get/put we do in the trace state change path 447 * we are guarnateed not to have any current users when we get here 448 * all we need to do is make sure that we don't have any running timers 449 * or pending schedule calls 450 */ 451 452 for_each_possible_cpu(cpu) { 453 data = &per_cpu(dm_cpu_data, cpu); 454 del_timer_sync(&data->send_timer); 455 cancel_work_sync(&data->dm_alert_work); 456 /* 457 * At this point, we should have exclusive access 458 * to this struct and can free the skb inside it 459 */ 460 kfree_skb(data->skb); 461 } 462 463 BUG_ON(genl_unregister_family(&net_drop_monitor_family)); 464 } 465 466 module_init(init_net_drop_monitor); 467 module_exit(exit_net_drop_monitor); 468 469 MODULE_LICENSE("GPL v2"); 470 MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); 471