xref: /linux/net/core/dev.c (revision a0f97e06a43cf524e616f09e6af3398e1e9c1c5b)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/notifier.h>
94 #include <linux/skbuff.h>
95 #include <net/net_namespace.h>
96 #include <net/sock.h>
97 #include <linux/rtnetlink.h>
98 #include <linux/proc_fs.h>
99 #include <linux/seq_file.h>
100 #include <linux/stat.h>
101 #include <linux/if_bridge.h>
102 #include <linux/if_macvlan.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/kmod.h>
109 #include <linux/module.h>
110 #include <linux/kallsyms.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 
123 /*
124  *	The list of packet types we will receive (as opposed to discard)
125  *	and the routines to invoke.
126  *
127  *	Why 16. Because with 16 the only overlap we get on a hash of the
128  *	low nibble of the protocol value is RARP/SNAP/X.25.
129  *
130  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
131  *             sure which should go first, but I bet it won't make much
132  *             difference if we are running VLANs.  The good news is that
133  *             this protocol won't be in the list unless compiled in, so
134  *             the average user (w/out VLANs) will not be adversely affected.
135  *             --BLG
136  *
137  *		0800	IP
138  *		8100    802.1Q VLAN
139  *		0001	802.3
140  *		0002	AX.25
141  *		0004	802.2
142  *		8035	RARP
143  *		0005	SNAP
144  *		0805	X.25
145  *		0806	ARP
146  *		8137	IPX
147  *		0009	Localtalk
148  *		86DD	IPv6
149  */
150 
151 static DEFINE_SPINLOCK(ptype_lock);
152 static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
153 static struct list_head ptype_all __read_mostly;	/* Taps */
154 
155 #ifdef CONFIG_NET_DMA
156 struct net_dma {
157 	struct dma_client client;
158 	spinlock_t lock;
159 	cpumask_t channel_mask;
160 	struct dma_chan *channels[NR_CPUS];
161 };
162 
163 static enum dma_state_client
164 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
165 	enum dma_state state);
166 
167 static struct net_dma net_dma = {
168 	.client = {
169 		.event_callback = netdev_dma_event,
170 	},
171 };
172 #endif
173 
174 /*
175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
176  * semaphore.
177  *
178  * Pure readers hold dev_base_lock for reading.
179  *
180  * Writers must hold the rtnl semaphore while they loop through the
181  * dev_base_head list, and hold dev_base_lock for writing when they do the
182  * actual updates.  This allows pure readers to access the list even
183  * while a writer is preparing to update it.
184  *
185  * To put it another way, dev_base_lock is held for writing only to
186  * protect against pure readers; the rtnl semaphore provides the
187  * protection against other writers.
188  *
189  * See, for example usages, register_netdevice() and
190  * unregister_netdevice(), which must be called with the rtnl
191  * semaphore held.
192  */
193 DEFINE_RWLOCK(dev_base_lock);
194 
195 EXPORT_SYMBOL(dev_base_lock);
196 
197 #define NETDEV_HASHBITS	8
198 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
199 
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 {
202 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
204 }
205 
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
209 }
210 
211 /* Device list insertion */
212 static int list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev->nd_net;
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
222 	write_unlock_bh(&dev_base_lock);
223 	return 0;
224 }
225 
226 /* Device list removal */
227 static void unlist_netdevice(struct net_device *dev)
228 {
229 	ASSERT_RTNL();
230 
231 	/* Unlink dev from the device chain */
232 	write_lock_bh(&dev_base_lock);
233 	list_del(&dev->dev_list);
234 	hlist_del(&dev->name_hlist);
235 	hlist_del(&dev->index_hlist);
236 	write_unlock_bh(&dev_base_lock);
237 }
238 
239 /*
240  *	Our notifier list
241  */
242 
243 static RAW_NOTIFIER_HEAD(netdev_chain);
244 
245 /*
246  *	Device drivers call our routines to queue packets here. We empty the
247  *	queue in the local softnet handler.
248  */
249 
250 DEFINE_PER_CPU(struct softnet_data, softnet_data);
251 
252 extern int netdev_kobject_init(void);
253 extern int netdev_register_kobject(struct net_device *);
254 extern void netdev_unregister_kobject(struct net_device *);
255 
256 #ifdef CONFIG_DEBUG_LOCK_ALLOC
257 /*
258  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
259  * according to dev->type
260  */
261 static const unsigned short netdev_lock_type[] =
262 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
263 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
264 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
265 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
266 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
267 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
268 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
269 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
270 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
271 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
272 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
273 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
274 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
275 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
276 	 ARPHRD_NONE};
277 
278 static const char *netdev_lock_name[] =
279 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
280 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
281 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
282 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
283 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
284 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
285 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
286 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
287 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
288 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
289 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
290 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
291 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
292 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
293 	 "_xmit_NONE"};
294 
295 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
296 
297 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
298 {
299 	int i;
300 
301 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
302 		if (netdev_lock_type[i] == dev_type)
303 			return i;
304 	/* the last key is used by default */
305 	return ARRAY_SIZE(netdev_lock_type) - 1;
306 }
307 
308 static inline void netdev_set_lockdep_class(spinlock_t *lock,
309 					    unsigned short dev_type)
310 {
311 	int i;
312 
313 	i = netdev_lock_pos(dev_type);
314 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
315 				   netdev_lock_name[i]);
316 }
317 #else
318 static inline void netdev_set_lockdep_class(spinlock_t *lock,
319 					    unsigned short dev_type)
320 {
321 }
322 #endif
323 
324 /*******************************************************************************
325 
326 		Protocol management and registration routines
327 
328 *******************************************************************************/
329 
330 /*
331  *	Add a protocol ID to the list. Now that the input handler is
332  *	smarter we can dispense with all the messy stuff that used to be
333  *	here.
334  *
335  *	BEWARE!!! Protocol handlers, mangling input packets,
336  *	MUST BE last in hash buckets and checking protocol handlers
337  *	MUST start from promiscuous ptype_all chain in net_bh.
338  *	It is true now, do not change it.
339  *	Explanation follows: if protocol handler, mangling packet, will
340  *	be the first on list, it is not able to sense, that packet
341  *	is cloned and should be copied-on-write, so that it will
342  *	change it and subsequent readers will get broken packet.
343  *							--ANK (980803)
344  */
345 
346 /**
347  *	dev_add_pack - add packet handler
348  *	@pt: packet type declaration
349  *
350  *	Add a protocol handler to the networking stack. The passed &packet_type
351  *	is linked into kernel lists and may not be freed until it has been
352  *	removed from the kernel lists.
353  *
354  *	This call does not sleep therefore it can not
355  *	guarantee all CPU's that are in middle of receiving packets
356  *	will see the new packet type (until the next received packet).
357  */
358 
359 void dev_add_pack(struct packet_type *pt)
360 {
361 	int hash;
362 
363 	spin_lock_bh(&ptype_lock);
364 	if (pt->type == htons(ETH_P_ALL))
365 		list_add_rcu(&pt->list, &ptype_all);
366 	else {
367 		hash = ntohs(pt->type) & 15;
368 		list_add_rcu(&pt->list, &ptype_base[hash]);
369 	}
370 	spin_unlock_bh(&ptype_lock);
371 }
372 
373 /**
374  *	__dev_remove_pack	 - remove packet handler
375  *	@pt: packet type declaration
376  *
377  *	Remove a protocol handler that was previously added to the kernel
378  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
379  *	from the kernel lists and can be freed or reused once this function
380  *	returns.
381  *
382  *      The packet type might still be in use by receivers
383  *	and must not be freed until after all the CPU's have gone
384  *	through a quiescent state.
385  */
386 void __dev_remove_pack(struct packet_type *pt)
387 {
388 	struct list_head *head;
389 	struct packet_type *pt1;
390 
391 	spin_lock_bh(&ptype_lock);
392 
393 	if (pt->type == htons(ETH_P_ALL))
394 		head = &ptype_all;
395 	else
396 		head = &ptype_base[ntohs(pt->type) & 15];
397 
398 	list_for_each_entry(pt1, head, list) {
399 		if (pt == pt1) {
400 			list_del_rcu(&pt->list);
401 			goto out;
402 		}
403 	}
404 
405 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
406 out:
407 	spin_unlock_bh(&ptype_lock);
408 }
409 /**
410  *	dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *	This call sleeps to guarantee that no CPU is looking at the packet
419  *	type after return.
420  */
421 void dev_remove_pack(struct packet_type *pt)
422 {
423 	__dev_remove_pack(pt);
424 
425 	synchronize_net();
426 }
427 
428 /******************************************************************************
429 
430 		      Device Boot-time Settings Routines
431 
432 *******************************************************************************/
433 
434 /* Boot time configuration table */
435 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
436 
437 /**
438  *	netdev_boot_setup_add	- add new setup entry
439  *	@name: name of the device
440  *	@map: configured settings for the device
441  *
442  *	Adds new setup entry to the dev_boot_setup list.  The function
443  *	returns 0 on error and 1 on success.  This is a generic routine to
444  *	all netdevices.
445  */
446 static int netdev_boot_setup_add(char *name, struct ifmap *map)
447 {
448 	struct netdev_boot_setup *s;
449 	int i;
450 
451 	s = dev_boot_setup;
452 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
453 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
454 			memset(s[i].name, 0, sizeof(s[i].name));
455 			strcpy(s[i].name, name);
456 			memcpy(&s[i].map, map, sizeof(s[i].map));
457 			break;
458 		}
459 	}
460 
461 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
462 }
463 
464 /**
465  *	netdev_boot_setup_check	- check boot time settings
466  *	@dev: the netdevice
467  *
468  * 	Check boot time settings for the device.
469  *	The found settings are set for the device to be used
470  *	later in the device probing.
471  *	Returns 0 if no settings found, 1 if they are.
472  */
473 int netdev_boot_setup_check(struct net_device *dev)
474 {
475 	struct netdev_boot_setup *s = dev_boot_setup;
476 	int i;
477 
478 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
479 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
480 		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
481 			dev->irq 	= s[i].map.irq;
482 			dev->base_addr 	= s[i].map.base_addr;
483 			dev->mem_start 	= s[i].map.mem_start;
484 			dev->mem_end 	= s[i].map.mem_end;
485 			return 1;
486 		}
487 	}
488 	return 0;
489 }
490 
491 
492 /**
493  *	netdev_boot_base	- get address from boot time settings
494  *	@prefix: prefix for network device
495  *	@unit: id for network device
496  *
497  * 	Check boot time settings for the base address of device.
498  *	The found settings are set for the device to be used
499  *	later in the device probing.
500  *	Returns 0 if no settings found.
501  */
502 unsigned long netdev_boot_base(const char *prefix, int unit)
503 {
504 	const struct netdev_boot_setup *s = dev_boot_setup;
505 	char name[IFNAMSIZ];
506 	int i;
507 
508 	sprintf(name, "%s%d", prefix, unit);
509 
510 	/*
511 	 * If device already registered then return base of 1
512 	 * to indicate not to probe for this interface
513 	 */
514 	if (__dev_get_by_name(&init_net, name))
515 		return 1;
516 
517 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
518 		if (!strcmp(name, s[i].name))
519 			return s[i].map.base_addr;
520 	return 0;
521 }
522 
523 /*
524  * Saves at boot time configured settings for any netdevice.
525  */
526 int __init netdev_boot_setup(char *str)
527 {
528 	int ints[5];
529 	struct ifmap map;
530 
531 	str = get_options(str, ARRAY_SIZE(ints), ints);
532 	if (!str || !*str)
533 		return 0;
534 
535 	/* Save settings */
536 	memset(&map, 0, sizeof(map));
537 	if (ints[0] > 0)
538 		map.irq = ints[1];
539 	if (ints[0] > 1)
540 		map.base_addr = ints[2];
541 	if (ints[0] > 2)
542 		map.mem_start = ints[3];
543 	if (ints[0] > 3)
544 		map.mem_end = ints[4];
545 
546 	/* Add new entry to the list */
547 	return netdev_boot_setup_add(str, &map);
548 }
549 
550 __setup("netdev=", netdev_boot_setup);
551 
552 /*******************************************************************************
553 
554 			    Device Interface Subroutines
555 
556 *******************************************************************************/
557 
558 /**
559  *	__dev_get_by_name	- find a device by its name
560  *	@name: name to find
561  *
562  *	Find an interface by name. Must be called under RTNL semaphore
563  *	or @dev_base_lock. If the name is found a pointer to the device
564  *	is returned. If the name is not found then %NULL is returned. The
565  *	reference counters are not incremented so the caller must be
566  *	careful with locks.
567  */
568 
569 struct net_device *__dev_get_by_name(struct net *net, const char *name)
570 {
571 	struct hlist_node *p;
572 
573 	hlist_for_each(p, dev_name_hash(net, name)) {
574 		struct net_device *dev
575 			= hlist_entry(p, struct net_device, name_hlist);
576 		if (!strncmp(dev->name, name, IFNAMSIZ))
577 			return dev;
578 	}
579 	return NULL;
580 }
581 
582 /**
583  *	dev_get_by_name		- find a device by its name
584  *	@name: name to find
585  *
586  *	Find an interface by name. This can be called from any
587  *	context and does its own locking. The returned handle has
588  *	the usage count incremented and the caller must use dev_put() to
589  *	release it when it is no longer needed. %NULL is returned if no
590  *	matching device is found.
591  */
592 
593 struct net_device *dev_get_by_name(struct net *net, const char *name)
594 {
595 	struct net_device *dev;
596 
597 	read_lock(&dev_base_lock);
598 	dev = __dev_get_by_name(net, name);
599 	if (dev)
600 		dev_hold(dev);
601 	read_unlock(&dev_base_lock);
602 	return dev;
603 }
604 
605 /**
606  *	__dev_get_by_index - find a device by its ifindex
607  *	@ifindex: index of device
608  *
609  *	Search for an interface by index. Returns %NULL if the device
610  *	is not found or a pointer to the device. The device has not
611  *	had its reference counter increased so the caller must be careful
612  *	about locking. The caller must hold either the RTNL semaphore
613  *	or @dev_base_lock.
614  */
615 
616 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
617 {
618 	struct hlist_node *p;
619 
620 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
621 		struct net_device *dev
622 			= hlist_entry(p, struct net_device, index_hlist);
623 		if (dev->ifindex == ifindex)
624 			return dev;
625 	}
626 	return NULL;
627 }
628 
629 
630 /**
631  *	dev_get_by_index - find a device by its ifindex
632  *	@ifindex: index of device
633  *
634  *	Search for an interface by index. Returns NULL if the device
635  *	is not found or a pointer to the device. The device returned has
636  *	had a reference added and the pointer is safe until the user calls
637  *	dev_put to indicate they have finished with it.
638  */
639 
640 struct net_device *dev_get_by_index(struct net *net, int ifindex)
641 {
642 	struct net_device *dev;
643 
644 	read_lock(&dev_base_lock);
645 	dev = __dev_get_by_index(net, ifindex);
646 	if (dev)
647 		dev_hold(dev);
648 	read_unlock(&dev_base_lock);
649 	return dev;
650 }
651 
652 /**
653  *	dev_getbyhwaddr - find a device by its hardware address
654  *	@type: media type of device
655  *	@ha: hardware address
656  *
657  *	Search for an interface by MAC address. Returns NULL if the device
658  *	is not found or a pointer to the device. The caller must hold the
659  *	rtnl semaphore. The returned device has not had its ref count increased
660  *	and the caller must therefore be careful about locking
661  *
662  *	BUGS:
663  *	If the API was consistent this would be __dev_get_by_hwaddr
664  */
665 
666 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
667 {
668 	struct net_device *dev;
669 
670 	ASSERT_RTNL();
671 
672 	for_each_netdev(&init_net, dev)
673 		if (dev->type == type &&
674 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
675 			return dev;
676 
677 	return NULL;
678 }
679 
680 EXPORT_SYMBOL(dev_getbyhwaddr);
681 
682 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
683 {
684 	struct net_device *dev;
685 
686 	ASSERT_RTNL();
687 	for_each_netdev(net, dev)
688 		if (dev->type == type)
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
695 
696 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	rtnl_lock();
701 	dev = __dev_getfirstbyhwtype(net, type);
702 	if (dev)
703 		dev_hold(dev);
704 	rtnl_unlock();
705 	return dev;
706 }
707 
708 EXPORT_SYMBOL(dev_getfirstbyhwtype);
709 
710 /**
711  *	dev_get_by_flags - find any device with given flags
712  *	@if_flags: IFF_* values
713  *	@mask: bitmask of bits in if_flags to check
714  *
715  *	Search for any interface with the given flags. Returns NULL if a device
716  *	is not found or a pointer to the device. The device returned has
717  *	had a reference added and the pointer is safe until the user calls
718  *	dev_put to indicate they have finished with it.
719  */
720 
721 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
722 {
723 	struct net_device *dev, *ret;
724 
725 	ret = NULL;
726 	read_lock(&dev_base_lock);
727 	for_each_netdev(net, dev) {
728 		if (((dev->flags ^ if_flags) & mask) == 0) {
729 			dev_hold(dev);
730 			ret = dev;
731 			break;
732 		}
733 	}
734 	read_unlock(&dev_base_lock);
735 	return ret;
736 }
737 
738 /**
739  *	dev_valid_name - check if name is okay for network device
740  *	@name: name string
741  *
742  *	Network device names need to be valid file names to
743  *	to allow sysfs to work.  We also disallow any kind of
744  *	whitespace.
745  */
746 int dev_valid_name(const char *name)
747 {
748 	if (*name == '\0')
749 		return 0;
750 	if (strlen(name) >= IFNAMSIZ)
751 		return 0;
752 	if (!strcmp(name, ".") || !strcmp(name, ".."))
753 		return 0;
754 
755 	while (*name) {
756 		if (*name == '/' || isspace(*name))
757 			return 0;
758 		name++;
759 	}
760 	return 1;
761 }
762 
763 /**
764  *	__dev_alloc_name - allocate a name for a device
765  *	@net: network namespace to allocate the device name in
766  *	@name: name format string
767  *	@buf:  scratch buffer and result name string
768  *
769  *	Passed a format string - eg "lt%d" it will try and find a suitable
770  *	id. It scans list of devices to build up a free map, then chooses
771  *	the first empty slot. The caller must hold the dev_base or rtnl lock
772  *	while allocating the name and adding the device in order to avoid
773  *	duplicates.
774  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
775  *	Returns the number of the unit assigned or a negative errno code.
776  */
777 
778 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
779 {
780 	int i = 0;
781 	const char *p;
782 	const int max_netdevices = 8*PAGE_SIZE;
783 	unsigned long *inuse;
784 	struct net_device *d;
785 
786 	p = strnchr(name, IFNAMSIZ-1, '%');
787 	if (p) {
788 		/*
789 		 * Verify the string as this thing may have come from
790 		 * the user.  There must be either one "%d" and no other "%"
791 		 * characters.
792 		 */
793 		if (p[1] != 'd' || strchr(p + 2, '%'))
794 			return -EINVAL;
795 
796 		/* Use one page as a bit array of possible slots */
797 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
798 		if (!inuse)
799 			return -ENOMEM;
800 
801 		for_each_netdev(net, d) {
802 			if (!sscanf(d->name, name, &i))
803 				continue;
804 			if (i < 0 || i >= max_netdevices)
805 				continue;
806 
807 			/*  avoid cases where sscanf is not exact inverse of printf */
808 			snprintf(buf, IFNAMSIZ, name, i);
809 			if (!strncmp(buf, d->name, IFNAMSIZ))
810 				set_bit(i, inuse);
811 		}
812 
813 		i = find_first_zero_bit(inuse, max_netdevices);
814 		free_page((unsigned long) inuse);
815 	}
816 
817 	snprintf(buf, IFNAMSIZ, name, i);
818 	if (!__dev_get_by_name(net, buf))
819 		return i;
820 
821 	/* It is possible to run out of possible slots
822 	 * when the name is long and there isn't enough space left
823 	 * for the digits, or if all bits are used.
824 	 */
825 	return -ENFILE;
826 }
827 
828 /**
829  *	dev_alloc_name - allocate a name for a device
830  *	@dev: device
831  *	@name: name format string
832  *
833  *	Passed a format string - eg "lt%d" it will try and find a suitable
834  *	id. It scans list of devices to build up a free map, then chooses
835  *	the first empty slot. The caller must hold the dev_base or rtnl lock
836  *	while allocating the name and adding the device in order to avoid
837  *	duplicates.
838  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
839  *	Returns the number of the unit assigned or a negative errno code.
840  */
841 
842 int dev_alloc_name(struct net_device *dev, const char *name)
843 {
844 	char buf[IFNAMSIZ];
845 	struct net *net;
846 	int ret;
847 
848 	BUG_ON(!dev->nd_net);
849 	net = dev->nd_net;
850 	ret = __dev_alloc_name(net, name, buf);
851 	if (ret >= 0)
852 		strlcpy(dev->name, buf, IFNAMSIZ);
853 	return ret;
854 }
855 
856 
857 /**
858  *	dev_change_name - change name of a device
859  *	@dev: device
860  *	@newname: name (or format string) must be at least IFNAMSIZ
861  *
862  *	Change name of a device, can pass format strings "eth%d".
863  *	for wildcarding.
864  */
865 int dev_change_name(struct net_device *dev, char *newname)
866 {
867 	char oldname[IFNAMSIZ];
868 	int err = 0;
869 	int ret;
870 	struct net *net;
871 
872 	ASSERT_RTNL();
873 	BUG_ON(!dev->nd_net);
874 
875 	net = dev->nd_net;
876 	if (dev->flags & IFF_UP)
877 		return -EBUSY;
878 
879 	if (!dev_valid_name(newname))
880 		return -EINVAL;
881 
882 	memcpy(oldname, dev->name, IFNAMSIZ);
883 
884 	if (strchr(newname, '%')) {
885 		err = dev_alloc_name(dev, newname);
886 		if (err < 0)
887 			return err;
888 		strcpy(newname, dev->name);
889 	}
890 	else if (__dev_get_by_name(net, newname))
891 		return -EEXIST;
892 	else
893 		strlcpy(dev->name, newname, IFNAMSIZ);
894 
895 rollback:
896 	device_rename(&dev->dev, dev->name);
897 
898 	write_lock_bh(&dev_base_lock);
899 	hlist_del(&dev->name_hlist);
900 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
901 	write_unlock_bh(&dev_base_lock);
902 
903 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
904 	ret = notifier_to_errno(ret);
905 
906 	if (ret) {
907 		if (err) {
908 			printk(KERN_ERR
909 			       "%s: name change rollback failed: %d.\n",
910 			       dev->name, ret);
911 		} else {
912 			err = ret;
913 			memcpy(dev->name, oldname, IFNAMSIZ);
914 			goto rollback;
915 		}
916 	}
917 
918 	return err;
919 }
920 
921 /**
922  *	netdev_features_change - device changes features
923  *	@dev: device to cause notification
924  *
925  *	Called to indicate a device has changed features.
926  */
927 void netdev_features_change(struct net_device *dev)
928 {
929 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
930 }
931 EXPORT_SYMBOL(netdev_features_change);
932 
933 /**
934  *	netdev_state_change - device changes state
935  *	@dev: device to cause notification
936  *
937  *	Called to indicate a device has changed state. This function calls
938  *	the notifier chains for netdev_chain and sends a NEWLINK message
939  *	to the routing socket.
940  */
941 void netdev_state_change(struct net_device *dev)
942 {
943 	if (dev->flags & IFF_UP) {
944 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
945 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
946 	}
947 }
948 
949 /**
950  *	dev_load 	- load a network module
951  *	@name: name of interface
952  *
953  *	If a network interface is not present and the process has suitable
954  *	privileges this function loads the module. If module loading is not
955  *	available in this kernel then it becomes a nop.
956  */
957 
958 void dev_load(struct net *net, const char *name)
959 {
960 	struct net_device *dev;
961 
962 	read_lock(&dev_base_lock);
963 	dev = __dev_get_by_name(net, name);
964 	read_unlock(&dev_base_lock);
965 
966 	if (!dev && capable(CAP_SYS_MODULE))
967 		request_module("%s", name);
968 }
969 
970 /**
971  *	dev_open	- prepare an interface for use.
972  *	@dev:	device to open
973  *
974  *	Takes a device from down to up state. The device's private open
975  *	function is invoked and then the multicast lists are loaded. Finally
976  *	the device is moved into the up state and a %NETDEV_UP message is
977  *	sent to the netdev notifier chain.
978  *
979  *	Calling this function on an active interface is a nop. On a failure
980  *	a negative errno code is returned.
981  */
982 int dev_open(struct net_device *dev)
983 {
984 	int ret = 0;
985 
986 	/*
987 	 *	Is it already up?
988 	 */
989 
990 	if (dev->flags & IFF_UP)
991 		return 0;
992 
993 	/*
994 	 *	Is it even present?
995 	 */
996 	if (!netif_device_present(dev))
997 		return -ENODEV;
998 
999 	/*
1000 	 *	Call device private open method
1001 	 */
1002 	set_bit(__LINK_STATE_START, &dev->state);
1003 	if (dev->open) {
1004 		ret = dev->open(dev);
1005 		if (ret)
1006 			clear_bit(__LINK_STATE_START, &dev->state);
1007 	}
1008 
1009 	/*
1010 	 *	If it went open OK then:
1011 	 */
1012 
1013 	if (!ret) {
1014 		/*
1015 		 *	Set the flags.
1016 		 */
1017 		dev->flags |= IFF_UP;
1018 
1019 		/*
1020 		 *	Initialize multicasting status
1021 		 */
1022 		dev_set_rx_mode(dev);
1023 
1024 		/*
1025 		 *	Wakeup transmit queue engine
1026 		 */
1027 		dev_activate(dev);
1028 
1029 		/*
1030 		 *	... and announce new interface.
1031 		 */
1032 		call_netdevice_notifiers(NETDEV_UP, dev);
1033 	}
1034 	return ret;
1035 }
1036 
1037 /**
1038  *	dev_close - shutdown an interface.
1039  *	@dev: device to shutdown
1040  *
1041  *	This function moves an active device into down state. A
1042  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1043  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1044  *	chain.
1045  */
1046 int dev_close(struct net_device *dev)
1047 {
1048 	might_sleep();
1049 
1050 	if (!(dev->flags & IFF_UP))
1051 		return 0;
1052 
1053 	/*
1054 	 *	Tell people we are going down, so that they can
1055 	 *	prepare to death, when device is still operating.
1056 	 */
1057 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1058 
1059 	dev_deactivate(dev);
1060 
1061 	clear_bit(__LINK_STATE_START, &dev->state);
1062 
1063 	/* Synchronize to scheduled poll. We cannot touch poll list,
1064 	 * it can be even on different cpu. So just clear netif_running().
1065 	 *
1066 	 * dev->stop() will invoke napi_disable() on all of it's
1067 	 * napi_struct instances on this device.
1068 	 */
1069 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1070 
1071 	/*
1072 	 *	Call the device specific close. This cannot fail.
1073 	 *	Only if device is UP
1074 	 *
1075 	 *	We allow it to be called even after a DETACH hot-plug
1076 	 *	event.
1077 	 */
1078 	if (dev->stop)
1079 		dev->stop(dev);
1080 
1081 	/*
1082 	 *	Device is now down.
1083 	 */
1084 
1085 	dev->flags &= ~IFF_UP;
1086 
1087 	/*
1088 	 * Tell people we are down
1089 	 */
1090 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1091 
1092 	return 0;
1093 }
1094 
1095 
1096 static int dev_boot_phase = 1;
1097 
1098 /*
1099  *	Device change register/unregister. These are not inline or static
1100  *	as we export them to the world.
1101  */
1102 
1103 /**
1104  *	register_netdevice_notifier - register a network notifier block
1105  *	@nb: notifier
1106  *
1107  *	Register a notifier to be called when network device events occur.
1108  *	The notifier passed is linked into the kernel structures and must
1109  *	not be reused until it has been unregistered. A negative errno code
1110  *	is returned on a failure.
1111  *
1112  * 	When registered all registration and up events are replayed
1113  *	to the new notifier to allow device to have a race free
1114  *	view of the network device list.
1115  */
1116 
1117 int register_netdevice_notifier(struct notifier_block *nb)
1118 {
1119 	struct net_device *dev;
1120 	struct net_device *last;
1121 	struct net *net;
1122 	int err;
1123 
1124 	rtnl_lock();
1125 	err = raw_notifier_chain_register(&netdev_chain, nb);
1126 	if (err)
1127 		goto unlock;
1128 	if (dev_boot_phase)
1129 		goto unlock;
1130 	for_each_net(net) {
1131 		for_each_netdev(net, dev) {
1132 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1133 			err = notifier_to_errno(err);
1134 			if (err)
1135 				goto rollback;
1136 
1137 			if (!(dev->flags & IFF_UP))
1138 				continue;
1139 
1140 			nb->notifier_call(nb, NETDEV_UP, dev);
1141 		}
1142 	}
1143 
1144 unlock:
1145 	rtnl_unlock();
1146 	return err;
1147 
1148 rollback:
1149 	last = dev;
1150 	for_each_net(net) {
1151 		for_each_netdev(net, dev) {
1152 			if (dev == last)
1153 				break;
1154 
1155 			if (dev->flags & IFF_UP) {
1156 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1157 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1158 			}
1159 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1160 		}
1161 	}
1162 	goto unlock;
1163 }
1164 
1165 /**
1166  *	unregister_netdevice_notifier - unregister a network notifier block
1167  *	@nb: notifier
1168  *
1169  *	Unregister a notifier previously registered by
1170  *	register_netdevice_notifier(). The notifier is unlinked into the
1171  *	kernel structures and may then be reused. A negative errno code
1172  *	is returned on a failure.
1173  */
1174 
1175 int unregister_netdevice_notifier(struct notifier_block *nb)
1176 {
1177 	int err;
1178 
1179 	rtnl_lock();
1180 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1181 	rtnl_unlock();
1182 	return err;
1183 }
1184 
1185 /**
1186  *	call_netdevice_notifiers - call all network notifier blocks
1187  *      @val: value passed unmodified to notifier function
1188  *      @v:   pointer passed unmodified to notifier function
1189  *
1190  *	Call all network notifier blocks.  Parameters and return value
1191  *	are as for raw_notifier_call_chain().
1192  */
1193 
1194 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1195 {
1196 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1197 }
1198 
1199 /* When > 0 there are consumers of rx skb time stamps */
1200 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1201 
1202 void net_enable_timestamp(void)
1203 {
1204 	atomic_inc(&netstamp_needed);
1205 }
1206 
1207 void net_disable_timestamp(void)
1208 {
1209 	atomic_dec(&netstamp_needed);
1210 }
1211 
1212 static inline void net_timestamp(struct sk_buff *skb)
1213 {
1214 	if (atomic_read(&netstamp_needed))
1215 		__net_timestamp(skb);
1216 	else
1217 		skb->tstamp.tv64 = 0;
1218 }
1219 
1220 /*
1221  *	Support routine. Sends outgoing frames to any network
1222  *	taps currently in use.
1223  */
1224 
1225 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1226 {
1227 	struct packet_type *ptype;
1228 
1229 	net_timestamp(skb);
1230 
1231 	rcu_read_lock();
1232 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1233 		/* Never send packets back to the socket
1234 		 * they originated from - MvS (miquels@drinkel.ow.org)
1235 		 */
1236 		if ((ptype->dev == dev || !ptype->dev) &&
1237 		    (ptype->af_packet_priv == NULL ||
1238 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1239 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1240 			if (!skb2)
1241 				break;
1242 
1243 			/* skb->nh should be correctly
1244 			   set by sender, so that the second statement is
1245 			   just protection against buggy protocols.
1246 			 */
1247 			skb_reset_mac_header(skb2);
1248 
1249 			if (skb_network_header(skb2) < skb2->data ||
1250 			    skb2->network_header > skb2->tail) {
1251 				if (net_ratelimit())
1252 					printk(KERN_CRIT "protocol %04x is "
1253 					       "buggy, dev %s\n",
1254 					       skb2->protocol, dev->name);
1255 				skb_reset_network_header(skb2);
1256 			}
1257 
1258 			skb2->transport_header = skb2->network_header;
1259 			skb2->pkt_type = PACKET_OUTGOING;
1260 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1261 		}
1262 	}
1263 	rcu_read_unlock();
1264 }
1265 
1266 
1267 void __netif_schedule(struct net_device *dev)
1268 {
1269 	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1270 		unsigned long flags;
1271 		struct softnet_data *sd;
1272 
1273 		local_irq_save(flags);
1274 		sd = &__get_cpu_var(softnet_data);
1275 		dev->next_sched = sd->output_queue;
1276 		sd->output_queue = dev;
1277 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1278 		local_irq_restore(flags);
1279 	}
1280 }
1281 EXPORT_SYMBOL(__netif_schedule);
1282 
1283 void dev_kfree_skb_irq(struct sk_buff *skb)
1284 {
1285 	if (atomic_dec_and_test(&skb->users)) {
1286 		struct softnet_data *sd;
1287 		unsigned long flags;
1288 
1289 		local_irq_save(flags);
1290 		sd = &__get_cpu_var(softnet_data);
1291 		skb->next = sd->completion_queue;
1292 		sd->completion_queue = skb;
1293 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1294 		local_irq_restore(flags);
1295 	}
1296 }
1297 EXPORT_SYMBOL(dev_kfree_skb_irq);
1298 
1299 void dev_kfree_skb_any(struct sk_buff *skb)
1300 {
1301 	if (in_irq() || irqs_disabled())
1302 		dev_kfree_skb_irq(skb);
1303 	else
1304 		dev_kfree_skb(skb);
1305 }
1306 EXPORT_SYMBOL(dev_kfree_skb_any);
1307 
1308 
1309 /**
1310  * netif_device_detach - mark device as removed
1311  * @dev: network device
1312  *
1313  * Mark device as removed from system and therefore no longer available.
1314  */
1315 void netif_device_detach(struct net_device *dev)
1316 {
1317 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1318 	    netif_running(dev)) {
1319 		netif_stop_queue(dev);
1320 	}
1321 }
1322 EXPORT_SYMBOL(netif_device_detach);
1323 
1324 /**
1325  * netif_device_attach - mark device as attached
1326  * @dev: network device
1327  *
1328  * Mark device as attached from system and restart if needed.
1329  */
1330 void netif_device_attach(struct net_device *dev)
1331 {
1332 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1333 	    netif_running(dev)) {
1334 		netif_wake_queue(dev);
1335 		__netdev_watchdog_up(dev);
1336 	}
1337 }
1338 EXPORT_SYMBOL(netif_device_attach);
1339 
1340 
1341 /*
1342  * Invalidate hardware checksum when packet is to be mangled, and
1343  * complete checksum manually on outgoing path.
1344  */
1345 int skb_checksum_help(struct sk_buff *skb)
1346 {
1347 	__wsum csum;
1348 	int ret = 0, offset;
1349 
1350 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1351 		goto out_set_summed;
1352 
1353 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1354 		/* Let GSO fix up the checksum. */
1355 		goto out_set_summed;
1356 	}
1357 
1358 	if (skb_cloned(skb)) {
1359 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1360 		if (ret)
1361 			goto out;
1362 	}
1363 
1364 	offset = skb->csum_start - skb_headroom(skb);
1365 	BUG_ON(offset > (int)skb->len);
1366 	csum = skb_checksum(skb, offset, skb->len-offset, 0);
1367 
1368 	offset = skb_headlen(skb) - offset;
1369 	BUG_ON(offset <= 0);
1370 	BUG_ON(skb->csum_offset + 2 > offset);
1371 
1372 	*(__sum16 *)(skb->head + skb->csum_start + skb->csum_offset) =
1373 		csum_fold(csum);
1374 out_set_summed:
1375 	skb->ip_summed = CHECKSUM_NONE;
1376 out:
1377 	return ret;
1378 }
1379 
1380 /**
1381  *	skb_gso_segment - Perform segmentation on skb.
1382  *	@skb: buffer to segment
1383  *	@features: features for the output path (see dev->features)
1384  *
1385  *	This function segments the given skb and returns a list of segments.
1386  *
1387  *	It may return NULL if the skb requires no segmentation.  This is
1388  *	only possible when GSO is used for verifying header integrity.
1389  */
1390 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1391 {
1392 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1393 	struct packet_type *ptype;
1394 	__be16 type = skb->protocol;
1395 	int err;
1396 
1397 	BUG_ON(skb_shinfo(skb)->frag_list);
1398 
1399 	skb_reset_mac_header(skb);
1400 	skb->mac_len = skb->network_header - skb->mac_header;
1401 	__skb_pull(skb, skb->mac_len);
1402 
1403 	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1404 		if (skb_header_cloned(skb) &&
1405 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1406 			return ERR_PTR(err);
1407 	}
1408 
1409 	rcu_read_lock();
1410 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1411 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1412 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1413 				err = ptype->gso_send_check(skb);
1414 				segs = ERR_PTR(err);
1415 				if (err || skb_gso_ok(skb, features))
1416 					break;
1417 				__skb_push(skb, (skb->data -
1418 						 skb_network_header(skb)));
1419 			}
1420 			segs = ptype->gso_segment(skb, features);
1421 			break;
1422 		}
1423 	}
1424 	rcu_read_unlock();
1425 
1426 	__skb_push(skb, skb->data - skb_mac_header(skb));
1427 
1428 	return segs;
1429 }
1430 
1431 EXPORT_SYMBOL(skb_gso_segment);
1432 
1433 /* Take action when hardware reception checksum errors are detected. */
1434 #ifdef CONFIG_BUG
1435 void netdev_rx_csum_fault(struct net_device *dev)
1436 {
1437 	if (net_ratelimit()) {
1438 		printk(KERN_ERR "%s: hw csum failure.\n",
1439 			dev ? dev->name : "<unknown>");
1440 		dump_stack();
1441 	}
1442 }
1443 EXPORT_SYMBOL(netdev_rx_csum_fault);
1444 #endif
1445 
1446 /* Actually, we should eliminate this check as soon as we know, that:
1447  * 1. IOMMU is present and allows to map all the memory.
1448  * 2. No high memory really exists on this machine.
1449  */
1450 
1451 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1452 {
1453 #ifdef CONFIG_HIGHMEM
1454 	int i;
1455 
1456 	if (dev->features & NETIF_F_HIGHDMA)
1457 		return 0;
1458 
1459 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1460 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1461 			return 1;
1462 
1463 #endif
1464 	return 0;
1465 }
1466 
1467 struct dev_gso_cb {
1468 	void (*destructor)(struct sk_buff *skb);
1469 };
1470 
1471 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1472 
1473 static void dev_gso_skb_destructor(struct sk_buff *skb)
1474 {
1475 	struct dev_gso_cb *cb;
1476 
1477 	do {
1478 		struct sk_buff *nskb = skb->next;
1479 
1480 		skb->next = nskb->next;
1481 		nskb->next = NULL;
1482 		kfree_skb(nskb);
1483 	} while (skb->next);
1484 
1485 	cb = DEV_GSO_CB(skb);
1486 	if (cb->destructor)
1487 		cb->destructor(skb);
1488 }
1489 
1490 /**
1491  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1492  *	@skb: buffer to segment
1493  *
1494  *	This function segments the given skb and stores the list of segments
1495  *	in skb->next.
1496  */
1497 static int dev_gso_segment(struct sk_buff *skb)
1498 {
1499 	struct net_device *dev = skb->dev;
1500 	struct sk_buff *segs;
1501 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1502 					 NETIF_F_SG : 0);
1503 
1504 	segs = skb_gso_segment(skb, features);
1505 
1506 	/* Verifying header integrity only. */
1507 	if (!segs)
1508 		return 0;
1509 
1510 	if (unlikely(IS_ERR(segs)))
1511 		return PTR_ERR(segs);
1512 
1513 	skb->next = segs;
1514 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1515 	skb->destructor = dev_gso_skb_destructor;
1516 
1517 	return 0;
1518 }
1519 
1520 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1521 {
1522 	if (likely(!skb->next)) {
1523 		if (!list_empty(&ptype_all))
1524 			dev_queue_xmit_nit(skb, dev);
1525 
1526 		if (netif_needs_gso(dev, skb)) {
1527 			if (unlikely(dev_gso_segment(skb)))
1528 				goto out_kfree_skb;
1529 			if (skb->next)
1530 				goto gso;
1531 		}
1532 
1533 		return dev->hard_start_xmit(skb, dev);
1534 	}
1535 
1536 gso:
1537 	do {
1538 		struct sk_buff *nskb = skb->next;
1539 		int rc;
1540 
1541 		skb->next = nskb->next;
1542 		nskb->next = NULL;
1543 		rc = dev->hard_start_xmit(nskb, dev);
1544 		if (unlikely(rc)) {
1545 			nskb->next = skb->next;
1546 			skb->next = nskb;
1547 			return rc;
1548 		}
1549 		if (unlikely((netif_queue_stopped(dev) ||
1550 			     netif_subqueue_stopped(dev, skb->queue_mapping)) &&
1551 			     skb->next))
1552 			return NETDEV_TX_BUSY;
1553 	} while (skb->next);
1554 
1555 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1556 
1557 out_kfree_skb:
1558 	kfree_skb(skb);
1559 	return 0;
1560 }
1561 
1562 /**
1563  *	dev_queue_xmit - transmit a buffer
1564  *	@skb: buffer to transmit
1565  *
1566  *	Queue a buffer for transmission to a network device. The caller must
1567  *	have set the device and priority and built the buffer before calling
1568  *	this function. The function can be called from an interrupt.
1569  *
1570  *	A negative errno code is returned on a failure. A success does not
1571  *	guarantee the frame will be transmitted as it may be dropped due
1572  *	to congestion or traffic shaping.
1573  *
1574  * -----------------------------------------------------------------------------------
1575  *      I notice this method can also return errors from the queue disciplines,
1576  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1577  *      be positive.
1578  *
1579  *      Regardless of the return value, the skb is consumed, so it is currently
1580  *      difficult to retry a send to this method.  (You can bump the ref count
1581  *      before sending to hold a reference for retry if you are careful.)
1582  *
1583  *      When calling this method, interrupts MUST be enabled.  This is because
1584  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1585  *          --BLG
1586  */
1587 
1588 int dev_queue_xmit(struct sk_buff *skb)
1589 {
1590 	struct net_device *dev = skb->dev;
1591 	struct Qdisc *q;
1592 	int rc = -ENOMEM;
1593 
1594 	/* GSO will handle the following emulations directly. */
1595 	if (netif_needs_gso(dev, skb))
1596 		goto gso;
1597 
1598 	if (skb_shinfo(skb)->frag_list &&
1599 	    !(dev->features & NETIF_F_FRAGLIST) &&
1600 	    __skb_linearize(skb))
1601 		goto out_kfree_skb;
1602 
1603 	/* Fragmented skb is linearized if device does not support SG,
1604 	 * or if at least one of fragments is in highmem and device
1605 	 * does not support DMA from it.
1606 	 */
1607 	if (skb_shinfo(skb)->nr_frags &&
1608 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1609 	    __skb_linearize(skb))
1610 		goto out_kfree_skb;
1611 
1612 	/* If packet is not checksummed and device does not support
1613 	 * checksumming for this protocol, complete checksumming here.
1614 	 */
1615 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1616 		skb_set_transport_header(skb, skb->csum_start -
1617 					      skb_headroom(skb));
1618 
1619 		if (!(dev->features & NETIF_F_GEN_CSUM) &&
1620 		    !((dev->features & NETIF_F_IP_CSUM) &&
1621 		      skb->protocol == htons(ETH_P_IP)) &&
1622 		    !((dev->features & NETIF_F_IPV6_CSUM) &&
1623 		      skb->protocol == htons(ETH_P_IPV6)))
1624 			if (skb_checksum_help(skb))
1625 				goto out_kfree_skb;
1626 	}
1627 
1628 gso:
1629 	spin_lock_prefetch(&dev->queue_lock);
1630 
1631 	/* Disable soft irqs for various locks below. Also
1632 	 * stops preemption for RCU.
1633 	 */
1634 	rcu_read_lock_bh();
1635 
1636 	/* Updates of qdisc are serialized by queue_lock.
1637 	 * The struct Qdisc which is pointed to by qdisc is now a
1638 	 * rcu structure - it may be accessed without acquiring
1639 	 * a lock (but the structure may be stale.) The freeing of the
1640 	 * qdisc will be deferred until it's known that there are no
1641 	 * more references to it.
1642 	 *
1643 	 * If the qdisc has an enqueue function, we still need to
1644 	 * hold the queue_lock before calling it, since queue_lock
1645 	 * also serializes access to the device queue.
1646 	 */
1647 
1648 	q = rcu_dereference(dev->qdisc);
1649 #ifdef CONFIG_NET_CLS_ACT
1650 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1651 #endif
1652 	if (q->enqueue) {
1653 		/* Grab device queue */
1654 		spin_lock(&dev->queue_lock);
1655 		q = dev->qdisc;
1656 		if (q->enqueue) {
1657 			/* reset queue_mapping to zero */
1658 			skb->queue_mapping = 0;
1659 			rc = q->enqueue(skb, q);
1660 			qdisc_run(dev);
1661 			spin_unlock(&dev->queue_lock);
1662 
1663 			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1664 			goto out;
1665 		}
1666 		spin_unlock(&dev->queue_lock);
1667 	}
1668 
1669 	/* The device has no queue. Common case for software devices:
1670 	   loopback, all the sorts of tunnels...
1671 
1672 	   Really, it is unlikely that netif_tx_lock protection is necessary
1673 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1674 	   counters.)
1675 	   However, it is possible, that they rely on protection
1676 	   made by us here.
1677 
1678 	   Check this and shot the lock. It is not prone from deadlocks.
1679 	   Either shot noqueue qdisc, it is even simpler 8)
1680 	 */
1681 	if (dev->flags & IFF_UP) {
1682 		int cpu = smp_processor_id(); /* ok because BHs are off */
1683 
1684 		if (dev->xmit_lock_owner != cpu) {
1685 
1686 			HARD_TX_LOCK(dev, cpu);
1687 
1688 			if (!netif_queue_stopped(dev) &&
1689 			    !netif_subqueue_stopped(dev, skb->queue_mapping)) {
1690 				rc = 0;
1691 				if (!dev_hard_start_xmit(skb, dev)) {
1692 					HARD_TX_UNLOCK(dev);
1693 					goto out;
1694 				}
1695 			}
1696 			HARD_TX_UNLOCK(dev);
1697 			if (net_ratelimit())
1698 				printk(KERN_CRIT "Virtual device %s asks to "
1699 				       "queue packet!\n", dev->name);
1700 		} else {
1701 			/* Recursion is detected! It is possible,
1702 			 * unfortunately */
1703 			if (net_ratelimit())
1704 				printk(KERN_CRIT "Dead loop on virtual device "
1705 				       "%s, fix it urgently!\n", dev->name);
1706 		}
1707 	}
1708 
1709 	rc = -ENETDOWN;
1710 	rcu_read_unlock_bh();
1711 
1712 out_kfree_skb:
1713 	kfree_skb(skb);
1714 	return rc;
1715 out:
1716 	rcu_read_unlock_bh();
1717 	return rc;
1718 }
1719 
1720 
1721 /*=======================================================================
1722 			Receiver routines
1723   =======================================================================*/
1724 
1725 int netdev_max_backlog __read_mostly = 1000;
1726 int netdev_budget __read_mostly = 300;
1727 int weight_p __read_mostly = 64;            /* old backlog weight */
1728 
1729 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1730 
1731 
1732 /**
1733  *	netif_rx	-	post buffer to the network code
1734  *	@skb: buffer to post
1735  *
1736  *	This function receives a packet from a device driver and queues it for
1737  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1738  *	may be dropped during processing for congestion control or by the
1739  *	protocol layers.
1740  *
1741  *	return values:
1742  *	NET_RX_SUCCESS	(no congestion)
1743  *	NET_RX_CN_LOW   (low congestion)
1744  *	NET_RX_CN_MOD   (moderate congestion)
1745  *	NET_RX_CN_HIGH  (high congestion)
1746  *	NET_RX_DROP     (packet was dropped)
1747  *
1748  */
1749 
1750 int netif_rx(struct sk_buff *skb)
1751 {
1752 	struct softnet_data *queue;
1753 	unsigned long flags;
1754 
1755 	/* if netpoll wants it, pretend we never saw it */
1756 	if (netpoll_rx(skb))
1757 		return NET_RX_DROP;
1758 
1759 	if (!skb->tstamp.tv64)
1760 		net_timestamp(skb);
1761 
1762 	/*
1763 	 * The code is rearranged so that the path is the most
1764 	 * short when CPU is congested, but is still operating.
1765 	 */
1766 	local_irq_save(flags);
1767 	queue = &__get_cpu_var(softnet_data);
1768 
1769 	__get_cpu_var(netdev_rx_stat).total++;
1770 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1771 		if (queue->input_pkt_queue.qlen) {
1772 enqueue:
1773 			dev_hold(skb->dev);
1774 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1775 			local_irq_restore(flags);
1776 			return NET_RX_SUCCESS;
1777 		}
1778 
1779 		napi_schedule(&queue->backlog);
1780 		goto enqueue;
1781 	}
1782 
1783 	__get_cpu_var(netdev_rx_stat).dropped++;
1784 	local_irq_restore(flags);
1785 
1786 	kfree_skb(skb);
1787 	return NET_RX_DROP;
1788 }
1789 
1790 int netif_rx_ni(struct sk_buff *skb)
1791 {
1792 	int err;
1793 
1794 	preempt_disable();
1795 	err = netif_rx(skb);
1796 	if (local_softirq_pending())
1797 		do_softirq();
1798 	preempt_enable();
1799 
1800 	return err;
1801 }
1802 
1803 EXPORT_SYMBOL(netif_rx_ni);
1804 
1805 static inline struct net_device *skb_bond(struct sk_buff *skb)
1806 {
1807 	struct net_device *dev = skb->dev;
1808 
1809 	if (dev->master) {
1810 		if (skb_bond_should_drop(skb)) {
1811 			kfree_skb(skb);
1812 			return NULL;
1813 		}
1814 		skb->dev = dev->master;
1815 	}
1816 
1817 	return dev;
1818 }
1819 
1820 
1821 static void net_tx_action(struct softirq_action *h)
1822 {
1823 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1824 
1825 	if (sd->completion_queue) {
1826 		struct sk_buff *clist;
1827 
1828 		local_irq_disable();
1829 		clist = sd->completion_queue;
1830 		sd->completion_queue = NULL;
1831 		local_irq_enable();
1832 
1833 		while (clist) {
1834 			struct sk_buff *skb = clist;
1835 			clist = clist->next;
1836 
1837 			BUG_TRAP(!atomic_read(&skb->users));
1838 			__kfree_skb(skb);
1839 		}
1840 	}
1841 
1842 	if (sd->output_queue) {
1843 		struct net_device *head;
1844 
1845 		local_irq_disable();
1846 		head = sd->output_queue;
1847 		sd->output_queue = NULL;
1848 		local_irq_enable();
1849 
1850 		while (head) {
1851 			struct net_device *dev = head;
1852 			head = head->next_sched;
1853 
1854 			smp_mb__before_clear_bit();
1855 			clear_bit(__LINK_STATE_SCHED, &dev->state);
1856 
1857 			if (spin_trylock(&dev->queue_lock)) {
1858 				qdisc_run(dev);
1859 				spin_unlock(&dev->queue_lock);
1860 			} else {
1861 				netif_schedule(dev);
1862 			}
1863 		}
1864 	}
1865 }
1866 
1867 static inline int deliver_skb(struct sk_buff *skb,
1868 			      struct packet_type *pt_prev,
1869 			      struct net_device *orig_dev)
1870 {
1871 	atomic_inc(&skb->users);
1872 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1873 }
1874 
1875 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1876 /* These hooks defined here for ATM */
1877 struct net_bridge;
1878 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1879 						unsigned char *addr);
1880 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1881 
1882 /*
1883  * If bridge module is loaded call bridging hook.
1884  *  returns NULL if packet was consumed.
1885  */
1886 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1887 					struct sk_buff *skb) __read_mostly;
1888 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1889 					    struct packet_type **pt_prev, int *ret,
1890 					    struct net_device *orig_dev)
1891 {
1892 	struct net_bridge_port *port;
1893 
1894 	if (skb->pkt_type == PACKET_LOOPBACK ||
1895 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
1896 		return skb;
1897 
1898 	if (*pt_prev) {
1899 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1900 		*pt_prev = NULL;
1901 	}
1902 
1903 	return br_handle_frame_hook(port, skb);
1904 }
1905 #else
1906 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
1907 #endif
1908 
1909 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1910 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1911 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1912 
1913 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1914 					     struct packet_type **pt_prev,
1915 					     int *ret,
1916 					     struct net_device *orig_dev)
1917 {
1918 	if (skb->dev->macvlan_port == NULL)
1919 		return skb;
1920 
1921 	if (*pt_prev) {
1922 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1923 		*pt_prev = NULL;
1924 	}
1925 	return macvlan_handle_frame_hook(skb);
1926 }
1927 #else
1928 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
1929 #endif
1930 
1931 #ifdef CONFIG_NET_CLS_ACT
1932 /* TODO: Maybe we should just force sch_ingress to be compiled in
1933  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1934  * a compare and 2 stores extra right now if we dont have it on
1935  * but have CONFIG_NET_CLS_ACT
1936  * NOTE: This doesnt stop any functionality; if you dont have
1937  * the ingress scheduler, you just cant add policies on ingress.
1938  *
1939  */
1940 static int ing_filter(struct sk_buff *skb)
1941 {
1942 	struct Qdisc *q;
1943 	struct net_device *dev = skb->dev;
1944 	int result = TC_ACT_OK;
1945 
1946 	if (dev->qdisc_ingress) {
1947 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1948 		if (MAX_RED_LOOP < ttl++) {
1949 			printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
1950 				skb->iif, skb->dev->ifindex);
1951 			return TC_ACT_SHOT;
1952 		}
1953 
1954 		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1955 
1956 		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1957 
1958 		spin_lock(&dev->ingress_lock);
1959 		if ((q = dev->qdisc_ingress) != NULL)
1960 			result = q->enqueue(skb, q);
1961 		spin_unlock(&dev->ingress_lock);
1962 
1963 	}
1964 
1965 	return result;
1966 }
1967 #endif
1968 
1969 int netif_receive_skb(struct sk_buff *skb)
1970 {
1971 	struct packet_type *ptype, *pt_prev;
1972 	struct net_device *orig_dev;
1973 	int ret = NET_RX_DROP;
1974 	__be16 type;
1975 
1976 	/* if we've gotten here through NAPI, check netpoll */
1977 	if (netpoll_receive_skb(skb))
1978 		return NET_RX_DROP;
1979 
1980 	if (!skb->tstamp.tv64)
1981 		net_timestamp(skb);
1982 
1983 	if (!skb->iif)
1984 		skb->iif = skb->dev->ifindex;
1985 
1986 	orig_dev = skb_bond(skb);
1987 
1988 	if (!orig_dev)
1989 		return NET_RX_DROP;
1990 
1991 	__get_cpu_var(netdev_rx_stat).total++;
1992 
1993 	skb_reset_network_header(skb);
1994 	skb_reset_transport_header(skb);
1995 	skb->mac_len = skb->network_header - skb->mac_header;
1996 
1997 	pt_prev = NULL;
1998 
1999 	rcu_read_lock();
2000 
2001 #ifdef CONFIG_NET_CLS_ACT
2002 	if (skb->tc_verd & TC_NCLS) {
2003 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2004 		goto ncls;
2005 	}
2006 #endif
2007 
2008 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2009 		if (!ptype->dev || ptype->dev == skb->dev) {
2010 			if (pt_prev)
2011 				ret = deliver_skb(skb, pt_prev, orig_dev);
2012 			pt_prev = ptype;
2013 		}
2014 	}
2015 
2016 #ifdef CONFIG_NET_CLS_ACT
2017 	if (pt_prev) {
2018 		ret = deliver_skb(skb, pt_prev, orig_dev);
2019 		pt_prev = NULL; /* noone else should process this after*/
2020 	} else {
2021 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2022 	}
2023 
2024 	ret = ing_filter(skb);
2025 
2026 	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
2027 		kfree_skb(skb);
2028 		goto out;
2029 	}
2030 
2031 	skb->tc_verd = 0;
2032 ncls:
2033 #endif
2034 
2035 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2036 	if (!skb)
2037 		goto out;
2038 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2039 	if (!skb)
2040 		goto out;
2041 
2042 	type = skb->protocol;
2043 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
2044 		if (ptype->type == type &&
2045 		    (!ptype->dev || ptype->dev == skb->dev)) {
2046 			if (pt_prev)
2047 				ret = deliver_skb(skb, pt_prev, orig_dev);
2048 			pt_prev = ptype;
2049 		}
2050 	}
2051 
2052 	if (pt_prev) {
2053 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2054 	} else {
2055 		kfree_skb(skb);
2056 		/* Jamal, now you will not able to escape explaining
2057 		 * me how you were going to use this. :-)
2058 		 */
2059 		ret = NET_RX_DROP;
2060 	}
2061 
2062 out:
2063 	rcu_read_unlock();
2064 	return ret;
2065 }
2066 
2067 static int process_backlog(struct napi_struct *napi, int quota)
2068 {
2069 	int work = 0;
2070 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2071 	unsigned long start_time = jiffies;
2072 
2073 	napi->weight = weight_p;
2074 	do {
2075 		struct sk_buff *skb;
2076 		struct net_device *dev;
2077 
2078 		local_irq_disable();
2079 		skb = __skb_dequeue(&queue->input_pkt_queue);
2080 		if (!skb) {
2081 			__napi_complete(napi);
2082 			local_irq_enable();
2083 			break;
2084 		}
2085 
2086 		local_irq_enable();
2087 
2088 		dev = skb->dev;
2089 
2090 		netif_receive_skb(skb);
2091 
2092 		dev_put(dev);
2093 	} while (++work < quota && jiffies == start_time);
2094 
2095 	return work;
2096 }
2097 
2098 /**
2099  * __napi_schedule - schedule for receive
2100  * @napi: entry to schedule
2101  *
2102  * The entry's receive function will be scheduled to run
2103  */
2104 void fastcall __napi_schedule(struct napi_struct *n)
2105 {
2106 	unsigned long flags;
2107 
2108 	local_irq_save(flags);
2109 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2110 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2111 	local_irq_restore(flags);
2112 }
2113 EXPORT_SYMBOL(__napi_schedule);
2114 
2115 
2116 static void net_rx_action(struct softirq_action *h)
2117 {
2118 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2119 	unsigned long start_time = jiffies;
2120 	int budget = netdev_budget;
2121 	void *have;
2122 
2123 	local_irq_disable();
2124 
2125 	while (!list_empty(list)) {
2126 		struct napi_struct *n;
2127 		int work, weight;
2128 
2129 		/* If softirq window is exhuasted then punt.
2130 		 *
2131 		 * Note that this is a slight policy change from the
2132 		 * previous NAPI code, which would allow up to 2
2133 		 * jiffies to pass before breaking out.  The test
2134 		 * used to be "jiffies - start_time > 1".
2135 		 */
2136 		if (unlikely(budget <= 0 || jiffies != start_time))
2137 			goto softnet_break;
2138 
2139 		local_irq_enable();
2140 
2141 		/* Even though interrupts have been re-enabled, this
2142 		 * access is safe because interrupts can only add new
2143 		 * entries to the tail of this list, and only ->poll()
2144 		 * calls can remove this head entry from the list.
2145 		 */
2146 		n = list_entry(list->next, struct napi_struct, poll_list);
2147 
2148 		have = netpoll_poll_lock(n);
2149 
2150 		weight = n->weight;
2151 
2152 		work = n->poll(n, weight);
2153 
2154 		WARN_ON_ONCE(work > weight);
2155 
2156 		budget -= work;
2157 
2158 		local_irq_disable();
2159 
2160 		/* Drivers must not modify the NAPI state if they
2161 		 * consume the entire weight.  In such cases this code
2162 		 * still "owns" the NAPI instance and therefore can
2163 		 * move the instance around on the list at-will.
2164 		 */
2165 		if (unlikely(work == weight))
2166 			list_move_tail(&n->poll_list, list);
2167 
2168 		netpoll_poll_unlock(have);
2169 	}
2170 out:
2171 	local_irq_enable();
2172 
2173 #ifdef CONFIG_NET_DMA
2174 	/*
2175 	 * There may not be any more sk_buffs coming right now, so push
2176 	 * any pending DMA copies to hardware
2177 	 */
2178 	if (!cpus_empty(net_dma.channel_mask)) {
2179 		int chan_idx;
2180 		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2181 			struct dma_chan *chan = net_dma.channels[chan_idx];
2182 			if (chan)
2183 				dma_async_memcpy_issue_pending(chan);
2184 		}
2185 	}
2186 #endif
2187 
2188 	return;
2189 
2190 softnet_break:
2191 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2192 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2193 	goto out;
2194 }
2195 
2196 static gifconf_func_t * gifconf_list [NPROTO];
2197 
2198 /**
2199  *	register_gifconf	-	register a SIOCGIF handler
2200  *	@family: Address family
2201  *	@gifconf: Function handler
2202  *
2203  *	Register protocol dependent address dumping routines. The handler
2204  *	that is passed must not be freed or reused until it has been replaced
2205  *	by another handler.
2206  */
2207 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2208 {
2209 	if (family >= NPROTO)
2210 		return -EINVAL;
2211 	gifconf_list[family] = gifconf;
2212 	return 0;
2213 }
2214 
2215 
2216 /*
2217  *	Map an interface index to its name (SIOCGIFNAME)
2218  */
2219 
2220 /*
2221  *	We need this ioctl for efficient implementation of the
2222  *	if_indextoname() function required by the IPv6 API.  Without
2223  *	it, we would have to search all the interfaces to find a
2224  *	match.  --pb
2225  */
2226 
2227 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2228 {
2229 	struct net_device *dev;
2230 	struct ifreq ifr;
2231 
2232 	/*
2233 	 *	Fetch the caller's info block.
2234 	 */
2235 
2236 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2237 		return -EFAULT;
2238 
2239 	read_lock(&dev_base_lock);
2240 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2241 	if (!dev) {
2242 		read_unlock(&dev_base_lock);
2243 		return -ENODEV;
2244 	}
2245 
2246 	strcpy(ifr.ifr_name, dev->name);
2247 	read_unlock(&dev_base_lock);
2248 
2249 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2250 		return -EFAULT;
2251 	return 0;
2252 }
2253 
2254 /*
2255  *	Perform a SIOCGIFCONF call. This structure will change
2256  *	size eventually, and there is nothing I can do about it.
2257  *	Thus we will need a 'compatibility mode'.
2258  */
2259 
2260 static int dev_ifconf(struct net *net, char __user *arg)
2261 {
2262 	struct ifconf ifc;
2263 	struct net_device *dev;
2264 	char __user *pos;
2265 	int len;
2266 	int total;
2267 	int i;
2268 
2269 	/*
2270 	 *	Fetch the caller's info block.
2271 	 */
2272 
2273 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2274 		return -EFAULT;
2275 
2276 	pos = ifc.ifc_buf;
2277 	len = ifc.ifc_len;
2278 
2279 	/*
2280 	 *	Loop over the interfaces, and write an info block for each.
2281 	 */
2282 
2283 	total = 0;
2284 	for_each_netdev(net, dev) {
2285 		for (i = 0; i < NPROTO; i++) {
2286 			if (gifconf_list[i]) {
2287 				int done;
2288 				if (!pos)
2289 					done = gifconf_list[i](dev, NULL, 0);
2290 				else
2291 					done = gifconf_list[i](dev, pos + total,
2292 							       len - total);
2293 				if (done < 0)
2294 					return -EFAULT;
2295 				total += done;
2296 			}
2297 		}
2298 	}
2299 
2300 	/*
2301 	 *	All done.  Write the updated control block back to the caller.
2302 	 */
2303 	ifc.ifc_len = total;
2304 
2305 	/*
2306 	 * 	Both BSD and Solaris return 0 here, so we do too.
2307 	 */
2308 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2309 }
2310 
2311 #ifdef CONFIG_PROC_FS
2312 /*
2313  *	This is invoked by the /proc filesystem handler to display a device
2314  *	in detail.
2315  */
2316 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2317 {
2318 	struct net *net = seq->private;
2319 	loff_t off;
2320 	struct net_device *dev;
2321 
2322 	read_lock(&dev_base_lock);
2323 	if (!*pos)
2324 		return SEQ_START_TOKEN;
2325 
2326 	off = 1;
2327 	for_each_netdev(net, dev)
2328 		if (off++ == *pos)
2329 			return dev;
2330 
2331 	return NULL;
2332 }
2333 
2334 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2335 {
2336 	struct net *net = seq->private;
2337 	++*pos;
2338 	return v == SEQ_START_TOKEN ?
2339 		first_net_device(net) : next_net_device((struct net_device *)v);
2340 }
2341 
2342 void dev_seq_stop(struct seq_file *seq, void *v)
2343 {
2344 	read_unlock(&dev_base_lock);
2345 }
2346 
2347 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2348 {
2349 	struct net_device_stats *stats = dev->get_stats(dev);
2350 
2351 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2352 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2353 		   dev->name, stats->rx_bytes, stats->rx_packets,
2354 		   stats->rx_errors,
2355 		   stats->rx_dropped + stats->rx_missed_errors,
2356 		   stats->rx_fifo_errors,
2357 		   stats->rx_length_errors + stats->rx_over_errors +
2358 		    stats->rx_crc_errors + stats->rx_frame_errors,
2359 		   stats->rx_compressed, stats->multicast,
2360 		   stats->tx_bytes, stats->tx_packets,
2361 		   stats->tx_errors, stats->tx_dropped,
2362 		   stats->tx_fifo_errors, stats->collisions,
2363 		   stats->tx_carrier_errors +
2364 		    stats->tx_aborted_errors +
2365 		    stats->tx_window_errors +
2366 		    stats->tx_heartbeat_errors,
2367 		   stats->tx_compressed);
2368 }
2369 
2370 /*
2371  *	Called from the PROCfs module. This now uses the new arbitrary sized
2372  *	/proc/net interface to create /proc/net/dev
2373  */
2374 static int dev_seq_show(struct seq_file *seq, void *v)
2375 {
2376 	if (v == SEQ_START_TOKEN)
2377 		seq_puts(seq, "Inter-|   Receive                            "
2378 			      "                    |  Transmit\n"
2379 			      " face |bytes    packets errs drop fifo frame "
2380 			      "compressed multicast|bytes    packets errs "
2381 			      "drop fifo colls carrier compressed\n");
2382 	else
2383 		dev_seq_printf_stats(seq, v);
2384 	return 0;
2385 }
2386 
2387 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2388 {
2389 	struct netif_rx_stats *rc = NULL;
2390 
2391 	while (*pos < NR_CPUS)
2392 		if (cpu_online(*pos)) {
2393 			rc = &per_cpu(netdev_rx_stat, *pos);
2394 			break;
2395 		} else
2396 			++*pos;
2397 	return rc;
2398 }
2399 
2400 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2401 {
2402 	return softnet_get_online(pos);
2403 }
2404 
2405 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2406 {
2407 	++*pos;
2408 	return softnet_get_online(pos);
2409 }
2410 
2411 static void softnet_seq_stop(struct seq_file *seq, void *v)
2412 {
2413 }
2414 
2415 static int softnet_seq_show(struct seq_file *seq, void *v)
2416 {
2417 	struct netif_rx_stats *s = v;
2418 
2419 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2420 		   s->total, s->dropped, s->time_squeeze, 0,
2421 		   0, 0, 0, 0, /* was fastroute */
2422 		   s->cpu_collision );
2423 	return 0;
2424 }
2425 
2426 static const struct seq_operations dev_seq_ops = {
2427 	.start = dev_seq_start,
2428 	.next  = dev_seq_next,
2429 	.stop  = dev_seq_stop,
2430 	.show  = dev_seq_show,
2431 };
2432 
2433 static int dev_seq_open(struct inode *inode, struct file *file)
2434 {
2435 	struct seq_file *seq;
2436 	int res;
2437 	res =  seq_open(file, &dev_seq_ops);
2438 	if (!res) {
2439 		seq = file->private_data;
2440 		seq->private = get_proc_net(inode);
2441 		if (!seq->private) {
2442 			seq_release(inode, file);
2443 			res = -ENXIO;
2444 		}
2445 	}
2446 	return res;
2447 }
2448 
2449 static int dev_seq_release(struct inode *inode, struct file *file)
2450 {
2451 	struct seq_file *seq = file->private_data;
2452 	struct net *net = seq->private;
2453 	put_net(net);
2454 	return seq_release(inode, file);
2455 }
2456 
2457 static const struct file_operations dev_seq_fops = {
2458 	.owner	 = THIS_MODULE,
2459 	.open    = dev_seq_open,
2460 	.read    = seq_read,
2461 	.llseek  = seq_lseek,
2462 	.release = dev_seq_release,
2463 };
2464 
2465 static const struct seq_operations softnet_seq_ops = {
2466 	.start = softnet_seq_start,
2467 	.next  = softnet_seq_next,
2468 	.stop  = softnet_seq_stop,
2469 	.show  = softnet_seq_show,
2470 };
2471 
2472 static int softnet_seq_open(struct inode *inode, struct file *file)
2473 {
2474 	return seq_open(file, &softnet_seq_ops);
2475 }
2476 
2477 static const struct file_operations softnet_seq_fops = {
2478 	.owner	 = THIS_MODULE,
2479 	.open    = softnet_seq_open,
2480 	.read    = seq_read,
2481 	.llseek  = seq_lseek,
2482 	.release = seq_release,
2483 };
2484 
2485 static void *ptype_get_idx(loff_t pos)
2486 {
2487 	struct packet_type *pt = NULL;
2488 	loff_t i = 0;
2489 	int t;
2490 
2491 	list_for_each_entry_rcu(pt, &ptype_all, list) {
2492 		if (i == pos)
2493 			return pt;
2494 		++i;
2495 	}
2496 
2497 	for (t = 0; t < 16; t++) {
2498 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2499 			if (i == pos)
2500 				return pt;
2501 			++i;
2502 		}
2503 	}
2504 	return NULL;
2505 }
2506 
2507 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2508 {
2509 	rcu_read_lock();
2510 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2511 }
2512 
2513 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2514 {
2515 	struct packet_type *pt;
2516 	struct list_head *nxt;
2517 	int hash;
2518 
2519 	++*pos;
2520 	if (v == SEQ_START_TOKEN)
2521 		return ptype_get_idx(0);
2522 
2523 	pt = v;
2524 	nxt = pt->list.next;
2525 	if (pt->type == htons(ETH_P_ALL)) {
2526 		if (nxt != &ptype_all)
2527 			goto found;
2528 		hash = 0;
2529 		nxt = ptype_base[0].next;
2530 	} else
2531 		hash = ntohs(pt->type) & 15;
2532 
2533 	while (nxt == &ptype_base[hash]) {
2534 		if (++hash >= 16)
2535 			return NULL;
2536 		nxt = ptype_base[hash].next;
2537 	}
2538 found:
2539 	return list_entry(nxt, struct packet_type, list);
2540 }
2541 
2542 static void ptype_seq_stop(struct seq_file *seq, void *v)
2543 {
2544 	rcu_read_unlock();
2545 }
2546 
2547 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2548 {
2549 #ifdef CONFIG_KALLSYMS
2550 	unsigned long offset = 0, symsize;
2551 	const char *symname;
2552 	char *modname;
2553 	char namebuf[128];
2554 
2555 	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2556 				  &modname, namebuf);
2557 
2558 	if (symname) {
2559 		char *delim = ":";
2560 
2561 		if (!modname)
2562 			modname = delim = "";
2563 		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2564 			   symname, offset);
2565 		return;
2566 	}
2567 #endif
2568 
2569 	seq_printf(seq, "[%p]", sym);
2570 }
2571 
2572 static int ptype_seq_show(struct seq_file *seq, void *v)
2573 {
2574 	struct packet_type *pt = v;
2575 
2576 	if (v == SEQ_START_TOKEN)
2577 		seq_puts(seq, "Type Device      Function\n");
2578 	else {
2579 		if (pt->type == htons(ETH_P_ALL))
2580 			seq_puts(seq, "ALL ");
2581 		else
2582 			seq_printf(seq, "%04x", ntohs(pt->type));
2583 
2584 		seq_printf(seq, " %-8s ",
2585 			   pt->dev ? pt->dev->name : "");
2586 		ptype_seq_decode(seq,  pt->func);
2587 		seq_putc(seq, '\n');
2588 	}
2589 
2590 	return 0;
2591 }
2592 
2593 static const struct seq_operations ptype_seq_ops = {
2594 	.start = ptype_seq_start,
2595 	.next  = ptype_seq_next,
2596 	.stop  = ptype_seq_stop,
2597 	.show  = ptype_seq_show,
2598 };
2599 
2600 static int ptype_seq_open(struct inode *inode, struct file *file)
2601 {
2602 	return seq_open(file, &ptype_seq_ops);
2603 }
2604 
2605 static const struct file_operations ptype_seq_fops = {
2606 	.owner	 = THIS_MODULE,
2607 	.open    = ptype_seq_open,
2608 	.read    = seq_read,
2609 	.llseek  = seq_lseek,
2610 	.release = seq_release,
2611 };
2612 
2613 
2614 static int __net_init dev_proc_net_init(struct net *net)
2615 {
2616 	int rc = -ENOMEM;
2617 
2618 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2619 		goto out;
2620 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2621 		goto out_dev;
2622 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2623 		goto out_softnet;
2624 
2625 	if (wext_proc_init(net))
2626 		goto out_ptype;
2627 	rc = 0;
2628 out:
2629 	return rc;
2630 out_ptype:
2631 	proc_net_remove(net, "ptype");
2632 out_softnet:
2633 	proc_net_remove(net, "softnet_stat");
2634 out_dev:
2635 	proc_net_remove(net, "dev");
2636 	goto out;
2637 }
2638 
2639 static void __net_exit dev_proc_net_exit(struct net *net)
2640 {
2641 	wext_proc_exit(net);
2642 
2643 	proc_net_remove(net, "ptype");
2644 	proc_net_remove(net, "softnet_stat");
2645 	proc_net_remove(net, "dev");
2646 }
2647 
2648 static struct pernet_operations __net_initdata dev_proc_ops = {
2649 	.init = dev_proc_net_init,
2650 	.exit = dev_proc_net_exit,
2651 };
2652 
2653 static int __init dev_proc_init(void)
2654 {
2655 	return register_pernet_subsys(&dev_proc_ops);
2656 }
2657 #else
2658 #define dev_proc_init() 0
2659 #endif	/* CONFIG_PROC_FS */
2660 
2661 
2662 /**
2663  *	netdev_set_master	-	set up master/slave pair
2664  *	@slave: slave device
2665  *	@master: new master device
2666  *
2667  *	Changes the master device of the slave. Pass %NULL to break the
2668  *	bonding. The caller must hold the RTNL semaphore. On a failure
2669  *	a negative errno code is returned. On success the reference counts
2670  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2671  *	function returns zero.
2672  */
2673 int netdev_set_master(struct net_device *slave, struct net_device *master)
2674 {
2675 	struct net_device *old = slave->master;
2676 
2677 	ASSERT_RTNL();
2678 
2679 	if (master) {
2680 		if (old)
2681 			return -EBUSY;
2682 		dev_hold(master);
2683 	}
2684 
2685 	slave->master = master;
2686 
2687 	synchronize_net();
2688 
2689 	if (old)
2690 		dev_put(old);
2691 
2692 	if (master)
2693 		slave->flags |= IFF_SLAVE;
2694 	else
2695 		slave->flags &= ~IFF_SLAVE;
2696 
2697 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2698 	return 0;
2699 }
2700 
2701 static void __dev_set_promiscuity(struct net_device *dev, int inc)
2702 {
2703 	unsigned short old_flags = dev->flags;
2704 
2705 	ASSERT_RTNL();
2706 
2707 	if ((dev->promiscuity += inc) == 0)
2708 		dev->flags &= ~IFF_PROMISC;
2709 	else
2710 		dev->flags |= IFF_PROMISC;
2711 	if (dev->flags != old_flags) {
2712 		printk(KERN_INFO "device %s %s promiscuous mode\n",
2713 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2714 							       "left");
2715 		audit_log(current->audit_context, GFP_ATOMIC,
2716 			AUDIT_ANOM_PROMISCUOUS,
2717 			"dev=%s prom=%d old_prom=%d auid=%u",
2718 			dev->name, (dev->flags & IFF_PROMISC),
2719 			(old_flags & IFF_PROMISC),
2720 			audit_get_loginuid(current->audit_context));
2721 
2722 		if (dev->change_rx_flags)
2723 			dev->change_rx_flags(dev, IFF_PROMISC);
2724 	}
2725 }
2726 
2727 /**
2728  *	dev_set_promiscuity	- update promiscuity count on a device
2729  *	@dev: device
2730  *	@inc: modifier
2731  *
2732  *	Add or remove promiscuity from a device. While the count in the device
2733  *	remains above zero the interface remains promiscuous. Once it hits zero
2734  *	the device reverts back to normal filtering operation. A negative inc
2735  *	value is used to drop promiscuity on the device.
2736  */
2737 void dev_set_promiscuity(struct net_device *dev, int inc)
2738 {
2739 	unsigned short old_flags = dev->flags;
2740 
2741 	__dev_set_promiscuity(dev, inc);
2742 	if (dev->flags != old_flags)
2743 		dev_set_rx_mode(dev);
2744 }
2745 
2746 /**
2747  *	dev_set_allmulti	- update allmulti count on a device
2748  *	@dev: device
2749  *	@inc: modifier
2750  *
2751  *	Add or remove reception of all multicast frames to a device. While the
2752  *	count in the device remains above zero the interface remains listening
2753  *	to all interfaces. Once it hits zero the device reverts back to normal
2754  *	filtering operation. A negative @inc value is used to drop the counter
2755  *	when releasing a resource needing all multicasts.
2756  */
2757 
2758 void dev_set_allmulti(struct net_device *dev, int inc)
2759 {
2760 	unsigned short old_flags = dev->flags;
2761 
2762 	ASSERT_RTNL();
2763 
2764 	dev->flags |= IFF_ALLMULTI;
2765 	if ((dev->allmulti += inc) == 0)
2766 		dev->flags &= ~IFF_ALLMULTI;
2767 	if (dev->flags ^ old_flags) {
2768 		if (dev->change_rx_flags)
2769 			dev->change_rx_flags(dev, IFF_ALLMULTI);
2770 		dev_set_rx_mode(dev);
2771 	}
2772 }
2773 
2774 /*
2775  *	Upload unicast and multicast address lists to device and
2776  *	configure RX filtering. When the device doesn't support unicast
2777  *	filtering it is put in promiscous mode while unicast addresses
2778  *	are present.
2779  */
2780 void __dev_set_rx_mode(struct net_device *dev)
2781 {
2782 	/* dev_open will call this function so the list will stay sane. */
2783 	if (!(dev->flags&IFF_UP))
2784 		return;
2785 
2786 	if (!netif_device_present(dev))
2787 		return;
2788 
2789 	if (dev->set_rx_mode)
2790 		dev->set_rx_mode(dev);
2791 	else {
2792 		/* Unicast addresses changes may only happen under the rtnl,
2793 		 * therefore calling __dev_set_promiscuity here is safe.
2794 		 */
2795 		if (dev->uc_count > 0 && !dev->uc_promisc) {
2796 			__dev_set_promiscuity(dev, 1);
2797 			dev->uc_promisc = 1;
2798 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
2799 			__dev_set_promiscuity(dev, -1);
2800 			dev->uc_promisc = 0;
2801 		}
2802 
2803 		if (dev->set_multicast_list)
2804 			dev->set_multicast_list(dev);
2805 	}
2806 }
2807 
2808 void dev_set_rx_mode(struct net_device *dev)
2809 {
2810 	netif_tx_lock_bh(dev);
2811 	__dev_set_rx_mode(dev);
2812 	netif_tx_unlock_bh(dev);
2813 }
2814 
2815 int __dev_addr_delete(struct dev_addr_list **list, int *count,
2816 		      void *addr, int alen, int glbl)
2817 {
2818 	struct dev_addr_list *da;
2819 
2820 	for (; (da = *list) != NULL; list = &da->next) {
2821 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2822 		    alen == da->da_addrlen) {
2823 			if (glbl) {
2824 				int old_glbl = da->da_gusers;
2825 				da->da_gusers = 0;
2826 				if (old_glbl == 0)
2827 					break;
2828 			}
2829 			if (--da->da_users)
2830 				return 0;
2831 
2832 			*list = da->next;
2833 			kfree(da);
2834 			(*count)--;
2835 			return 0;
2836 		}
2837 	}
2838 	return -ENOENT;
2839 }
2840 
2841 int __dev_addr_add(struct dev_addr_list **list, int *count,
2842 		   void *addr, int alen, int glbl)
2843 {
2844 	struct dev_addr_list *da;
2845 
2846 	for (da = *list; da != NULL; da = da->next) {
2847 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2848 		    da->da_addrlen == alen) {
2849 			if (glbl) {
2850 				int old_glbl = da->da_gusers;
2851 				da->da_gusers = 1;
2852 				if (old_glbl)
2853 					return 0;
2854 			}
2855 			da->da_users++;
2856 			return 0;
2857 		}
2858 	}
2859 
2860 	da = kmalloc(sizeof(*da), GFP_ATOMIC);
2861 	if (da == NULL)
2862 		return -ENOMEM;
2863 	memcpy(da->da_addr, addr, alen);
2864 	da->da_addrlen = alen;
2865 	da->da_users = 1;
2866 	da->da_gusers = glbl ? 1 : 0;
2867 	da->next = *list;
2868 	*list = da;
2869 	(*count)++;
2870 	return 0;
2871 }
2872 
2873 /**
2874  *	dev_unicast_delete	- Release secondary unicast address.
2875  *	@dev: device
2876  *	@addr: address to delete
2877  *	@alen: length of @addr
2878  *
2879  *	Release reference to a secondary unicast address and remove it
2880  *	from the device if the reference count drops to zero.
2881  *
2882  * 	The caller must hold the rtnl_mutex.
2883  */
2884 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2885 {
2886 	int err;
2887 
2888 	ASSERT_RTNL();
2889 
2890 	netif_tx_lock_bh(dev);
2891 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2892 	if (!err)
2893 		__dev_set_rx_mode(dev);
2894 	netif_tx_unlock_bh(dev);
2895 	return err;
2896 }
2897 EXPORT_SYMBOL(dev_unicast_delete);
2898 
2899 /**
2900  *	dev_unicast_add		- add a secondary unicast address
2901  *	@dev: device
2902  *	@addr: address to delete
2903  *	@alen: length of @addr
2904  *
2905  *	Add a secondary unicast address to the device or increase
2906  *	the reference count if it already exists.
2907  *
2908  *	The caller must hold the rtnl_mutex.
2909  */
2910 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2911 {
2912 	int err;
2913 
2914 	ASSERT_RTNL();
2915 
2916 	netif_tx_lock_bh(dev);
2917 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2918 	if (!err)
2919 		__dev_set_rx_mode(dev);
2920 	netif_tx_unlock_bh(dev);
2921 	return err;
2922 }
2923 EXPORT_SYMBOL(dev_unicast_add);
2924 
2925 static void __dev_addr_discard(struct dev_addr_list **list)
2926 {
2927 	struct dev_addr_list *tmp;
2928 
2929 	while (*list != NULL) {
2930 		tmp = *list;
2931 		*list = tmp->next;
2932 		if (tmp->da_users > tmp->da_gusers)
2933 			printk("__dev_addr_discard: address leakage! "
2934 			       "da_users=%d\n", tmp->da_users);
2935 		kfree(tmp);
2936 	}
2937 }
2938 
2939 static void dev_addr_discard(struct net_device *dev)
2940 {
2941 	netif_tx_lock_bh(dev);
2942 
2943 	__dev_addr_discard(&dev->uc_list);
2944 	dev->uc_count = 0;
2945 
2946 	__dev_addr_discard(&dev->mc_list);
2947 	dev->mc_count = 0;
2948 
2949 	netif_tx_unlock_bh(dev);
2950 }
2951 
2952 unsigned dev_get_flags(const struct net_device *dev)
2953 {
2954 	unsigned flags;
2955 
2956 	flags = (dev->flags & ~(IFF_PROMISC |
2957 				IFF_ALLMULTI |
2958 				IFF_RUNNING |
2959 				IFF_LOWER_UP |
2960 				IFF_DORMANT)) |
2961 		(dev->gflags & (IFF_PROMISC |
2962 				IFF_ALLMULTI));
2963 
2964 	if (netif_running(dev)) {
2965 		if (netif_oper_up(dev))
2966 			flags |= IFF_RUNNING;
2967 		if (netif_carrier_ok(dev))
2968 			flags |= IFF_LOWER_UP;
2969 		if (netif_dormant(dev))
2970 			flags |= IFF_DORMANT;
2971 	}
2972 
2973 	return flags;
2974 }
2975 
2976 int dev_change_flags(struct net_device *dev, unsigned flags)
2977 {
2978 	int ret, changes;
2979 	int old_flags = dev->flags;
2980 
2981 	ASSERT_RTNL();
2982 
2983 	/*
2984 	 *	Set the flags on our device.
2985 	 */
2986 
2987 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2988 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2989 			       IFF_AUTOMEDIA)) |
2990 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2991 				    IFF_ALLMULTI));
2992 
2993 	/*
2994 	 *	Load in the correct multicast list now the flags have changed.
2995 	 */
2996 
2997 	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
2998 		dev->change_rx_flags(dev, IFF_MULTICAST);
2999 
3000 	dev_set_rx_mode(dev);
3001 
3002 	/*
3003 	 *	Have we downed the interface. We handle IFF_UP ourselves
3004 	 *	according to user attempts to set it, rather than blindly
3005 	 *	setting it.
3006 	 */
3007 
3008 	ret = 0;
3009 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3010 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3011 
3012 		if (!ret)
3013 			dev_set_rx_mode(dev);
3014 	}
3015 
3016 	if (dev->flags & IFF_UP &&
3017 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3018 					  IFF_VOLATILE)))
3019 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3020 
3021 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3022 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3023 		dev->gflags ^= IFF_PROMISC;
3024 		dev_set_promiscuity(dev, inc);
3025 	}
3026 
3027 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3028 	   is important. Some (broken) drivers set IFF_PROMISC, when
3029 	   IFF_ALLMULTI is requested not asking us and not reporting.
3030 	 */
3031 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3032 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3033 		dev->gflags ^= IFF_ALLMULTI;
3034 		dev_set_allmulti(dev, inc);
3035 	}
3036 
3037 	/* Exclude state transition flags, already notified */
3038 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3039 	if (changes)
3040 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3041 
3042 	return ret;
3043 }
3044 
3045 int dev_set_mtu(struct net_device *dev, int new_mtu)
3046 {
3047 	int err;
3048 
3049 	if (new_mtu == dev->mtu)
3050 		return 0;
3051 
3052 	/*	MTU must be positive.	 */
3053 	if (new_mtu < 0)
3054 		return -EINVAL;
3055 
3056 	if (!netif_device_present(dev))
3057 		return -ENODEV;
3058 
3059 	err = 0;
3060 	if (dev->change_mtu)
3061 		err = dev->change_mtu(dev, new_mtu);
3062 	else
3063 		dev->mtu = new_mtu;
3064 	if (!err && dev->flags & IFF_UP)
3065 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3066 	return err;
3067 }
3068 
3069 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3070 {
3071 	int err;
3072 
3073 	if (!dev->set_mac_address)
3074 		return -EOPNOTSUPP;
3075 	if (sa->sa_family != dev->type)
3076 		return -EINVAL;
3077 	if (!netif_device_present(dev))
3078 		return -ENODEV;
3079 	err = dev->set_mac_address(dev, sa);
3080 	if (!err)
3081 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3082 	return err;
3083 }
3084 
3085 /*
3086  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3087  */
3088 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3089 {
3090 	int err;
3091 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3092 
3093 	if (!dev)
3094 		return -ENODEV;
3095 
3096 	switch (cmd) {
3097 		case SIOCGIFFLAGS:	/* Get interface flags */
3098 			ifr->ifr_flags = dev_get_flags(dev);
3099 			return 0;
3100 
3101 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3102 					   (currently unused) */
3103 			ifr->ifr_metric = 0;
3104 			return 0;
3105 
3106 		case SIOCGIFMTU:	/* Get the MTU of a device */
3107 			ifr->ifr_mtu = dev->mtu;
3108 			return 0;
3109 
3110 		case SIOCGIFHWADDR:
3111 			if (!dev->addr_len)
3112 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3113 			else
3114 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3115 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3116 			ifr->ifr_hwaddr.sa_family = dev->type;
3117 			return 0;
3118 
3119 		case SIOCGIFSLAVE:
3120 			err = -EINVAL;
3121 			break;
3122 
3123 		case SIOCGIFMAP:
3124 			ifr->ifr_map.mem_start = dev->mem_start;
3125 			ifr->ifr_map.mem_end   = dev->mem_end;
3126 			ifr->ifr_map.base_addr = dev->base_addr;
3127 			ifr->ifr_map.irq       = dev->irq;
3128 			ifr->ifr_map.dma       = dev->dma;
3129 			ifr->ifr_map.port      = dev->if_port;
3130 			return 0;
3131 
3132 		case SIOCGIFINDEX:
3133 			ifr->ifr_ifindex = dev->ifindex;
3134 			return 0;
3135 
3136 		case SIOCGIFTXQLEN:
3137 			ifr->ifr_qlen = dev->tx_queue_len;
3138 			return 0;
3139 
3140 		default:
3141 			/* dev_ioctl() should ensure this case
3142 			 * is never reached
3143 			 */
3144 			WARN_ON(1);
3145 			err = -EINVAL;
3146 			break;
3147 
3148 	}
3149 	return err;
3150 }
3151 
3152 /*
3153  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3154  */
3155 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3156 {
3157 	int err;
3158 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3159 
3160 	if (!dev)
3161 		return -ENODEV;
3162 
3163 	switch (cmd) {
3164 		case SIOCSIFFLAGS:	/* Set interface flags */
3165 			return dev_change_flags(dev, ifr->ifr_flags);
3166 
3167 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3168 					   (currently unused) */
3169 			return -EOPNOTSUPP;
3170 
3171 		case SIOCSIFMTU:	/* Set the MTU of a device */
3172 			return dev_set_mtu(dev, ifr->ifr_mtu);
3173 
3174 		case SIOCSIFHWADDR:
3175 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3176 
3177 		case SIOCSIFHWBROADCAST:
3178 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3179 				return -EINVAL;
3180 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3181 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3182 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3183 			return 0;
3184 
3185 		case SIOCSIFMAP:
3186 			if (dev->set_config) {
3187 				if (!netif_device_present(dev))
3188 					return -ENODEV;
3189 				return dev->set_config(dev, &ifr->ifr_map);
3190 			}
3191 			return -EOPNOTSUPP;
3192 
3193 		case SIOCADDMULTI:
3194 			if (!dev->set_multicast_list ||
3195 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3196 				return -EINVAL;
3197 			if (!netif_device_present(dev))
3198 				return -ENODEV;
3199 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3200 					  dev->addr_len, 1);
3201 
3202 		case SIOCDELMULTI:
3203 			if (!dev->set_multicast_list ||
3204 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3205 				return -EINVAL;
3206 			if (!netif_device_present(dev))
3207 				return -ENODEV;
3208 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3209 					     dev->addr_len, 1);
3210 
3211 		case SIOCSIFTXQLEN:
3212 			if (ifr->ifr_qlen < 0)
3213 				return -EINVAL;
3214 			dev->tx_queue_len = ifr->ifr_qlen;
3215 			return 0;
3216 
3217 		case SIOCSIFNAME:
3218 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3219 			return dev_change_name(dev, ifr->ifr_newname);
3220 
3221 		/*
3222 		 *	Unknown or private ioctl
3223 		 */
3224 
3225 		default:
3226 			if ((cmd >= SIOCDEVPRIVATE &&
3227 			    cmd <= SIOCDEVPRIVATE + 15) ||
3228 			    cmd == SIOCBONDENSLAVE ||
3229 			    cmd == SIOCBONDRELEASE ||
3230 			    cmd == SIOCBONDSETHWADDR ||
3231 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3232 			    cmd == SIOCBONDINFOQUERY ||
3233 			    cmd == SIOCBONDCHANGEACTIVE ||
3234 			    cmd == SIOCGMIIPHY ||
3235 			    cmd == SIOCGMIIREG ||
3236 			    cmd == SIOCSMIIREG ||
3237 			    cmd == SIOCBRADDIF ||
3238 			    cmd == SIOCBRDELIF ||
3239 			    cmd == SIOCWANDEV) {
3240 				err = -EOPNOTSUPP;
3241 				if (dev->do_ioctl) {
3242 					if (netif_device_present(dev))
3243 						err = dev->do_ioctl(dev, ifr,
3244 								    cmd);
3245 					else
3246 						err = -ENODEV;
3247 				}
3248 			} else
3249 				err = -EINVAL;
3250 
3251 	}
3252 	return err;
3253 }
3254 
3255 /*
3256  *	This function handles all "interface"-type I/O control requests. The actual
3257  *	'doing' part of this is dev_ifsioc above.
3258  */
3259 
3260 /**
3261  *	dev_ioctl	-	network device ioctl
3262  *	@cmd: command to issue
3263  *	@arg: pointer to a struct ifreq in user space
3264  *
3265  *	Issue ioctl functions to devices. This is normally called by the
3266  *	user space syscall interfaces but can sometimes be useful for
3267  *	other purposes. The return value is the return from the syscall if
3268  *	positive or a negative errno code on error.
3269  */
3270 
3271 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3272 {
3273 	struct ifreq ifr;
3274 	int ret;
3275 	char *colon;
3276 
3277 	/* One special case: SIOCGIFCONF takes ifconf argument
3278 	   and requires shared lock, because it sleeps writing
3279 	   to user space.
3280 	 */
3281 
3282 	if (cmd == SIOCGIFCONF) {
3283 		rtnl_lock();
3284 		ret = dev_ifconf(net, (char __user *) arg);
3285 		rtnl_unlock();
3286 		return ret;
3287 	}
3288 	if (cmd == SIOCGIFNAME)
3289 		return dev_ifname(net, (struct ifreq __user *)arg);
3290 
3291 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3292 		return -EFAULT;
3293 
3294 	ifr.ifr_name[IFNAMSIZ-1] = 0;
3295 
3296 	colon = strchr(ifr.ifr_name, ':');
3297 	if (colon)
3298 		*colon = 0;
3299 
3300 	/*
3301 	 *	See which interface the caller is talking about.
3302 	 */
3303 
3304 	switch (cmd) {
3305 		/*
3306 		 *	These ioctl calls:
3307 		 *	- can be done by all.
3308 		 *	- atomic and do not require locking.
3309 		 *	- return a value
3310 		 */
3311 		case SIOCGIFFLAGS:
3312 		case SIOCGIFMETRIC:
3313 		case SIOCGIFMTU:
3314 		case SIOCGIFHWADDR:
3315 		case SIOCGIFSLAVE:
3316 		case SIOCGIFMAP:
3317 		case SIOCGIFINDEX:
3318 		case SIOCGIFTXQLEN:
3319 			dev_load(net, ifr.ifr_name);
3320 			read_lock(&dev_base_lock);
3321 			ret = dev_ifsioc_locked(net, &ifr, cmd);
3322 			read_unlock(&dev_base_lock);
3323 			if (!ret) {
3324 				if (colon)
3325 					*colon = ':';
3326 				if (copy_to_user(arg, &ifr,
3327 						 sizeof(struct ifreq)))
3328 					ret = -EFAULT;
3329 			}
3330 			return ret;
3331 
3332 		case SIOCETHTOOL:
3333 			dev_load(net, ifr.ifr_name);
3334 			rtnl_lock();
3335 			ret = dev_ethtool(net, &ifr);
3336 			rtnl_unlock();
3337 			if (!ret) {
3338 				if (colon)
3339 					*colon = ':';
3340 				if (copy_to_user(arg, &ifr,
3341 						 sizeof(struct ifreq)))
3342 					ret = -EFAULT;
3343 			}
3344 			return ret;
3345 
3346 		/*
3347 		 *	These ioctl calls:
3348 		 *	- require superuser power.
3349 		 *	- require strict serialization.
3350 		 *	- return a value
3351 		 */
3352 		case SIOCGMIIPHY:
3353 		case SIOCGMIIREG:
3354 		case SIOCSIFNAME:
3355 			if (!capable(CAP_NET_ADMIN))
3356 				return -EPERM;
3357 			dev_load(net, ifr.ifr_name);
3358 			rtnl_lock();
3359 			ret = dev_ifsioc(net, &ifr, cmd);
3360 			rtnl_unlock();
3361 			if (!ret) {
3362 				if (colon)
3363 					*colon = ':';
3364 				if (copy_to_user(arg, &ifr,
3365 						 sizeof(struct ifreq)))
3366 					ret = -EFAULT;
3367 			}
3368 			return ret;
3369 
3370 		/*
3371 		 *	These ioctl calls:
3372 		 *	- require superuser power.
3373 		 *	- require strict serialization.
3374 		 *	- do not return a value
3375 		 */
3376 		case SIOCSIFFLAGS:
3377 		case SIOCSIFMETRIC:
3378 		case SIOCSIFMTU:
3379 		case SIOCSIFMAP:
3380 		case SIOCSIFHWADDR:
3381 		case SIOCSIFSLAVE:
3382 		case SIOCADDMULTI:
3383 		case SIOCDELMULTI:
3384 		case SIOCSIFHWBROADCAST:
3385 		case SIOCSIFTXQLEN:
3386 		case SIOCSMIIREG:
3387 		case SIOCBONDENSLAVE:
3388 		case SIOCBONDRELEASE:
3389 		case SIOCBONDSETHWADDR:
3390 		case SIOCBONDCHANGEACTIVE:
3391 		case SIOCBRADDIF:
3392 		case SIOCBRDELIF:
3393 			if (!capable(CAP_NET_ADMIN))
3394 				return -EPERM;
3395 			/* fall through */
3396 		case SIOCBONDSLAVEINFOQUERY:
3397 		case SIOCBONDINFOQUERY:
3398 			dev_load(net, ifr.ifr_name);
3399 			rtnl_lock();
3400 			ret = dev_ifsioc(net, &ifr, cmd);
3401 			rtnl_unlock();
3402 			return ret;
3403 
3404 		case SIOCGIFMEM:
3405 			/* Get the per device memory space. We can add this but
3406 			 * currently do not support it */
3407 		case SIOCSIFMEM:
3408 			/* Set the per device memory buffer space.
3409 			 * Not applicable in our case */
3410 		case SIOCSIFLINK:
3411 			return -EINVAL;
3412 
3413 		/*
3414 		 *	Unknown or private ioctl.
3415 		 */
3416 		default:
3417 			if (cmd == SIOCWANDEV ||
3418 			    (cmd >= SIOCDEVPRIVATE &&
3419 			     cmd <= SIOCDEVPRIVATE + 15)) {
3420 				dev_load(net, ifr.ifr_name);
3421 				rtnl_lock();
3422 				ret = dev_ifsioc(net, &ifr, cmd);
3423 				rtnl_unlock();
3424 				if (!ret && copy_to_user(arg, &ifr,
3425 							 sizeof(struct ifreq)))
3426 					ret = -EFAULT;
3427 				return ret;
3428 			}
3429 			/* Take care of Wireless Extensions */
3430 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3431 				return wext_handle_ioctl(net, &ifr, cmd, arg);
3432 			return -EINVAL;
3433 	}
3434 }
3435 
3436 
3437 /**
3438  *	dev_new_index	-	allocate an ifindex
3439  *
3440  *	Returns a suitable unique value for a new device interface
3441  *	number.  The caller must hold the rtnl semaphore or the
3442  *	dev_base_lock to be sure it remains unique.
3443  */
3444 static int dev_new_index(struct net *net)
3445 {
3446 	static int ifindex;
3447 	for (;;) {
3448 		if (++ifindex <= 0)
3449 			ifindex = 1;
3450 		if (!__dev_get_by_index(net, ifindex))
3451 			return ifindex;
3452 	}
3453 }
3454 
3455 /* Delayed registration/unregisteration */
3456 static DEFINE_SPINLOCK(net_todo_list_lock);
3457 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3458 
3459 static void net_set_todo(struct net_device *dev)
3460 {
3461 	spin_lock(&net_todo_list_lock);
3462 	list_add_tail(&dev->todo_list, &net_todo_list);
3463 	spin_unlock(&net_todo_list_lock);
3464 }
3465 
3466 /**
3467  *	register_netdevice	- register a network device
3468  *	@dev: device to register
3469  *
3470  *	Take a completed network device structure and add it to the kernel
3471  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3472  *	chain. 0 is returned on success. A negative errno code is returned
3473  *	on a failure to set up the device, or if the name is a duplicate.
3474  *
3475  *	Callers must hold the rtnl semaphore. You may want
3476  *	register_netdev() instead of this.
3477  *
3478  *	BUGS:
3479  *	The locking appears insufficient to guarantee two parallel registers
3480  *	will not get the same name.
3481  */
3482 
3483 int register_netdevice(struct net_device *dev)
3484 {
3485 	struct hlist_head *head;
3486 	struct hlist_node *p;
3487 	int ret;
3488 	struct net *net;
3489 
3490 	BUG_ON(dev_boot_phase);
3491 	ASSERT_RTNL();
3492 
3493 	might_sleep();
3494 
3495 	/* When net_device's are persistent, this will be fatal. */
3496 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3497 	BUG_ON(!dev->nd_net);
3498 	net = dev->nd_net;
3499 
3500 	spin_lock_init(&dev->queue_lock);
3501 	spin_lock_init(&dev->_xmit_lock);
3502 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3503 	dev->xmit_lock_owner = -1;
3504 	spin_lock_init(&dev->ingress_lock);
3505 
3506 	dev->iflink = -1;
3507 
3508 	/* Init, if this function is available */
3509 	if (dev->init) {
3510 		ret = dev->init(dev);
3511 		if (ret) {
3512 			if (ret > 0)
3513 				ret = -EIO;
3514 			goto out;
3515 		}
3516 	}
3517 
3518 	if (!dev_valid_name(dev->name)) {
3519 		ret = -EINVAL;
3520 		goto err_uninit;
3521 	}
3522 
3523 	dev->ifindex = dev_new_index(net);
3524 	if (dev->iflink == -1)
3525 		dev->iflink = dev->ifindex;
3526 
3527 	/* Check for existence of name */
3528 	head = dev_name_hash(net, dev->name);
3529 	hlist_for_each(p, head) {
3530 		struct net_device *d
3531 			= hlist_entry(p, struct net_device, name_hlist);
3532 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3533 			ret = -EEXIST;
3534 			goto err_uninit;
3535 		}
3536 	}
3537 
3538 	/* Fix illegal checksum combinations */
3539 	if ((dev->features & NETIF_F_HW_CSUM) &&
3540 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3541 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3542 		       dev->name);
3543 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3544 	}
3545 
3546 	if ((dev->features & NETIF_F_NO_CSUM) &&
3547 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3548 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3549 		       dev->name);
3550 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3551 	}
3552 
3553 
3554 	/* Fix illegal SG+CSUM combinations. */
3555 	if ((dev->features & NETIF_F_SG) &&
3556 	    !(dev->features & NETIF_F_ALL_CSUM)) {
3557 		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3558 		       dev->name);
3559 		dev->features &= ~NETIF_F_SG;
3560 	}
3561 
3562 	/* TSO requires that SG is present as well. */
3563 	if ((dev->features & NETIF_F_TSO) &&
3564 	    !(dev->features & NETIF_F_SG)) {
3565 		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3566 		       dev->name);
3567 		dev->features &= ~NETIF_F_TSO;
3568 	}
3569 	if (dev->features & NETIF_F_UFO) {
3570 		if (!(dev->features & NETIF_F_HW_CSUM)) {
3571 			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3572 					"NETIF_F_HW_CSUM feature.\n",
3573 							dev->name);
3574 			dev->features &= ~NETIF_F_UFO;
3575 		}
3576 		if (!(dev->features & NETIF_F_SG)) {
3577 			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3578 					"NETIF_F_SG feature.\n",
3579 					dev->name);
3580 			dev->features &= ~NETIF_F_UFO;
3581 		}
3582 	}
3583 
3584 	ret = netdev_register_kobject(dev);
3585 	if (ret)
3586 		goto err_uninit;
3587 	dev->reg_state = NETREG_REGISTERED;
3588 
3589 	/*
3590 	 *	Default initial state at registry is that the
3591 	 *	device is present.
3592 	 */
3593 
3594 	set_bit(__LINK_STATE_PRESENT, &dev->state);
3595 
3596 	dev_init_scheduler(dev);
3597 	dev_hold(dev);
3598 	list_netdevice(dev);
3599 
3600 	/* Notify protocols, that a new device appeared. */
3601 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3602 	ret = notifier_to_errno(ret);
3603 	if (ret)
3604 		unregister_netdevice(dev);
3605 
3606 out:
3607 	return ret;
3608 
3609 err_uninit:
3610 	if (dev->uninit)
3611 		dev->uninit(dev);
3612 	goto out;
3613 }
3614 
3615 /**
3616  *	register_netdev	- register a network device
3617  *	@dev: device to register
3618  *
3619  *	Take a completed network device structure and add it to the kernel
3620  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3621  *	chain. 0 is returned on success. A negative errno code is returned
3622  *	on a failure to set up the device, or if the name is a duplicate.
3623  *
3624  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
3625  *	and expands the device name if you passed a format string to
3626  *	alloc_netdev.
3627  */
3628 int register_netdev(struct net_device *dev)
3629 {
3630 	int err;
3631 
3632 	rtnl_lock();
3633 
3634 	/*
3635 	 * If the name is a format string the caller wants us to do a
3636 	 * name allocation.
3637 	 */
3638 	if (strchr(dev->name, '%')) {
3639 		err = dev_alloc_name(dev, dev->name);
3640 		if (err < 0)
3641 			goto out;
3642 	}
3643 
3644 	err = register_netdevice(dev);
3645 out:
3646 	rtnl_unlock();
3647 	return err;
3648 }
3649 EXPORT_SYMBOL(register_netdev);
3650 
3651 /*
3652  * netdev_wait_allrefs - wait until all references are gone.
3653  *
3654  * This is called when unregistering network devices.
3655  *
3656  * Any protocol or device that holds a reference should register
3657  * for netdevice notification, and cleanup and put back the
3658  * reference if they receive an UNREGISTER event.
3659  * We can get stuck here if buggy protocols don't correctly
3660  * call dev_put.
3661  */
3662 static void netdev_wait_allrefs(struct net_device *dev)
3663 {
3664 	unsigned long rebroadcast_time, warning_time;
3665 
3666 	rebroadcast_time = warning_time = jiffies;
3667 	while (atomic_read(&dev->refcnt) != 0) {
3668 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3669 			rtnl_lock();
3670 
3671 			/* Rebroadcast unregister notification */
3672 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3673 
3674 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3675 				     &dev->state)) {
3676 				/* We must not have linkwatch events
3677 				 * pending on unregister. If this
3678 				 * happens, we simply run the queue
3679 				 * unscheduled, resulting in a noop
3680 				 * for this device.
3681 				 */
3682 				linkwatch_run_queue();
3683 			}
3684 
3685 			__rtnl_unlock();
3686 
3687 			rebroadcast_time = jiffies;
3688 		}
3689 
3690 		msleep(250);
3691 
3692 		if (time_after(jiffies, warning_time + 10 * HZ)) {
3693 			printk(KERN_EMERG "unregister_netdevice: "
3694 			       "waiting for %s to become free. Usage "
3695 			       "count = %d\n",
3696 			       dev->name, atomic_read(&dev->refcnt));
3697 			warning_time = jiffies;
3698 		}
3699 	}
3700 }
3701 
3702 /* The sequence is:
3703  *
3704  *	rtnl_lock();
3705  *	...
3706  *	register_netdevice(x1);
3707  *	register_netdevice(x2);
3708  *	...
3709  *	unregister_netdevice(y1);
3710  *	unregister_netdevice(y2);
3711  *      ...
3712  *	rtnl_unlock();
3713  *	free_netdev(y1);
3714  *	free_netdev(y2);
3715  *
3716  * We are invoked by rtnl_unlock() after it drops the semaphore.
3717  * This allows us to deal with problems:
3718  * 1) We can delete sysfs objects which invoke hotplug
3719  *    without deadlocking with linkwatch via keventd.
3720  * 2) Since we run with the RTNL semaphore not held, we can sleep
3721  *    safely in order to wait for the netdev refcnt to drop to zero.
3722  */
3723 static DEFINE_MUTEX(net_todo_run_mutex);
3724 void netdev_run_todo(void)
3725 {
3726 	struct list_head list;
3727 
3728 	/* Need to guard against multiple cpu's getting out of order. */
3729 	mutex_lock(&net_todo_run_mutex);
3730 
3731 	/* Not safe to do outside the semaphore.  We must not return
3732 	 * until all unregister events invoked by the local processor
3733 	 * have been completed (either by this todo run, or one on
3734 	 * another cpu).
3735 	 */
3736 	if (list_empty(&net_todo_list))
3737 		goto out;
3738 
3739 	/* Snapshot list, allow later requests */
3740 	spin_lock(&net_todo_list_lock);
3741 	list_replace_init(&net_todo_list, &list);
3742 	spin_unlock(&net_todo_list_lock);
3743 
3744 	while (!list_empty(&list)) {
3745 		struct net_device *dev
3746 			= list_entry(list.next, struct net_device, todo_list);
3747 		list_del(&dev->todo_list);
3748 
3749 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3750 			printk(KERN_ERR "network todo '%s' but state %d\n",
3751 			       dev->name, dev->reg_state);
3752 			dump_stack();
3753 			continue;
3754 		}
3755 
3756 		dev->reg_state = NETREG_UNREGISTERED;
3757 
3758 		netdev_wait_allrefs(dev);
3759 
3760 		/* paranoia */
3761 		BUG_ON(atomic_read(&dev->refcnt));
3762 		BUG_TRAP(!dev->ip_ptr);
3763 		BUG_TRAP(!dev->ip6_ptr);
3764 		BUG_TRAP(!dev->dn_ptr);
3765 
3766 		if (dev->destructor)
3767 			dev->destructor(dev);
3768 
3769 		/* Free network device */
3770 		kobject_put(&dev->dev.kobj);
3771 	}
3772 
3773 out:
3774 	mutex_unlock(&net_todo_run_mutex);
3775 }
3776 
3777 static struct net_device_stats *internal_stats(struct net_device *dev)
3778 {
3779 	return &dev->stats;
3780 }
3781 
3782 /**
3783  *	alloc_netdev_mq - allocate network device
3784  *	@sizeof_priv:	size of private data to allocate space for
3785  *	@name:		device name format string
3786  *	@setup:		callback to initialize device
3787  *	@queue_count:	the number of subqueues to allocate
3788  *
3789  *	Allocates a struct net_device with private data area for driver use
3790  *	and performs basic initialization.  Also allocates subquue structs
3791  *	for each queue on the device at the end of the netdevice.
3792  */
3793 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
3794 		void (*setup)(struct net_device *), unsigned int queue_count)
3795 {
3796 	void *p;
3797 	struct net_device *dev;
3798 	int alloc_size;
3799 
3800 	BUG_ON(strlen(name) >= sizeof(dev->name));
3801 
3802 	/* ensure 32-byte alignment of both the device and private area */
3803 	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
3804 		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
3805 		     ~NETDEV_ALIGN_CONST;
3806 	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3807 
3808 	p = kzalloc(alloc_size, GFP_KERNEL);
3809 	if (!p) {
3810 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3811 		return NULL;
3812 	}
3813 
3814 	dev = (struct net_device *)
3815 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3816 	dev->padded = (char *)dev - (char *)p;
3817 	dev->nd_net = &init_net;
3818 
3819 	if (sizeof_priv) {
3820 		dev->priv = ((char *)dev +
3821 			     ((sizeof(struct net_device) +
3822 			       (sizeof(struct net_device_subqueue) *
3823 				(queue_count - 1)) + NETDEV_ALIGN_CONST)
3824 			      & ~NETDEV_ALIGN_CONST));
3825 	}
3826 
3827 	dev->egress_subqueue_count = queue_count;
3828 
3829 	dev->get_stats = internal_stats;
3830 	netpoll_netdev_init(dev);
3831 	setup(dev);
3832 	strcpy(dev->name, name);
3833 	return dev;
3834 }
3835 EXPORT_SYMBOL(alloc_netdev_mq);
3836 
3837 /**
3838  *	free_netdev - free network device
3839  *	@dev: device
3840  *
3841  *	This function does the last stage of destroying an allocated device
3842  * 	interface. The reference to the device object is released.
3843  *	If this is the last reference then it will be freed.
3844  */
3845 void free_netdev(struct net_device *dev)
3846 {
3847 	/*  Compatibility with error handling in drivers */
3848 	if (dev->reg_state == NETREG_UNINITIALIZED) {
3849 		kfree((char *)dev - dev->padded);
3850 		return;
3851 	}
3852 
3853 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3854 	dev->reg_state = NETREG_RELEASED;
3855 
3856 	/* will free via device release */
3857 	put_device(&dev->dev);
3858 }
3859 
3860 /* Synchronize with packet receive processing. */
3861 void synchronize_net(void)
3862 {
3863 	might_sleep();
3864 	synchronize_rcu();
3865 }
3866 
3867 /**
3868  *	unregister_netdevice - remove device from the kernel
3869  *	@dev: device
3870  *
3871  *	This function shuts down a device interface and removes it
3872  *	from the kernel tables. On success 0 is returned, on a failure
3873  *	a negative errno code is returned.
3874  *
3875  *	Callers must hold the rtnl semaphore.  You may want
3876  *	unregister_netdev() instead of this.
3877  */
3878 
3879 void unregister_netdevice(struct net_device *dev)
3880 {
3881 	BUG_ON(dev_boot_phase);
3882 	ASSERT_RTNL();
3883 
3884 	/* Some devices call without registering for initialization unwind. */
3885 	if (dev->reg_state == NETREG_UNINITIALIZED) {
3886 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3887 				  "was registered\n", dev->name, dev);
3888 
3889 		WARN_ON(1);
3890 		return;
3891 	}
3892 
3893 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3894 
3895 	/* If device is running, close it first. */
3896 	dev_close(dev);
3897 
3898 	/* And unlink it from device chain. */
3899 	unlist_netdevice(dev);
3900 
3901 	dev->reg_state = NETREG_UNREGISTERING;
3902 
3903 	synchronize_net();
3904 
3905 	/* Shutdown queueing discipline. */
3906 	dev_shutdown(dev);
3907 
3908 
3909 	/* Notify protocols, that we are about to destroy
3910 	   this device. They should clean all the things.
3911 	*/
3912 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3913 
3914 	/*
3915 	 *	Flush the unicast and multicast chains
3916 	 */
3917 	dev_addr_discard(dev);
3918 
3919 	if (dev->uninit)
3920 		dev->uninit(dev);
3921 
3922 	/* Notifier chain MUST detach us from master device. */
3923 	BUG_TRAP(!dev->master);
3924 
3925 	/* Remove entries from kobject tree */
3926 	netdev_unregister_kobject(dev);
3927 
3928 	/* Finish processing unregister after unlock */
3929 	net_set_todo(dev);
3930 
3931 	synchronize_net();
3932 
3933 	dev_put(dev);
3934 }
3935 
3936 /**
3937  *	unregister_netdev - remove device from the kernel
3938  *	@dev: device
3939  *
3940  *	This function shuts down a device interface and removes it
3941  *	from the kernel tables. On success 0 is returned, on a failure
3942  *	a negative errno code is returned.
3943  *
3944  *	This is just a wrapper for unregister_netdevice that takes
3945  *	the rtnl semaphore.  In general you want to use this and not
3946  *	unregister_netdevice.
3947  */
3948 void unregister_netdev(struct net_device *dev)
3949 {
3950 	rtnl_lock();
3951 	unregister_netdevice(dev);
3952 	rtnl_unlock();
3953 }
3954 
3955 EXPORT_SYMBOL(unregister_netdev);
3956 
3957 /**
3958  *	dev_change_net_namespace - move device to different nethost namespace
3959  *	@dev: device
3960  *	@net: network namespace
3961  *	@pat: If not NULL name pattern to try if the current device name
3962  *	      is already taken in the destination network namespace.
3963  *
3964  *	This function shuts down a device interface and moves it
3965  *	to a new network namespace. On success 0 is returned, on
3966  *	a failure a netagive errno code is returned.
3967  *
3968  *	Callers must hold the rtnl semaphore.
3969  */
3970 
3971 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
3972 {
3973 	char buf[IFNAMSIZ];
3974 	const char *destname;
3975 	int err;
3976 
3977 	ASSERT_RTNL();
3978 
3979 	/* Don't allow namespace local devices to be moved. */
3980 	err = -EINVAL;
3981 	if (dev->features & NETIF_F_NETNS_LOCAL)
3982 		goto out;
3983 
3984 	/* Ensure the device has been registrered */
3985 	err = -EINVAL;
3986 	if (dev->reg_state != NETREG_REGISTERED)
3987 		goto out;
3988 
3989 	/* Get out if there is nothing todo */
3990 	err = 0;
3991 	if (dev->nd_net == net)
3992 		goto out;
3993 
3994 	/* Pick the destination device name, and ensure
3995 	 * we can use it in the destination network namespace.
3996 	 */
3997 	err = -EEXIST;
3998 	destname = dev->name;
3999 	if (__dev_get_by_name(net, destname)) {
4000 		/* We get here if we can't use the current device name */
4001 		if (!pat)
4002 			goto out;
4003 		if (!dev_valid_name(pat))
4004 			goto out;
4005 		if (strchr(pat, '%')) {
4006 			if (__dev_alloc_name(net, pat, buf) < 0)
4007 				goto out;
4008 			destname = buf;
4009 		} else
4010 			destname = pat;
4011 		if (__dev_get_by_name(net, destname))
4012 			goto out;
4013 	}
4014 
4015 	/*
4016 	 * And now a mini version of register_netdevice unregister_netdevice.
4017 	 */
4018 
4019 	/* If device is running close it first. */
4020 	dev_close(dev);
4021 
4022 	/* And unlink it from device chain */
4023 	err = -ENODEV;
4024 	unlist_netdevice(dev);
4025 
4026 	synchronize_net();
4027 
4028 	/* Shutdown queueing discipline. */
4029 	dev_shutdown(dev);
4030 
4031 	/* Notify protocols, that we are about to destroy
4032 	   this device. They should clean all the things.
4033 	*/
4034 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4035 
4036 	/*
4037 	 *	Flush the unicast and multicast chains
4038 	 */
4039 	dev_addr_discard(dev);
4040 
4041 	/* Actually switch the network namespace */
4042 	dev->nd_net = net;
4043 
4044 	/* Assign the new device name */
4045 	if (destname != dev->name)
4046 		strcpy(dev->name, destname);
4047 
4048 	/* If there is an ifindex conflict assign a new one */
4049 	if (__dev_get_by_index(net, dev->ifindex)) {
4050 		int iflink = (dev->iflink == dev->ifindex);
4051 		dev->ifindex = dev_new_index(net);
4052 		if (iflink)
4053 			dev->iflink = dev->ifindex;
4054 	}
4055 
4056 	/* Fixup kobjects */
4057 	err = device_rename(&dev->dev, dev->name);
4058 	WARN_ON(err);
4059 
4060 	/* Add the device back in the hashes */
4061 	list_netdevice(dev);
4062 
4063 	/* Notify protocols, that a new device appeared. */
4064 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4065 
4066 	synchronize_net();
4067 	err = 0;
4068 out:
4069 	return err;
4070 }
4071 
4072 static int dev_cpu_callback(struct notifier_block *nfb,
4073 			    unsigned long action,
4074 			    void *ocpu)
4075 {
4076 	struct sk_buff **list_skb;
4077 	struct net_device **list_net;
4078 	struct sk_buff *skb;
4079 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4080 	struct softnet_data *sd, *oldsd;
4081 
4082 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4083 		return NOTIFY_OK;
4084 
4085 	local_irq_disable();
4086 	cpu = smp_processor_id();
4087 	sd = &per_cpu(softnet_data, cpu);
4088 	oldsd = &per_cpu(softnet_data, oldcpu);
4089 
4090 	/* Find end of our completion_queue. */
4091 	list_skb = &sd->completion_queue;
4092 	while (*list_skb)
4093 		list_skb = &(*list_skb)->next;
4094 	/* Append completion queue from offline CPU. */
4095 	*list_skb = oldsd->completion_queue;
4096 	oldsd->completion_queue = NULL;
4097 
4098 	/* Find end of our output_queue. */
4099 	list_net = &sd->output_queue;
4100 	while (*list_net)
4101 		list_net = &(*list_net)->next_sched;
4102 	/* Append output queue from offline CPU. */
4103 	*list_net = oldsd->output_queue;
4104 	oldsd->output_queue = NULL;
4105 
4106 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4107 	local_irq_enable();
4108 
4109 	/* Process offline CPU's input_pkt_queue */
4110 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4111 		netif_rx(skb);
4112 
4113 	return NOTIFY_OK;
4114 }
4115 
4116 #ifdef CONFIG_NET_DMA
4117 /**
4118  * net_dma_rebalance - try to maintain one DMA channel per CPU
4119  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4120  *
4121  * This is called when the number of channels allocated to the net_dma client
4122  * changes.  The net_dma client tries to have one DMA channel per CPU.
4123  */
4124 
4125 static void net_dma_rebalance(struct net_dma *net_dma)
4126 {
4127 	unsigned int cpu, i, n, chan_idx;
4128 	struct dma_chan *chan;
4129 
4130 	if (cpus_empty(net_dma->channel_mask)) {
4131 		for_each_online_cpu(cpu)
4132 			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4133 		return;
4134 	}
4135 
4136 	i = 0;
4137 	cpu = first_cpu(cpu_online_map);
4138 
4139 	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4140 		chan = net_dma->channels[chan_idx];
4141 
4142 		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4143 		   + (i < (num_online_cpus() %
4144 			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4145 
4146 		while(n) {
4147 			per_cpu(softnet_data, cpu).net_dma = chan;
4148 			cpu = next_cpu(cpu, cpu_online_map);
4149 			n--;
4150 		}
4151 		i++;
4152 	}
4153 }
4154 
4155 /**
4156  * netdev_dma_event - event callback for the net_dma_client
4157  * @client: should always be net_dma_client
4158  * @chan: DMA channel for the event
4159  * @state: DMA state to be handled
4160  */
4161 static enum dma_state_client
4162 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4163 	enum dma_state state)
4164 {
4165 	int i, found = 0, pos = -1;
4166 	struct net_dma *net_dma =
4167 		container_of(client, struct net_dma, client);
4168 	enum dma_state_client ack = DMA_DUP; /* default: take no action */
4169 
4170 	spin_lock(&net_dma->lock);
4171 	switch (state) {
4172 	case DMA_RESOURCE_AVAILABLE:
4173 		for (i = 0; i < NR_CPUS; i++)
4174 			if (net_dma->channels[i] == chan) {
4175 				found = 1;
4176 				break;
4177 			} else if (net_dma->channels[i] == NULL && pos < 0)
4178 				pos = i;
4179 
4180 		if (!found && pos >= 0) {
4181 			ack = DMA_ACK;
4182 			net_dma->channels[pos] = chan;
4183 			cpu_set(pos, net_dma->channel_mask);
4184 			net_dma_rebalance(net_dma);
4185 		}
4186 		break;
4187 	case DMA_RESOURCE_REMOVED:
4188 		for (i = 0; i < NR_CPUS; i++)
4189 			if (net_dma->channels[i] == chan) {
4190 				found = 1;
4191 				pos = i;
4192 				break;
4193 			}
4194 
4195 		if (found) {
4196 			ack = DMA_ACK;
4197 			cpu_clear(pos, net_dma->channel_mask);
4198 			net_dma->channels[i] = NULL;
4199 			net_dma_rebalance(net_dma);
4200 		}
4201 		break;
4202 	default:
4203 		break;
4204 	}
4205 	spin_unlock(&net_dma->lock);
4206 
4207 	return ack;
4208 }
4209 
4210 /**
4211  * netdev_dma_regiser - register the networking subsystem as a DMA client
4212  */
4213 static int __init netdev_dma_register(void)
4214 {
4215 	spin_lock_init(&net_dma.lock);
4216 	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4217 	dma_async_client_register(&net_dma.client);
4218 	dma_async_client_chan_request(&net_dma.client);
4219 	return 0;
4220 }
4221 
4222 #else
4223 static int __init netdev_dma_register(void) { return -ENODEV; }
4224 #endif /* CONFIG_NET_DMA */
4225 
4226 /**
4227  *	netdev_compute_feature - compute conjunction of two feature sets
4228  *	@all: first feature set
4229  *	@one: second feature set
4230  *
4231  *	Computes a new feature set after adding a device with feature set
4232  *	@one to the master device with current feature set @all.  Returns
4233  *	the new feature set.
4234  */
4235 int netdev_compute_features(unsigned long all, unsigned long one)
4236 {
4237 	/* if device needs checksumming, downgrade to hw checksumming */
4238 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4239 		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4240 
4241 	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4242 	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4243 		all ^= NETIF_F_HW_CSUM
4244 			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4245 
4246 	if (one & NETIF_F_GSO)
4247 		one |= NETIF_F_GSO_SOFTWARE;
4248 	one |= NETIF_F_GSO;
4249 
4250 	/* If even one device supports robust GSO, enable it for all. */
4251 	if (one & NETIF_F_GSO_ROBUST)
4252 		all |= NETIF_F_GSO_ROBUST;
4253 
4254 	all &= one | NETIF_F_LLTX;
4255 
4256 	if (!(all & NETIF_F_ALL_CSUM))
4257 		all &= ~NETIF_F_SG;
4258 	if (!(all & NETIF_F_SG))
4259 		all &= ~NETIF_F_GSO_MASK;
4260 
4261 	return all;
4262 }
4263 EXPORT_SYMBOL(netdev_compute_features);
4264 
4265 static struct hlist_head *netdev_create_hash(void)
4266 {
4267 	int i;
4268 	struct hlist_head *hash;
4269 
4270 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4271 	if (hash != NULL)
4272 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4273 			INIT_HLIST_HEAD(&hash[i]);
4274 
4275 	return hash;
4276 }
4277 
4278 /* Initialize per network namespace state */
4279 static int __net_init netdev_init(struct net *net)
4280 {
4281 	INIT_LIST_HEAD(&net->dev_base_head);
4282 	rwlock_init(&dev_base_lock);
4283 
4284 	net->dev_name_head = netdev_create_hash();
4285 	if (net->dev_name_head == NULL)
4286 		goto err_name;
4287 
4288 	net->dev_index_head = netdev_create_hash();
4289 	if (net->dev_index_head == NULL)
4290 		goto err_idx;
4291 
4292 	return 0;
4293 
4294 err_idx:
4295 	kfree(net->dev_name_head);
4296 err_name:
4297 	return -ENOMEM;
4298 }
4299 
4300 static void __net_exit netdev_exit(struct net *net)
4301 {
4302 	kfree(net->dev_name_head);
4303 	kfree(net->dev_index_head);
4304 }
4305 
4306 static struct pernet_operations __net_initdata netdev_net_ops = {
4307 	.init = netdev_init,
4308 	.exit = netdev_exit,
4309 };
4310 
4311 static void __net_exit default_device_exit(struct net *net)
4312 {
4313 	struct net_device *dev, *next;
4314 	/*
4315 	 * Push all migratable of the network devices back to the
4316 	 * initial network namespace
4317 	 */
4318 	rtnl_lock();
4319 	for_each_netdev_safe(net, dev, next) {
4320 		int err;
4321 
4322 		/* Ignore unmoveable devices (i.e. loopback) */
4323 		if (dev->features & NETIF_F_NETNS_LOCAL)
4324 			continue;
4325 
4326 		/* Push remaing network devices to init_net */
4327 		err = dev_change_net_namespace(dev, &init_net, "dev%d");
4328 		if (err) {
4329 			printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n",
4330 				__func__, dev->name, err);
4331 			unregister_netdevice(dev);
4332 		}
4333 	}
4334 	rtnl_unlock();
4335 }
4336 
4337 static struct pernet_operations __net_initdata default_device_ops = {
4338 	.exit = default_device_exit,
4339 };
4340 
4341 /*
4342  *	Initialize the DEV module. At boot time this walks the device list and
4343  *	unhooks any devices that fail to initialise (normally hardware not
4344  *	present) and leaves us with a valid list of present and active devices.
4345  *
4346  */
4347 
4348 /*
4349  *       This is called single threaded during boot, so no need
4350  *       to take the rtnl semaphore.
4351  */
4352 static int __init net_dev_init(void)
4353 {
4354 	int i, rc = -ENOMEM;
4355 
4356 	BUG_ON(!dev_boot_phase);
4357 
4358 	if (dev_proc_init())
4359 		goto out;
4360 
4361 	if (netdev_kobject_init())
4362 		goto out;
4363 
4364 	INIT_LIST_HEAD(&ptype_all);
4365 	for (i = 0; i < 16; i++)
4366 		INIT_LIST_HEAD(&ptype_base[i]);
4367 
4368 	if (register_pernet_subsys(&netdev_net_ops))
4369 		goto out;
4370 
4371 	if (register_pernet_device(&default_device_ops))
4372 		goto out;
4373 
4374 	/*
4375 	 *	Initialise the packet receive queues.
4376 	 */
4377 
4378 	for_each_possible_cpu(i) {
4379 		struct softnet_data *queue;
4380 
4381 		queue = &per_cpu(softnet_data, i);
4382 		skb_queue_head_init(&queue->input_pkt_queue);
4383 		queue->completion_queue = NULL;
4384 		INIT_LIST_HEAD(&queue->poll_list);
4385 
4386 		queue->backlog.poll = process_backlog;
4387 		queue->backlog.weight = weight_p;
4388 	}
4389 
4390 	netdev_dma_register();
4391 
4392 	dev_boot_phase = 0;
4393 
4394 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4395 	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4396 
4397 	hotcpu_notifier(dev_cpu_callback, 0);
4398 	dst_init();
4399 	dev_mcast_init();
4400 	rc = 0;
4401 out:
4402 	return rc;
4403 }
4404 
4405 subsys_initcall(net_dev_init);
4406 
4407 EXPORT_SYMBOL(__dev_get_by_index);
4408 EXPORT_SYMBOL(__dev_get_by_name);
4409 EXPORT_SYMBOL(__dev_remove_pack);
4410 EXPORT_SYMBOL(dev_valid_name);
4411 EXPORT_SYMBOL(dev_add_pack);
4412 EXPORT_SYMBOL(dev_alloc_name);
4413 EXPORT_SYMBOL(dev_close);
4414 EXPORT_SYMBOL(dev_get_by_flags);
4415 EXPORT_SYMBOL(dev_get_by_index);
4416 EXPORT_SYMBOL(dev_get_by_name);
4417 EXPORT_SYMBOL(dev_open);
4418 EXPORT_SYMBOL(dev_queue_xmit);
4419 EXPORT_SYMBOL(dev_remove_pack);
4420 EXPORT_SYMBOL(dev_set_allmulti);
4421 EXPORT_SYMBOL(dev_set_promiscuity);
4422 EXPORT_SYMBOL(dev_change_flags);
4423 EXPORT_SYMBOL(dev_set_mtu);
4424 EXPORT_SYMBOL(dev_set_mac_address);
4425 EXPORT_SYMBOL(free_netdev);
4426 EXPORT_SYMBOL(netdev_boot_setup_check);
4427 EXPORT_SYMBOL(netdev_set_master);
4428 EXPORT_SYMBOL(netdev_state_change);
4429 EXPORT_SYMBOL(netif_receive_skb);
4430 EXPORT_SYMBOL(netif_rx);
4431 EXPORT_SYMBOL(register_gifconf);
4432 EXPORT_SYMBOL(register_netdevice);
4433 EXPORT_SYMBOL(register_netdevice_notifier);
4434 EXPORT_SYMBOL(skb_checksum_help);
4435 EXPORT_SYMBOL(synchronize_net);
4436 EXPORT_SYMBOL(unregister_netdevice);
4437 EXPORT_SYMBOL(unregister_netdevice_notifier);
4438 EXPORT_SYMBOL(net_enable_timestamp);
4439 EXPORT_SYMBOL(net_disable_timestamp);
4440 EXPORT_SYMBOL(dev_get_flags);
4441 
4442 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4443 EXPORT_SYMBOL(br_handle_frame_hook);
4444 EXPORT_SYMBOL(br_fdb_get_hook);
4445 EXPORT_SYMBOL(br_fdb_put_hook);
4446 #endif
4447 
4448 #ifdef CONFIG_KMOD
4449 EXPORT_SYMBOL(dev_load);
4450 #endif
4451 
4452 EXPORT_PER_CPU_SYMBOL(softnet_data);
4453