xref: /linux/net/core/dev.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/notifier.h>
94 #include <linux/skbuff.h>
95 #include <net/net_namespace.h>
96 #include <net/sock.h>
97 #include <linux/rtnetlink.h>
98 #include <linux/proc_fs.h>
99 #include <linux/seq_file.h>
100 #include <linux/stat.h>
101 #include <linux/if_bridge.h>
102 #include <linux/if_macvlan.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/kmod.h>
109 #include <linux/module.h>
110 #include <linux/kallsyms.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 
123 /*
124  *	The list of packet types we will receive (as opposed to discard)
125  *	and the routines to invoke.
126  *
127  *	Why 16. Because with 16 the only overlap we get on a hash of the
128  *	low nibble of the protocol value is RARP/SNAP/X.25.
129  *
130  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
131  *             sure which should go first, but I bet it won't make much
132  *             difference if we are running VLANs.  The good news is that
133  *             this protocol won't be in the list unless compiled in, so
134  *             the average user (w/out VLANs) will not be adversely affected.
135  *             --BLG
136  *
137  *		0800	IP
138  *		8100    802.1Q VLAN
139  *		0001	802.3
140  *		0002	AX.25
141  *		0004	802.2
142  *		8035	RARP
143  *		0005	SNAP
144  *		0805	X.25
145  *		0806	ARP
146  *		8137	IPX
147  *		0009	Localtalk
148  *		86DD	IPv6
149  */
150 
151 static DEFINE_SPINLOCK(ptype_lock);
152 static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
153 static struct list_head ptype_all __read_mostly;	/* Taps */
154 
155 #ifdef CONFIG_NET_DMA
156 struct net_dma {
157 	struct dma_client client;
158 	spinlock_t lock;
159 	cpumask_t channel_mask;
160 	struct dma_chan *channels[NR_CPUS];
161 };
162 
163 static enum dma_state_client
164 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
165 	enum dma_state state);
166 
167 static struct net_dma net_dma = {
168 	.client = {
169 		.event_callback = netdev_dma_event,
170 	},
171 };
172 #endif
173 
174 /*
175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
176  * semaphore.
177  *
178  * Pure readers hold dev_base_lock for reading.
179  *
180  * Writers must hold the rtnl semaphore while they loop through the
181  * dev_base_head list, and hold dev_base_lock for writing when they do the
182  * actual updates.  This allows pure readers to access the list even
183  * while a writer is preparing to update it.
184  *
185  * To put it another way, dev_base_lock is held for writing only to
186  * protect against pure readers; the rtnl semaphore provides the
187  * protection against other writers.
188  *
189  * See, for example usages, register_netdevice() and
190  * unregister_netdevice(), which must be called with the rtnl
191  * semaphore held.
192  */
193 DEFINE_RWLOCK(dev_base_lock);
194 
195 EXPORT_SYMBOL(dev_base_lock);
196 
197 #define NETDEV_HASHBITS	8
198 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
199 
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 {
202 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
204 }
205 
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
209 }
210 
211 /* Device list insertion */
212 static int list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev->nd_net;
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
222 	write_unlock_bh(&dev_base_lock);
223 	return 0;
224 }
225 
226 /* Device list removal */
227 static void unlist_netdevice(struct net_device *dev)
228 {
229 	ASSERT_RTNL();
230 
231 	/* Unlink dev from the device chain */
232 	write_lock_bh(&dev_base_lock);
233 	list_del(&dev->dev_list);
234 	hlist_del(&dev->name_hlist);
235 	hlist_del(&dev->index_hlist);
236 	write_unlock_bh(&dev_base_lock);
237 }
238 
239 /*
240  *	Our notifier list
241  */
242 
243 static RAW_NOTIFIER_HEAD(netdev_chain);
244 
245 /*
246  *	Device drivers call our routines to queue packets here. We empty the
247  *	queue in the local softnet handler.
248  */
249 
250 DEFINE_PER_CPU(struct softnet_data, softnet_data);
251 
252 extern int netdev_kobject_init(void);
253 extern int netdev_register_kobject(struct net_device *);
254 extern void netdev_unregister_kobject(struct net_device *);
255 
256 #ifdef CONFIG_DEBUG_LOCK_ALLOC
257 /*
258  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
259  * according to dev->type
260  */
261 static const unsigned short netdev_lock_type[] =
262 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
263 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
264 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
265 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
266 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
267 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
268 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
269 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
270 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
271 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
272 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
273 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
274 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
275 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
276 	 ARPHRD_NONE};
277 
278 static const char *netdev_lock_name[] =
279 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
280 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
281 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
282 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
283 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
284 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
285 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
286 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
287 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
288 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
289 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
290 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
291 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
292 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
293 	 "_xmit_NONE"};
294 
295 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
296 
297 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
298 {
299 	int i;
300 
301 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
302 		if (netdev_lock_type[i] == dev_type)
303 			return i;
304 	/* the last key is used by default */
305 	return ARRAY_SIZE(netdev_lock_type) - 1;
306 }
307 
308 static inline void netdev_set_lockdep_class(spinlock_t *lock,
309 					    unsigned short dev_type)
310 {
311 	int i;
312 
313 	i = netdev_lock_pos(dev_type);
314 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
315 				   netdev_lock_name[i]);
316 }
317 #else
318 static inline void netdev_set_lockdep_class(spinlock_t *lock,
319 					    unsigned short dev_type)
320 {
321 }
322 #endif
323 
324 /*******************************************************************************
325 
326 		Protocol management and registration routines
327 
328 *******************************************************************************/
329 
330 /*
331  *	Add a protocol ID to the list. Now that the input handler is
332  *	smarter we can dispense with all the messy stuff that used to be
333  *	here.
334  *
335  *	BEWARE!!! Protocol handlers, mangling input packets,
336  *	MUST BE last in hash buckets and checking protocol handlers
337  *	MUST start from promiscuous ptype_all chain in net_bh.
338  *	It is true now, do not change it.
339  *	Explanation follows: if protocol handler, mangling packet, will
340  *	be the first on list, it is not able to sense, that packet
341  *	is cloned and should be copied-on-write, so that it will
342  *	change it and subsequent readers will get broken packet.
343  *							--ANK (980803)
344  */
345 
346 /**
347  *	dev_add_pack - add packet handler
348  *	@pt: packet type declaration
349  *
350  *	Add a protocol handler to the networking stack. The passed &packet_type
351  *	is linked into kernel lists and may not be freed until it has been
352  *	removed from the kernel lists.
353  *
354  *	This call does not sleep therefore it can not
355  *	guarantee all CPU's that are in middle of receiving packets
356  *	will see the new packet type (until the next received packet).
357  */
358 
359 void dev_add_pack(struct packet_type *pt)
360 {
361 	int hash;
362 
363 	spin_lock_bh(&ptype_lock);
364 	if (pt->type == htons(ETH_P_ALL))
365 		list_add_rcu(&pt->list, &ptype_all);
366 	else {
367 		hash = ntohs(pt->type) & 15;
368 		list_add_rcu(&pt->list, &ptype_base[hash]);
369 	}
370 	spin_unlock_bh(&ptype_lock);
371 }
372 
373 /**
374  *	__dev_remove_pack	 - remove packet handler
375  *	@pt: packet type declaration
376  *
377  *	Remove a protocol handler that was previously added to the kernel
378  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
379  *	from the kernel lists and can be freed or reused once this function
380  *	returns.
381  *
382  *      The packet type might still be in use by receivers
383  *	and must not be freed until after all the CPU's have gone
384  *	through a quiescent state.
385  */
386 void __dev_remove_pack(struct packet_type *pt)
387 {
388 	struct list_head *head;
389 	struct packet_type *pt1;
390 
391 	spin_lock_bh(&ptype_lock);
392 
393 	if (pt->type == htons(ETH_P_ALL))
394 		head = &ptype_all;
395 	else
396 		head = &ptype_base[ntohs(pt->type) & 15];
397 
398 	list_for_each_entry(pt1, head, list) {
399 		if (pt == pt1) {
400 			list_del_rcu(&pt->list);
401 			goto out;
402 		}
403 	}
404 
405 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
406 out:
407 	spin_unlock_bh(&ptype_lock);
408 }
409 /**
410  *	dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *	This call sleeps to guarantee that no CPU is looking at the packet
419  *	type after return.
420  */
421 void dev_remove_pack(struct packet_type *pt)
422 {
423 	__dev_remove_pack(pt);
424 
425 	synchronize_net();
426 }
427 
428 /******************************************************************************
429 
430 		      Device Boot-time Settings Routines
431 
432 *******************************************************************************/
433 
434 /* Boot time configuration table */
435 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
436 
437 /**
438  *	netdev_boot_setup_add	- add new setup entry
439  *	@name: name of the device
440  *	@map: configured settings for the device
441  *
442  *	Adds new setup entry to the dev_boot_setup list.  The function
443  *	returns 0 on error and 1 on success.  This is a generic routine to
444  *	all netdevices.
445  */
446 static int netdev_boot_setup_add(char *name, struct ifmap *map)
447 {
448 	struct netdev_boot_setup *s;
449 	int i;
450 
451 	s = dev_boot_setup;
452 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
453 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
454 			memset(s[i].name, 0, sizeof(s[i].name));
455 			strcpy(s[i].name, name);
456 			memcpy(&s[i].map, map, sizeof(s[i].map));
457 			break;
458 		}
459 	}
460 
461 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
462 }
463 
464 /**
465  *	netdev_boot_setup_check	- check boot time settings
466  *	@dev: the netdevice
467  *
468  * 	Check boot time settings for the device.
469  *	The found settings are set for the device to be used
470  *	later in the device probing.
471  *	Returns 0 if no settings found, 1 if they are.
472  */
473 int netdev_boot_setup_check(struct net_device *dev)
474 {
475 	struct netdev_boot_setup *s = dev_boot_setup;
476 	int i;
477 
478 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
479 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
480 		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
481 			dev->irq 	= s[i].map.irq;
482 			dev->base_addr 	= s[i].map.base_addr;
483 			dev->mem_start 	= s[i].map.mem_start;
484 			dev->mem_end 	= s[i].map.mem_end;
485 			return 1;
486 		}
487 	}
488 	return 0;
489 }
490 
491 
492 /**
493  *	netdev_boot_base	- get address from boot time settings
494  *	@prefix: prefix for network device
495  *	@unit: id for network device
496  *
497  * 	Check boot time settings for the base address of device.
498  *	The found settings are set for the device to be used
499  *	later in the device probing.
500  *	Returns 0 if no settings found.
501  */
502 unsigned long netdev_boot_base(const char *prefix, int unit)
503 {
504 	const struct netdev_boot_setup *s = dev_boot_setup;
505 	char name[IFNAMSIZ];
506 	int i;
507 
508 	sprintf(name, "%s%d", prefix, unit);
509 
510 	/*
511 	 * If device already registered then return base of 1
512 	 * to indicate not to probe for this interface
513 	 */
514 	if (__dev_get_by_name(&init_net, name))
515 		return 1;
516 
517 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
518 		if (!strcmp(name, s[i].name))
519 			return s[i].map.base_addr;
520 	return 0;
521 }
522 
523 /*
524  * Saves at boot time configured settings for any netdevice.
525  */
526 int __init netdev_boot_setup(char *str)
527 {
528 	int ints[5];
529 	struct ifmap map;
530 
531 	str = get_options(str, ARRAY_SIZE(ints), ints);
532 	if (!str || !*str)
533 		return 0;
534 
535 	/* Save settings */
536 	memset(&map, 0, sizeof(map));
537 	if (ints[0] > 0)
538 		map.irq = ints[1];
539 	if (ints[0] > 1)
540 		map.base_addr = ints[2];
541 	if (ints[0] > 2)
542 		map.mem_start = ints[3];
543 	if (ints[0] > 3)
544 		map.mem_end = ints[4];
545 
546 	/* Add new entry to the list */
547 	return netdev_boot_setup_add(str, &map);
548 }
549 
550 __setup("netdev=", netdev_boot_setup);
551 
552 /*******************************************************************************
553 
554 			    Device Interface Subroutines
555 
556 *******************************************************************************/
557 
558 /**
559  *	__dev_get_by_name	- find a device by its name
560  *	@net: the applicable net namespace
561  *	@name: name to find
562  *
563  *	Find an interface by name. Must be called under RTNL semaphore
564  *	or @dev_base_lock. If the name is found a pointer to the device
565  *	is returned. If the name is not found then %NULL is returned. The
566  *	reference counters are not incremented so the caller must be
567  *	careful with locks.
568  */
569 
570 struct net_device *__dev_get_by_name(struct net *net, const char *name)
571 {
572 	struct hlist_node *p;
573 
574 	hlist_for_each(p, dev_name_hash(net, name)) {
575 		struct net_device *dev
576 			= hlist_entry(p, struct net_device, name_hlist);
577 		if (!strncmp(dev->name, name, IFNAMSIZ))
578 			return dev;
579 	}
580 	return NULL;
581 }
582 
583 /**
584  *	dev_get_by_name		- find a device by its name
585  *	@net: the applicable net namespace
586  *	@name: name to find
587  *
588  *	Find an interface by name. This can be called from any
589  *	context and does its own locking. The returned handle has
590  *	the usage count incremented and the caller must use dev_put() to
591  *	release it when it is no longer needed. %NULL is returned if no
592  *	matching device is found.
593  */
594 
595 struct net_device *dev_get_by_name(struct net *net, const char *name)
596 {
597 	struct net_device *dev;
598 
599 	read_lock(&dev_base_lock);
600 	dev = __dev_get_by_name(net, name);
601 	if (dev)
602 		dev_hold(dev);
603 	read_unlock(&dev_base_lock);
604 	return dev;
605 }
606 
607 /**
608  *	__dev_get_by_index - find a device by its ifindex
609  *	@net: the applicable net namespace
610  *	@ifindex: index of device
611  *
612  *	Search for an interface by index. Returns %NULL if the device
613  *	is not found or a pointer to the device. The device has not
614  *	had its reference counter increased so the caller must be careful
615  *	about locking. The caller must hold either the RTNL semaphore
616  *	or @dev_base_lock.
617  */
618 
619 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
620 {
621 	struct hlist_node *p;
622 
623 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
624 		struct net_device *dev
625 			= hlist_entry(p, struct net_device, index_hlist);
626 		if (dev->ifindex == ifindex)
627 			return dev;
628 	}
629 	return NULL;
630 }
631 
632 
633 /**
634  *	dev_get_by_index - find a device by its ifindex
635  *	@net: the applicable net namespace
636  *	@ifindex: index of device
637  *
638  *	Search for an interface by index. Returns NULL if the device
639  *	is not found or a pointer to the device. The device returned has
640  *	had a reference added and the pointer is safe until the user calls
641  *	dev_put to indicate they have finished with it.
642  */
643 
644 struct net_device *dev_get_by_index(struct net *net, int ifindex)
645 {
646 	struct net_device *dev;
647 
648 	read_lock(&dev_base_lock);
649 	dev = __dev_get_by_index(net, ifindex);
650 	if (dev)
651 		dev_hold(dev);
652 	read_unlock(&dev_base_lock);
653 	return dev;
654 }
655 
656 /**
657  *	dev_getbyhwaddr - find a device by its hardware address
658  *	@net: the applicable net namespace
659  *	@type: media type of device
660  *	@ha: hardware address
661  *
662  *	Search for an interface by MAC address. Returns NULL if the device
663  *	is not found or a pointer to the device. The caller must hold the
664  *	rtnl semaphore. The returned device has not had its ref count increased
665  *	and the caller must therefore be careful about locking
666  *
667  *	BUGS:
668  *	If the API was consistent this would be __dev_get_by_hwaddr
669  */
670 
671 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
672 {
673 	struct net_device *dev;
674 
675 	ASSERT_RTNL();
676 
677 	for_each_netdev(&init_net, dev)
678 		if (dev->type == type &&
679 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
680 			return dev;
681 
682 	return NULL;
683 }
684 
685 EXPORT_SYMBOL(dev_getbyhwaddr);
686 
687 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
688 {
689 	struct net_device *dev;
690 
691 	ASSERT_RTNL();
692 	for_each_netdev(net, dev)
693 		if (dev->type == type)
694 			return dev;
695 
696 	return NULL;
697 }
698 
699 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
700 
701 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
702 {
703 	struct net_device *dev;
704 
705 	rtnl_lock();
706 	dev = __dev_getfirstbyhwtype(net, type);
707 	if (dev)
708 		dev_hold(dev);
709 	rtnl_unlock();
710 	return dev;
711 }
712 
713 EXPORT_SYMBOL(dev_getfirstbyhwtype);
714 
715 /**
716  *	dev_get_by_flags - find any device with given flags
717  *	@net: the applicable net namespace
718  *	@if_flags: IFF_* values
719  *	@mask: bitmask of bits in if_flags to check
720  *
721  *	Search for any interface with the given flags. Returns NULL if a device
722  *	is not found or a pointer to the device. The device returned has
723  *	had a reference added and the pointer is safe until the user calls
724  *	dev_put to indicate they have finished with it.
725  */
726 
727 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
728 {
729 	struct net_device *dev, *ret;
730 
731 	ret = NULL;
732 	read_lock(&dev_base_lock);
733 	for_each_netdev(net, dev) {
734 		if (((dev->flags ^ if_flags) & mask) == 0) {
735 			dev_hold(dev);
736 			ret = dev;
737 			break;
738 		}
739 	}
740 	read_unlock(&dev_base_lock);
741 	return ret;
742 }
743 
744 /**
745  *	dev_valid_name - check if name is okay for network device
746  *	@name: name string
747  *
748  *	Network device names need to be valid file names to
749  *	to allow sysfs to work.  We also disallow any kind of
750  *	whitespace.
751  */
752 int dev_valid_name(const char *name)
753 {
754 	if (*name == '\0')
755 		return 0;
756 	if (strlen(name) >= IFNAMSIZ)
757 		return 0;
758 	if (!strcmp(name, ".") || !strcmp(name, ".."))
759 		return 0;
760 
761 	while (*name) {
762 		if (*name == '/' || isspace(*name))
763 			return 0;
764 		name++;
765 	}
766 	return 1;
767 }
768 
769 /**
770  *	__dev_alloc_name - allocate a name for a device
771  *	@net: network namespace to allocate the device name in
772  *	@name: name format string
773  *	@buf:  scratch buffer and result name string
774  *
775  *	Passed a format string - eg "lt%d" it will try and find a suitable
776  *	id. It scans list of devices to build up a free map, then chooses
777  *	the first empty slot. The caller must hold the dev_base or rtnl lock
778  *	while allocating the name and adding the device in order to avoid
779  *	duplicates.
780  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
781  *	Returns the number of the unit assigned or a negative errno code.
782  */
783 
784 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
785 {
786 	int i = 0;
787 	const char *p;
788 	const int max_netdevices = 8*PAGE_SIZE;
789 	unsigned long *inuse;
790 	struct net_device *d;
791 
792 	p = strnchr(name, IFNAMSIZ-1, '%');
793 	if (p) {
794 		/*
795 		 * Verify the string as this thing may have come from
796 		 * the user.  There must be either one "%d" and no other "%"
797 		 * characters.
798 		 */
799 		if (p[1] != 'd' || strchr(p + 2, '%'))
800 			return -EINVAL;
801 
802 		/* Use one page as a bit array of possible slots */
803 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
804 		if (!inuse)
805 			return -ENOMEM;
806 
807 		for_each_netdev(net, d) {
808 			if (!sscanf(d->name, name, &i))
809 				continue;
810 			if (i < 0 || i >= max_netdevices)
811 				continue;
812 
813 			/*  avoid cases where sscanf is not exact inverse of printf */
814 			snprintf(buf, IFNAMSIZ, name, i);
815 			if (!strncmp(buf, d->name, IFNAMSIZ))
816 				set_bit(i, inuse);
817 		}
818 
819 		i = find_first_zero_bit(inuse, max_netdevices);
820 		free_page((unsigned long) inuse);
821 	}
822 
823 	snprintf(buf, IFNAMSIZ, name, i);
824 	if (!__dev_get_by_name(net, buf))
825 		return i;
826 
827 	/* It is possible to run out of possible slots
828 	 * when the name is long and there isn't enough space left
829 	 * for the digits, or if all bits are used.
830 	 */
831 	return -ENFILE;
832 }
833 
834 /**
835  *	dev_alloc_name - allocate a name for a device
836  *	@dev: device
837  *	@name: name format string
838  *
839  *	Passed a format string - eg "lt%d" it will try and find a suitable
840  *	id. It scans list of devices to build up a free map, then chooses
841  *	the first empty slot. The caller must hold the dev_base or rtnl lock
842  *	while allocating the name and adding the device in order to avoid
843  *	duplicates.
844  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
845  *	Returns the number of the unit assigned or a negative errno code.
846  */
847 
848 int dev_alloc_name(struct net_device *dev, const char *name)
849 {
850 	char buf[IFNAMSIZ];
851 	struct net *net;
852 	int ret;
853 
854 	BUG_ON(!dev->nd_net);
855 	net = dev->nd_net;
856 	ret = __dev_alloc_name(net, name, buf);
857 	if (ret >= 0)
858 		strlcpy(dev->name, buf, IFNAMSIZ);
859 	return ret;
860 }
861 
862 
863 /**
864  *	dev_change_name - change name of a device
865  *	@dev: device
866  *	@newname: name (or format string) must be at least IFNAMSIZ
867  *
868  *	Change name of a device, can pass format strings "eth%d".
869  *	for wildcarding.
870  */
871 int dev_change_name(struct net_device *dev, char *newname)
872 {
873 	char oldname[IFNAMSIZ];
874 	int err = 0;
875 	int ret;
876 	struct net *net;
877 
878 	ASSERT_RTNL();
879 	BUG_ON(!dev->nd_net);
880 
881 	net = dev->nd_net;
882 	if (dev->flags & IFF_UP)
883 		return -EBUSY;
884 
885 	if (!dev_valid_name(newname))
886 		return -EINVAL;
887 
888 	memcpy(oldname, dev->name, IFNAMSIZ);
889 
890 	if (strchr(newname, '%')) {
891 		err = dev_alloc_name(dev, newname);
892 		if (err < 0)
893 			return err;
894 		strcpy(newname, dev->name);
895 	}
896 	else if (__dev_get_by_name(net, newname))
897 		return -EEXIST;
898 	else
899 		strlcpy(dev->name, newname, IFNAMSIZ);
900 
901 rollback:
902 	device_rename(&dev->dev, dev->name);
903 
904 	write_lock_bh(&dev_base_lock);
905 	hlist_del(&dev->name_hlist);
906 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
907 	write_unlock_bh(&dev_base_lock);
908 
909 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
910 	ret = notifier_to_errno(ret);
911 
912 	if (ret) {
913 		if (err) {
914 			printk(KERN_ERR
915 			       "%s: name change rollback failed: %d.\n",
916 			       dev->name, ret);
917 		} else {
918 			err = ret;
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			goto rollback;
921 		}
922 	}
923 
924 	return err;
925 }
926 
927 /**
928  *	netdev_features_change - device changes features
929  *	@dev: device to cause notification
930  *
931  *	Called to indicate a device has changed features.
932  */
933 void netdev_features_change(struct net_device *dev)
934 {
935 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
936 }
937 EXPORT_SYMBOL(netdev_features_change);
938 
939 /**
940  *	netdev_state_change - device changes state
941  *	@dev: device to cause notification
942  *
943  *	Called to indicate a device has changed state. This function calls
944  *	the notifier chains for netdev_chain and sends a NEWLINK message
945  *	to the routing socket.
946  */
947 void netdev_state_change(struct net_device *dev)
948 {
949 	if (dev->flags & IFF_UP) {
950 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
951 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
952 	}
953 }
954 
955 /**
956  *	dev_load 	- load a network module
957  *	@net: the applicable net namespace
958  *	@name: name of interface
959  *
960  *	If a network interface is not present and the process has suitable
961  *	privileges this function loads the module. If module loading is not
962  *	available in this kernel then it becomes a nop.
963  */
964 
965 void dev_load(struct net *net, const char *name)
966 {
967 	struct net_device *dev;
968 
969 	read_lock(&dev_base_lock);
970 	dev = __dev_get_by_name(net, name);
971 	read_unlock(&dev_base_lock);
972 
973 	if (!dev && capable(CAP_SYS_MODULE))
974 		request_module("%s", name);
975 }
976 
977 /**
978  *	dev_open	- prepare an interface for use.
979  *	@dev:	device to open
980  *
981  *	Takes a device from down to up state. The device's private open
982  *	function is invoked and then the multicast lists are loaded. Finally
983  *	the device is moved into the up state and a %NETDEV_UP message is
984  *	sent to the netdev notifier chain.
985  *
986  *	Calling this function on an active interface is a nop. On a failure
987  *	a negative errno code is returned.
988  */
989 int dev_open(struct net_device *dev)
990 {
991 	int ret = 0;
992 
993 	/*
994 	 *	Is it already up?
995 	 */
996 
997 	if (dev->flags & IFF_UP)
998 		return 0;
999 
1000 	/*
1001 	 *	Is it even present?
1002 	 */
1003 	if (!netif_device_present(dev))
1004 		return -ENODEV;
1005 
1006 	/*
1007 	 *	Call device private open method
1008 	 */
1009 	set_bit(__LINK_STATE_START, &dev->state);
1010 	if (dev->open) {
1011 		ret = dev->open(dev);
1012 		if (ret)
1013 			clear_bit(__LINK_STATE_START, &dev->state);
1014 	}
1015 
1016 	/*
1017 	 *	If it went open OK then:
1018 	 */
1019 
1020 	if (!ret) {
1021 		/*
1022 		 *	Set the flags.
1023 		 */
1024 		dev->flags |= IFF_UP;
1025 
1026 		/*
1027 		 *	Initialize multicasting status
1028 		 */
1029 		dev_set_rx_mode(dev);
1030 
1031 		/*
1032 		 *	Wakeup transmit queue engine
1033 		 */
1034 		dev_activate(dev);
1035 
1036 		/*
1037 		 *	... and announce new interface.
1038 		 */
1039 		call_netdevice_notifiers(NETDEV_UP, dev);
1040 	}
1041 	return ret;
1042 }
1043 
1044 /**
1045  *	dev_close - shutdown an interface.
1046  *	@dev: device to shutdown
1047  *
1048  *	This function moves an active device into down state. A
1049  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1050  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1051  *	chain.
1052  */
1053 int dev_close(struct net_device *dev)
1054 {
1055 	might_sleep();
1056 
1057 	if (!(dev->flags & IFF_UP))
1058 		return 0;
1059 
1060 	/*
1061 	 *	Tell people we are going down, so that they can
1062 	 *	prepare to death, when device is still operating.
1063 	 */
1064 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1065 
1066 	dev_deactivate(dev);
1067 
1068 	clear_bit(__LINK_STATE_START, &dev->state);
1069 
1070 	/* Synchronize to scheduled poll. We cannot touch poll list,
1071 	 * it can be even on different cpu. So just clear netif_running().
1072 	 *
1073 	 * dev->stop() will invoke napi_disable() on all of it's
1074 	 * napi_struct instances on this device.
1075 	 */
1076 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1077 
1078 	/*
1079 	 *	Call the device specific close. This cannot fail.
1080 	 *	Only if device is UP
1081 	 *
1082 	 *	We allow it to be called even after a DETACH hot-plug
1083 	 *	event.
1084 	 */
1085 	if (dev->stop)
1086 		dev->stop(dev);
1087 
1088 	/*
1089 	 *	Device is now down.
1090 	 */
1091 
1092 	dev->flags &= ~IFF_UP;
1093 
1094 	/*
1095 	 * Tell people we are down
1096 	 */
1097 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1098 
1099 	return 0;
1100 }
1101 
1102 
1103 static int dev_boot_phase = 1;
1104 
1105 /*
1106  *	Device change register/unregister. These are not inline or static
1107  *	as we export them to the world.
1108  */
1109 
1110 /**
1111  *	register_netdevice_notifier - register a network notifier block
1112  *	@nb: notifier
1113  *
1114  *	Register a notifier to be called when network device events occur.
1115  *	The notifier passed is linked into the kernel structures and must
1116  *	not be reused until it has been unregistered. A negative errno code
1117  *	is returned on a failure.
1118  *
1119  * 	When registered all registration and up events are replayed
1120  *	to the new notifier to allow device to have a race free
1121  *	view of the network device list.
1122  */
1123 
1124 int register_netdevice_notifier(struct notifier_block *nb)
1125 {
1126 	struct net_device *dev;
1127 	struct net_device *last;
1128 	struct net *net;
1129 	int err;
1130 
1131 	rtnl_lock();
1132 	err = raw_notifier_chain_register(&netdev_chain, nb);
1133 	if (err)
1134 		goto unlock;
1135 	if (dev_boot_phase)
1136 		goto unlock;
1137 	for_each_net(net) {
1138 		for_each_netdev(net, dev) {
1139 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1140 			err = notifier_to_errno(err);
1141 			if (err)
1142 				goto rollback;
1143 
1144 			if (!(dev->flags & IFF_UP))
1145 				continue;
1146 
1147 			nb->notifier_call(nb, NETDEV_UP, dev);
1148 		}
1149 	}
1150 
1151 unlock:
1152 	rtnl_unlock();
1153 	return err;
1154 
1155 rollback:
1156 	last = dev;
1157 	for_each_net(net) {
1158 		for_each_netdev(net, dev) {
1159 			if (dev == last)
1160 				break;
1161 
1162 			if (dev->flags & IFF_UP) {
1163 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1164 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1165 			}
1166 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1167 		}
1168 	}
1169 	goto unlock;
1170 }
1171 
1172 /**
1173  *	unregister_netdevice_notifier - unregister a network notifier block
1174  *	@nb: notifier
1175  *
1176  *	Unregister a notifier previously registered by
1177  *	register_netdevice_notifier(). The notifier is unlinked into the
1178  *	kernel structures and may then be reused. A negative errno code
1179  *	is returned on a failure.
1180  */
1181 
1182 int unregister_netdevice_notifier(struct notifier_block *nb)
1183 {
1184 	int err;
1185 
1186 	rtnl_lock();
1187 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1188 	rtnl_unlock();
1189 	return err;
1190 }
1191 
1192 /**
1193  *	call_netdevice_notifiers - call all network notifier blocks
1194  *      @val: value passed unmodified to notifier function
1195  *      @dev: net_device pointer passed unmodified to notifier function
1196  *
1197  *	Call all network notifier blocks.  Parameters and return value
1198  *	are as for raw_notifier_call_chain().
1199  */
1200 
1201 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1202 {
1203 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1204 }
1205 
1206 /* When > 0 there are consumers of rx skb time stamps */
1207 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1208 
1209 void net_enable_timestamp(void)
1210 {
1211 	atomic_inc(&netstamp_needed);
1212 }
1213 
1214 void net_disable_timestamp(void)
1215 {
1216 	atomic_dec(&netstamp_needed);
1217 }
1218 
1219 static inline void net_timestamp(struct sk_buff *skb)
1220 {
1221 	if (atomic_read(&netstamp_needed))
1222 		__net_timestamp(skb);
1223 	else
1224 		skb->tstamp.tv64 = 0;
1225 }
1226 
1227 /*
1228  *	Support routine. Sends outgoing frames to any network
1229  *	taps currently in use.
1230  */
1231 
1232 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1233 {
1234 	struct packet_type *ptype;
1235 
1236 	net_timestamp(skb);
1237 
1238 	rcu_read_lock();
1239 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1240 		/* Never send packets back to the socket
1241 		 * they originated from - MvS (miquels@drinkel.ow.org)
1242 		 */
1243 		if ((ptype->dev == dev || !ptype->dev) &&
1244 		    (ptype->af_packet_priv == NULL ||
1245 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1246 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1247 			if (!skb2)
1248 				break;
1249 
1250 			/* skb->nh should be correctly
1251 			   set by sender, so that the second statement is
1252 			   just protection against buggy protocols.
1253 			 */
1254 			skb_reset_mac_header(skb2);
1255 
1256 			if (skb_network_header(skb2) < skb2->data ||
1257 			    skb2->network_header > skb2->tail) {
1258 				if (net_ratelimit())
1259 					printk(KERN_CRIT "protocol %04x is "
1260 					       "buggy, dev %s\n",
1261 					       skb2->protocol, dev->name);
1262 				skb_reset_network_header(skb2);
1263 			}
1264 
1265 			skb2->transport_header = skb2->network_header;
1266 			skb2->pkt_type = PACKET_OUTGOING;
1267 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1268 		}
1269 	}
1270 	rcu_read_unlock();
1271 }
1272 
1273 
1274 void __netif_schedule(struct net_device *dev)
1275 {
1276 	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1277 		unsigned long flags;
1278 		struct softnet_data *sd;
1279 
1280 		local_irq_save(flags);
1281 		sd = &__get_cpu_var(softnet_data);
1282 		dev->next_sched = sd->output_queue;
1283 		sd->output_queue = dev;
1284 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1285 		local_irq_restore(flags);
1286 	}
1287 }
1288 EXPORT_SYMBOL(__netif_schedule);
1289 
1290 void dev_kfree_skb_irq(struct sk_buff *skb)
1291 {
1292 	if (atomic_dec_and_test(&skb->users)) {
1293 		struct softnet_data *sd;
1294 		unsigned long flags;
1295 
1296 		local_irq_save(flags);
1297 		sd = &__get_cpu_var(softnet_data);
1298 		skb->next = sd->completion_queue;
1299 		sd->completion_queue = skb;
1300 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1301 		local_irq_restore(flags);
1302 	}
1303 }
1304 EXPORT_SYMBOL(dev_kfree_skb_irq);
1305 
1306 void dev_kfree_skb_any(struct sk_buff *skb)
1307 {
1308 	if (in_irq() || irqs_disabled())
1309 		dev_kfree_skb_irq(skb);
1310 	else
1311 		dev_kfree_skb(skb);
1312 }
1313 EXPORT_SYMBOL(dev_kfree_skb_any);
1314 
1315 
1316 /**
1317  * netif_device_detach - mark device as removed
1318  * @dev: network device
1319  *
1320  * Mark device as removed from system and therefore no longer available.
1321  */
1322 void netif_device_detach(struct net_device *dev)
1323 {
1324 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1325 	    netif_running(dev)) {
1326 		netif_stop_queue(dev);
1327 	}
1328 }
1329 EXPORT_SYMBOL(netif_device_detach);
1330 
1331 /**
1332  * netif_device_attach - mark device as attached
1333  * @dev: network device
1334  *
1335  * Mark device as attached from system and restart if needed.
1336  */
1337 void netif_device_attach(struct net_device *dev)
1338 {
1339 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1340 	    netif_running(dev)) {
1341 		netif_wake_queue(dev);
1342 		__netdev_watchdog_up(dev);
1343 	}
1344 }
1345 EXPORT_SYMBOL(netif_device_attach);
1346 
1347 
1348 /*
1349  * Invalidate hardware checksum when packet is to be mangled, and
1350  * complete checksum manually on outgoing path.
1351  */
1352 int skb_checksum_help(struct sk_buff *skb)
1353 {
1354 	__wsum csum;
1355 	int ret = 0, offset;
1356 
1357 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1358 		goto out_set_summed;
1359 
1360 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1361 		/* Let GSO fix up the checksum. */
1362 		goto out_set_summed;
1363 	}
1364 
1365 	if (skb_cloned(skb)) {
1366 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1367 		if (ret)
1368 			goto out;
1369 	}
1370 
1371 	offset = skb->csum_start - skb_headroom(skb);
1372 	BUG_ON(offset > (int)skb->len);
1373 	csum = skb_checksum(skb, offset, skb->len-offset, 0);
1374 
1375 	offset = skb_headlen(skb) - offset;
1376 	BUG_ON(offset <= 0);
1377 	BUG_ON(skb->csum_offset + 2 > offset);
1378 
1379 	*(__sum16 *)(skb->head + skb->csum_start + skb->csum_offset) =
1380 		csum_fold(csum);
1381 out_set_summed:
1382 	skb->ip_summed = CHECKSUM_NONE;
1383 out:
1384 	return ret;
1385 }
1386 
1387 /**
1388  *	skb_gso_segment - Perform segmentation on skb.
1389  *	@skb: buffer to segment
1390  *	@features: features for the output path (see dev->features)
1391  *
1392  *	This function segments the given skb and returns a list of segments.
1393  *
1394  *	It may return NULL if the skb requires no segmentation.  This is
1395  *	only possible when GSO is used for verifying header integrity.
1396  */
1397 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1398 {
1399 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1400 	struct packet_type *ptype;
1401 	__be16 type = skb->protocol;
1402 	int err;
1403 
1404 	BUG_ON(skb_shinfo(skb)->frag_list);
1405 
1406 	skb_reset_mac_header(skb);
1407 	skb->mac_len = skb->network_header - skb->mac_header;
1408 	__skb_pull(skb, skb->mac_len);
1409 
1410 	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1411 		if (skb_header_cloned(skb) &&
1412 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1413 			return ERR_PTR(err);
1414 	}
1415 
1416 	rcu_read_lock();
1417 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1418 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1419 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1420 				err = ptype->gso_send_check(skb);
1421 				segs = ERR_PTR(err);
1422 				if (err || skb_gso_ok(skb, features))
1423 					break;
1424 				__skb_push(skb, (skb->data -
1425 						 skb_network_header(skb)));
1426 			}
1427 			segs = ptype->gso_segment(skb, features);
1428 			break;
1429 		}
1430 	}
1431 	rcu_read_unlock();
1432 
1433 	__skb_push(skb, skb->data - skb_mac_header(skb));
1434 
1435 	return segs;
1436 }
1437 
1438 EXPORT_SYMBOL(skb_gso_segment);
1439 
1440 /* Take action when hardware reception checksum errors are detected. */
1441 #ifdef CONFIG_BUG
1442 void netdev_rx_csum_fault(struct net_device *dev)
1443 {
1444 	if (net_ratelimit()) {
1445 		printk(KERN_ERR "%s: hw csum failure.\n",
1446 			dev ? dev->name : "<unknown>");
1447 		dump_stack();
1448 	}
1449 }
1450 EXPORT_SYMBOL(netdev_rx_csum_fault);
1451 #endif
1452 
1453 /* Actually, we should eliminate this check as soon as we know, that:
1454  * 1. IOMMU is present and allows to map all the memory.
1455  * 2. No high memory really exists on this machine.
1456  */
1457 
1458 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1459 {
1460 #ifdef CONFIG_HIGHMEM
1461 	int i;
1462 
1463 	if (dev->features & NETIF_F_HIGHDMA)
1464 		return 0;
1465 
1466 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1467 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1468 			return 1;
1469 
1470 #endif
1471 	return 0;
1472 }
1473 
1474 struct dev_gso_cb {
1475 	void (*destructor)(struct sk_buff *skb);
1476 };
1477 
1478 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1479 
1480 static void dev_gso_skb_destructor(struct sk_buff *skb)
1481 {
1482 	struct dev_gso_cb *cb;
1483 
1484 	do {
1485 		struct sk_buff *nskb = skb->next;
1486 
1487 		skb->next = nskb->next;
1488 		nskb->next = NULL;
1489 		kfree_skb(nskb);
1490 	} while (skb->next);
1491 
1492 	cb = DEV_GSO_CB(skb);
1493 	if (cb->destructor)
1494 		cb->destructor(skb);
1495 }
1496 
1497 /**
1498  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1499  *	@skb: buffer to segment
1500  *
1501  *	This function segments the given skb and stores the list of segments
1502  *	in skb->next.
1503  */
1504 static int dev_gso_segment(struct sk_buff *skb)
1505 {
1506 	struct net_device *dev = skb->dev;
1507 	struct sk_buff *segs;
1508 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1509 					 NETIF_F_SG : 0);
1510 
1511 	segs = skb_gso_segment(skb, features);
1512 
1513 	/* Verifying header integrity only. */
1514 	if (!segs)
1515 		return 0;
1516 
1517 	if (unlikely(IS_ERR(segs)))
1518 		return PTR_ERR(segs);
1519 
1520 	skb->next = segs;
1521 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1522 	skb->destructor = dev_gso_skb_destructor;
1523 
1524 	return 0;
1525 }
1526 
1527 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1528 {
1529 	if (likely(!skb->next)) {
1530 		if (!list_empty(&ptype_all))
1531 			dev_queue_xmit_nit(skb, dev);
1532 
1533 		if (netif_needs_gso(dev, skb)) {
1534 			if (unlikely(dev_gso_segment(skb)))
1535 				goto out_kfree_skb;
1536 			if (skb->next)
1537 				goto gso;
1538 		}
1539 
1540 		return dev->hard_start_xmit(skb, dev);
1541 	}
1542 
1543 gso:
1544 	do {
1545 		struct sk_buff *nskb = skb->next;
1546 		int rc;
1547 
1548 		skb->next = nskb->next;
1549 		nskb->next = NULL;
1550 		rc = dev->hard_start_xmit(nskb, dev);
1551 		if (unlikely(rc)) {
1552 			nskb->next = skb->next;
1553 			skb->next = nskb;
1554 			return rc;
1555 		}
1556 		if (unlikely((netif_queue_stopped(dev) ||
1557 			     netif_subqueue_stopped(dev, skb->queue_mapping)) &&
1558 			     skb->next))
1559 			return NETDEV_TX_BUSY;
1560 	} while (skb->next);
1561 
1562 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1563 
1564 out_kfree_skb:
1565 	kfree_skb(skb);
1566 	return 0;
1567 }
1568 
1569 /**
1570  *	dev_queue_xmit - transmit a buffer
1571  *	@skb: buffer to transmit
1572  *
1573  *	Queue a buffer for transmission to a network device. The caller must
1574  *	have set the device and priority and built the buffer before calling
1575  *	this function. The function can be called from an interrupt.
1576  *
1577  *	A negative errno code is returned on a failure. A success does not
1578  *	guarantee the frame will be transmitted as it may be dropped due
1579  *	to congestion or traffic shaping.
1580  *
1581  * -----------------------------------------------------------------------------------
1582  *      I notice this method can also return errors from the queue disciplines,
1583  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1584  *      be positive.
1585  *
1586  *      Regardless of the return value, the skb is consumed, so it is currently
1587  *      difficult to retry a send to this method.  (You can bump the ref count
1588  *      before sending to hold a reference for retry if you are careful.)
1589  *
1590  *      When calling this method, interrupts MUST be enabled.  This is because
1591  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1592  *          --BLG
1593  */
1594 
1595 int dev_queue_xmit(struct sk_buff *skb)
1596 {
1597 	struct net_device *dev = skb->dev;
1598 	struct Qdisc *q;
1599 	int rc = -ENOMEM;
1600 
1601 	/* GSO will handle the following emulations directly. */
1602 	if (netif_needs_gso(dev, skb))
1603 		goto gso;
1604 
1605 	if (skb_shinfo(skb)->frag_list &&
1606 	    !(dev->features & NETIF_F_FRAGLIST) &&
1607 	    __skb_linearize(skb))
1608 		goto out_kfree_skb;
1609 
1610 	/* Fragmented skb is linearized if device does not support SG,
1611 	 * or if at least one of fragments is in highmem and device
1612 	 * does not support DMA from it.
1613 	 */
1614 	if (skb_shinfo(skb)->nr_frags &&
1615 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1616 	    __skb_linearize(skb))
1617 		goto out_kfree_skb;
1618 
1619 	/* If packet is not checksummed and device does not support
1620 	 * checksumming for this protocol, complete checksumming here.
1621 	 */
1622 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1623 		skb_set_transport_header(skb, skb->csum_start -
1624 					      skb_headroom(skb));
1625 
1626 		if (!(dev->features & NETIF_F_GEN_CSUM) &&
1627 		    !((dev->features & NETIF_F_IP_CSUM) &&
1628 		      skb->protocol == htons(ETH_P_IP)) &&
1629 		    !((dev->features & NETIF_F_IPV6_CSUM) &&
1630 		      skb->protocol == htons(ETH_P_IPV6)))
1631 			if (skb_checksum_help(skb))
1632 				goto out_kfree_skb;
1633 	}
1634 
1635 gso:
1636 	spin_lock_prefetch(&dev->queue_lock);
1637 
1638 	/* Disable soft irqs for various locks below. Also
1639 	 * stops preemption for RCU.
1640 	 */
1641 	rcu_read_lock_bh();
1642 
1643 	/* Updates of qdisc are serialized by queue_lock.
1644 	 * The struct Qdisc which is pointed to by qdisc is now a
1645 	 * rcu structure - it may be accessed without acquiring
1646 	 * a lock (but the structure may be stale.) The freeing of the
1647 	 * qdisc will be deferred until it's known that there are no
1648 	 * more references to it.
1649 	 *
1650 	 * If the qdisc has an enqueue function, we still need to
1651 	 * hold the queue_lock before calling it, since queue_lock
1652 	 * also serializes access to the device queue.
1653 	 */
1654 
1655 	q = rcu_dereference(dev->qdisc);
1656 #ifdef CONFIG_NET_CLS_ACT
1657 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1658 #endif
1659 	if (q->enqueue) {
1660 		/* Grab device queue */
1661 		spin_lock(&dev->queue_lock);
1662 		q = dev->qdisc;
1663 		if (q->enqueue) {
1664 			/* reset queue_mapping to zero */
1665 			skb->queue_mapping = 0;
1666 			rc = q->enqueue(skb, q);
1667 			qdisc_run(dev);
1668 			spin_unlock(&dev->queue_lock);
1669 
1670 			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1671 			goto out;
1672 		}
1673 		spin_unlock(&dev->queue_lock);
1674 	}
1675 
1676 	/* The device has no queue. Common case for software devices:
1677 	   loopback, all the sorts of tunnels...
1678 
1679 	   Really, it is unlikely that netif_tx_lock protection is necessary
1680 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1681 	   counters.)
1682 	   However, it is possible, that they rely on protection
1683 	   made by us here.
1684 
1685 	   Check this and shot the lock. It is not prone from deadlocks.
1686 	   Either shot noqueue qdisc, it is even simpler 8)
1687 	 */
1688 	if (dev->flags & IFF_UP) {
1689 		int cpu = smp_processor_id(); /* ok because BHs are off */
1690 
1691 		if (dev->xmit_lock_owner != cpu) {
1692 
1693 			HARD_TX_LOCK(dev, cpu);
1694 
1695 			if (!netif_queue_stopped(dev) &&
1696 			    !netif_subqueue_stopped(dev, skb->queue_mapping)) {
1697 				rc = 0;
1698 				if (!dev_hard_start_xmit(skb, dev)) {
1699 					HARD_TX_UNLOCK(dev);
1700 					goto out;
1701 				}
1702 			}
1703 			HARD_TX_UNLOCK(dev);
1704 			if (net_ratelimit())
1705 				printk(KERN_CRIT "Virtual device %s asks to "
1706 				       "queue packet!\n", dev->name);
1707 		} else {
1708 			/* Recursion is detected! It is possible,
1709 			 * unfortunately */
1710 			if (net_ratelimit())
1711 				printk(KERN_CRIT "Dead loop on virtual device "
1712 				       "%s, fix it urgently!\n", dev->name);
1713 		}
1714 	}
1715 
1716 	rc = -ENETDOWN;
1717 	rcu_read_unlock_bh();
1718 
1719 out_kfree_skb:
1720 	kfree_skb(skb);
1721 	return rc;
1722 out:
1723 	rcu_read_unlock_bh();
1724 	return rc;
1725 }
1726 
1727 
1728 /*=======================================================================
1729 			Receiver routines
1730   =======================================================================*/
1731 
1732 int netdev_max_backlog __read_mostly = 1000;
1733 int netdev_budget __read_mostly = 300;
1734 int weight_p __read_mostly = 64;            /* old backlog weight */
1735 
1736 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1737 
1738 
1739 /**
1740  *	netif_rx	-	post buffer to the network code
1741  *	@skb: buffer to post
1742  *
1743  *	This function receives a packet from a device driver and queues it for
1744  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1745  *	may be dropped during processing for congestion control or by the
1746  *	protocol layers.
1747  *
1748  *	return values:
1749  *	NET_RX_SUCCESS	(no congestion)
1750  *	NET_RX_CN_LOW   (low congestion)
1751  *	NET_RX_CN_MOD   (moderate congestion)
1752  *	NET_RX_CN_HIGH  (high congestion)
1753  *	NET_RX_DROP     (packet was dropped)
1754  *
1755  */
1756 
1757 int netif_rx(struct sk_buff *skb)
1758 {
1759 	struct softnet_data *queue;
1760 	unsigned long flags;
1761 
1762 	/* if netpoll wants it, pretend we never saw it */
1763 	if (netpoll_rx(skb))
1764 		return NET_RX_DROP;
1765 
1766 	if (!skb->tstamp.tv64)
1767 		net_timestamp(skb);
1768 
1769 	/*
1770 	 * The code is rearranged so that the path is the most
1771 	 * short when CPU is congested, but is still operating.
1772 	 */
1773 	local_irq_save(flags);
1774 	queue = &__get_cpu_var(softnet_data);
1775 
1776 	__get_cpu_var(netdev_rx_stat).total++;
1777 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1778 		if (queue->input_pkt_queue.qlen) {
1779 enqueue:
1780 			dev_hold(skb->dev);
1781 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1782 			local_irq_restore(flags);
1783 			return NET_RX_SUCCESS;
1784 		}
1785 
1786 		napi_schedule(&queue->backlog);
1787 		goto enqueue;
1788 	}
1789 
1790 	__get_cpu_var(netdev_rx_stat).dropped++;
1791 	local_irq_restore(flags);
1792 
1793 	kfree_skb(skb);
1794 	return NET_RX_DROP;
1795 }
1796 
1797 int netif_rx_ni(struct sk_buff *skb)
1798 {
1799 	int err;
1800 
1801 	preempt_disable();
1802 	err = netif_rx(skb);
1803 	if (local_softirq_pending())
1804 		do_softirq();
1805 	preempt_enable();
1806 
1807 	return err;
1808 }
1809 
1810 EXPORT_SYMBOL(netif_rx_ni);
1811 
1812 static inline struct net_device *skb_bond(struct sk_buff *skb)
1813 {
1814 	struct net_device *dev = skb->dev;
1815 
1816 	if (dev->master) {
1817 		if (skb_bond_should_drop(skb)) {
1818 			kfree_skb(skb);
1819 			return NULL;
1820 		}
1821 		skb->dev = dev->master;
1822 	}
1823 
1824 	return dev;
1825 }
1826 
1827 
1828 static void net_tx_action(struct softirq_action *h)
1829 {
1830 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1831 
1832 	if (sd->completion_queue) {
1833 		struct sk_buff *clist;
1834 
1835 		local_irq_disable();
1836 		clist = sd->completion_queue;
1837 		sd->completion_queue = NULL;
1838 		local_irq_enable();
1839 
1840 		while (clist) {
1841 			struct sk_buff *skb = clist;
1842 			clist = clist->next;
1843 
1844 			BUG_TRAP(!atomic_read(&skb->users));
1845 			__kfree_skb(skb);
1846 		}
1847 	}
1848 
1849 	if (sd->output_queue) {
1850 		struct net_device *head;
1851 
1852 		local_irq_disable();
1853 		head = sd->output_queue;
1854 		sd->output_queue = NULL;
1855 		local_irq_enable();
1856 
1857 		while (head) {
1858 			struct net_device *dev = head;
1859 			head = head->next_sched;
1860 
1861 			smp_mb__before_clear_bit();
1862 			clear_bit(__LINK_STATE_SCHED, &dev->state);
1863 
1864 			if (spin_trylock(&dev->queue_lock)) {
1865 				qdisc_run(dev);
1866 				spin_unlock(&dev->queue_lock);
1867 			} else {
1868 				netif_schedule(dev);
1869 			}
1870 		}
1871 	}
1872 }
1873 
1874 static inline int deliver_skb(struct sk_buff *skb,
1875 			      struct packet_type *pt_prev,
1876 			      struct net_device *orig_dev)
1877 {
1878 	atomic_inc(&skb->users);
1879 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1880 }
1881 
1882 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1883 /* These hooks defined here for ATM */
1884 struct net_bridge;
1885 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1886 						unsigned char *addr);
1887 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1888 
1889 /*
1890  * If bridge module is loaded call bridging hook.
1891  *  returns NULL if packet was consumed.
1892  */
1893 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1894 					struct sk_buff *skb) __read_mostly;
1895 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1896 					    struct packet_type **pt_prev, int *ret,
1897 					    struct net_device *orig_dev)
1898 {
1899 	struct net_bridge_port *port;
1900 
1901 	if (skb->pkt_type == PACKET_LOOPBACK ||
1902 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
1903 		return skb;
1904 
1905 	if (*pt_prev) {
1906 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1907 		*pt_prev = NULL;
1908 	}
1909 
1910 	return br_handle_frame_hook(port, skb);
1911 }
1912 #else
1913 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
1914 #endif
1915 
1916 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1917 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1918 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1919 
1920 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1921 					     struct packet_type **pt_prev,
1922 					     int *ret,
1923 					     struct net_device *orig_dev)
1924 {
1925 	if (skb->dev->macvlan_port == NULL)
1926 		return skb;
1927 
1928 	if (*pt_prev) {
1929 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1930 		*pt_prev = NULL;
1931 	}
1932 	return macvlan_handle_frame_hook(skb);
1933 }
1934 #else
1935 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
1936 #endif
1937 
1938 #ifdef CONFIG_NET_CLS_ACT
1939 /* TODO: Maybe we should just force sch_ingress to be compiled in
1940  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1941  * a compare and 2 stores extra right now if we dont have it on
1942  * but have CONFIG_NET_CLS_ACT
1943  * NOTE: This doesnt stop any functionality; if you dont have
1944  * the ingress scheduler, you just cant add policies on ingress.
1945  *
1946  */
1947 static int ing_filter(struct sk_buff *skb)
1948 {
1949 	struct Qdisc *q;
1950 	struct net_device *dev = skb->dev;
1951 	int result = TC_ACT_OK;
1952 
1953 	if (dev->qdisc_ingress) {
1954 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1955 		if (MAX_RED_LOOP < ttl++) {
1956 			printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
1957 				skb->iif, skb->dev->ifindex);
1958 			return TC_ACT_SHOT;
1959 		}
1960 
1961 		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1962 
1963 		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1964 
1965 		spin_lock(&dev->ingress_lock);
1966 		if ((q = dev->qdisc_ingress) != NULL)
1967 			result = q->enqueue(skb, q);
1968 		spin_unlock(&dev->ingress_lock);
1969 
1970 	}
1971 
1972 	return result;
1973 }
1974 #endif
1975 
1976 int netif_receive_skb(struct sk_buff *skb)
1977 {
1978 	struct packet_type *ptype, *pt_prev;
1979 	struct net_device *orig_dev;
1980 	int ret = NET_RX_DROP;
1981 	__be16 type;
1982 
1983 	/* if we've gotten here through NAPI, check netpoll */
1984 	if (netpoll_receive_skb(skb))
1985 		return NET_RX_DROP;
1986 
1987 	if (!skb->tstamp.tv64)
1988 		net_timestamp(skb);
1989 
1990 	if (!skb->iif)
1991 		skb->iif = skb->dev->ifindex;
1992 
1993 	orig_dev = skb_bond(skb);
1994 
1995 	if (!orig_dev)
1996 		return NET_RX_DROP;
1997 
1998 	__get_cpu_var(netdev_rx_stat).total++;
1999 
2000 	skb_reset_network_header(skb);
2001 	skb_reset_transport_header(skb);
2002 	skb->mac_len = skb->network_header - skb->mac_header;
2003 
2004 	pt_prev = NULL;
2005 
2006 	rcu_read_lock();
2007 
2008 #ifdef CONFIG_NET_CLS_ACT
2009 	if (skb->tc_verd & TC_NCLS) {
2010 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2011 		goto ncls;
2012 	}
2013 #endif
2014 
2015 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2016 		if (!ptype->dev || ptype->dev == skb->dev) {
2017 			if (pt_prev)
2018 				ret = deliver_skb(skb, pt_prev, orig_dev);
2019 			pt_prev = ptype;
2020 		}
2021 	}
2022 
2023 #ifdef CONFIG_NET_CLS_ACT
2024 	if (pt_prev) {
2025 		ret = deliver_skb(skb, pt_prev, orig_dev);
2026 		pt_prev = NULL; /* noone else should process this after*/
2027 	} else {
2028 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2029 	}
2030 
2031 	ret = ing_filter(skb);
2032 
2033 	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
2034 		kfree_skb(skb);
2035 		goto out;
2036 	}
2037 
2038 	skb->tc_verd = 0;
2039 ncls:
2040 #endif
2041 
2042 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2043 	if (!skb)
2044 		goto out;
2045 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2046 	if (!skb)
2047 		goto out;
2048 
2049 	type = skb->protocol;
2050 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
2051 		if (ptype->type == type &&
2052 		    (!ptype->dev || ptype->dev == skb->dev)) {
2053 			if (pt_prev)
2054 				ret = deliver_skb(skb, pt_prev, orig_dev);
2055 			pt_prev = ptype;
2056 		}
2057 	}
2058 
2059 	if (pt_prev) {
2060 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2061 	} else {
2062 		kfree_skb(skb);
2063 		/* Jamal, now you will not able to escape explaining
2064 		 * me how you were going to use this. :-)
2065 		 */
2066 		ret = NET_RX_DROP;
2067 	}
2068 
2069 out:
2070 	rcu_read_unlock();
2071 	return ret;
2072 }
2073 
2074 static int process_backlog(struct napi_struct *napi, int quota)
2075 {
2076 	int work = 0;
2077 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2078 	unsigned long start_time = jiffies;
2079 
2080 	napi->weight = weight_p;
2081 	do {
2082 		struct sk_buff *skb;
2083 		struct net_device *dev;
2084 
2085 		local_irq_disable();
2086 		skb = __skb_dequeue(&queue->input_pkt_queue);
2087 		if (!skb) {
2088 			__napi_complete(napi);
2089 			local_irq_enable();
2090 			break;
2091 		}
2092 
2093 		local_irq_enable();
2094 
2095 		dev = skb->dev;
2096 
2097 		netif_receive_skb(skb);
2098 
2099 		dev_put(dev);
2100 	} while (++work < quota && jiffies == start_time);
2101 
2102 	return work;
2103 }
2104 
2105 /**
2106  * __napi_schedule - schedule for receive
2107  * @n: entry to schedule
2108  *
2109  * The entry's receive function will be scheduled to run
2110  */
2111 void fastcall __napi_schedule(struct napi_struct *n)
2112 {
2113 	unsigned long flags;
2114 
2115 	local_irq_save(flags);
2116 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2117 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2118 	local_irq_restore(flags);
2119 }
2120 EXPORT_SYMBOL(__napi_schedule);
2121 
2122 
2123 static void net_rx_action(struct softirq_action *h)
2124 {
2125 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2126 	unsigned long start_time = jiffies;
2127 	int budget = netdev_budget;
2128 	void *have;
2129 
2130 	local_irq_disable();
2131 
2132 	while (!list_empty(list)) {
2133 		struct napi_struct *n;
2134 		int work, weight;
2135 
2136 		/* If softirq window is exhuasted then punt.
2137 		 *
2138 		 * Note that this is a slight policy change from the
2139 		 * previous NAPI code, which would allow up to 2
2140 		 * jiffies to pass before breaking out.  The test
2141 		 * used to be "jiffies - start_time > 1".
2142 		 */
2143 		if (unlikely(budget <= 0 || jiffies != start_time))
2144 			goto softnet_break;
2145 
2146 		local_irq_enable();
2147 
2148 		/* Even though interrupts have been re-enabled, this
2149 		 * access is safe because interrupts can only add new
2150 		 * entries to the tail of this list, and only ->poll()
2151 		 * calls can remove this head entry from the list.
2152 		 */
2153 		n = list_entry(list->next, struct napi_struct, poll_list);
2154 
2155 		have = netpoll_poll_lock(n);
2156 
2157 		weight = n->weight;
2158 
2159 		work = n->poll(n, weight);
2160 
2161 		WARN_ON_ONCE(work > weight);
2162 
2163 		budget -= work;
2164 
2165 		local_irq_disable();
2166 
2167 		/* Drivers must not modify the NAPI state if they
2168 		 * consume the entire weight.  In such cases this code
2169 		 * still "owns" the NAPI instance and therefore can
2170 		 * move the instance around on the list at-will.
2171 		 */
2172 		if (unlikely(work == weight))
2173 			list_move_tail(&n->poll_list, list);
2174 
2175 		netpoll_poll_unlock(have);
2176 	}
2177 out:
2178 	local_irq_enable();
2179 
2180 #ifdef CONFIG_NET_DMA
2181 	/*
2182 	 * There may not be any more sk_buffs coming right now, so push
2183 	 * any pending DMA copies to hardware
2184 	 */
2185 	if (!cpus_empty(net_dma.channel_mask)) {
2186 		int chan_idx;
2187 		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2188 			struct dma_chan *chan = net_dma.channels[chan_idx];
2189 			if (chan)
2190 				dma_async_memcpy_issue_pending(chan);
2191 		}
2192 	}
2193 #endif
2194 
2195 	return;
2196 
2197 softnet_break:
2198 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2199 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2200 	goto out;
2201 }
2202 
2203 static gifconf_func_t * gifconf_list [NPROTO];
2204 
2205 /**
2206  *	register_gifconf	-	register a SIOCGIF handler
2207  *	@family: Address family
2208  *	@gifconf: Function handler
2209  *
2210  *	Register protocol dependent address dumping routines. The handler
2211  *	that is passed must not be freed or reused until it has been replaced
2212  *	by another handler.
2213  */
2214 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2215 {
2216 	if (family >= NPROTO)
2217 		return -EINVAL;
2218 	gifconf_list[family] = gifconf;
2219 	return 0;
2220 }
2221 
2222 
2223 /*
2224  *	Map an interface index to its name (SIOCGIFNAME)
2225  */
2226 
2227 /*
2228  *	We need this ioctl for efficient implementation of the
2229  *	if_indextoname() function required by the IPv6 API.  Without
2230  *	it, we would have to search all the interfaces to find a
2231  *	match.  --pb
2232  */
2233 
2234 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2235 {
2236 	struct net_device *dev;
2237 	struct ifreq ifr;
2238 
2239 	/*
2240 	 *	Fetch the caller's info block.
2241 	 */
2242 
2243 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2244 		return -EFAULT;
2245 
2246 	read_lock(&dev_base_lock);
2247 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2248 	if (!dev) {
2249 		read_unlock(&dev_base_lock);
2250 		return -ENODEV;
2251 	}
2252 
2253 	strcpy(ifr.ifr_name, dev->name);
2254 	read_unlock(&dev_base_lock);
2255 
2256 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2257 		return -EFAULT;
2258 	return 0;
2259 }
2260 
2261 /*
2262  *	Perform a SIOCGIFCONF call. This structure will change
2263  *	size eventually, and there is nothing I can do about it.
2264  *	Thus we will need a 'compatibility mode'.
2265  */
2266 
2267 static int dev_ifconf(struct net *net, char __user *arg)
2268 {
2269 	struct ifconf ifc;
2270 	struct net_device *dev;
2271 	char __user *pos;
2272 	int len;
2273 	int total;
2274 	int i;
2275 
2276 	/*
2277 	 *	Fetch the caller's info block.
2278 	 */
2279 
2280 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2281 		return -EFAULT;
2282 
2283 	pos = ifc.ifc_buf;
2284 	len = ifc.ifc_len;
2285 
2286 	/*
2287 	 *	Loop over the interfaces, and write an info block for each.
2288 	 */
2289 
2290 	total = 0;
2291 	for_each_netdev(net, dev) {
2292 		for (i = 0; i < NPROTO; i++) {
2293 			if (gifconf_list[i]) {
2294 				int done;
2295 				if (!pos)
2296 					done = gifconf_list[i](dev, NULL, 0);
2297 				else
2298 					done = gifconf_list[i](dev, pos + total,
2299 							       len - total);
2300 				if (done < 0)
2301 					return -EFAULT;
2302 				total += done;
2303 			}
2304 		}
2305 	}
2306 
2307 	/*
2308 	 *	All done.  Write the updated control block back to the caller.
2309 	 */
2310 	ifc.ifc_len = total;
2311 
2312 	/*
2313 	 * 	Both BSD and Solaris return 0 here, so we do too.
2314 	 */
2315 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2316 }
2317 
2318 #ifdef CONFIG_PROC_FS
2319 /*
2320  *	This is invoked by the /proc filesystem handler to display a device
2321  *	in detail.
2322  */
2323 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2324 {
2325 	struct net *net = seq->private;
2326 	loff_t off;
2327 	struct net_device *dev;
2328 
2329 	read_lock(&dev_base_lock);
2330 	if (!*pos)
2331 		return SEQ_START_TOKEN;
2332 
2333 	off = 1;
2334 	for_each_netdev(net, dev)
2335 		if (off++ == *pos)
2336 			return dev;
2337 
2338 	return NULL;
2339 }
2340 
2341 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2342 {
2343 	struct net *net = seq->private;
2344 	++*pos;
2345 	return v == SEQ_START_TOKEN ?
2346 		first_net_device(net) : next_net_device((struct net_device *)v);
2347 }
2348 
2349 void dev_seq_stop(struct seq_file *seq, void *v)
2350 {
2351 	read_unlock(&dev_base_lock);
2352 }
2353 
2354 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2355 {
2356 	struct net_device_stats *stats = dev->get_stats(dev);
2357 
2358 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2359 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2360 		   dev->name, stats->rx_bytes, stats->rx_packets,
2361 		   stats->rx_errors,
2362 		   stats->rx_dropped + stats->rx_missed_errors,
2363 		   stats->rx_fifo_errors,
2364 		   stats->rx_length_errors + stats->rx_over_errors +
2365 		    stats->rx_crc_errors + stats->rx_frame_errors,
2366 		   stats->rx_compressed, stats->multicast,
2367 		   stats->tx_bytes, stats->tx_packets,
2368 		   stats->tx_errors, stats->tx_dropped,
2369 		   stats->tx_fifo_errors, stats->collisions,
2370 		   stats->tx_carrier_errors +
2371 		    stats->tx_aborted_errors +
2372 		    stats->tx_window_errors +
2373 		    stats->tx_heartbeat_errors,
2374 		   stats->tx_compressed);
2375 }
2376 
2377 /*
2378  *	Called from the PROCfs module. This now uses the new arbitrary sized
2379  *	/proc/net interface to create /proc/net/dev
2380  */
2381 static int dev_seq_show(struct seq_file *seq, void *v)
2382 {
2383 	if (v == SEQ_START_TOKEN)
2384 		seq_puts(seq, "Inter-|   Receive                            "
2385 			      "                    |  Transmit\n"
2386 			      " face |bytes    packets errs drop fifo frame "
2387 			      "compressed multicast|bytes    packets errs "
2388 			      "drop fifo colls carrier compressed\n");
2389 	else
2390 		dev_seq_printf_stats(seq, v);
2391 	return 0;
2392 }
2393 
2394 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2395 {
2396 	struct netif_rx_stats *rc = NULL;
2397 
2398 	while (*pos < NR_CPUS)
2399 		if (cpu_online(*pos)) {
2400 			rc = &per_cpu(netdev_rx_stat, *pos);
2401 			break;
2402 		} else
2403 			++*pos;
2404 	return rc;
2405 }
2406 
2407 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2408 {
2409 	return softnet_get_online(pos);
2410 }
2411 
2412 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2413 {
2414 	++*pos;
2415 	return softnet_get_online(pos);
2416 }
2417 
2418 static void softnet_seq_stop(struct seq_file *seq, void *v)
2419 {
2420 }
2421 
2422 static int softnet_seq_show(struct seq_file *seq, void *v)
2423 {
2424 	struct netif_rx_stats *s = v;
2425 
2426 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2427 		   s->total, s->dropped, s->time_squeeze, 0,
2428 		   0, 0, 0, 0, /* was fastroute */
2429 		   s->cpu_collision );
2430 	return 0;
2431 }
2432 
2433 static const struct seq_operations dev_seq_ops = {
2434 	.start = dev_seq_start,
2435 	.next  = dev_seq_next,
2436 	.stop  = dev_seq_stop,
2437 	.show  = dev_seq_show,
2438 };
2439 
2440 static int dev_seq_open(struct inode *inode, struct file *file)
2441 {
2442 	struct seq_file *seq;
2443 	int res;
2444 	res =  seq_open(file, &dev_seq_ops);
2445 	if (!res) {
2446 		seq = file->private_data;
2447 		seq->private = get_proc_net(inode);
2448 		if (!seq->private) {
2449 			seq_release(inode, file);
2450 			res = -ENXIO;
2451 		}
2452 	}
2453 	return res;
2454 }
2455 
2456 static int dev_seq_release(struct inode *inode, struct file *file)
2457 {
2458 	struct seq_file *seq = file->private_data;
2459 	struct net *net = seq->private;
2460 	put_net(net);
2461 	return seq_release(inode, file);
2462 }
2463 
2464 static const struct file_operations dev_seq_fops = {
2465 	.owner	 = THIS_MODULE,
2466 	.open    = dev_seq_open,
2467 	.read    = seq_read,
2468 	.llseek  = seq_lseek,
2469 	.release = dev_seq_release,
2470 };
2471 
2472 static const struct seq_operations softnet_seq_ops = {
2473 	.start = softnet_seq_start,
2474 	.next  = softnet_seq_next,
2475 	.stop  = softnet_seq_stop,
2476 	.show  = softnet_seq_show,
2477 };
2478 
2479 static int softnet_seq_open(struct inode *inode, struct file *file)
2480 {
2481 	return seq_open(file, &softnet_seq_ops);
2482 }
2483 
2484 static const struct file_operations softnet_seq_fops = {
2485 	.owner	 = THIS_MODULE,
2486 	.open    = softnet_seq_open,
2487 	.read    = seq_read,
2488 	.llseek  = seq_lseek,
2489 	.release = seq_release,
2490 };
2491 
2492 static void *ptype_get_idx(loff_t pos)
2493 {
2494 	struct packet_type *pt = NULL;
2495 	loff_t i = 0;
2496 	int t;
2497 
2498 	list_for_each_entry_rcu(pt, &ptype_all, list) {
2499 		if (i == pos)
2500 			return pt;
2501 		++i;
2502 	}
2503 
2504 	for (t = 0; t < 16; t++) {
2505 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2506 			if (i == pos)
2507 				return pt;
2508 			++i;
2509 		}
2510 	}
2511 	return NULL;
2512 }
2513 
2514 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2515 {
2516 	rcu_read_lock();
2517 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2518 }
2519 
2520 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2521 {
2522 	struct packet_type *pt;
2523 	struct list_head *nxt;
2524 	int hash;
2525 
2526 	++*pos;
2527 	if (v == SEQ_START_TOKEN)
2528 		return ptype_get_idx(0);
2529 
2530 	pt = v;
2531 	nxt = pt->list.next;
2532 	if (pt->type == htons(ETH_P_ALL)) {
2533 		if (nxt != &ptype_all)
2534 			goto found;
2535 		hash = 0;
2536 		nxt = ptype_base[0].next;
2537 	} else
2538 		hash = ntohs(pt->type) & 15;
2539 
2540 	while (nxt == &ptype_base[hash]) {
2541 		if (++hash >= 16)
2542 			return NULL;
2543 		nxt = ptype_base[hash].next;
2544 	}
2545 found:
2546 	return list_entry(nxt, struct packet_type, list);
2547 }
2548 
2549 static void ptype_seq_stop(struct seq_file *seq, void *v)
2550 {
2551 	rcu_read_unlock();
2552 }
2553 
2554 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2555 {
2556 #ifdef CONFIG_KALLSYMS
2557 	unsigned long offset = 0, symsize;
2558 	const char *symname;
2559 	char *modname;
2560 	char namebuf[128];
2561 
2562 	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2563 				  &modname, namebuf);
2564 
2565 	if (symname) {
2566 		char *delim = ":";
2567 
2568 		if (!modname)
2569 			modname = delim = "";
2570 		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2571 			   symname, offset);
2572 		return;
2573 	}
2574 #endif
2575 
2576 	seq_printf(seq, "[%p]", sym);
2577 }
2578 
2579 static int ptype_seq_show(struct seq_file *seq, void *v)
2580 {
2581 	struct packet_type *pt = v;
2582 
2583 	if (v == SEQ_START_TOKEN)
2584 		seq_puts(seq, "Type Device      Function\n");
2585 	else {
2586 		if (pt->type == htons(ETH_P_ALL))
2587 			seq_puts(seq, "ALL ");
2588 		else
2589 			seq_printf(seq, "%04x", ntohs(pt->type));
2590 
2591 		seq_printf(seq, " %-8s ",
2592 			   pt->dev ? pt->dev->name : "");
2593 		ptype_seq_decode(seq,  pt->func);
2594 		seq_putc(seq, '\n');
2595 	}
2596 
2597 	return 0;
2598 }
2599 
2600 static const struct seq_operations ptype_seq_ops = {
2601 	.start = ptype_seq_start,
2602 	.next  = ptype_seq_next,
2603 	.stop  = ptype_seq_stop,
2604 	.show  = ptype_seq_show,
2605 };
2606 
2607 static int ptype_seq_open(struct inode *inode, struct file *file)
2608 {
2609 	return seq_open(file, &ptype_seq_ops);
2610 }
2611 
2612 static const struct file_operations ptype_seq_fops = {
2613 	.owner	 = THIS_MODULE,
2614 	.open    = ptype_seq_open,
2615 	.read    = seq_read,
2616 	.llseek  = seq_lseek,
2617 	.release = seq_release,
2618 };
2619 
2620 
2621 static int __net_init dev_proc_net_init(struct net *net)
2622 {
2623 	int rc = -ENOMEM;
2624 
2625 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2626 		goto out;
2627 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2628 		goto out_dev;
2629 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2630 		goto out_softnet;
2631 
2632 	if (wext_proc_init(net))
2633 		goto out_ptype;
2634 	rc = 0;
2635 out:
2636 	return rc;
2637 out_ptype:
2638 	proc_net_remove(net, "ptype");
2639 out_softnet:
2640 	proc_net_remove(net, "softnet_stat");
2641 out_dev:
2642 	proc_net_remove(net, "dev");
2643 	goto out;
2644 }
2645 
2646 static void __net_exit dev_proc_net_exit(struct net *net)
2647 {
2648 	wext_proc_exit(net);
2649 
2650 	proc_net_remove(net, "ptype");
2651 	proc_net_remove(net, "softnet_stat");
2652 	proc_net_remove(net, "dev");
2653 }
2654 
2655 static struct pernet_operations __net_initdata dev_proc_ops = {
2656 	.init = dev_proc_net_init,
2657 	.exit = dev_proc_net_exit,
2658 };
2659 
2660 static int __init dev_proc_init(void)
2661 {
2662 	return register_pernet_subsys(&dev_proc_ops);
2663 }
2664 #else
2665 #define dev_proc_init() 0
2666 #endif	/* CONFIG_PROC_FS */
2667 
2668 
2669 /**
2670  *	netdev_set_master	-	set up master/slave pair
2671  *	@slave: slave device
2672  *	@master: new master device
2673  *
2674  *	Changes the master device of the slave. Pass %NULL to break the
2675  *	bonding. The caller must hold the RTNL semaphore. On a failure
2676  *	a negative errno code is returned. On success the reference counts
2677  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2678  *	function returns zero.
2679  */
2680 int netdev_set_master(struct net_device *slave, struct net_device *master)
2681 {
2682 	struct net_device *old = slave->master;
2683 
2684 	ASSERT_RTNL();
2685 
2686 	if (master) {
2687 		if (old)
2688 			return -EBUSY;
2689 		dev_hold(master);
2690 	}
2691 
2692 	slave->master = master;
2693 
2694 	synchronize_net();
2695 
2696 	if (old)
2697 		dev_put(old);
2698 
2699 	if (master)
2700 		slave->flags |= IFF_SLAVE;
2701 	else
2702 		slave->flags &= ~IFF_SLAVE;
2703 
2704 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2705 	return 0;
2706 }
2707 
2708 static void __dev_set_promiscuity(struct net_device *dev, int inc)
2709 {
2710 	unsigned short old_flags = dev->flags;
2711 
2712 	ASSERT_RTNL();
2713 
2714 	if ((dev->promiscuity += inc) == 0)
2715 		dev->flags &= ~IFF_PROMISC;
2716 	else
2717 		dev->flags |= IFF_PROMISC;
2718 	if (dev->flags != old_flags) {
2719 		printk(KERN_INFO "device %s %s promiscuous mode\n",
2720 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2721 							       "left");
2722 		audit_log(current->audit_context, GFP_ATOMIC,
2723 			AUDIT_ANOM_PROMISCUOUS,
2724 			"dev=%s prom=%d old_prom=%d auid=%u",
2725 			dev->name, (dev->flags & IFF_PROMISC),
2726 			(old_flags & IFF_PROMISC),
2727 			audit_get_loginuid(current->audit_context));
2728 
2729 		if (dev->change_rx_flags)
2730 			dev->change_rx_flags(dev, IFF_PROMISC);
2731 	}
2732 }
2733 
2734 /**
2735  *	dev_set_promiscuity	- update promiscuity count on a device
2736  *	@dev: device
2737  *	@inc: modifier
2738  *
2739  *	Add or remove promiscuity from a device. While the count in the device
2740  *	remains above zero the interface remains promiscuous. Once it hits zero
2741  *	the device reverts back to normal filtering operation. A negative inc
2742  *	value is used to drop promiscuity on the device.
2743  */
2744 void dev_set_promiscuity(struct net_device *dev, int inc)
2745 {
2746 	unsigned short old_flags = dev->flags;
2747 
2748 	__dev_set_promiscuity(dev, inc);
2749 	if (dev->flags != old_flags)
2750 		dev_set_rx_mode(dev);
2751 }
2752 
2753 /**
2754  *	dev_set_allmulti	- update allmulti count on a device
2755  *	@dev: device
2756  *	@inc: modifier
2757  *
2758  *	Add or remove reception of all multicast frames to a device. While the
2759  *	count in the device remains above zero the interface remains listening
2760  *	to all interfaces. Once it hits zero the device reverts back to normal
2761  *	filtering operation. A negative @inc value is used to drop the counter
2762  *	when releasing a resource needing all multicasts.
2763  */
2764 
2765 void dev_set_allmulti(struct net_device *dev, int inc)
2766 {
2767 	unsigned short old_flags = dev->flags;
2768 
2769 	ASSERT_RTNL();
2770 
2771 	dev->flags |= IFF_ALLMULTI;
2772 	if ((dev->allmulti += inc) == 0)
2773 		dev->flags &= ~IFF_ALLMULTI;
2774 	if (dev->flags ^ old_flags) {
2775 		if (dev->change_rx_flags)
2776 			dev->change_rx_flags(dev, IFF_ALLMULTI);
2777 		dev_set_rx_mode(dev);
2778 	}
2779 }
2780 
2781 /*
2782  *	Upload unicast and multicast address lists to device and
2783  *	configure RX filtering. When the device doesn't support unicast
2784  *	filtering it is put in promiscous mode while unicast addresses
2785  *	are present.
2786  */
2787 void __dev_set_rx_mode(struct net_device *dev)
2788 {
2789 	/* dev_open will call this function so the list will stay sane. */
2790 	if (!(dev->flags&IFF_UP))
2791 		return;
2792 
2793 	if (!netif_device_present(dev))
2794 		return;
2795 
2796 	if (dev->set_rx_mode)
2797 		dev->set_rx_mode(dev);
2798 	else {
2799 		/* Unicast addresses changes may only happen under the rtnl,
2800 		 * therefore calling __dev_set_promiscuity here is safe.
2801 		 */
2802 		if (dev->uc_count > 0 && !dev->uc_promisc) {
2803 			__dev_set_promiscuity(dev, 1);
2804 			dev->uc_promisc = 1;
2805 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
2806 			__dev_set_promiscuity(dev, -1);
2807 			dev->uc_promisc = 0;
2808 		}
2809 
2810 		if (dev->set_multicast_list)
2811 			dev->set_multicast_list(dev);
2812 	}
2813 }
2814 
2815 void dev_set_rx_mode(struct net_device *dev)
2816 {
2817 	netif_tx_lock_bh(dev);
2818 	__dev_set_rx_mode(dev);
2819 	netif_tx_unlock_bh(dev);
2820 }
2821 
2822 int __dev_addr_delete(struct dev_addr_list **list, int *count,
2823 		      void *addr, int alen, int glbl)
2824 {
2825 	struct dev_addr_list *da;
2826 
2827 	for (; (da = *list) != NULL; list = &da->next) {
2828 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2829 		    alen == da->da_addrlen) {
2830 			if (glbl) {
2831 				int old_glbl = da->da_gusers;
2832 				da->da_gusers = 0;
2833 				if (old_glbl == 0)
2834 					break;
2835 			}
2836 			if (--da->da_users)
2837 				return 0;
2838 
2839 			*list = da->next;
2840 			kfree(da);
2841 			(*count)--;
2842 			return 0;
2843 		}
2844 	}
2845 	return -ENOENT;
2846 }
2847 
2848 int __dev_addr_add(struct dev_addr_list **list, int *count,
2849 		   void *addr, int alen, int glbl)
2850 {
2851 	struct dev_addr_list *da;
2852 
2853 	for (da = *list; da != NULL; da = da->next) {
2854 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2855 		    da->da_addrlen == alen) {
2856 			if (glbl) {
2857 				int old_glbl = da->da_gusers;
2858 				da->da_gusers = 1;
2859 				if (old_glbl)
2860 					return 0;
2861 			}
2862 			da->da_users++;
2863 			return 0;
2864 		}
2865 	}
2866 
2867 	da = kmalloc(sizeof(*da), GFP_ATOMIC);
2868 	if (da == NULL)
2869 		return -ENOMEM;
2870 	memcpy(da->da_addr, addr, alen);
2871 	da->da_addrlen = alen;
2872 	da->da_users = 1;
2873 	da->da_gusers = glbl ? 1 : 0;
2874 	da->next = *list;
2875 	*list = da;
2876 	(*count)++;
2877 	return 0;
2878 }
2879 
2880 /**
2881  *	dev_unicast_delete	- Release secondary unicast address.
2882  *	@dev: device
2883  *	@addr: address to delete
2884  *	@alen: length of @addr
2885  *
2886  *	Release reference to a secondary unicast address and remove it
2887  *	from the device if the reference count drops to zero.
2888  *
2889  * 	The caller must hold the rtnl_mutex.
2890  */
2891 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2892 {
2893 	int err;
2894 
2895 	ASSERT_RTNL();
2896 
2897 	netif_tx_lock_bh(dev);
2898 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2899 	if (!err)
2900 		__dev_set_rx_mode(dev);
2901 	netif_tx_unlock_bh(dev);
2902 	return err;
2903 }
2904 EXPORT_SYMBOL(dev_unicast_delete);
2905 
2906 /**
2907  *	dev_unicast_add		- add a secondary unicast address
2908  *	@dev: device
2909  *	@addr: address to delete
2910  *	@alen: length of @addr
2911  *
2912  *	Add a secondary unicast address to the device or increase
2913  *	the reference count if it already exists.
2914  *
2915  *	The caller must hold the rtnl_mutex.
2916  */
2917 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2918 {
2919 	int err;
2920 
2921 	ASSERT_RTNL();
2922 
2923 	netif_tx_lock_bh(dev);
2924 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2925 	if (!err)
2926 		__dev_set_rx_mode(dev);
2927 	netif_tx_unlock_bh(dev);
2928 	return err;
2929 }
2930 EXPORT_SYMBOL(dev_unicast_add);
2931 
2932 static void __dev_addr_discard(struct dev_addr_list **list)
2933 {
2934 	struct dev_addr_list *tmp;
2935 
2936 	while (*list != NULL) {
2937 		tmp = *list;
2938 		*list = tmp->next;
2939 		if (tmp->da_users > tmp->da_gusers)
2940 			printk("__dev_addr_discard: address leakage! "
2941 			       "da_users=%d\n", tmp->da_users);
2942 		kfree(tmp);
2943 	}
2944 }
2945 
2946 static void dev_addr_discard(struct net_device *dev)
2947 {
2948 	netif_tx_lock_bh(dev);
2949 
2950 	__dev_addr_discard(&dev->uc_list);
2951 	dev->uc_count = 0;
2952 
2953 	__dev_addr_discard(&dev->mc_list);
2954 	dev->mc_count = 0;
2955 
2956 	netif_tx_unlock_bh(dev);
2957 }
2958 
2959 unsigned dev_get_flags(const struct net_device *dev)
2960 {
2961 	unsigned flags;
2962 
2963 	flags = (dev->flags & ~(IFF_PROMISC |
2964 				IFF_ALLMULTI |
2965 				IFF_RUNNING |
2966 				IFF_LOWER_UP |
2967 				IFF_DORMANT)) |
2968 		(dev->gflags & (IFF_PROMISC |
2969 				IFF_ALLMULTI));
2970 
2971 	if (netif_running(dev)) {
2972 		if (netif_oper_up(dev))
2973 			flags |= IFF_RUNNING;
2974 		if (netif_carrier_ok(dev))
2975 			flags |= IFF_LOWER_UP;
2976 		if (netif_dormant(dev))
2977 			flags |= IFF_DORMANT;
2978 	}
2979 
2980 	return flags;
2981 }
2982 
2983 int dev_change_flags(struct net_device *dev, unsigned flags)
2984 {
2985 	int ret, changes;
2986 	int old_flags = dev->flags;
2987 
2988 	ASSERT_RTNL();
2989 
2990 	/*
2991 	 *	Set the flags on our device.
2992 	 */
2993 
2994 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2995 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2996 			       IFF_AUTOMEDIA)) |
2997 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2998 				    IFF_ALLMULTI));
2999 
3000 	/*
3001 	 *	Load in the correct multicast list now the flags have changed.
3002 	 */
3003 
3004 	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
3005 		dev->change_rx_flags(dev, IFF_MULTICAST);
3006 
3007 	dev_set_rx_mode(dev);
3008 
3009 	/*
3010 	 *	Have we downed the interface. We handle IFF_UP ourselves
3011 	 *	according to user attempts to set it, rather than blindly
3012 	 *	setting it.
3013 	 */
3014 
3015 	ret = 0;
3016 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3017 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3018 
3019 		if (!ret)
3020 			dev_set_rx_mode(dev);
3021 	}
3022 
3023 	if (dev->flags & IFF_UP &&
3024 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3025 					  IFF_VOLATILE)))
3026 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3027 
3028 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3029 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3030 		dev->gflags ^= IFF_PROMISC;
3031 		dev_set_promiscuity(dev, inc);
3032 	}
3033 
3034 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3035 	   is important. Some (broken) drivers set IFF_PROMISC, when
3036 	   IFF_ALLMULTI is requested not asking us and not reporting.
3037 	 */
3038 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3039 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3040 		dev->gflags ^= IFF_ALLMULTI;
3041 		dev_set_allmulti(dev, inc);
3042 	}
3043 
3044 	/* Exclude state transition flags, already notified */
3045 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3046 	if (changes)
3047 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3048 
3049 	return ret;
3050 }
3051 
3052 int dev_set_mtu(struct net_device *dev, int new_mtu)
3053 {
3054 	int err;
3055 
3056 	if (new_mtu == dev->mtu)
3057 		return 0;
3058 
3059 	/*	MTU must be positive.	 */
3060 	if (new_mtu < 0)
3061 		return -EINVAL;
3062 
3063 	if (!netif_device_present(dev))
3064 		return -ENODEV;
3065 
3066 	err = 0;
3067 	if (dev->change_mtu)
3068 		err = dev->change_mtu(dev, new_mtu);
3069 	else
3070 		dev->mtu = new_mtu;
3071 	if (!err && dev->flags & IFF_UP)
3072 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3073 	return err;
3074 }
3075 
3076 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3077 {
3078 	int err;
3079 
3080 	if (!dev->set_mac_address)
3081 		return -EOPNOTSUPP;
3082 	if (sa->sa_family != dev->type)
3083 		return -EINVAL;
3084 	if (!netif_device_present(dev))
3085 		return -ENODEV;
3086 	err = dev->set_mac_address(dev, sa);
3087 	if (!err)
3088 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3089 	return err;
3090 }
3091 
3092 /*
3093  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3094  */
3095 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3096 {
3097 	int err;
3098 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3099 
3100 	if (!dev)
3101 		return -ENODEV;
3102 
3103 	switch (cmd) {
3104 		case SIOCGIFFLAGS:	/* Get interface flags */
3105 			ifr->ifr_flags = dev_get_flags(dev);
3106 			return 0;
3107 
3108 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3109 					   (currently unused) */
3110 			ifr->ifr_metric = 0;
3111 			return 0;
3112 
3113 		case SIOCGIFMTU:	/* Get the MTU of a device */
3114 			ifr->ifr_mtu = dev->mtu;
3115 			return 0;
3116 
3117 		case SIOCGIFHWADDR:
3118 			if (!dev->addr_len)
3119 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3120 			else
3121 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3122 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3123 			ifr->ifr_hwaddr.sa_family = dev->type;
3124 			return 0;
3125 
3126 		case SIOCGIFSLAVE:
3127 			err = -EINVAL;
3128 			break;
3129 
3130 		case SIOCGIFMAP:
3131 			ifr->ifr_map.mem_start = dev->mem_start;
3132 			ifr->ifr_map.mem_end   = dev->mem_end;
3133 			ifr->ifr_map.base_addr = dev->base_addr;
3134 			ifr->ifr_map.irq       = dev->irq;
3135 			ifr->ifr_map.dma       = dev->dma;
3136 			ifr->ifr_map.port      = dev->if_port;
3137 			return 0;
3138 
3139 		case SIOCGIFINDEX:
3140 			ifr->ifr_ifindex = dev->ifindex;
3141 			return 0;
3142 
3143 		case SIOCGIFTXQLEN:
3144 			ifr->ifr_qlen = dev->tx_queue_len;
3145 			return 0;
3146 
3147 		default:
3148 			/* dev_ioctl() should ensure this case
3149 			 * is never reached
3150 			 */
3151 			WARN_ON(1);
3152 			err = -EINVAL;
3153 			break;
3154 
3155 	}
3156 	return err;
3157 }
3158 
3159 /*
3160  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3161  */
3162 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3163 {
3164 	int err;
3165 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3166 
3167 	if (!dev)
3168 		return -ENODEV;
3169 
3170 	switch (cmd) {
3171 		case SIOCSIFFLAGS:	/* Set interface flags */
3172 			return dev_change_flags(dev, ifr->ifr_flags);
3173 
3174 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3175 					   (currently unused) */
3176 			return -EOPNOTSUPP;
3177 
3178 		case SIOCSIFMTU:	/* Set the MTU of a device */
3179 			return dev_set_mtu(dev, ifr->ifr_mtu);
3180 
3181 		case SIOCSIFHWADDR:
3182 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3183 
3184 		case SIOCSIFHWBROADCAST:
3185 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3186 				return -EINVAL;
3187 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3188 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3189 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3190 			return 0;
3191 
3192 		case SIOCSIFMAP:
3193 			if (dev->set_config) {
3194 				if (!netif_device_present(dev))
3195 					return -ENODEV;
3196 				return dev->set_config(dev, &ifr->ifr_map);
3197 			}
3198 			return -EOPNOTSUPP;
3199 
3200 		case SIOCADDMULTI:
3201 			if (!dev->set_multicast_list ||
3202 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3203 				return -EINVAL;
3204 			if (!netif_device_present(dev))
3205 				return -ENODEV;
3206 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3207 					  dev->addr_len, 1);
3208 
3209 		case SIOCDELMULTI:
3210 			if (!dev->set_multicast_list ||
3211 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3212 				return -EINVAL;
3213 			if (!netif_device_present(dev))
3214 				return -ENODEV;
3215 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3216 					     dev->addr_len, 1);
3217 
3218 		case SIOCSIFTXQLEN:
3219 			if (ifr->ifr_qlen < 0)
3220 				return -EINVAL;
3221 			dev->tx_queue_len = ifr->ifr_qlen;
3222 			return 0;
3223 
3224 		case SIOCSIFNAME:
3225 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3226 			return dev_change_name(dev, ifr->ifr_newname);
3227 
3228 		/*
3229 		 *	Unknown or private ioctl
3230 		 */
3231 
3232 		default:
3233 			if ((cmd >= SIOCDEVPRIVATE &&
3234 			    cmd <= SIOCDEVPRIVATE + 15) ||
3235 			    cmd == SIOCBONDENSLAVE ||
3236 			    cmd == SIOCBONDRELEASE ||
3237 			    cmd == SIOCBONDSETHWADDR ||
3238 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3239 			    cmd == SIOCBONDINFOQUERY ||
3240 			    cmd == SIOCBONDCHANGEACTIVE ||
3241 			    cmd == SIOCGMIIPHY ||
3242 			    cmd == SIOCGMIIREG ||
3243 			    cmd == SIOCSMIIREG ||
3244 			    cmd == SIOCBRADDIF ||
3245 			    cmd == SIOCBRDELIF ||
3246 			    cmd == SIOCWANDEV) {
3247 				err = -EOPNOTSUPP;
3248 				if (dev->do_ioctl) {
3249 					if (netif_device_present(dev))
3250 						err = dev->do_ioctl(dev, ifr,
3251 								    cmd);
3252 					else
3253 						err = -ENODEV;
3254 				}
3255 			} else
3256 				err = -EINVAL;
3257 
3258 	}
3259 	return err;
3260 }
3261 
3262 /*
3263  *	This function handles all "interface"-type I/O control requests. The actual
3264  *	'doing' part of this is dev_ifsioc above.
3265  */
3266 
3267 /**
3268  *	dev_ioctl	-	network device ioctl
3269  *	@net: the applicable net namespace
3270  *	@cmd: command to issue
3271  *	@arg: pointer to a struct ifreq in user space
3272  *
3273  *	Issue ioctl functions to devices. This is normally called by the
3274  *	user space syscall interfaces but can sometimes be useful for
3275  *	other purposes. The return value is the return from the syscall if
3276  *	positive or a negative errno code on error.
3277  */
3278 
3279 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3280 {
3281 	struct ifreq ifr;
3282 	int ret;
3283 	char *colon;
3284 
3285 	/* One special case: SIOCGIFCONF takes ifconf argument
3286 	   and requires shared lock, because it sleeps writing
3287 	   to user space.
3288 	 */
3289 
3290 	if (cmd == SIOCGIFCONF) {
3291 		rtnl_lock();
3292 		ret = dev_ifconf(net, (char __user *) arg);
3293 		rtnl_unlock();
3294 		return ret;
3295 	}
3296 	if (cmd == SIOCGIFNAME)
3297 		return dev_ifname(net, (struct ifreq __user *)arg);
3298 
3299 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3300 		return -EFAULT;
3301 
3302 	ifr.ifr_name[IFNAMSIZ-1] = 0;
3303 
3304 	colon = strchr(ifr.ifr_name, ':');
3305 	if (colon)
3306 		*colon = 0;
3307 
3308 	/*
3309 	 *	See which interface the caller is talking about.
3310 	 */
3311 
3312 	switch (cmd) {
3313 		/*
3314 		 *	These ioctl calls:
3315 		 *	- can be done by all.
3316 		 *	- atomic and do not require locking.
3317 		 *	- return a value
3318 		 */
3319 		case SIOCGIFFLAGS:
3320 		case SIOCGIFMETRIC:
3321 		case SIOCGIFMTU:
3322 		case SIOCGIFHWADDR:
3323 		case SIOCGIFSLAVE:
3324 		case SIOCGIFMAP:
3325 		case SIOCGIFINDEX:
3326 		case SIOCGIFTXQLEN:
3327 			dev_load(net, ifr.ifr_name);
3328 			read_lock(&dev_base_lock);
3329 			ret = dev_ifsioc_locked(net, &ifr, cmd);
3330 			read_unlock(&dev_base_lock);
3331 			if (!ret) {
3332 				if (colon)
3333 					*colon = ':';
3334 				if (copy_to_user(arg, &ifr,
3335 						 sizeof(struct ifreq)))
3336 					ret = -EFAULT;
3337 			}
3338 			return ret;
3339 
3340 		case SIOCETHTOOL:
3341 			dev_load(net, ifr.ifr_name);
3342 			rtnl_lock();
3343 			ret = dev_ethtool(net, &ifr);
3344 			rtnl_unlock();
3345 			if (!ret) {
3346 				if (colon)
3347 					*colon = ':';
3348 				if (copy_to_user(arg, &ifr,
3349 						 sizeof(struct ifreq)))
3350 					ret = -EFAULT;
3351 			}
3352 			return ret;
3353 
3354 		/*
3355 		 *	These ioctl calls:
3356 		 *	- require superuser power.
3357 		 *	- require strict serialization.
3358 		 *	- return a value
3359 		 */
3360 		case SIOCGMIIPHY:
3361 		case SIOCGMIIREG:
3362 		case SIOCSIFNAME:
3363 			if (!capable(CAP_NET_ADMIN))
3364 				return -EPERM;
3365 			dev_load(net, ifr.ifr_name);
3366 			rtnl_lock();
3367 			ret = dev_ifsioc(net, &ifr, cmd);
3368 			rtnl_unlock();
3369 			if (!ret) {
3370 				if (colon)
3371 					*colon = ':';
3372 				if (copy_to_user(arg, &ifr,
3373 						 sizeof(struct ifreq)))
3374 					ret = -EFAULT;
3375 			}
3376 			return ret;
3377 
3378 		/*
3379 		 *	These ioctl calls:
3380 		 *	- require superuser power.
3381 		 *	- require strict serialization.
3382 		 *	- do not return a value
3383 		 */
3384 		case SIOCSIFFLAGS:
3385 		case SIOCSIFMETRIC:
3386 		case SIOCSIFMTU:
3387 		case SIOCSIFMAP:
3388 		case SIOCSIFHWADDR:
3389 		case SIOCSIFSLAVE:
3390 		case SIOCADDMULTI:
3391 		case SIOCDELMULTI:
3392 		case SIOCSIFHWBROADCAST:
3393 		case SIOCSIFTXQLEN:
3394 		case SIOCSMIIREG:
3395 		case SIOCBONDENSLAVE:
3396 		case SIOCBONDRELEASE:
3397 		case SIOCBONDSETHWADDR:
3398 		case SIOCBONDCHANGEACTIVE:
3399 		case SIOCBRADDIF:
3400 		case SIOCBRDELIF:
3401 			if (!capable(CAP_NET_ADMIN))
3402 				return -EPERM;
3403 			/* fall through */
3404 		case SIOCBONDSLAVEINFOQUERY:
3405 		case SIOCBONDINFOQUERY:
3406 			dev_load(net, ifr.ifr_name);
3407 			rtnl_lock();
3408 			ret = dev_ifsioc(net, &ifr, cmd);
3409 			rtnl_unlock();
3410 			return ret;
3411 
3412 		case SIOCGIFMEM:
3413 			/* Get the per device memory space. We can add this but
3414 			 * currently do not support it */
3415 		case SIOCSIFMEM:
3416 			/* Set the per device memory buffer space.
3417 			 * Not applicable in our case */
3418 		case SIOCSIFLINK:
3419 			return -EINVAL;
3420 
3421 		/*
3422 		 *	Unknown or private ioctl.
3423 		 */
3424 		default:
3425 			if (cmd == SIOCWANDEV ||
3426 			    (cmd >= SIOCDEVPRIVATE &&
3427 			     cmd <= SIOCDEVPRIVATE + 15)) {
3428 				dev_load(net, ifr.ifr_name);
3429 				rtnl_lock();
3430 				ret = dev_ifsioc(net, &ifr, cmd);
3431 				rtnl_unlock();
3432 				if (!ret && copy_to_user(arg, &ifr,
3433 							 sizeof(struct ifreq)))
3434 					ret = -EFAULT;
3435 				return ret;
3436 			}
3437 			/* Take care of Wireless Extensions */
3438 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3439 				return wext_handle_ioctl(net, &ifr, cmd, arg);
3440 			return -EINVAL;
3441 	}
3442 }
3443 
3444 
3445 /**
3446  *	dev_new_index	-	allocate an ifindex
3447  *	@net: the applicable net namespace
3448  *
3449  *	Returns a suitable unique value for a new device interface
3450  *	number.  The caller must hold the rtnl semaphore or the
3451  *	dev_base_lock to be sure it remains unique.
3452  */
3453 static int dev_new_index(struct net *net)
3454 {
3455 	static int ifindex;
3456 	for (;;) {
3457 		if (++ifindex <= 0)
3458 			ifindex = 1;
3459 		if (!__dev_get_by_index(net, ifindex))
3460 			return ifindex;
3461 	}
3462 }
3463 
3464 /* Delayed registration/unregisteration */
3465 static DEFINE_SPINLOCK(net_todo_list_lock);
3466 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3467 
3468 static void net_set_todo(struct net_device *dev)
3469 {
3470 	spin_lock(&net_todo_list_lock);
3471 	list_add_tail(&dev->todo_list, &net_todo_list);
3472 	spin_unlock(&net_todo_list_lock);
3473 }
3474 
3475 /**
3476  *	register_netdevice	- register a network device
3477  *	@dev: device to register
3478  *
3479  *	Take a completed network device structure and add it to the kernel
3480  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3481  *	chain. 0 is returned on success. A negative errno code is returned
3482  *	on a failure to set up the device, or if the name is a duplicate.
3483  *
3484  *	Callers must hold the rtnl semaphore. You may want
3485  *	register_netdev() instead of this.
3486  *
3487  *	BUGS:
3488  *	The locking appears insufficient to guarantee two parallel registers
3489  *	will not get the same name.
3490  */
3491 
3492 int register_netdevice(struct net_device *dev)
3493 {
3494 	struct hlist_head *head;
3495 	struct hlist_node *p;
3496 	int ret;
3497 	struct net *net;
3498 
3499 	BUG_ON(dev_boot_phase);
3500 	ASSERT_RTNL();
3501 
3502 	might_sleep();
3503 
3504 	/* When net_device's are persistent, this will be fatal. */
3505 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3506 	BUG_ON(!dev->nd_net);
3507 	net = dev->nd_net;
3508 
3509 	spin_lock_init(&dev->queue_lock);
3510 	spin_lock_init(&dev->_xmit_lock);
3511 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3512 	dev->xmit_lock_owner = -1;
3513 	spin_lock_init(&dev->ingress_lock);
3514 
3515 	dev->iflink = -1;
3516 
3517 	/* Init, if this function is available */
3518 	if (dev->init) {
3519 		ret = dev->init(dev);
3520 		if (ret) {
3521 			if (ret > 0)
3522 				ret = -EIO;
3523 			goto out;
3524 		}
3525 	}
3526 
3527 	if (!dev_valid_name(dev->name)) {
3528 		ret = -EINVAL;
3529 		goto err_uninit;
3530 	}
3531 
3532 	dev->ifindex = dev_new_index(net);
3533 	if (dev->iflink == -1)
3534 		dev->iflink = dev->ifindex;
3535 
3536 	/* Check for existence of name */
3537 	head = dev_name_hash(net, dev->name);
3538 	hlist_for_each(p, head) {
3539 		struct net_device *d
3540 			= hlist_entry(p, struct net_device, name_hlist);
3541 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3542 			ret = -EEXIST;
3543 			goto err_uninit;
3544 		}
3545 	}
3546 
3547 	/* Fix illegal checksum combinations */
3548 	if ((dev->features & NETIF_F_HW_CSUM) &&
3549 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3550 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3551 		       dev->name);
3552 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3553 	}
3554 
3555 	if ((dev->features & NETIF_F_NO_CSUM) &&
3556 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3557 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3558 		       dev->name);
3559 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3560 	}
3561 
3562 
3563 	/* Fix illegal SG+CSUM combinations. */
3564 	if ((dev->features & NETIF_F_SG) &&
3565 	    !(dev->features & NETIF_F_ALL_CSUM)) {
3566 		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3567 		       dev->name);
3568 		dev->features &= ~NETIF_F_SG;
3569 	}
3570 
3571 	/* TSO requires that SG is present as well. */
3572 	if ((dev->features & NETIF_F_TSO) &&
3573 	    !(dev->features & NETIF_F_SG)) {
3574 		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3575 		       dev->name);
3576 		dev->features &= ~NETIF_F_TSO;
3577 	}
3578 	if (dev->features & NETIF_F_UFO) {
3579 		if (!(dev->features & NETIF_F_HW_CSUM)) {
3580 			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3581 					"NETIF_F_HW_CSUM feature.\n",
3582 							dev->name);
3583 			dev->features &= ~NETIF_F_UFO;
3584 		}
3585 		if (!(dev->features & NETIF_F_SG)) {
3586 			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3587 					"NETIF_F_SG feature.\n",
3588 					dev->name);
3589 			dev->features &= ~NETIF_F_UFO;
3590 		}
3591 	}
3592 
3593 	ret = netdev_register_kobject(dev);
3594 	if (ret)
3595 		goto err_uninit;
3596 	dev->reg_state = NETREG_REGISTERED;
3597 
3598 	/*
3599 	 *	Default initial state at registry is that the
3600 	 *	device is present.
3601 	 */
3602 
3603 	set_bit(__LINK_STATE_PRESENT, &dev->state);
3604 
3605 	dev_init_scheduler(dev);
3606 	dev_hold(dev);
3607 	list_netdevice(dev);
3608 
3609 	/* Notify protocols, that a new device appeared. */
3610 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3611 	ret = notifier_to_errno(ret);
3612 	if (ret)
3613 		unregister_netdevice(dev);
3614 
3615 out:
3616 	return ret;
3617 
3618 err_uninit:
3619 	if (dev->uninit)
3620 		dev->uninit(dev);
3621 	goto out;
3622 }
3623 
3624 /**
3625  *	register_netdev	- register a network device
3626  *	@dev: device to register
3627  *
3628  *	Take a completed network device structure and add it to the kernel
3629  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3630  *	chain. 0 is returned on success. A negative errno code is returned
3631  *	on a failure to set up the device, or if the name is a duplicate.
3632  *
3633  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
3634  *	and expands the device name if you passed a format string to
3635  *	alloc_netdev.
3636  */
3637 int register_netdev(struct net_device *dev)
3638 {
3639 	int err;
3640 
3641 	rtnl_lock();
3642 
3643 	/*
3644 	 * If the name is a format string the caller wants us to do a
3645 	 * name allocation.
3646 	 */
3647 	if (strchr(dev->name, '%')) {
3648 		err = dev_alloc_name(dev, dev->name);
3649 		if (err < 0)
3650 			goto out;
3651 	}
3652 
3653 	err = register_netdevice(dev);
3654 out:
3655 	rtnl_unlock();
3656 	return err;
3657 }
3658 EXPORT_SYMBOL(register_netdev);
3659 
3660 /*
3661  * netdev_wait_allrefs - wait until all references are gone.
3662  *
3663  * This is called when unregistering network devices.
3664  *
3665  * Any protocol or device that holds a reference should register
3666  * for netdevice notification, and cleanup and put back the
3667  * reference if they receive an UNREGISTER event.
3668  * We can get stuck here if buggy protocols don't correctly
3669  * call dev_put.
3670  */
3671 static void netdev_wait_allrefs(struct net_device *dev)
3672 {
3673 	unsigned long rebroadcast_time, warning_time;
3674 
3675 	rebroadcast_time = warning_time = jiffies;
3676 	while (atomic_read(&dev->refcnt) != 0) {
3677 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3678 			rtnl_lock();
3679 
3680 			/* Rebroadcast unregister notification */
3681 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3682 
3683 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3684 				     &dev->state)) {
3685 				/* We must not have linkwatch events
3686 				 * pending on unregister. If this
3687 				 * happens, we simply run the queue
3688 				 * unscheduled, resulting in a noop
3689 				 * for this device.
3690 				 */
3691 				linkwatch_run_queue();
3692 			}
3693 
3694 			__rtnl_unlock();
3695 
3696 			rebroadcast_time = jiffies;
3697 		}
3698 
3699 		msleep(250);
3700 
3701 		if (time_after(jiffies, warning_time + 10 * HZ)) {
3702 			printk(KERN_EMERG "unregister_netdevice: "
3703 			       "waiting for %s to become free. Usage "
3704 			       "count = %d\n",
3705 			       dev->name, atomic_read(&dev->refcnt));
3706 			warning_time = jiffies;
3707 		}
3708 	}
3709 }
3710 
3711 /* The sequence is:
3712  *
3713  *	rtnl_lock();
3714  *	...
3715  *	register_netdevice(x1);
3716  *	register_netdevice(x2);
3717  *	...
3718  *	unregister_netdevice(y1);
3719  *	unregister_netdevice(y2);
3720  *      ...
3721  *	rtnl_unlock();
3722  *	free_netdev(y1);
3723  *	free_netdev(y2);
3724  *
3725  * We are invoked by rtnl_unlock() after it drops the semaphore.
3726  * This allows us to deal with problems:
3727  * 1) We can delete sysfs objects which invoke hotplug
3728  *    without deadlocking with linkwatch via keventd.
3729  * 2) Since we run with the RTNL semaphore not held, we can sleep
3730  *    safely in order to wait for the netdev refcnt to drop to zero.
3731  */
3732 static DEFINE_MUTEX(net_todo_run_mutex);
3733 void netdev_run_todo(void)
3734 {
3735 	struct list_head list;
3736 
3737 	/* Need to guard against multiple cpu's getting out of order. */
3738 	mutex_lock(&net_todo_run_mutex);
3739 
3740 	/* Not safe to do outside the semaphore.  We must not return
3741 	 * until all unregister events invoked by the local processor
3742 	 * have been completed (either by this todo run, or one on
3743 	 * another cpu).
3744 	 */
3745 	if (list_empty(&net_todo_list))
3746 		goto out;
3747 
3748 	/* Snapshot list, allow later requests */
3749 	spin_lock(&net_todo_list_lock);
3750 	list_replace_init(&net_todo_list, &list);
3751 	spin_unlock(&net_todo_list_lock);
3752 
3753 	while (!list_empty(&list)) {
3754 		struct net_device *dev
3755 			= list_entry(list.next, struct net_device, todo_list);
3756 		list_del(&dev->todo_list);
3757 
3758 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3759 			printk(KERN_ERR "network todo '%s' but state %d\n",
3760 			       dev->name, dev->reg_state);
3761 			dump_stack();
3762 			continue;
3763 		}
3764 
3765 		dev->reg_state = NETREG_UNREGISTERED;
3766 
3767 		netdev_wait_allrefs(dev);
3768 
3769 		/* paranoia */
3770 		BUG_ON(atomic_read(&dev->refcnt));
3771 		BUG_TRAP(!dev->ip_ptr);
3772 		BUG_TRAP(!dev->ip6_ptr);
3773 		BUG_TRAP(!dev->dn_ptr);
3774 
3775 		if (dev->destructor)
3776 			dev->destructor(dev);
3777 
3778 		/* Free network device */
3779 		kobject_put(&dev->dev.kobj);
3780 	}
3781 
3782 out:
3783 	mutex_unlock(&net_todo_run_mutex);
3784 }
3785 
3786 static struct net_device_stats *internal_stats(struct net_device *dev)
3787 {
3788 	return &dev->stats;
3789 }
3790 
3791 /**
3792  *	alloc_netdev_mq - allocate network device
3793  *	@sizeof_priv:	size of private data to allocate space for
3794  *	@name:		device name format string
3795  *	@setup:		callback to initialize device
3796  *	@queue_count:	the number of subqueues to allocate
3797  *
3798  *	Allocates a struct net_device with private data area for driver use
3799  *	and performs basic initialization.  Also allocates subquue structs
3800  *	for each queue on the device at the end of the netdevice.
3801  */
3802 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
3803 		void (*setup)(struct net_device *), unsigned int queue_count)
3804 {
3805 	void *p;
3806 	struct net_device *dev;
3807 	int alloc_size;
3808 
3809 	BUG_ON(strlen(name) >= sizeof(dev->name));
3810 
3811 	/* ensure 32-byte alignment of both the device and private area */
3812 	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
3813 		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
3814 		     ~NETDEV_ALIGN_CONST;
3815 	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3816 
3817 	p = kzalloc(alloc_size, GFP_KERNEL);
3818 	if (!p) {
3819 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3820 		return NULL;
3821 	}
3822 
3823 	dev = (struct net_device *)
3824 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3825 	dev->padded = (char *)dev - (char *)p;
3826 	dev->nd_net = &init_net;
3827 
3828 	if (sizeof_priv) {
3829 		dev->priv = ((char *)dev +
3830 			     ((sizeof(struct net_device) +
3831 			       (sizeof(struct net_device_subqueue) *
3832 				(queue_count - 1)) + NETDEV_ALIGN_CONST)
3833 			      & ~NETDEV_ALIGN_CONST));
3834 	}
3835 
3836 	dev->egress_subqueue_count = queue_count;
3837 
3838 	dev->get_stats = internal_stats;
3839 	netpoll_netdev_init(dev);
3840 	setup(dev);
3841 	strcpy(dev->name, name);
3842 	return dev;
3843 }
3844 EXPORT_SYMBOL(alloc_netdev_mq);
3845 
3846 /**
3847  *	free_netdev - free network device
3848  *	@dev: device
3849  *
3850  *	This function does the last stage of destroying an allocated device
3851  * 	interface. The reference to the device object is released.
3852  *	If this is the last reference then it will be freed.
3853  */
3854 void free_netdev(struct net_device *dev)
3855 {
3856 	/*  Compatibility with error handling in drivers */
3857 	if (dev->reg_state == NETREG_UNINITIALIZED) {
3858 		kfree((char *)dev - dev->padded);
3859 		return;
3860 	}
3861 
3862 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3863 	dev->reg_state = NETREG_RELEASED;
3864 
3865 	/* will free via device release */
3866 	put_device(&dev->dev);
3867 }
3868 
3869 /* Synchronize with packet receive processing. */
3870 void synchronize_net(void)
3871 {
3872 	might_sleep();
3873 	synchronize_rcu();
3874 }
3875 
3876 /**
3877  *	unregister_netdevice - remove device from the kernel
3878  *	@dev: device
3879  *
3880  *	This function shuts down a device interface and removes it
3881  *	from the kernel tables. On success 0 is returned, on a failure
3882  *	a negative errno code is returned.
3883  *
3884  *	Callers must hold the rtnl semaphore.  You may want
3885  *	unregister_netdev() instead of this.
3886  */
3887 
3888 void unregister_netdevice(struct net_device *dev)
3889 {
3890 	BUG_ON(dev_boot_phase);
3891 	ASSERT_RTNL();
3892 
3893 	/* Some devices call without registering for initialization unwind. */
3894 	if (dev->reg_state == NETREG_UNINITIALIZED) {
3895 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3896 				  "was registered\n", dev->name, dev);
3897 
3898 		WARN_ON(1);
3899 		return;
3900 	}
3901 
3902 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3903 
3904 	/* If device is running, close it first. */
3905 	dev_close(dev);
3906 
3907 	/* And unlink it from device chain. */
3908 	unlist_netdevice(dev);
3909 
3910 	dev->reg_state = NETREG_UNREGISTERING;
3911 
3912 	synchronize_net();
3913 
3914 	/* Shutdown queueing discipline. */
3915 	dev_shutdown(dev);
3916 
3917 
3918 	/* Notify protocols, that we are about to destroy
3919 	   this device. They should clean all the things.
3920 	*/
3921 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3922 
3923 	/*
3924 	 *	Flush the unicast and multicast chains
3925 	 */
3926 	dev_addr_discard(dev);
3927 
3928 	if (dev->uninit)
3929 		dev->uninit(dev);
3930 
3931 	/* Notifier chain MUST detach us from master device. */
3932 	BUG_TRAP(!dev->master);
3933 
3934 	/* Remove entries from kobject tree */
3935 	netdev_unregister_kobject(dev);
3936 
3937 	/* Finish processing unregister after unlock */
3938 	net_set_todo(dev);
3939 
3940 	synchronize_net();
3941 
3942 	dev_put(dev);
3943 }
3944 
3945 /**
3946  *	unregister_netdev - remove device from the kernel
3947  *	@dev: device
3948  *
3949  *	This function shuts down a device interface and removes it
3950  *	from the kernel tables. On success 0 is returned, on a failure
3951  *	a negative errno code is returned.
3952  *
3953  *	This is just a wrapper for unregister_netdevice that takes
3954  *	the rtnl semaphore.  In general you want to use this and not
3955  *	unregister_netdevice.
3956  */
3957 void unregister_netdev(struct net_device *dev)
3958 {
3959 	rtnl_lock();
3960 	unregister_netdevice(dev);
3961 	rtnl_unlock();
3962 }
3963 
3964 EXPORT_SYMBOL(unregister_netdev);
3965 
3966 /**
3967  *	dev_change_net_namespace - move device to different nethost namespace
3968  *	@dev: device
3969  *	@net: network namespace
3970  *	@pat: If not NULL name pattern to try if the current device name
3971  *	      is already taken in the destination network namespace.
3972  *
3973  *	This function shuts down a device interface and moves it
3974  *	to a new network namespace. On success 0 is returned, on
3975  *	a failure a netagive errno code is returned.
3976  *
3977  *	Callers must hold the rtnl semaphore.
3978  */
3979 
3980 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
3981 {
3982 	char buf[IFNAMSIZ];
3983 	const char *destname;
3984 	int err;
3985 
3986 	ASSERT_RTNL();
3987 
3988 	/* Don't allow namespace local devices to be moved. */
3989 	err = -EINVAL;
3990 	if (dev->features & NETIF_F_NETNS_LOCAL)
3991 		goto out;
3992 
3993 	/* Ensure the device has been registrered */
3994 	err = -EINVAL;
3995 	if (dev->reg_state != NETREG_REGISTERED)
3996 		goto out;
3997 
3998 	/* Get out if there is nothing todo */
3999 	err = 0;
4000 	if (dev->nd_net == net)
4001 		goto out;
4002 
4003 	/* Pick the destination device name, and ensure
4004 	 * we can use it in the destination network namespace.
4005 	 */
4006 	err = -EEXIST;
4007 	destname = dev->name;
4008 	if (__dev_get_by_name(net, destname)) {
4009 		/* We get here if we can't use the current device name */
4010 		if (!pat)
4011 			goto out;
4012 		if (!dev_valid_name(pat))
4013 			goto out;
4014 		if (strchr(pat, '%')) {
4015 			if (__dev_alloc_name(net, pat, buf) < 0)
4016 				goto out;
4017 			destname = buf;
4018 		} else
4019 			destname = pat;
4020 		if (__dev_get_by_name(net, destname))
4021 			goto out;
4022 	}
4023 
4024 	/*
4025 	 * And now a mini version of register_netdevice unregister_netdevice.
4026 	 */
4027 
4028 	/* If device is running close it first. */
4029 	dev_close(dev);
4030 
4031 	/* And unlink it from device chain */
4032 	err = -ENODEV;
4033 	unlist_netdevice(dev);
4034 
4035 	synchronize_net();
4036 
4037 	/* Shutdown queueing discipline. */
4038 	dev_shutdown(dev);
4039 
4040 	/* Notify protocols, that we are about to destroy
4041 	   this device. They should clean all the things.
4042 	*/
4043 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4044 
4045 	/*
4046 	 *	Flush the unicast and multicast chains
4047 	 */
4048 	dev_addr_discard(dev);
4049 
4050 	/* Actually switch the network namespace */
4051 	dev->nd_net = net;
4052 
4053 	/* Assign the new device name */
4054 	if (destname != dev->name)
4055 		strcpy(dev->name, destname);
4056 
4057 	/* If there is an ifindex conflict assign a new one */
4058 	if (__dev_get_by_index(net, dev->ifindex)) {
4059 		int iflink = (dev->iflink == dev->ifindex);
4060 		dev->ifindex = dev_new_index(net);
4061 		if (iflink)
4062 			dev->iflink = dev->ifindex;
4063 	}
4064 
4065 	/* Fixup kobjects */
4066 	err = device_rename(&dev->dev, dev->name);
4067 	WARN_ON(err);
4068 
4069 	/* Add the device back in the hashes */
4070 	list_netdevice(dev);
4071 
4072 	/* Notify protocols, that a new device appeared. */
4073 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4074 
4075 	synchronize_net();
4076 	err = 0;
4077 out:
4078 	return err;
4079 }
4080 
4081 static int dev_cpu_callback(struct notifier_block *nfb,
4082 			    unsigned long action,
4083 			    void *ocpu)
4084 {
4085 	struct sk_buff **list_skb;
4086 	struct net_device **list_net;
4087 	struct sk_buff *skb;
4088 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4089 	struct softnet_data *sd, *oldsd;
4090 
4091 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4092 		return NOTIFY_OK;
4093 
4094 	local_irq_disable();
4095 	cpu = smp_processor_id();
4096 	sd = &per_cpu(softnet_data, cpu);
4097 	oldsd = &per_cpu(softnet_data, oldcpu);
4098 
4099 	/* Find end of our completion_queue. */
4100 	list_skb = &sd->completion_queue;
4101 	while (*list_skb)
4102 		list_skb = &(*list_skb)->next;
4103 	/* Append completion queue from offline CPU. */
4104 	*list_skb = oldsd->completion_queue;
4105 	oldsd->completion_queue = NULL;
4106 
4107 	/* Find end of our output_queue. */
4108 	list_net = &sd->output_queue;
4109 	while (*list_net)
4110 		list_net = &(*list_net)->next_sched;
4111 	/* Append output queue from offline CPU. */
4112 	*list_net = oldsd->output_queue;
4113 	oldsd->output_queue = NULL;
4114 
4115 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4116 	local_irq_enable();
4117 
4118 	/* Process offline CPU's input_pkt_queue */
4119 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4120 		netif_rx(skb);
4121 
4122 	return NOTIFY_OK;
4123 }
4124 
4125 #ifdef CONFIG_NET_DMA
4126 /**
4127  * net_dma_rebalance - try to maintain one DMA channel per CPU
4128  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4129  *
4130  * This is called when the number of channels allocated to the net_dma client
4131  * changes.  The net_dma client tries to have one DMA channel per CPU.
4132  */
4133 
4134 static void net_dma_rebalance(struct net_dma *net_dma)
4135 {
4136 	unsigned int cpu, i, n, chan_idx;
4137 	struct dma_chan *chan;
4138 
4139 	if (cpus_empty(net_dma->channel_mask)) {
4140 		for_each_online_cpu(cpu)
4141 			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4142 		return;
4143 	}
4144 
4145 	i = 0;
4146 	cpu = first_cpu(cpu_online_map);
4147 
4148 	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4149 		chan = net_dma->channels[chan_idx];
4150 
4151 		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4152 		   + (i < (num_online_cpus() %
4153 			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4154 
4155 		while(n) {
4156 			per_cpu(softnet_data, cpu).net_dma = chan;
4157 			cpu = next_cpu(cpu, cpu_online_map);
4158 			n--;
4159 		}
4160 		i++;
4161 	}
4162 }
4163 
4164 /**
4165  * netdev_dma_event - event callback for the net_dma_client
4166  * @client: should always be net_dma_client
4167  * @chan: DMA channel for the event
4168  * @state: DMA state to be handled
4169  */
4170 static enum dma_state_client
4171 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4172 	enum dma_state state)
4173 {
4174 	int i, found = 0, pos = -1;
4175 	struct net_dma *net_dma =
4176 		container_of(client, struct net_dma, client);
4177 	enum dma_state_client ack = DMA_DUP; /* default: take no action */
4178 
4179 	spin_lock(&net_dma->lock);
4180 	switch (state) {
4181 	case DMA_RESOURCE_AVAILABLE:
4182 		for (i = 0; i < NR_CPUS; i++)
4183 			if (net_dma->channels[i] == chan) {
4184 				found = 1;
4185 				break;
4186 			} else if (net_dma->channels[i] == NULL && pos < 0)
4187 				pos = i;
4188 
4189 		if (!found && pos >= 0) {
4190 			ack = DMA_ACK;
4191 			net_dma->channels[pos] = chan;
4192 			cpu_set(pos, net_dma->channel_mask);
4193 			net_dma_rebalance(net_dma);
4194 		}
4195 		break;
4196 	case DMA_RESOURCE_REMOVED:
4197 		for (i = 0; i < NR_CPUS; i++)
4198 			if (net_dma->channels[i] == chan) {
4199 				found = 1;
4200 				pos = i;
4201 				break;
4202 			}
4203 
4204 		if (found) {
4205 			ack = DMA_ACK;
4206 			cpu_clear(pos, net_dma->channel_mask);
4207 			net_dma->channels[i] = NULL;
4208 			net_dma_rebalance(net_dma);
4209 		}
4210 		break;
4211 	default:
4212 		break;
4213 	}
4214 	spin_unlock(&net_dma->lock);
4215 
4216 	return ack;
4217 }
4218 
4219 /**
4220  * netdev_dma_regiser - register the networking subsystem as a DMA client
4221  */
4222 static int __init netdev_dma_register(void)
4223 {
4224 	spin_lock_init(&net_dma.lock);
4225 	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4226 	dma_async_client_register(&net_dma.client);
4227 	dma_async_client_chan_request(&net_dma.client);
4228 	return 0;
4229 }
4230 
4231 #else
4232 static int __init netdev_dma_register(void) { return -ENODEV; }
4233 #endif /* CONFIG_NET_DMA */
4234 
4235 /**
4236  *	netdev_compute_feature - compute conjunction of two feature sets
4237  *	@all: first feature set
4238  *	@one: second feature set
4239  *
4240  *	Computes a new feature set after adding a device with feature set
4241  *	@one to the master device with current feature set @all.  Returns
4242  *	the new feature set.
4243  */
4244 int netdev_compute_features(unsigned long all, unsigned long one)
4245 {
4246 	/* if device needs checksumming, downgrade to hw checksumming */
4247 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4248 		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4249 
4250 	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4251 	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4252 		all ^= NETIF_F_HW_CSUM
4253 			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4254 
4255 	if (one & NETIF_F_GSO)
4256 		one |= NETIF_F_GSO_SOFTWARE;
4257 	one |= NETIF_F_GSO;
4258 
4259 	/* If even one device supports robust GSO, enable it for all. */
4260 	if (one & NETIF_F_GSO_ROBUST)
4261 		all |= NETIF_F_GSO_ROBUST;
4262 
4263 	all &= one | NETIF_F_LLTX;
4264 
4265 	if (!(all & NETIF_F_ALL_CSUM))
4266 		all &= ~NETIF_F_SG;
4267 	if (!(all & NETIF_F_SG))
4268 		all &= ~NETIF_F_GSO_MASK;
4269 
4270 	return all;
4271 }
4272 EXPORT_SYMBOL(netdev_compute_features);
4273 
4274 static struct hlist_head *netdev_create_hash(void)
4275 {
4276 	int i;
4277 	struct hlist_head *hash;
4278 
4279 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4280 	if (hash != NULL)
4281 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4282 			INIT_HLIST_HEAD(&hash[i]);
4283 
4284 	return hash;
4285 }
4286 
4287 /* Initialize per network namespace state */
4288 static int __net_init netdev_init(struct net *net)
4289 {
4290 	INIT_LIST_HEAD(&net->dev_base_head);
4291 	rwlock_init(&dev_base_lock);
4292 
4293 	net->dev_name_head = netdev_create_hash();
4294 	if (net->dev_name_head == NULL)
4295 		goto err_name;
4296 
4297 	net->dev_index_head = netdev_create_hash();
4298 	if (net->dev_index_head == NULL)
4299 		goto err_idx;
4300 
4301 	return 0;
4302 
4303 err_idx:
4304 	kfree(net->dev_name_head);
4305 err_name:
4306 	return -ENOMEM;
4307 }
4308 
4309 static void __net_exit netdev_exit(struct net *net)
4310 {
4311 	kfree(net->dev_name_head);
4312 	kfree(net->dev_index_head);
4313 }
4314 
4315 static struct pernet_operations __net_initdata netdev_net_ops = {
4316 	.init = netdev_init,
4317 	.exit = netdev_exit,
4318 };
4319 
4320 static void __net_exit default_device_exit(struct net *net)
4321 {
4322 	struct net_device *dev, *next;
4323 	/*
4324 	 * Push all migratable of the network devices back to the
4325 	 * initial network namespace
4326 	 */
4327 	rtnl_lock();
4328 	for_each_netdev_safe(net, dev, next) {
4329 		int err;
4330 
4331 		/* Ignore unmoveable devices (i.e. loopback) */
4332 		if (dev->features & NETIF_F_NETNS_LOCAL)
4333 			continue;
4334 
4335 		/* Push remaing network devices to init_net */
4336 		err = dev_change_net_namespace(dev, &init_net, "dev%d");
4337 		if (err) {
4338 			printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n",
4339 				__func__, dev->name, err);
4340 			unregister_netdevice(dev);
4341 		}
4342 	}
4343 	rtnl_unlock();
4344 }
4345 
4346 static struct pernet_operations __net_initdata default_device_ops = {
4347 	.exit = default_device_exit,
4348 };
4349 
4350 /*
4351  *	Initialize the DEV module. At boot time this walks the device list and
4352  *	unhooks any devices that fail to initialise (normally hardware not
4353  *	present) and leaves us with a valid list of present and active devices.
4354  *
4355  */
4356 
4357 /*
4358  *       This is called single threaded during boot, so no need
4359  *       to take the rtnl semaphore.
4360  */
4361 static int __init net_dev_init(void)
4362 {
4363 	int i, rc = -ENOMEM;
4364 
4365 	BUG_ON(!dev_boot_phase);
4366 
4367 	if (dev_proc_init())
4368 		goto out;
4369 
4370 	if (netdev_kobject_init())
4371 		goto out;
4372 
4373 	INIT_LIST_HEAD(&ptype_all);
4374 	for (i = 0; i < 16; i++)
4375 		INIT_LIST_HEAD(&ptype_base[i]);
4376 
4377 	if (register_pernet_subsys(&netdev_net_ops))
4378 		goto out;
4379 
4380 	if (register_pernet_device(&default_device_ops))
4381 		goto out;
4382 
4383 	/*
4384 	 *	Initialise the packet receive queues.
4385 	 */
4386 
4387 	for_each_possible_cpu(i) {
4388 		struct softnet_data *queue;
4389 
4390 		queue = &per_cpu(softnet_data, i);
4391 		skb_queue_head_init(&queue->input_pkt_queue);
4392 		queue->completion_queue = NULL;
4393 		INIT_LIST_HEAD(&queue->poll_list);
4394 
4395 		queue->backlog.poll = process_backlog;
4396 		queue->backlog.weight = weight_p;
4397 	}
4398 
4399 	netdev_dma_register();
4400 
4401 	dev_boot_phase = 0;
4402 
4403 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4404 	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4405 
4406 	hotcpu_notifier(dev_cpu_callback, 0);
4407 	dst_init();
4408 	dev_mcast_init();
4409 	rc = 0;
4410 out:
4411 	return rc;
4412 }
4413 
4414 subsys_initcall(net_dev_init);
4415 
4416 EXPORT_SYMBOL(__dev_get_by_index);
4417 EXPORT_SYMBOL(__dev_get_by_name);
4418 EXPORT_SYMBOL(__dev_remove_pack);
4419 EXPORT_SYMBOL(dev_valid_name);
4420 EXPORT_SYMBOL(dev_add_pack);
4421 EXPORT_SYMBOL(dev_alloc_name);
4422 EXPORT_SYMBOL(dev_close);
4423 EXPORT_SYMBOL(dev_get_by_flags);
4424 EXPORT_SYMBOL(dev_get_by_index);
4425 EXPORT_SYMBOL(dev_get_by_name);
4426 EXPORT_SYMBOL(dev_open);
4427 EXPORT_SYMBOL(dev_queue_xmit);
4428 EXPORT_SYMBOL(dev_remove_pack);
4429 EXPORT_SYMBOL(dev_set_allmulti);
4430 EXPORT_SYMBOL(dev_set_promiscuity);
4431 EXPORT_SYMBOL(dev_change_flags);
4432 EXPORT_SYMBOL(dev_set_mtu);
4433 EXPORT_SYMBOL(dev_set_mac_address);
4434 EXPORT_SYMBOL(free_netdev);
4435 EXPORT_SYMBOL(netdev_boot_setup_check);
4436 EXPORT_SYMBOL(netdev_set_master);
4437 EXPORT_SYMBOL(netdev_state_change);
4438 EXPORT_SYMBOL(netif_receive_skb);
4439 EXPORT_SYMBOL(netif_rx);
4440 EXPORT_SYMBOL(register_gifconf);
4441 EXPORT_SYMBOL(register_netdevice);
4442 EXPORT_SYMBOL(register_netdevice_notifier);
4443 EXPORT_SYMBOL(skb_checksum_help);
4444 EXPORT_SYMBOL(synchronize_net);
4445 EXPORT_SYMBOL(unregister_netdevice);
4446 EXPORT_SYMBOL(unregister_netdevice_notifier);
4447 EXPORT_SYMBOL(net_enable_timestamp);
4448 EXPORT_SYMBOL(net_disable_timestamp);
4449 EXPORT_SYMBOL(dev_get_flags);
4450 
4451 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4452 EXPORT_SYMBOL(br_handle_frame_hook);
4453 EXPORT_SYMBOL(br_fdb_get_hook);
4454 EXPORT_SYMBOL(br_fdb_put_hook);
4455 #endif
4456 
4457 #ifdef CONFIG_KMOD
4458 EXPORT_SYMBOL(dev_load);
4459 #endif
4460 
4461 EXPORT_PER_CPU_SYMBOL(softnet_data);
4462