xref: /linux/net/packet/af_packet.c (revision b454cc6636d254fbf6049b73e9560aee76fb04a3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *		Alan Cox	:	verify_area() now used correctly
16  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17  *		Alan Cox	:	tidied skbuff lists.
18  *		Alan Cox	:	Now uses generic datagram routines I
19  *					added. Also fixed the peek/read crash
20  *					from all old Linux datagram code.
21  *		Alan Cox	:	Uses the improved datagram code.
22  *		Alan Cox	:	Added NULL's for socket options.
23  *		Alan Cox	:	Re-commented the code.
24  *		Alan Cox	:	Use new kernel side addressing
25  *		Rob Janssen	:	Correct MTU usage.
26  *		Dave Platt	:	Counter leaks caused by incorrect
27  *					interrupt locking and some slightly
28  *					dubious gcc output. Can you read
29  *					compiler: it said _VOLATILE_
30  *	Richard Kooijman	:	Timestamp fixes.
31  *		Alan Cox	:	New buffers. Use sk->mac.raw.
32  *		Alan Cox	:	sendmsg/recvmsg support.
33  *		Alan Cox	:	Protocol setting support
34  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35  *	Cyrus Durgin		:	Fixed kerneld for kmod.
36  *	Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
40  *					The convention is that longer addresses
41  *					will simply extend the hardware address
42  *					byte arrays at the end of sockaddr_ll
43  *					and packet_mreq.
44  *
45  *		This program is free software; you can redistribute it and/or
46  *		modify it under the terms of the GNU General Public License
47  *		as published by the Free Software Foundation; either version
48  *		2 of the License, or (at your option) any later version.
49  *
50  */
51 
52 #include <linux/types.h>
53 #include <linux/sched.h>
54 #include <linux/mm.h>
55 #include <linux/capability.h>
56 #include <linux/fcntl.h>
57 #include <linux/socket.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/if_packet.h>
62 #include <linux/wireless.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85 
86 #define CONFIG_SOCK_PACKET	1
87 
88 /*
89    Proposed replacement for SIOC{ADD,DEL}MULTI and
90    IFF_PROMISC, IFF_ALLMULTI flags.
91 
92    It is more expensive, but I believe,
93    it is really correct solution: reentereble, safe and fault tolerant.
94 
95    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96    reference count and global flag, so that real status is
97    (gflag|(count != 0)), so that we can use obsolete faulty interface
98    not harming clever users.
99  */
100 #define CONFIG_PACKET_MULTICAST	1
101 
102 /*
103    Assumptions:
104    - if device has no dev->hard_header routine, it adds and removes ll header
105      inside itself. In this case ll header is invisible outside of device,
106      but higher levels still should reserve dev->hard_header_len.
107      Some devices are enough clever to reallocate skb, when header
108      will not fit to reserved space (tunnel), another ones are silly
109      (PPP).
110    - packet socket receives packets with pulled ll header,
111      so that SOCK_RAW should push it back.
112 
113 On receive:
114 -----------
115 
116 Incoming, dev->hard_header!=NULL
117    mac.raw -> ll header
118    data    -> data
119 
120 Outgoing, dev->hard_header!=NULL
121    mac.raw -> ll header
122    data    -> ll header
123 
124 Incoming, dev->hard_header==NULL
125    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
126               PPP makes it, that is wrong, because introduce assymetry
127 	      between rx and tx paths.
128    data    -> data
129 
130 Outgoing, dev->hard_header==NULL
131    mac.raw -> data. ll header is still not built!
132    data    -> data
133 
134 Resume
135   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
136 
137 
138 On transmit:
139 ------------
140 
141 dev->hard_header != NULL
142    mac.raw -> ll header
143    data    -> ll header
144 
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146    mac.raw -> data
147    data -> data
148 
149    We should set nh.raw on output to correct posistion,
150    packet classifier depends on it.
151  */
152 
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
156 
157 static atomic_t packet_socks_nr;
158 
159 
160 /* Private packet socket structures. */
161 
162 #ifdef CONFIG_PACKET_MULTICAST
163 struct packet_mclist
164 {
165 	struct packet_mclist	*next;
166 	int			ifindex;
167 	int			count;
168 	unsigned short		type;
169 	unsigned short		alen;
170 	unsigned char		addr[MAX_ADDR_LEN];
171 };
172 /* identical to struct packet_mreq except it has
173  * a longer address field.
174  */
175 struct packet_mreq_max
176 {
177 	int		mr_ifindex;
178 	unsigned short	mr_type;
179 	unsigned short	mr_alen;
180 	unsigned char	mr_address[MAX_ADDR_LEN];
181 };
182 #endif
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
185 #endif
186 
187 static void packet_flush_mclist(struct sock *sk);
188 
189 struct packet_sock {
190 	/* struct sock has to be the first member of packet_sock */
191 	struct sock		sk;
192 	struct tpacket_stats	stats;
193 #ifdef CONFIG_PACKET_MMAP
194 	char *			*pg_vec;
195 	unsigned int		head;
196 	unsigned int            frames_per_block;
197 	unsigned int		frame_size;
198 	unsigned int		frame_max;
199 	int			copy_thresh;
200 #endif
201 	struct packet_type	prot_hook;
202 	spinlock_t		bind_lock;
203 	char			running;	/* prot_hook is attached*/
204 	int			ifindex;	/* bound device		*/
205 	__be16			num;
206 #ifdef CONFIG_PACKET_MULTICAST
207 	struct packet_mclist	*mclist;
208 #endif
209 #ifdef CONFIG_PACKET_MMAP
210 	atomic_t		mapped;
211 	unsigned int            pg_vec_order;
212 	unsigned int		pg_vec_pages;
213 	unsigned int		pg_vec_len;
214 #endif
215 };
216 
217 #ifdef CONFIG_PACKET_MMAP
218 
219 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
220 {
221 	unsigned int pg_vec_pos, frame_offset;
222 	char *frame;
223 
224 	pg_vec_pos = position / po->frames_per_block;
225 	frame_offset = position % po->frames_per_block;
226 
227 	frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
228 
229 	return frame;
230 }
231 #endif
232 
233 static inline struct packet_sock *pkt_sk(struct sock *sk)
234 {
235 	return (struct packet_sock *)sk;
236 }
237 
238 static void packet_sock_destruct(struct sock *sk)
239 {
240 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
241 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
242 
243 	if (!sock_flag(sk, SOCK_DEAD)) {
244 		printk("Attempt to release alive packet socket: %p\n", sk);
245 		return;
246 	}
247 
248 	atomic_dec(&packet_socks_nr);
249 #ifdef PACKET_REFCNT_DEBUG
250 	printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
251 #endif
252 }
253 
254 
255 static const struct proto_ops packet_ops;
256 
257 #ifdef CONFIG_SOCK_PACKET
258 static const struct proto_ops packet_ops_spkt;
259 
260 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
261 {
262 	struct sock *sk;
263 	struct sockaddr_pkt *spkt;
264 
265 	/*
266 	 *	When we registered the protocol we saved the socket in the data
267 	 *	field for just this event.
268 	 */
269 
270 	sk = pt->af_packet_priv;
271 
272 	/*
273 	 *	Yank back the headers [hope the device set this
274 	 *	right or kerboom...]
275 	 *
276 	 *	Incoming packets have ll header pulled,
277 	 *	push it back.
278 	 *
279 	 *	For outgoing ones skb->data == skb->mac.raw
280 	 *	so that this procedure is noop.
281 	 */
282 
283 	if (skb->pkt_type == PACKET_LOOPBACK)
284 		goto out;
285 
286 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
287 		goto oom;
288 
289 	/* drop any routing info */
290 	dst_release(skb->dst);
291 	skb->dst = NULL;
292 
293 	/* drop conntrack reference */
294 	nf_reset(skb);
295 
296 	spkt = (struct sockaddr_pkt*)skb->cb;
297 
298 	skb_push(skb, skb->data-skb->mac.raw);
299 
300 	/*
301 	 *	The SOCK_PACKET socket receives _all_ frames.
302 	 */
303 
304 	spkt->spkt_family = dev->type;
305 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
306 	spkt->spkt_protocol = skb->protocol;
307 
308 	/*
309 	 *	Charge the memory to the socket. This is done specifically
310 	 *	to prevent sockets using all the memory up.
311 	 */
312 
313 	if (sock_queue_rcv_skb(sk,skb) == 0)
314 		return 0;
315 
316 out:
317 	kfree_skb(skb);
318 oom:
319 	return 0;
320 }
321 
322 
323 /*
324  *	Output a raw packet to a device layer. This bypasses all the other
325  *	protocol layers and you must therefore supply it with a complete frame
326  */
327 
328 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
329 			       struct msghdr *msg, size_t len)
330 {
331 	struct sock *sk = sock->sk;
332 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
333 	struct sk_buff *skb;
334 	struct net_device *dev;
335 	__be16 proto=0;
336 	int err;
337 
338 	/*
339 	 *	Get and verify the address.
340 	 */
341 
342 	if (saddr)
343 	{
344 		if (msg->msg_namelen < sizeof(struct sockaddr))
345 			return(-EINVAL);
346 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
347 			proto=saddr->spkt_protocol;
348 	}
349 	else
350 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
351 
352 	/*
353 	 *	Find the device first to size check it
354 	 */
355 
356 	saddr->spkt_device[13] = 0;
357 	dev = dev_get_by_name(saddr->spkt_device);
358 	err = -ENODEV;
359 	if (dev == NULL)
360 		goto out_unlock;
361 
362 	err = -ENETDOWN;
363 	if (!(dev->flags & IFF_UP))
364 		goto out_unlock;
365 
366 	/*
367 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
368 	 *	raw protocol and you must do your own fragmentation at this level.
369 	 */
370 
371 	err = -EMSGSIZE;
372 	if (len > dev->mtu + dev->hard_header_len)
373 		goto out_unlock;
374 
375 	err = -ENOBUFS;
376 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
377 
378 	/*
379 	 *	If the write buffer is full, then tough. At this level the user gets to
380 	 *	deal with the problem - do your own algorithmic backoffs. That's far
381 	 *	more flexible.
382 	 */
383 
384 	if (skb == NULL)
385 		goto out_unlock;
386 
387 	/*
388 	 *	Fill it in
389 	 */
390 
391 	/* FIXME: Save some space for broken drivers that write a
392 	 * hard header at transmission time by themselves. PPP is the
393 	 * notable one here. This should really be fixed at the driver level.
394 	 */
395 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
396 	skb->nh.raw = skb->data;
397 
398 	/* Try to align data part correctly */
399 	if (dev->hard_header) {
400 		skb->data -= dev->hard_header_len;
401 		skb->tail -= dev->hard_header_len;
402 		if (len < dev->hard_header_len)
403 			skb->nh.raw = skb->data;
404 	}
405 
406 	/* Returns -EFAULT on error */
407 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
408 	skb->protocol = proto;
409 	skb->dev = dev;
410 	skb->priority = sk->sk_priority;
411 	if (err)
412 		goto out_free;
413 
414 	/*
415 	 *	Now send it
416 	 */
417 
418 	dev_queue_xmit(skb);
419 	dev_put(dev);
420 	return(len);
421 
422 out_free:
423 	kfree_skb(skb);
424 out_unlock:
425 	if (dev)
426 		dev_put(dev);
427 	return err;
428 }
429 #endif
430 
431 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
432 				      unsigned int res)
433 {
434 	struct sk_filter *filter;
435 
436 	rcu_read_lock_bh();
437 	filter = rcu_dereference(sk->sk_filter);
438 	if (filter != NULL)
439 		res = sk_run_filter(skb, filter->insns, filter->len);
440 	rcu_read_unlock_bh();
441 
442 	return res;
443 }
444 
445 /*
446    This function makes lazy skb cloning in hope that most of packets
447    are discarded by BPF.
448 
449    Note tricky part: we DO mangle shared skb! skb->data, skb->len
450    and skb->cb are mangled. It works because (and until) packets
451    falling here are owned by current CPU. Output packets are cloned
452    by dev_queue_xmit_nit(), input packets are processed by net_bh
453    sequencially, so that if we return skb to original state on exit,
454    we will not harm anyone.
455  */
456 
457 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
458 {
459 	struct sock *sk;
460 	struct sockaddr_ll *sll;
461 	struct packet_sock *po;
462 	u8 * skb_head = skb->data;
463 	int skb_len = skb->len;
464 	unsigned int snaplen, res;
465 
466 	if (skb->pkt_type == PACKET_LOOPBACK)
467 		goto drop;
468 
469 	sk = pt->af_packet_priv;
470 	po = pkt_sk(sk);
471 
472 	skb->dev = dev;
473 
474 	if (dev->hard_header) {
475 		/* The device has an explicit notion of ll header,
476 		   exported to higher levels.
477 
478 		   Otherwise, the device hides datails of it frame
479 		   structure, so that corresponding packet head
480 		   never delivered to user.
481 		 */
482 		if (sk->sk_type != SOCK_DGRAM)
483 			skb_push(skb, skb->data - skb->mac.raw);
484 		else if (skb->pkt_type == PACKET_OUTGOING) {
485 			/* Special case: outgoing packets have ll header at head */
486 			skb_pull(skb, skb->nh.raw - skb->data);
487 		}
488 	}
489 
490 	snaplen = skb->len;
491 
492 	res = run_filter(skb, sk, snaplen);
493 	if (!res)
494 		goto drop_n_restore;
495 	if (snaplen > res)
496 		snaplen = res;
497 
498 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
499 	    (unsigned)sk->sk_rcvbuf)
500 		goto drop_n_acct;
501 
502 	if (skb_shared(skb)) {
503 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
504 		if (nskb == NULL)
505 			goto drop_n_acct;
506 
507 		if (skb_head != skb->data) {
508 			skb->data = skb_head;
509 			skb->len = skb_len;
510 		}
511 		kfree_skb(skb);
512 		skb = nskb;
513 	}
514 
515 	sll = (struct sockaddr_ll*)skb->cb;
516 	sll->sll_family = AF_PACKET;
517 	sll->sll_hatype = dev->type;
518 	sll->sll_protocol = skb->protocol;
519 	sll->sll_pkttype = skb->pkt_type;
520 	sll->sll_ifindex = dev->ifindex;
521 	sll->sll_halen = 0;
522 
523 	if (dev->hard_header_parse)
524 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
525 
526 	if (pskb_trim(skb, snaplen))
527 		goto drop_n_acct;
528 
529 	skb_set_owner_r(skb, sk);
530 	skb->dev = NULL;
531 	dst_release(skb->dst);
532 	skb->dst = NULL;
533 
534 	/* drop conntrack reference */
535 	nf_reset(skb);
536 
537 	spin_lock(&sk->sk_receive_queue.lock);
538 	po->stats.tp_packets++;
539 	__skb_queue_tail(&sk->sk_receive_queue, skb);
540 	spin_unlock(&sk->sk_receive_queue.lock);
541 	sk->sk_data_ready(sk, skb->len);
542 	return 0;
543 
544 drop_n_acct:
545 	spin_lock(&sk->sk_receive_queue.lock);
546 	po->stats.tp_drops++;
547 	spin_unlock(&sk->sk_receive_queue.lock);
548 
549 drop_n_restore:
550 	if (skb_head != skb->data && skb_shared(skb)) {
551 		skb->data = skb_head;
552 		skb->len = skb_len;
553 	}
554 drop:
555 	kfree_skb(skb);
556 	return 0;
557 }
558 
559 #ifdef CONFIG_PACKET_MMAP
560 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
561 {
562 	struct sock *sk;
563 	struct packet_sock *po;
564 	struct sockaddr_ll *sll;
565 	struct tpacket_hdr *h;
566 	u8 * skb_head = skb->data;
567 	int skb_len = skb->len;
568 	unsigned int snaplen, res;
569 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
570 	unsigned short macoff, netoff;
571 	struct sk_buff *copy_skb = NULL;
572 
573 	if (skb->pkt_type == PACKET_LOOPBACK)
574 		goto drop;
575 
576 	sk = pt->af_packet_priv;
577 	po = pkt_sk(sk);
578 
579 	if (dev->hard_header) {
580 		if (sk->sk_type != SOCK_DGRAM)
581 			skb_push(skb, skb->data - skb->mac.raw);
582 		else if (skb->pkt_type == PACKET_OUTGOING) {
583 			/* Special case: outgoing packets have ll header at head */
584 			skb_pull(skb, skb->nh.raw - skb->data);
585 			if (skb->ip_summed == CHECKSUM_PARTIAL)
586 				status |= TP_STATUS_CSUMNOTREADY;
587 		}
588 	}
589 
590 	snaplen = skb->len;
591 
592 	res = run_filter(skb, sk, snaplen);
593 	if (!res)
594 		goto drop_n_restore;
595 	if (snaplen > res)
596 		snaplen = res;
597 
598 	if (sk->sk_type == SOCK_DGRAM) {
599 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
600 	} else {
601 		unsigned maclen = skb->nh.raw - skb->data;
602 		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
603 		macoff = netoff - maclen;
604 	}
605 
606 	if (macoff + snaplen > po->frame_size) {
607 		if (po->copy_thresh &&
608 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
609 		    (unsigned)sk->sk_rcvbuf) {
610 			if (skb_shared(skb)) {
611 				copy_skb = skb_clone(skb, GFP_ATOMIC);
612 			} else {
613 				copy_skb = skb_get(skb);
614 				skb_head = skb->data;
615 			}
616 			if (copy_skb)
617 				skb_set_owner_r(copy_skb, sk);
618 		}
619 		snaplen = po->frame_size - macoff;
620 		if ((int)snaplen < 0)
621 			snaplen = 0;
622 	}
623 
624 	spin_lock(&sk->sk_receive_queue.lock);
625 	h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
626 
627 	if (h->tp_status)
628 		goto ring_is_full;
629 	po->head = po->head != po->frame_max ? po->head+1 : 0;
630 	po->stats.tp_packets++;
631 	if (copy_skb) {
632 		status |= TP_STATUS_COPY;
633 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
634 	}
635 	if (!po->stats.tp_drops)
636 		status &= ~TP_STATUS_LOSING;
637 	spin_unlock(&sk->sk_receive_queue.lock);
638 
639 	skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
640 
641 	h->tp_len = skb->len;
642 	h->tp_snaplen = snaplen;
643 	h->tp_mac = macoff;
644 	h->tp_net = netoff;
645 	if (skb->tstamp.off_sec == 0) {
646 		__net_timestamp(skb);
647 		sock_enable_timestamp(sk);
648 	}
649 	h->tp_sec = skb->tstamp.off_sec;
650 	h->tp_usec = skb->tstamp.off_usec;
651 
652 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
653 	sll->sll_halen = 0;
654 	if (dev->hard_header_parse)
655 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
656 	sll->sll_family = AF_PACKET;
657 	sll->sll_hatype = dev->type;
658 	sll->sll_protocol = skb->protocol;
659 	sll->sll_pkttype = skb->pkt_type;
660 	sll->sll_ifindex = dev->ifindex;
661 
662 	h->tp_status = status;
663 	smp_mb();
664 
665 	{
666 		struct page *p_start, *p_end;
667 		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
668 
669 		p_start = virt_to_page(h);
670 		p_end = virt_to_page(h_end);
671 		while (p_start <= p_end) {
672 			flush_dcache_page(p_start);
673 			p_start++;
674 		}
675 	}
676 
677 	sk->sk_data_ready(sk, 0);
678 
679 drop_n_restore:
680 	if (skb_head != skb->data && skb_shared(skb)) {
681 		skb->data = skb_head;
682 		skb->len = skb_len;
683 	}
684 drop:
685         kfree_skb(skb);
686 	return 0;
687 
688 ring_is_full:
689 	po->stats.tp_drops++;
690 	spin_unlock(&sk->sk_receive_queue.lock);
691 
692 	sk->sk_data_ready(sk, 0);
693 	if (copy_skb)
694 		kfree_skb(copy_skb);
695 	goto drop_n_restore;
696 }
697 
698 #endif
699 
700 
701 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
702 			  struct msghdr *msg, size_t len)
703 {
704 	struct sock *sk = sock->sk;
705 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
706 	struct sk_buff *skb;
707 	struct net_device *dev;
708 	__be16 proto;
709 	unsigned char *addr;
710 	int ifindex, err, reserve = 0;
711 
712 	/*
713 	 *	Get and verify the address.
714 	 */
715 
716 	if (saddr == NULL) {
717 		struct packet_sock *po = pkt_sk(sk);
718 
719 		ifindex	= po->ifindex;
720 		proto	= po->num;
721 		addr	= NULL;
722 	} else {
723 		err = -EINVAL;
724 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
725 			goto out;
726 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
727 			goto out;
728 		ifindex	= saddr->sll_ifindex;
729 		proto	= saddr->sll_protocol;
730 		addr	= saddr->sll_addr;
731 	}
732 
733 
734 	dev = dev_get_by_index(ifindex);
735 	err = -ENXIO;
736 	if (dev == NULL)
737 		goto out_unlock;
738 	if (sock->type == SOCK_RAW)
739 		reserve = dev->hard_header_len;
740 
741 	err = -ENETDOWN;
742 	if (!(dev->flags & IFF_UP))
743 		goto out_unlock;
744 
745 	err = -EMSGSIZE;
746 	if (len > dev->mtu+reserve)
747 		goto out_unlock;
748 
749 	skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
750 				msg->msg_flags & MSG_DONTWAIT, &err);
751 	if (skb==NULL)
752 		goto out_unlock;
753 
754 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
755 	skb->nh.raw = skb->data;
756 
757 	if (dev->hard_header) {
758 		int res;
759 		err = -EINVAL;
760 		res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
761 		if (sock->type != SOCK_DGRAM) {
762 			skb->tail = skb->data;
763 			skb->len = 0;
764 		} else if (res < 0)
765 			goto out_free;
766 	}
767 
768 	/* Returns -EFAULT on error */
769 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
770 	if (err)
771 		goto out_free;
772 
773 	skb->protocol = proto;
774 	skb->dev = dev;
775 	skb->priority = sk->sk_priority;
776 
777 	/*
778 	 *	Now send it
779 	 */
780 
781 	err = dev_queue_xmit(skb);
782 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
783 		goto out_unlock;
784 
785 	dev_put(dev);
786 
787 	return(len);
788 
789 out_free:
790 	kfree_skb(skb);
791 out_unlock:
792 	if (dev)
793 		dev_put(dev);
794 out:
795 	return err;
796 }
797 
798 /*
799  *	Close a PACKET socket. This is fairly simple. We immediately go
800  *	to 'closed' state and remove our protocol entry in the device list.
801  */
802 
803 static int packet_release(struct socket *sock)
804 {
805 	struct sock *sk = sock->sk;
806 	struct packet_sock *po;
807 
808 	if (!sk)
809 		return 0;
810 
811 	po = pkt_sk(sk);
812 
813 	write_lock_bh(&packet_sklist_lock);
814 	sk_del_node_init(sk);
815 	write_unlock_bh(&packet_sklist_lock);
816 
817 	/*
818 	 *	Unhook packet receive handler.
819 	 */
820 
821 	if (po->running) {
822 		/*
823 		 *	Remove the protocol hook
824 		 */
825 		dev_remove_pack(&po->prot_hook);
826 		po->running = 0;
827 		po->num = 0;
828 		__sock_put(sk);
829 	}
830 
831 #ifdef CONFIG_PACKET_MULTICAST
832 	packet_flush_mclist(sk);
833 #endif
834 
835 #ifdef CONFIG_PACKET_MMAP
836 	if (po->pg_vec) {
837 		struct tpacket_req req;
838 		memset(&req, 0, sizeof(req));
839 		packet_set_ring(sk, &req, 1);
840 	}
841 #endif
842 
843 	/*
844 	 *	Now the socket is dead. No more input will appear.
845 	 */
846 
847 	sock_orphan(sk);
848 	sock->sk = NULL;
849 
850 	/* Purge queues */
851 
852 	skb_queue_purge(&sk->sk_receive_queue);
853 
854 	sock_put(sk);
855 	return 0;
856 }
857 
858 /*
859  *	Attach a packet hook.
860  */
861 
862 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
863 {
864 	struct packet_sock *po = pkt_sk(sk);
865 	/*
866 	 *	Detach an existing hook if present.
867 	 */
868 
869 	lock_sock(sk);
870 
871 	spin_lock(&po->bind_lock);
872 	if (po->running) {
873 		__sock_put(sk);
874 		po->running = 0;
875 		po->num = 0;
876 		spin_unlock(&po->bind_lock);
877 		dev_remove_pack(&po->prot_hook);
878 		spin_lock(&po->bind_lock);
879 	}
880 
881 	po->num = protocol;
882 	po->prot_hook.type = protocol;
883 	po->prot_hook.dev = dev;
884 
885 	po->ifindex = dev ? dev->ifindex : 0;
886 
887 	if (protocol == 0)
888 		goto out_unlock;
889 
890 	if (dev) {
891 		if (dev->flags&IFF_UP) {
892 			dev_add_pack(&po->prot_hook);
893 			sock_hold(sk);
894 			po->running = 1;
895 		} else {
896 			sk->sk_err = ENETDOWN;
897 			if (!sock_flag(sk, SOCK_DEAD))
898 				sk->sk_error_report(sk);
899 		}
900 	} else {
901 		dev_add_pack(&po->prot_hook);
902 		sock_hold(sk);
903 		po->running = 1;
904 	}
905 
906 out_unlock:
907 	spin_unlock(&po->bind_lock);
908 	release_sock(sk);
909 	return 0;
910 }
911 
912 /*
913  *	Bind a packet socket to a device
914  */
915 
916 #ifdef CONFIG_SOCK_PACKET
917 
918 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
919 {
920 	struct sock *sk=sock->sk;
921 	char name[15];
922 	struct net_device *dev;
923 	int err = -ENODEV;
924 
925 	/*
926 	 *	Check legality
927 	 */
928 
929 	if (addr_len != sizeof(struct sockaddr))
930 		return -EINVAL;
931 	strlcpy(name,uaddr->sa_data,sizeof(name));
932 
933 	dev = dev_get_by_name(name);
934 	if (dev) {
935 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
936 		dev_put(dev);
937 	}
938 	return err;
939 }
940 #endif
941 
942 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
943 {
944 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
945 	struct sock *sk=sock->sk;
946 	struct net_device *dev = NULL;
947 	int err;
948 
949 
950 	/*
951 	 *	Check legality
952 	 */
953 
954 	if (addr_len < sizeof(struct sockaddr_ll))
955 		return -EINVAL;
956 	if (sll->sll_family != AF_PACKET)
957 		return -EINVAL;
958 
959 	if (sll->sll_ifindex) {
960 		err = -ENODEV;
961 		dev = dev_get_by_index(sll->sll_ifindex);
962 		if (dev == NULL)
963 			goto out;
964 	}
965 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
966 	if (dev)
967 		dev_put(dev);
968 
969 out:
970 	return err;
971 }
972 
973 static struct proto packet_proto = {
974 	.name	  = "PACKET",
975 	.owner	  = THIS_MODULE,
976 	.obj_size = sizeof(struct packet_sock),
977 };
978 
979 /*
980  *	Create a packet of type SOCK_PACKET.
981  */
982 
983 static int packet_create(struct socket *sock, int protocol)
984 {
985 	struct sock *sk;
986 	struct packet_sock *po;
987 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
988 	int err;
989 
990 	if (!capable(CAP_NET_RAW))
991 		return -EPERM;
992 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
993 #ifdef CONFIG_SOCK_PACKET
994 	    && sock->type != SOCK_PACKET
995 #endif
996 	    )
997 		return -ESOCKTNOSUPPORT;
998 
999 	sock->state = SS_UNCONNECTED;
1000 
1001 	err = -ENOBUFS;
1002 	sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1003 	if (sk == NULL)
1004 		goto out;
1005 
1006 	sock->ops = &packet_ops;
1007 #ifdef CONFIG_SOCK_PACKET
1008 	if (sock->type == SOCK_PACKET)
1009 		sock->ops = &packet_ops_spkt;
1010 #endif
1011 	sock_init_data(sock, sk);
1012 
1013 	po = pkt_sk(sk);
1014 	sk->sk_family = PF_PACKET;
1015 	po->num = proto;
1016 
1017 	sk->sk_destruct = packet_sock_destruct;
1018 	atomic_inc(&packet_socks_nr);
1019 
1020 	/*
1021 	 *	Attach a protocol block
1022 	 */
1023 
1024 	spin_lock_init(&po->bind_lock);
1025 	po->prot_hook.func = packet_rcv;
1026 #ifdef CONFIG_SOCK_PACKET
1027 	if (sock->type == SOCK_PACKET)
1028 		po->prot_hook.func = packet_rcv_spkt;
1029 #endif
1030 	po->prot_hook.af_packet_priv = sk;
1031 
1032 	if (proto) {
1033 		po->prot_hook.type = proto;
1034 		dev_add_pack(&po->prot_hook);
1035 		sock_hold(sk);
1036 		po->running = 1;
1037 	}
1038 
1039 	write_lock_bh(&packet_sklist_lock);
1040 	sk_add_node(sk, &packet_sklist);
1041 	write_unlock_bh(&packet_sklist_lock);
1042 	return(0);
1043 out:
1044 	return err;
1045 }
1046 
1047 /*
1048  *	Pull a packet from our receive queue and hand it to the user.
1049  *	If necessary we block.
1050  */
1051 
1052 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1053 			  struct msghdr *msg, size_t len, int flags)
1054 {
1055 	struct sock *sk = sock->sk;
1056 	struct sk_buff *skb;
1057 	int copied, err;
1058 	struct sockaddr_ll *sll;
1059 
1060 	err = -EINVAL;
1061 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1062 		goto out;
1063 
1064 #if 0
1065 	/* What error should we return now? EUNATTACH? */
1066 	if (pkt_sk(sk)->ifindex < 0)
1067 		return -ENODEV;
1068 #endif
1069 
1070 	/*
1071 	 *	Call the generic datagram receiver. This handles all sorts
1072 	 *	of horrible races and re-entrancy so we can forget about it
1073 	 *	in the protocol layers.
1074 	 *
1075 	 *	Now it will return ENETDOWN, if device have just gone down,
1076 	 *	but then it will block.
1077 	 */
1078 
1079 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1080 
1081 	/*
1082 	 *	An error occurred so return it. Because skb_recv_datagram()
1083 	 *	handles the blocking we don't see and worry about blocking
1084 	 *	retries.
1085 	 */
1086 
1087 	if (skb == NULL)
1088 		goto out;
1089 
1090 	/*
1091 	 *	If the address length field is there to be filled in, we fill
1092 	 *	it in now.
1093 	 */
1094 
1095 	sll = (struct sockaddr_ll*)skb->cb;
1096 	if (sock->type == SOCK_PACKET)
1097 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1098 	else
1099 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1100 
1101 	/*
1102 	 *	You lose any data beyond the buffer you gave. If it worries a
1103 	 *	user program they can ask the device for its MTU anyway.
1104 	 */
1105 
1106 	copied = skb->len;
1107 	if (copied > len)
1108 	{
1109 		copied=len;
1110 		msg->msg_flags|=MSG_TRUNC;
1111 	}
1112 
1113 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1114 	if (err)
1115 		goto out_free;
1116 
1117 	sock_recv_timestamp(msg, sk, skb);
1118 
1119 	if (msg->msg_name)
1120 		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1121 
1122 	/*
1123 	 *	Free or return the buffer as appropriate. Again this
1124 	 *	hides all the races and re-entrancy issues from us.
1125 	 */
1126 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1127 
1128 out_free:
1129 	skb_free_datagram(sk, skb);
1130 out:
1131 	return err;
1132 }
1133 
1134 #ifdef CONFIG_SOCK_PACKET
1135 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1136 			       int *uaddr_len, int peer)
1137 {
1138 	struct net_device *dev;
1139 	struct sock *sk	= sock->sk;
1140 
1141 	if (peer)
1142 		return -EOPNOTSUPP;
1143 
1144 	uaddr->sa_family = AF_PACKET;
1145 	dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1146 	if (dev) {
1147 		strlcpy(uaddr->sa_data, dev->name, 15);
1148 		dev_put(dev);
1149 	} else
1150 		memset(uaddr->sa_data, 0, 14);
1151 	*uaddr_len = sizeof(*uaddr);
1152 
1153 	return 0;
1154 }
1155 #endif
1156 
1157 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1158 			  int *uaddr_len, int peer)
1159 {
1160 	struct net_device *dev;
1161 	struct sock *sk = sock->sk;
1162 	struct packet_sock *po = pkt_sk(sk);
1163 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1164 
1165 	if (peer)
1166 		return -EOPNOTSUPP;
1167 
1168 	sll->sll_family = AF_PACKET;
1169 	sll->sll_ifindex = po->ifindex;
1170 	sll->sll_protocol = po->num;
1171 	dev = dev_get_by_index(po->ifindex);
1172 	if (dev) {
1173 		sll->sll_hatype = dev->type;
1174 		sll->sll_halen = dev->addr_len;
1175 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1176 		dev_put(dev);
1177 	} else {
1178 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1179 		sll->sll_halen = 0;
1180 	}
1181 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1182 
1183 	return 0;
1184 }
1185 
1186 #ifdef CONFIG_PACKET_MULTICAST
1187 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1188 {
1189 	switch (i->type) {
1190 	case PACKET_MR_MULTICAST:
1191 		if (what > 0)
1192 			dev_mc_add(dev, i->addr, i->alen, 0);
1193 		else
1194 			dev_mc_delete(dev, i->addr, i->alen, 0);
1195 		break;
1196 	case PACKET_MR_PROMISC:
1197 		dev_set_promiscuity(dev, what);
1198 		break;
1199 	case PACKET_MR_ALLMULTI:
1200 		dev_set_allmulti(dev, what);
1201 		break;
1202 	default:;
1203 	}
1204 }
1205 
1206 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1207 {
1208 	for ( ; i; i=i->next) {
1209 		if (i->ifindex == dev->ifindex)
1210 			packet_dev_mc(dev, i, what);
1211 	}
1212 }
1213 
1214 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1215 {
1216 	struct packet_sock *po = pkt_sk(sk);
1217 	struct packet_mclist *ml, *i;
1218 	struct net_device *dev;
1219 	int err;
1220 
1221 	rtnl_lock();
1222 
1223 	err = -ENODEV;
1224 	dev = __dev_get_by_index(mreq->mr_ifindex);
1225 	if (!dev)
1226 		goto done;
1227 
1228 	err = -EINVAL;
1229 	if (mreq->mr_alen > dev->addr_len)
1230 		goto done;
1231 
1232 	err = -ENOBUFS;
1233 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1234 	if (i == NULL)
1235 		goto done;
1236 
1237 	err = 0;
1238 	for (ml = po->mclist; ml; ml = ml->next) {
1239 		if (ml->ifindex == mreq->mr_ifindex &&
1240 		    ml->type == mreq->mr_type &&
1241 		    ml->alen == mreq->mr_alen &&
1242 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1243 			ml->count++;
1244 			/* Free the new element ... */
1245 			kfree(i);
1246 			goto done;
1247 		}
1248 	}
1249 
1250 	i->type = mreq->mr_type;
1251 	i->ifindex = mreq->mr_ifindex;
1252 	i->alen = mreq->mr_alen;
1253 	memcpy(i->addr, mreq->mr_address, i->alen);
1254 	i->count = 1;
1255 	i->next = po->mclist;
1256 	po->mclist = i;
1257 	packet_dev_mc(dev, i, +1);
1258 
1259 done:
1260 	rtnl_unlock();
1261 	return err;
1262 }
1263 
1264 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1265 {
1266 	struct packet_mclist *ml, **mlp;
1267 
1268 	rtnl_lock();
1269 
1270 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1271 		if (ml->ifindex == mreq->mr_ifindex &&
1272 		    ml->type == mreq->mr_type &&
1273 		    ml->alen == mreq->mr_alen &&
1274 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1275 			if (--ml->count == 0) {
1276 				struct net_device *dev;
1277 				*mlp = ml->next;
1278 				dev = dev_get_by_index(ml->ifindex);
1279 				if (dev) {
1280 					packet_dev_mc(dev, ml, -1);
1281 					dev_put(dev);
1282 				}
1283 				kfree(ml);
1284 			}
1285 			rtnl_unlock();
1286 			return 0;
1287 		}
1288 	}
1289 	rtnl_unlock();
1290 	return -EADDRNOTAVAIL;
1291 }
1292 
1293 static void packet_flush_mclist(struct sock *sk)
1294 {
1295 	struct packet_sock *po = pkt_sk(sk);
1296 	struct packet_mclist *ml;
1297 
1298 	if (!po->mclist)
1299 		return;
1300 
1301 	rtnl_lock();
1302 	while ((ml = po->mclist) != NULL) {
1303 		struct net_device *dev;
1304 
1305 		po->mclist = ml->next;
1306 		if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1307 			packet_dev_mc(dev, ml, -1);
1308 			dev_put(dev);
1309 		}
1310 		kfree(ml);
1311 	}
1312 	rtnl_unlock();
1313 }
1314 #endif
1315 
1316 static int
1317 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1318 {
1319 	struct sock *sk = sock->sk;
1320 	int ret;
1321 
1322 	if (level != SOL_PACKET)
1323 		return -ENOPROTOOPT;
1324 
1325 	switch(optname)	{
1326 #ifdef CONFIG_PACKET_MULTICAST
1327 	case PACKET_ADD_MEMBERSHIP:
1328 	case PACKET_DROP_MEMBERSHIP:
1329 	{
1330 		struct packet_mreq_max mreq;
1331 		int len = optlen;
1332 		memset(&mreq, 0, sizeof(mreq));
1333 		if (len < sizeof(struct packet_mreq))
1334 			return -EINVAL;
1335 		if (len > sizeof(mreq))
1336 			len = sizeof(mreq);
1337 		if (copy_from_user(&mreq,optval,len))
1338 			return -EFAULT;
1339 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1340 			return -EINVAL;
1341 		if (optname == PACKET_ADD_MEMBERSHIP)
1342 			ret = packet_mc_add(sk, &mreq);
1343 		else
1344 			ret = packet_mc_drop(sk, &mreq);
1345 		return ret;
1346 	}
1347 #endif
1348 #ifdef CONFIG_PACKET_MMAP
1349 	case PACKET_RX_RING:
1350 	{
1351 		struct tpacket_req req;
1352 
1353 		if (optlen<sizeof(req))
1354 			return -EINVAL;
1355 		if (copy_from_user(&req,optval,sizeof(req)))
1356 			return -EFAULT;
1357 		return packet_set_ring(sk, &req, 0);
1358 	}
1359 	case PACKET_COPY_THRESH:
1360 	{
1361 		int val;
1362 
1363 		if (optlen!=sizeof(val))
1364 			return -EINVAL;
1365 		if (copy_from_user(&val,optval,sizeof(val)))
1366 			return -EFAULT;
1367 
1368 		pkt_sk(sk)->copy_thresh = val;
1369 		return 0;
1370 	}
1371 #endif
1372 	default:
1373 		return -ENOPROTOOPT;
1374 	}
1375 }
1376 
1377 static int packet_getsockopt(struct socket *sock, int level, int optname,
1378 			     char __user *optval, int __user *optlen)
1379 {
1380 	int len;
1381 	struct sock *sk = sock->sk;
1382 	struct packet_sock *po = pkt_sk(sk);
1383 
1384 	if (level != SOL_PACKET)
1385 		return -ENOPROTOOPT;
1386 
1387 	if (get_user(len, optlen))
1388 		return -EFAULT;
1389 
1390 	if (len < 0)
1391 		return -EINVAL;
1392 
1393 	switch(optname)	{
1394 	case PACKET_STATISTICS:
1395 	{
1396 		struct tpacket_stats st;
1397 
1398 		if (len > sizeof(struct tpacket_stats))
1399 			len = sizeof(struct tpacket_stats);
1400 		spin_lock_bh(&sk->sk_receive_queue.lock);
1401 		st = po->stats;
1402 		memset(&po->stats, 0, sizeof(st));
1403 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1404 		st.tp_packets += st.tp_drops;
1405 
1406 		if (copy_to_user(optval, &st, len))
1407 			return -EFAULT;
1408 		break;
1409 	}
1410 	default:
1411 		return -ENOPROTOOPT;
1412 	}
1413 
1414 	if (put_user(len, optlen))
1415 		return -EFAULT;
1416 	return 0;
1417 }
1418 
1419 
1420 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1421 {
1422 	struct sock *sk;
1423 	struct hlist_node *node;
1424 	struct net_device *dev = (struct net_device*)data;
1425 
1426 	read_lock(&packet_sklist_lock);
1427 	sk_for_each(sk, node, &packet_sklist) {
1428 		struct packet_sock *po = pkt_sk(sk);
1429 
1430 		switch (msg) {
1431 		case NETDEV_UNREGISTER:
1432 #ifdef CONFIG_PACKET_MULTICAST
1433 			if (po->mclist)
1434 				packet_dev_mclist(dev, po->mclist, -1);
1435 			// fallthrough
1436 #endif
1437 		case NETDEV_DOWN:
1438 			if (dev->ifindex == po->ifindex) {
1439 				spin_lock(&po->bind_lock);
1440 				if (po->running) {
1441 					__dev_remove_pack(&po->prot_hook);
1442 					__sock_put(sk);
1443 					po->running = 0;
1444 					sk->sk_err = ENETDOWN;
1445 					if (!sock_flag(sk, SOCK_DEAD))
1446 						sk->sk_error_report(sk);
1447 				}
1448 				if (msg == NETDEV_UNREGISTER) {
1449 					po->ifindex = -1;
1450 					po->prot_hook.dev = NULL;
1451 				}
1452 				spin_unlock(&po->bind_lock);
1453 			}
1454 			break;
1455 		case NETDEV_UP:
1456 			spin_lock(&po->bind_lock);
1457 			if (dev->ifindex == po->ifindex && po->num &&
1458 			    !po->running) {
1459 				dev_add_pack(&po->prot_hook);
1460 				sock_hold(sk);
1461 				po->running = 1;
1462 			}
1463 			spin_unlock(&po->bind_lock);
1464 			break;
1465 		}
1466 	}
1467 	read_unlock(&packet_sklist_lock);
1468 	return NOTIFY_DONE;
1469 }
1470 
1471 
1472 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1473 			unsigned long arg)
1474 {
1475 	struct sock *sk = sock->sk;
1476 
1477 	switch(cmd) {
1478 		case SIOCOUTQ:
1479 		{
1480 			int amount = atomic_read(&sk->sk_wmem_alloc);
1481 			return put_user(amount, (int __user *)arg);
1482 		}
1483 		case SIOCINQ:
1484 		{
1485 			struct sk_buff *skb;
1486 			int amount = 0;
1487 
1488 			spin_lock_bh(&sk->sk_receive_queue.lock);
1489 			skb = skb_peek(&sk->sk_receive_queue);
1490 			if (skb)
1491 				amount = skb->len;
1492 			spin_unlock_bh(&sk->sk_receive_queue.lock);
1493 			return put_user(amount, (int __user *)arg);
1494 		}
1495 		case SIOCGSTAMP:
1496 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1497 
1498 #ifdef CONFIG_INET
1499 		case SIOCADDRT:
1500 		case SIOCDELRT:
1501 		case SIOCDARP:
1502 		case SIOCGARP:
1503 		case SIOCSARP:
1504 		case SIOCGIFADDR:
1505 		case SIOCSIFADDR:
1506 		case SIOCGIFBRDADDR:
1507 		case SIOCSIFBRDADDR:
1508 		case SIOCGIFNETMASK:
1509 		case SIOCSIFNETMASK:
1510 		case SIOCGIFDSTADDR:
1511 		case SIOCSIFDSTADDR:
1512 		case SIOCSIFFLAGS:
1513 			return inet_dgram_ops.ioctl(sock, cmd, arg);
1514 #endif
1515 
1516 		default:
1517 			return -ENOIOCTLCMD;
1518 	}
1519 	return 0;
1520 }
1521 
1522 #ifndef CONFIG_PACKET_MMAP
1523 #define packet_mmap sock_no_mmap
1524 #define packet_poll datagram_poll
1525 #else
1526 
1527 static unsigned int packet_poll(struct file * file, struct socket *sock,
1528 				poll_table *wait)
1529 {
1530 	struct sock *sk = sock->sk;
1531 	struct packet_sock *po = pkt_sk(sk);
1532 	unsigned int mask = datagram_poll(file, sock, wait);
1533 
1534 	spin_lock_bh(&sk->sk_receive_queue.lock);
1535 	if (po->pg_vec) {
1536 		unsigned last = po->head ? po->head-1 : po->frame_max;
1537 		struct tpacket_hdr *h;
1538 
1539 		h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1540 
1541 		if (h->tp_status)
1542 			mask |= POLLIN | POLLRDNORM;
1543 	}
1544 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1545 	return mask;
1546 }
1547 
1548 
1549 /* Dirty? Well, I still did not learn better way to account
1550  * for user mmaps.
1551  */
1552 
1553 static void packet_mm_open(struct vm_area_struct *vma)
1554 {
1555 	struct file *file = vma->vm_file;
1556 	struct socket * sock = file->private_data;
1557 	struct sock *sk = sock->sk;
1558 
1559 	if (sk)
1560 		atomic_inc(&pkt_sk(sk)->mapped);
1561 }
1562 
1563 static void packet_mm_close(struct vm_area_struct *vma)
1564 {
1565 	struct file *file = vma->vm_file;
1566 	struct socket * sock = file->private_data;
1567 	struct sock *sk = sock->sk;
1568 
1569 	if (sk)
1570 		atomic_dec(&pkt_sk(sk)->mapped);
1571 }
1572 
1573 static struct vm_operations_struct packet_mmap_ops = {
1574 	.open =	packet_mm_open,
1575 	.close =packet_mm_close,
1576 };
1577 
1578 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1579 {
1580 	return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1581 }
1582 
1583 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1584 {
1585 	int i;
1586 
1587 	for (i = 0; i < len; i++) {
1588 		if (likely(pg_vec[i]))
1589 			free_pages((unsigned long) pg_vec[i], order);
1590 	}
1591 	kfree(pg_vec);
1592 }
1593 
1594 static inline char *alloc_one_pg_vec_page(unsigned long order)
1595 {
1596 	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1597 					 order);
1598 }
1599 
1600 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1601 {
1602 	unsigned int block_nr = req->tp_block_nr;
1603 	char **pg_vec;
1604 	int i;
1605 
1606 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1607 	if (unlikely(!pg_vec))
1608 		goto out;
1609 
1610 	for (i = 0; i < block_nr; i++) {
1611 		pg_vec[i] = alloc_one_pg_vec_page(order);
1612 		if (unlikely(!pg_vec[i]))
1613 			goto out_free_pgvec;
1614 	}
1615 
1616 out:
1617 	return pg_vec;
1618 
1619 out_free_pgvec:
1620 	free_pg_vec(pg_vec, order, block_nr);
1621 	pg_vec = NULL;
1622 	goto out;
1623 }
1624 
1625 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1626 {
1627 	char **pg_vec = NULL;
1628 	struct packet_sock *po = pkt_sk(sk);
1629 	int was_running, order = 0;
1630 	__be16 num;
1631 	int err = 0;
1632 
1633 	if (req->tp_block_nr) {
1634 		int i, l;
1635 
1636 		/* Sanity tests and some calculations */
1637 
1638 		if (unlikely(po->pg_vec))
1639 			return -EBUSY;
1640 
1641 		if (unlikely((int)req->tp_block_size <= 0))
1642 			return -EINVAL;
1643 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1644 			return -EINVAL;
1645 		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1646 			return -EINVAL;
1647 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1648 			return -EINVAL;
1649 
1650 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1651 		if (unlikely(po->frames_per_block <= 0))
1652 			return -EINVAL;
1653 		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1654 			     req->tp_frame_nr))
1655 			return -EINVAL;
1656 
1657 		err = -ENOMEM;
1658 		order = get_order(req->tp_block_size);
1659 		pg_vec = alloc_pg_vec(req, order);
1660 		if (unlikely(!pg_vec))
1661 			goto out;
1662 
1663 		l = 0;
1664 		for (i = 0; i < req->tp_block_nr; i++) {
1665 			char *ptr = pg_vec[i];
1666 			struct tpacket_hdr *header;
1667 			int k;
1668 
1669 			for (k = 0; k < po->frames_per_block; k++) {
1670 				header = (struct tpacket_hdr *) ptr;
1671 				header->tp_status = TP_STATUS_KERNEL;
1672 				ptr += req->tp_frame_size;
1673 			}
1674 		}
1675 		/* Done */
1676 	} else {
1677 		if (unlikely(req->tp_frame_nr))
1678 			return -EINVAL;
1679 	}
1680 
1681 	lock_sock(sk);
1682 
1683 	/* Detach socket from network */
1684 	spin_lock(&po->bind_lock);
1685 	was_running = po->running;
1686 	num = po->num;
1687 	if (was_running) {
1688 		__dev_remove_pack(&po->prot_hook);
1689 		po->num = 0;
1690 		po->running = 0;
1691 		__sock_put(sk);
1692 	}
1693 	spin_unlock(&po->bind_lock);
1694 
1695 	synchronize_net();
1696 
1697 	err = -EBUSY;
1698 	if (closing || atomic_read(&po->mapped) == 0) {
1699 		err = 0;
1700 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1701 
1702 		spin_lock_bh(&sk->sk_receive_queue.lock);
1703 		pg_vec = XC(po->pg_vec, pg_vec);
1704 		po->frame_max = (req->tp_frame_nr - 1);
1705 		po->head = 0;
1706 		po->frame_size = req->tp_frame_size;
1707 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1708 
1709 		order = XC(po->pg_vec_order, order);
1710 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1711 
1712 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1713 		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1714 		skb_queue_purge(&sk->sk_receive_queue);
1715 #undef XC
1716 		if (atomic_read(&po->mapped))
1717 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1718 	}
1719 
1720 	spin_lock(&po->bind_lock);
1721 	if (was_running && !po->running) {
1722 		sock_hold(sk);
1723 		po->running = 1;
1724 		po->num = num;
1725 		dev_add_pack(&po->prot_hook);
1726 	}
1727 	spin_unlock(&po->bind_lock);
1728 
1729 	release_sock(sk);
1730 
1731 	if (pg_vec)
1732 		free_pg_vec(pg_vec, order, req->tp_block_nr);
1733 out:
1734 	return err;
1735 }
1736 
1737 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1738 {
1739 	struct sock *sk = sock->sk;
1740 	struct packet_sock *po = pkt_sk(sk);
1741 	unsigned long size;
1742 	unsigned long start;
1743 	int err = -EINVAL;
1744 	int i;
1745 
1746 	if (vma->vm_pgoff)
1747 		return -EINVAL;
1748 
1749 	size = vma->vm_end - vma->vm_start;
1750 
1751 	lock_sock(sk);
1752 	if (po->pg_vec == NULL)
1753 		goto out;
1754 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1755 		goto out;
1756 
1757 	start = vma->vm_start;
1758 	for (i = 0; i < po->pg_vec_len; i++) {
1759 		struct page *page = virt_to_page(po->pg_vec[i]);
1760 		int pg_num;
1761 
1762 		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1763 			err = vm_insert_page(vma, start, page);
1764 			if (unlikely(err))
1765 				goto out;
1766 			start += PAGE_SIZE;
1767 		}
1768 	}
1769 	atomic_inc(&po->mapped);
1770 	vma->vm_ops = &packet_mmap_ops;
1771 	err = 0;
1772 
1773 out:
1774 	release_sock(sk);
1775 	return err;
1776 }
1777 #endif
1778 
1779 
1780 #ifdef CONFIG_SOCK_PACKET
1781 static const struct proto_ops packet_ops_spkt = {
1782 	.family =	PF_PACKET,
1783 	.owner =	THIS_MODULE,
1784 	.release =	packet_release,
1785 	.bind =		packet_bind_spkt,
1786 	.connect =	sock_no_connect,
1787 	.socketpair =	sock_no_socketpair,
1788 	.accept =	sock_no_accept,
1789 	.getname =	packet_getname_spkt,
1790 	.poll =		datagram_poll,
1791 	.ioctl =	packet_ioctl,
1792 	.listen =	sock_no_listen,
1793 	.shutdown =	sock_no_shutdown,
1794 	.setsockopt =	sock_no_setsockopt,
1795 	.getsockopt =	sock_no_getsockopt,
1796 	.sendmsg =	packet_sendmsg_spkt,
1797 	.recvmsg =	packet_recvmsg,
1798 	.mmap =		sock_no_mmap,
1799 	.sendpage =	sock_no_sendpage,
1800 };
1801 #endif
1802 
1803 static const struct proto_ops packet_ops = {
1804 	.family =	PF_PACKET,
1805 	.owner =	THIS_MODULE,
1806 	.release =	packet_release,
1807 	.bind =		packet_bind,
1808 	.connect =	sock_no_connect,
1809 	.socketpair =	sock_no_socketpair,
1810 	.accept =	sock_no_accept,
1811 	.getname =	packet_getname,
1812 	.poll =		packet_poll,
1813 	.ioctl =	packet_ioctl,
1814 	.listen =	sock_no_listen,
1815 	.shutdown =	sock_no_shutdown,
1816 	.setsockopt =	packet_setsockopt,
1817 	.getsockopt =	packet_getsockopt,
1818 	.sendmsg =	packet_sendmsg,
1819 	.recvmsg =	packet_recvmsg,
1820 	.mmap =		packet_mmap,
1821 	.sendpage =	sock_no_sendpage,
1822 };
1823 
1824 static struct net_proto_family packet_family_ops = {
1825 	.family =	PF_PACKET,
1826 	.create =	packet_create,
1827 	.owner	=	THIS_MODULE,
1828 };
1829 
1830 static struct notifier_block packet_netdev_notifier = {
1831 	.notifier_call =packet_notifier,
1832 };
1833 
1834 #ifdef CONFIG_PROC_FS
1835 static inline struct sock *packet_seq_idx(loff_t off)
1836 {
1837 	struct sock *s;
1838 	struct hlist_node *node;
1839 
1840 	sk_for_each(s, node, &packet_sklist) {
1841 		if (!off--)
1842 			return s;
1843 	}
1844 	return NULL;
1845 }
1846 
1847 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1848 {
1849 	read_lock(&packet_sklist_lock);
1850 	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1851 }
1852 
1853 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1854 {
1855 	++*pos;
1856 	return  (v == SEQ_START_TOKEN)
1857 		? sk_head(&packet_sklist)
1858 		: sk_next((struct sock*)v) ;
1859 }
1860 
1861 static void packet_seq_stop(struct seq_file *seq, void *v)
1862 {
1863 	read_unlock(&packet_sklist_lock);
1864 }
1865 
1866 static int packet_seq_show(struct seq_file *seq, void *v)
1867 {
1868 	if (v == SEQ_START_TOKEN)
1869 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1870 	else {
1871 		struct sock *s = v;
1872 		const struct packet_sock *po = pkt_sk(s);
1873 
1874 		seq_printf(seq,
1875 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1876 			   s,
1877 			   atomic_read(&s->sk_refcnt),
1878 			   s->sk_type,
1879 			   ntohs(po->num),
1880 			   po->ifindex,
1881 			   po->running,
1882 			   atomic_read(&s->sk_rmem_alloc),
1883 			   sock_i_uid(s),
1884 			   sock_i_ino(s) );
1885 	}
1886 
1887 	return 0;
1888 }
1889 
1890 static struct seq_operations packet_seq_ops = {
1891 	.start	= packet_seq_start,
1892 	.next	= packet_seq_next,
1893 	.stop	= packet_seq_stop,
1894 	.show	= packet_seq_show,
1895 };
1896 
1897 static int packet_seq_open(struct inode *inode, struct file *file)
1898 {
1899 	return seq_open(file, &packet_seq_ops);
1900 }
1901 
1902 static struct file_operations packet_seq_fops = {
1903 	.owner		= THIS_MODULE,
1904 	.open		= packet_seq_open,
1905 	.read		= seq_read,
1906 	.llseek		= seq_lseek,
1907 	.release	= seq_release,
1908 };
1909 
1910 #endif
1911 
1912 static void __exit packet_exit(void)
1913 {
1914 	proc_net_remove("packet");
1915 	unregister_netdevice_notifier(&packet_netdev_notifier);
1916 	sock_unregister(PF_PACKET);
1917 	proto_unregister(&packet_proto);
1918 }
1919 
1920 static int __init packet_init(void)
1921 {
1922 	int rc = proto_register(&packet_proto, 0);
1923 
1924 	if (rc != 0)
1925 		goto out;
1926 
1927 	sock_register(&packet_family_ops);
1928 	register_netdevice_notifier(&packet_netdev_notifier);
1929 	proc_net_fops_create("packet", 0, &packet_seq_fops);
1930 out:
1931 	return rc;
1932 }
1933 
1934 module_init(packet_init);
1935 module_exit(packet_exit);
1936 MODULE_LICENSE("GPL");
1937 MODULE_ALIAS_NETPROTO(PF_PACKET);
1938