xref: /linux/net/packet/af_packet.c (revision eb2bce7f5e7ac1ca6da434461217fadf3c688d2c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *		Alan Cox	:	verify_area() now used correctly
16  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17  *		Alan Cox	:	tidied skbuff lists.
18  *		Alan Cox	:	Now uses generic datagram routines I
19  *					added. Also fixed the peek/read crash
20  *					from all old Linux datagram code.
21  *		Alan Cox	:	Uses the improved datagram code.
22  *		Alan Cox	:	Added NULL's for socket options.
23  *		Alan Cox	:	Re-commented the code.
24  *		Alan Cox	:	Use new kernel side addressing
25  *		Rob Janssen	:	Correct MTU usage.
26  *		Dave Platt	:	Counter leaks caused by incorrect
27  *					interrupt locking and some slightly
28  *					dubious gcc output. Can you read
29  *					compiler: it said _VOLATILE_
30  *	Richard Kooijman	:	Timestamp fixes.
31  *		Alan Cox	:	New buffers. Use sk->mac.raw.
32  *		Alan Cox	:	sendmsg/recvmsg support.
33  *		Alan Cox	:	Protocol setting support
34  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35  *	Cyrus Durgin		:	Fixed kerneld for kmod.
36  *	Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
40  *					The convention is that longer addresses
41  *					will simply extend the hardware address
42  *					byte arrays at the end of sockaddr_ll
43  *					and packet_mreq.
44  *
45  *		This program is free software; you can redistribute it and/or
46  *		modify it under the terms of the GNU General Public License
47  *		as published by the Free Software Foundation; either version
48  *		2 of the License, or (at your option) any later version.
49  *
50  */
51 
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85 
86 #define CONFIG_SOCK_PACKET	1
87 
88 /*
89    Proposed replacement for SIOC{ADD,DEL}MULTI and
90    IFF_PROMISC, IFF_ALLMULTI flags.
91 
92    It is more expensive, but I believe,
93    it is really correct solution: reentereble, safe and fault tolerant.
94 
95    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96    reference count and global flag, so that real status is
97    (gflag|(count != 0)), so that we can use obsolete faulty interface
98    not harming clever users.
99  */
100 #define CONFIG_PACKET_MULTICAST	1
101 
102 /*
103    Assumptions:
104    - if device has no dev->hard_header routine, it adds and removes ll header
105      inside itself. In this case ll header is invisible outside of device,
106      but higher levels still should reserve dev->hard_header_len.
107      Some devices are enough clever to reallocate skb, when header
108      will not fit to reserved space (tunnel), another ones are silly
109      (PPP).
110    - packet socket receives packets with pulled ll header,
111      so that SOCK_RAW should push it back.
112 
113 On receive:
114 -----------
115 
116 Incoming, dev->hard_header!=NULL
117    mac_header -> ll header
118    data       -> data
119 
120 Outgoing, dev->hard_header!=NULL
121    mac_header -> ll header
122    data       -> ll header
123 
124 Incoming, dev->hard_header==NULL
125    mac_header -> UNKNOWN position. It is very likely, that it points to ll
126 		 header.  PPP makes it, that is wrong, because introduce
127                  assymetry between rx and tx paths.
128    data       -> data
129 
130 Outgoing, dev->hard_header==NULL
131    mac_header -> data. ll header is still not built!
132    data       -> data
133 
134 Resume
135   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
136 
137 
138 On transmit:
139 ------------
140 
141 dev->hard_header != NULL
142    mac_header -> ll header
143    data       -> ll header
144 
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146    mac_header -> data
147    data       -> data
148 
149    We should set nh.raw on output to correct posistion,
150    packet classifier depends on it.
151  */
152 
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
156 
157 static atomic_t packet_socks_nr;
158 
159 
160 /* Private packet socket structures. */
161 
162 #ifdef CONFIG_PACKET_MULTICAST
163 struct packet_mclist
164 {
165 	struct packet_mclist	*next;
166 	int			ifindex;
167 	int			count;
168 	unsigned short		type;
169 	unsigned short		alen;
170 	unsigned char		addr[MAX_ADDR_LEN];
171 };
172 /* identical to struct packet_mreq except it has
173  * a longer address field.
174  */
175 struct packet_mreq_max
176 {
177 	int		mr_ifindex;
178 	unsigned short	mr_type;
179 	unsigned short	mr_alen;
180 	unsigned char	mr_address[MAX_ADDR_LEN];
181 };
182 #endif
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
185 #endif
186 
187 static void packet_flush_mclist(struct sock *sk);
188 
189 struct packet_sock {
190 	/* struct sock has to be the first member of packet_sock */
191 	struct sock		sk;
192 	struct tpacket_stats	stats;
193 #ifdef CONFIG_PACKET_MMAP
194 	char *			*pg_vec;
195 	unsigned int		head;
196 	unsigned int            frames_per_block;
197 	unsigned int		frame_size;
198 	unsigned int		frame_max;
199 	int			copy_thresh;
200 #endif
201 	struct packet_type	prot_hook;
202 	spinlock_t		bind_lock;
203 	unsigned int		running:1,	/* prot_hook is attached*/
204 				auxdata:1,
205 				origdev:1;
206 	int			ifindex;	/* bound device		*/
207 	__be16			num;
208 #ifdef CONFIG_PACKET_MULTICAST
209 	struct packet_mclist	*mclist;
210 #endif
211 #ifdef CONFIG_PACKET_MMAP
212 	atomic_t		mapped;
213 	unsigned int            pg_vec_order;
214 	unsigned int		pg_vec_pages;
215 	unsigned int		pg_vec_len;
216 #endif
217 };
218 
219 struct packet_skb_cb {
220 	unsigned int origlen;
221 	union {
222 		struct sockaddr_pkt pkt;
223 		struct sockaddr_ll ll;
224 	} sa;
225 };
226 
227 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
228 
229 #ifdef CONFIG_PACKET_MMAP
230 
231 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
232 {
233 	unsigned int pg_vec_pos, frame_offset;
234 
235 	pg_vec_pos = position / po->frames_per_block;
236 	frame_offset = position % po->frames_per_block;
237 
238 	return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
239 }
240 #endif
241 
242 static inline struct packet_sock *pkt_sk(struct sock *sk)
243 {
244 	return (struct packet_sock *)sk;
245 }
246 
247 static void packet_sock_destruct(struct sock *sk)
248 {
249 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
250 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
251 
252 	if (!sock_flag(sk, SOCK_DEAD)) {
253 		printk("Attempt to release alive packet socket: %p\n", sk);
254 		return;
255 	}
256 
257 	atomic_dec(&packet_socks_nr);
258 #ifdef PACKET_REFCNT_DEBUG
259 	printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
260 #endif
261 }
262 
263 
264 static const struct proto_ops packet_ops;
265 
266 #ifdef CONFIG_SOCK_PACKET
267 static const struct proto_ops packet_ops_spkt;
268 
269 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
270 {
271 	struct sock *sk;
272 	struct sockaddr_pkt *spkt;
273 
274 	/*
275 	 *	When we registered the protocol we saved the socket in the data
276 	 *	field for just this event.
277 	 */
278 
279 	sk = pt->af_packet_priv;
280 
281 	/*
282 	 *	Yank back the headers [hope the device set this
283 	 *	right or kerboom...]
284 	 *
285 	 *	Incoming packets have ll header pulled,
286 	 *	push it back.
287 	 *
288 	 *	For outgoing ones skb->data == skb_mac_header(skb)
289 	 *	so that this procedure is noop.
290 	 */
291 
292 	if (skb->pkt_type == PACKET_LOOPBACK)
293 		goto out;
294 
295 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
296 		goto oom;
297 
298 	/* drop any routing info */
299 	dst_release(skb->dst);
300 	skb->dst = NULL;
301 
302 	/* drop conntrack reference */
303 	nf_reset(skb);
304 
305 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
306 
307 	skb_push(skb, skb->data - skb_mac_header(skb));
308 
309 	/*
310 	 *	The SOCK_PACKET socket receives _all_ frames.
311 	 */
312 
313 	spkt->spkt_family = dev->type;
314 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
315 	spkt->spkt_protocol = skb->protocol;
316 
317 	/*
318 	 *	Charge the memory to the socket. This is done specifically
319 	 *	to prevent sockets using all the memory up.
320 	 */
321 
322 	if (sock_queue_rcv_skb(sk,skb) == 0)
323 		return 0;
324 
325 out:
326 	kfree_skb(skb);
327 oom:
328 	return 0;
329 }
330 
331 
332 /*
333  *	Output a raw packet to a device layer. This bypasses all the other
334  *	protocol layers and you must therefore supply it with a complete frame
335  */
336 
337 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
338 			       struct msghdr *msg, size_t len)
339 {
340 	struct sock *sk = sock->sk;
341 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
342 	struct sk_buff *skb;
343 	struct net_device *dev;
344 	__be16 proto=0;
345 	int err;
346 
347 	/*
348 	 *	Get and verify the address.
349 	 */
350 
351 	if (saddr)
352 	{
353 		if (msg->msg_namelen < sizeof(struct sockaddr))
354 			return(-EINVAL);
355 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
356 			proto=saddr->spkt_protocol;
357 	}
358 	else
359 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
360 
361 	/*
362 	 *	Find the device first to size check it
363 	 */
364 
365 	saddr->spkt_device[13] = 0;
366 	dev = dev_get_by_name(saddr->spkt_device);
367 	err = -ENODEV;
368 	if (dev == NULL)
369 		goto out_unlock;
370 
371 	err = -ENETDOWN;
372 	if (!(dev->flags & IFF_UP))
373 		goto out_unlock;
374 
375 	/*
376 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
377 	 *	raw protocol and you must do your own fragmentation at this level.
378 	 */
379 
380 	err = -EMSGSIZE;
381 	if (len > dev->mtu + dev->hard_header_len)
382 		goto out_unlock;
383 
384 	err = -ENOBUFS;
385 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
386 
387 	/*
388 	 *	If the write buffer is full, then tough. At this level the user gets to
389 	 *	deal with the problem - do your own algorithmic backoffs. That's far
390 	 *	more flexible.
391 	 */
392 
393 	if (skb == NULL)
394 		goto out_unlock;
395 
396 	/*
397 	 *	Fill it in
398 	 */
399 
400 	/* FIXME: Save some space for broken drivers that write a
401 	 * hard header at transmission time by themselves. PPP is the
402 	 * notable one here. This should really be fixed at the driver level.
403 	 */
404 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
405 	skb_reset_network_header(skb);
406 
407 	/* Try to align data part correctly */
408 	if (dev->hard_header) {
409 		skb->data -= dev->hard_header_len;
410 		skb->tail -= dev->hard_header_len;
411 		if (len < dev->hard_header_len)
412 			skb_reset_network_header(skb);
413 	}
414 
415 	/* Returns -EFAULT on error */
416 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
417 	skb->protocol = proto;
418 	skb->dev = dev;
419 	skb->priority = sk->sk_priority;
420 	if (err)
421 		goto out_free;
422 
423 	/*
424 	 *	Now send it
425 	 */
426 
427 	dev_queue_xmit(skb);
428 	dev_put(dev);
429 	return(len);
430 
431 out_free:
432 	kfree_skb(skb);
433 out_unlock:
434 	if (dev)
435 		dev_put(dev);
436 	return err;
437 }
438 #endif
439 
440 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
441 				      unsigned int res)
442 {
443 	struct sk_filter *filter;
444 
445 	rcu_read_lock_bh();
446 	filter = rcu_dereference(sk->sk_filter);
447 	if (filter != NULL)
448 		res = sk_run_filter(skb, filter->insns, filter->len);
449 	rcu_read_unlock_bh();
450 
451 	return res;
452 }
453 
454 /*
455    This function makes lazy skb cloning in hope that most of packets
456    are discarded by BPF.
457 
458    Note tricky part: we DO mangle shared skb! skb->data, skb->len
459    and skb->cb are mangled. It works because (and until) packets
460    falling here are owned by current CPU. Output packets are cloned
461    by dev_queue_xmit_nit(), input packets are processed by net_bh
462    sequencially, so that if we return skb to original state on exit,
463    we will not harm anyone.
464  */
465 
466 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
467 {
468 	struct sock *sk;
469 	struct sockaddr_ll *sll;
470 	struct packet_sock *po;
471 	u8 * skb_head = skb->data;
472 	int skb_len = skb->len;
473 	unsigned int snaplen, res;
474 
475 	if (skb->pkt_type == PACKET_LOOPBACK)
476 		goto drop;
477 
478 	sk = pt->af_packet_priv;
479 	po = pkt_sk(sk);
480 
481 	skb->dev = dev;
482 
483 	if (dev->hard_header) {
484 		/* The device has an explicit notion of ll header,
485 		   exported to higher levels.
486 
487 		   Otherwise, the device hides datails of it frame
488 		   structure, so that corresponding packet head
489 		   never delivered to user.
490 		 */
491 		if (sk->sk_type != SOCK_DGRAM)
492 			skb_push(skb, skb->data - skb_mac_header(skb));
493 		else if (skb->pkt_type == PACKET_OUTGOING) {
494 			/* Special case: outgoing packets have ll header at head */
495 			skb_pull(skb, skb_network_offset(skb));
496 		}
497 	}
498 
499 	snaplen = skb->len;
500 
501 	res = run_filter(skb, sk, snaplen);
502 	if (!res)
503 		goto drop_n_restore;
504 	if (snaplen > res)
505 		snaplen = res;
506 
507 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
508 	    (unsigned)sk->sk_rcvbuf)
509 		goto drop_n_acct;
510 
511 	if (skb_shared(skb)) {
512 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
513 		if (nskb == NULL)
514 			goto drop_n_acct;
515 
516 		if (skb_head != skb->data) {
517 			skb->data = skb_head;
518 			skb->len = skb_len;
519 		}
520 		kfree_skb(skb);
521 		skb = nskb;
522 	}
523 
524 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
525 		     sizeof(skb->cb));
526 
527 	sll = &PACKET_SKB_CB(skb)->sa.ll;
528 	sll->sll_family = AF_PACKET;
529 	sll->sll_hatype = dev->type;
530 	sll->sll_protocol = skb->protocol;
531 	sll->sll_pkttype = skb->pkt_type;
532 	if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
533 		sll->sll_ifindex = orig_dev->ifindex;
534 	else
535 		sll->sll_ifindex = dev->ifindex;
536 	sll->sll_halen = 0;
537 
538 	if (dev->hard_header_parse)
539 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
540 
541 	PACKET_SKB_CB(skb)->origlen = skb->len;
542 
543 	if (pskb_trim(skb, snaplen))
544 		goto drop_n_acct;
545 
546 	skb_set_owner_r(skb, sk);
547 	skb->dev = NULL;
548 	dst_release(skb->dst);
549 	skb->dst = NULL;
550 
551 	/* drop conntrack reference */
552 	nf_reset(skb);
553 
554 	spin_lock(&sk->sk_receive_queue.lock);
555 	po->stats.tp_packets++;
556 	__skb_queue_tail(&sk->sk_receive_queue, skb);
557 	spin_unlock(&sk->sk_receive_queue.lock);
558 	sk->sk_data_ready(sk, skb->len);
559 	return 0;
560 
561 drop_n_acct:
562 	spin_lock(&sk->sk_receive_queue.lock);
563 	po->stats.tp_drops++;
564 	spin_unlock(&sk->sk_receive_queue.lock);
565 
566 drop_n_restore:
567 	if (skb_head != skb->data && skb_shared(skb)) {
568 		skb->data = skb_head;
569 		skb->len = skb_len;
570 	}
571 drop:
572 	kfree_skb(skb);
573 	return 0;
574 }
575 
576 #ifdef CONFIG_PACKET_MMAP
577 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
578 {
579 	struct sock *sk;
580 	struct packet_sock *po;
581 	struct sockaddr_ll *sll;
582 	struct tpacket_hdr *h;
583 	u8 * skb_head = skb->data;
584 	int skb_len = skb->len;
585 	unsigned int snaplen, res;
586 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
587 	unsigned short macoff, netoff;
588 	struct sk_buff *copy_skb = NULL;
589 	struct timeval tv;
590 
591 	if (skb->pkt_type == PACKET_LOOPBACK)
592 		goto drop;
593 
594 	sk = pt->af_packet_priv;
595 	po = pkt_sk(sk);
596 
597 	if (dev->hard_header) {
598 		if (sk->sk_type != SOCK_DGRAM)
599 			skb_push(skb, skb->data - skb_mac_header(skb));
600 		else if (skb->pkt_type == PACKET_OUTGOING) {
601 			/* Special case: outgoing packets have ll header at head */
602 			skb_pull(skb, skb_network_offset(skb));
603 		}
604 	}
605 
606 	if (skb->ip_summed == CHECKSUM_PARTIAL)
607 		status |= TP_STATUS_CSUMNOTREADY;
608 
609 	snaplen = skb->len;
610 
611 	res = run_filter(skb, sk, snaplen);
612 	if (!res)
613 		goto drop_n_restore;
614 	if (snaplen > res)
615 		snaplen = res;
616 
617 	if (sk->sk_type == SOCK_DGRAM) {
618 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
619 	} else {
620 		unsigned maclen = skb_network_offset(skb);
621 		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
622 		macoff = netoff - maclen;
623 	}
624 
625 	if (macoff + snaplen > po->frame_size) {
626 		if (po->copy_thresh &&
627 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
628 		    (unsigned)sk->sk_rcvbuf) {
629 			if (skb_shared(skb)) {
630 				copy_skb = skb_clone(skb, GFP_ATOMIC);
631 			} else {
632 				copy_skb = skb_get(skb);
633 				skb_head = skb->data;
634 			}
635 			if (copy_skb)
636 				skb_set_owner_r(copy_skb, sk);
637 		}
638 		snaplen = po->frame_size - macoff;
639 		if ((int)snaplen < 0)
640 			snaplen = 0;
641 	}
642 
643 	spin_lock(&sk->sk_receive_queue.lock);
644 	h = packet_lookup_frame(po, po->head);
645 
646 	if (h->tp_status)
647 		goto ring_is_full;
648 	po->head = po->head != po->frame_max ? po->head+1 : 0;
649 	po->stats.tp_packets++;
650 	if (copy_skb) {
651 		status |= TP_STATUS_COPY;
652 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
653 	}
654 	if (!po->stats.tp_drops)
655 		status &= ~TP_STATUS_LOSING;
656 	spin_unlock(&sk->sk_receive_queue.lock);
657 
658 	skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
659 
660 	h->tp_len = skb->len;
661 	h->tp_snaplen = snaplen;
662 	h->tp_mac = macoff;
663 	h->tp_net = netoff;
664 	if (skb->tstamp.tv64 == 0) {
665 		__net_timestamp(skb);
666 		sock_enable_timestamp(sk);
667 	}
668 	tv = ktime_to_timeval(skb->tstamp);
669 	h->tp_sec = tv.tv_sec;
670 	h->tp_usec = tv.tv_usec;
671 
672 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
673 	sll->sll_halen = 0;
674 	if (dev->hard_header_parse)
675 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
676 	sll->sll_family = AF_PACKET;
677 	sll->sll_hatype = dev->type;
678 	sll->sll_protocol = skb->protocol;
679 	sll->sll_pkttype = skb->pkt_type;
680 	if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
681 		sll->sll_ifindex = orig_dev->ifindex;
682 	else
683 		sll->sll_ifindex = dev->ifindex;
684 
685 	h->tp_status = status;
686 	smp_mb();
687 
688 	{
689 		struct page *p_start, *p_end;
690 		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
691 
692 		p_start = virt_to_page(h);
693 		p_end = virt_to_page(h_end);
694 		while (p_start <= p_end) {
695 			flush_dcache_page(p_start);
696 			p_start++;
697 		}
698 	}
699 
700 	sk->sk_data_ready(sk, 0);
701 
702 drop_n_restore:
703 	if (skb_head != skb->data && skb_shared(skb)) {
704 		skb->data = skb_head;
705 		skb->len = skb_len;
706 	}
707 drop:
708 	kfree_skb(skb);
709 	return 0;
710 
711 ring_is_full:
712 	po->stats.tp_drops++;
713 	spin_unlock(&sk->sk_receive_queue.lock);
714 
715 	sk->sk_data_ready(sk, 0);
716 	if (copy_skb)
717 		kfree_skb(copy_skb);
718 	goto drop_n_restore;
719 }
720 
721 #endif
722 
723 
724 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
725 			  struct msghdr *msg, size_t len)
726 {
727 	struct sock *sk = sock->sk;
728 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
729 	struct sk_buff *skb;
730 	struct net_device *dev;
731 	__be16 proto;
732 	unsigned char *addr;
733 	int ifindex, err, reserve = 0;
734 
735 	/*
736 	 *	Get and verify the address.
737 	 */
738 
739 	if (saddr == NULL) {
740 		struct packet_sock *po = pkt_sk(sk);
741 
742 		ifindex	= po->ifindex;
743 		proto	= po->num;
744 		addr	= NULL;
745 	} else {
746 		err = -EINVAL;
747 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
748 			goto out;
749 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
750 			goto out;
751 		ifindex	= saddr->sll_ifindex;
752 		proto	= saddr->sll_protocol;
753 		addr	= saddr->sll_addr;
754 	}
755 
756 
757 	dev = dev_get_by_index(ifindex);
758 	err = -ENXIO;
759 	if (dev == NULL)
760 		goto out_unlock;
761 	if (sock->type == SOCK_RAW)
762 		reserve = dev->hard_header_len;
763 
764 	err = -ENETDOWN;
765 	if (!(dev->flags & IFF_UP))
766 		goto out_unlock;
767 
768 	err = -EMSGSIZE;
769 	if (len > dev->mtu+reserve)
770 		goto out_unlock;
771 
772 	skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
773 				msg->msg_flags & MSG_DONTWAIT, &err);
774 	if (skb==NULL)
775 		goto out_unlock;
776 
777 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
778 	skb_reset_network_header(skb);
779 
780 	if (dev->hard_header) {
781 		int res;
782 		err = -EINVAL;
783 		res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
784 		if (sock->type != SOCK_DGRAM) {
785 			skb_reset_tail_pointer(skb);
786 			skb->len = 0;
787 		} else if (res < 0)
788 			goto out_free;
789 	}
790 
791 	/* Returns -EFAULT on error */
792 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
793 	if (err)
794 		goto out_free;
795 
796 	skb->protocol = proto;
797 	skb->dev = dev;
798 	skb->priority = sk->sk_priority;
799 
800 	/*
801 	 *	Now send it
802 	 */
803 
804 	err = dev_queue_xmit(skb);
805 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
806 		goto out_unlock;
807 
808 	dev_put(dev);
809 
810 	return(len);
811 
812 out_free:
813 	kfree_skb(skb);
814 out_unlock:
815 	if (dev)
816 		dev_put(dev);
817 out:
818 	return err;
819 }
820 
821 /*
822  *	Close a PACKET socket. This is fairly simple. We immediately go
823  *	to 'closed' state and remove our protocol entry in the device list.
824  */
825 
826 static int packet_release(struct socket *sock)
827 {
828 	struct sock *sk = sock->sk;
829 	struct packet_sock *po;
830 
831 	if (!sk)
832 		return 0;
833 
834 	po = pkt_sk(sk);
835 
836 	write_lock_bh(&packet_sklist_lock);
837 	sk_del_node_init(sk);
838 	write_unlock_bh(&packet_sklist_lock);
839 
840 	/*
841 	 *	Unhook packet receive handler.
842 	 */
843 
844 	if (po->running) {
845 		/*
846 		 *	Remove the protocol hook
847 		 */
848 		dev_remove_pack(&po->prot_hook);
849 		po->running = 0;
850 		po->num = 0;
851 		__sock_put(sk);
852 	}
853 
854 #ifdef CONFIG_PACKET_MULTICAST
855 	packet_flush_mclist(sk);
856 #endif
857 
858 #ifdef CONFIG_PACKET_MMAP
859 	if (po->pg_vec) {
860 		struct tpacket_req req;
861 		memset(&req, 0, sizeof(req));
862 		packet_set_ring(sk, &req, 1);
863 	}
864 #endif
865 
866 	/*
867 	 *	Now the socket is dead. No more input will appear.
868 	 */
869 
870 	sock_orphan(sk);
871 	sock->sk = NULL;
872 
873 	/* Purge queues */
874 
875 	skb_queue_purge(&sk->sk_receive_queue);
876 
877 	sock_put(sk);
878 	return 0;
879 }
880 
881 /*
882  *	Attach a packet hook.
883  */
884 
885 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
886 {
887 	struct packet_sock *po = pkt_sk(sk);
888 	/*
889 	 *	Detach an existing hook if present.
890 	 */
891 
892 	lock_sock(sk);
893 
894 	spin_lock(&po->bind_lock);
895 	if (po->running) {
896 		__sock_put(sk);
897 		po->running = 0;
898 		po->num = 0;
899 		spin_unlock(&po->bind_lock);
900 		dev_remove_pack(&po->prot_hook);
901 		spin_lock(&po->bind_lock);
902 	}
903 
904 	po->num = protocol;
905 	po->prot_hook.type = protocol;
906 	po->prot_hook.dev = dev;
907 
908 	po->ifindex = dev ? dev->ifindex : 0;
909 
910 	if (protocol == 0)
911 		goto out_unlock;
912 
913 	if (dev) {
914 		if (dev->flags&IFF_UP) {
915 			dev_add_pack(&po->prot_hook);
916 			sock_hold(sk);
917 			po->running = 1;
918 		} else {
919 			sk->sk_err = ENETDOWN;
920 			if (!sock_flag(sk, SOCK_DEAD))
921 				sk->sk_error_report(sk);
922 		}
923 	} else {
924 		dev_add_pack(&po->prot_hook);
925 		sock_hold(sk);
926 		po->running = 1;
927 	}
928 
929 out_unlock:
930 	spin_unlock(&po->bind_lock);
931 	release_sock(sk);
932 	return 0;
933 }
934 
935 /*
936  *	Bind a packet socket to a device
937  */
938 
939 #ifdef CONFIG_SOCK_PACKET
940 
941 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
942 {
943 	struct sock *sk=sock->sk;
944 	char name[15];
945 	struct net_device *dev;
946 	int err = -ENODEV;
947 
948 	/*
949 	 *	Check legality
950 	 */
951 
952 	if (addr_len != sizeof(struct sockaddr))
953 		return -EINVAL;
954 	strlcpy(name,uaddr->sa_data,sizeof(name));
955 
956 	dev = dev_get_by_name(name);
957 	if (dev) {
958 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
959 		dev_put(dev);
960 	}
961 	return err;
962 }
963 #endif
964 
965 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
966 {
967 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
968 	struct sock *sk=sock->sk;
969 	struct net_device *dev = NULL;
970 	int err;
971 
972 
973 	/*
974 	 *	Check legality
975 	 */
976 
977 	if (addr_len < sizeof(struct sockaddr_ll))
978 		return -EINVAL;
979 	if (sll->sll_family != AF_PACKET)
980 		return -EINVAL;
981 
982 	if (sll->sll_ifindex) {
983 		err = -ENODEV;
984 		dev = dev_get_by_index(sll->sll_ifindex);
985 		if (dev == NULL)
986 			goto out;
987 	}
988 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
989 	if (dev)
990 		dev_put(dev);
991 
992 out:
993 	return err;
994 }
995 
996 static struct proto packet_proto = {
997 	.name	  = "PACKET",
998 	.owner	  = THIS_MODULE,
999 	.obj_size = sizeof(struct packet_sock),
1000 };
1001 
1002 /*
1003  *	Create a packet of type SOCK_PACKET.
1004  */
1005 
1006 static int packet_create(struct socket *sock, int protocol)
1007 {
1008 	struct sock *sk;
1009 	struct packet_sock *po;
1010 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1011 	int err;
1012 
1013 	if (!capable(CAP_NET_RAW))
1014 		return -EPERM;
1015 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1016 #ifdef CONFIG_SOCK_PACKET
1017 	    && sock->type != SOCK_PACKET
1018 #endif
1019 	    )
1020 		return -ESOCKTNOSUPPORT;
1021 
1022 	sock->state = SS_UNCONNECTED;
1023 
1024 	err = -ENOBUFS;
1025 	sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1026 	if (sk == NULL)
1027 		goto out;
1028 
1029 	sock->ops = &packet_ops;
1030 #ifdef CONFIG_SOCK_PACKET
1031 	if (sock->type == SOCK_PACKET)
1032 		sock->ops = &packet_ops_spkt;
1033 #endif
1034 	sock_init_data(sock, sk);
1035 
1036 	po = pkt_sk(sk);
1037 	sk->sk_family = PF_PACKET;
1038 	po->num = proto;
1039 
1040 	sk->sk_destruct = packet_sock_destruct;
1041 	atomic_inc(&packet_socks_nr);
1042 
1043 	/*
1044 	 *	Attach a protocol block
1045 	 */
1046 
1047 	spin_lock_init(&po->bind_lock);
1048 	po->prot_hook.func = packet_rcv;
1049 #ifdef CONFIG_SOCK_PACKET
1050 	if (sock->type == SOCK_PACKET)
1051 		po->prot_hook.func = packet_rcv_spkt;
1052 #endif
1053 	po->prot_hook.af_packet_priv = sk;
1054 
1055 	if (proto) {
1056 		po->prot_hook.type = proto;
1057 		dev_add_pack(&po->prot_hook);
1058 		sock_hold(sk);
1059 		po->running = 1;
1060 	}
1061 
1062 	write_lock_bh(&packet_sklist_lock);
1063 	sk_add_node(sk, &packet_sklist);
1064 	write_unlock_bh(&packet_sklist_lock);
1065 	return(0);
1066 out:
1067 	return err;
1068 }
1069 
1070 /*
1071  *	Pull a packet from our receive queue and hand it to the user.
1072  *	If necessary we block.
1073  */
1074 
1075 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1076 			  struct msghdr *msg, size_t len, int flags)
1077 {
1078 	struct sock *sk = sock->sk;
1079 	struct sk_buff *skb;
1080 	int copied, err;
1081 	struct sockaddr_ll *sll;
1082 
1083 	err = -EINVAL;
1084 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1085 		goto out;
1086 
1087 #if 0
1088 	/* What error should we return now? EUNATTACH? */
1089 	if (pkt_sk(sk)->ifindex < 0)
1090 		return -ENODEV;
1091 #endif
1092 
1093 	/*
1094 	 *	Call the generic datagram receiver. This handles all sorts
1095 	 *	of horrible races and re-entrancy so we can forget about it
1096 	 *	in the protocol layers.
1097 	 *
1098 	 *	Now it will return ENETDOWN, if device have just gone down,
1099 	 *	but then it will block.
1100 	 */
1101 
1102 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1103 
1104 	/*
1105 	 *	An error occurred so return it. Because skb_recv_datagram()
1106 	 *	handles the blocking we don't see and worry about blocking
1107 	 *	retries.
1108 	 */
1109 
1110 	if (skb == NULL)
1111 		goto out;
1112 
1113 	/*
1114 	 *	If the address length field is there to be filled in, we fill
1115 	 *	it in now.
1116 	 */
1117 
1118 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1119 	if (sock->type == SOCK_PACKET)
1120 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1121 	else
1122 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1123 
1124 	/*
1125 	 *	You lose any data beyond the buffer you gave. If it worries a
1126 	 *	user program they can ask the device for its MTU anyway.
1127 	 */
1128 
1129 	copied = skb->len;
1130 	if (copied > len)
1131 	{
1132 		copied=len;
1133 		msg->msg_flags|=MSG_TRUNC;
1134 	}
1135 
1136 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1137 	if (err)
1138 		goto out_free;
1139 
1140 	sock_recv_timestamp(msg, sk, skb);
1141 
1142 	if (msg->msg_name)
1143 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1144 		       msg->msg_namelen);
1145 
1146 	if (pkt_sk(sk)->auxdata) {
1147 		struct tpacket_auxdata aux;
1148 
1149 		aux.tp_status = TP_STATUS_USER;
1150 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1151 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1152 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1153 		aux.tp_snaplen = skb->len;
1154 		aux.tp_mac = 0;
1155 		aux.tp_net = skb_network_offset(skb);
1156 
1157 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1158 	}
1159 
1160 	/*
1161 	 *	Free or return the buffer as appropriate. Again this
1162 	 *	hides all the races and re-entrancy issues from us.
1163 	 */
1164 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1165 
1166 out_free:
1167 	skb_free_datagram(sk, skb);
1168 out:
1169 	return err;
1170 }
1171 
1172 #ifdef CONFIG_SOCK_PACKET
1173 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1174 			       int *uaddr_len, int peer)
1175 {
1176 	struct net_device *dev;
1177 	struct sock *sk	= sock->sk;
1178 
1179 	if (peer)
1180 		return -EOPNOTSUPP;
1181 
1182 	uaddr->sa_family = AF_PACKET;
1183 	dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1184 	if (dev) {
1185 		strlcpy(uaddr->sa_data, dev->name, 15);
1186 		dev_put(dev);
1187 	} else
1188 		memset(uaddr->sa_data, 0, 14);
1189 	*uaddr_len = sizeof(*uaddr);
1190 
1191 	return 0;
1192 }
1193 #endif
1194 
1195 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1196 			  int *uaddr_len, int peer)
1197 {
1198 	struct net_device *dev;
1199 	struct sock *sk = sock->sk;
1200 	struct packet_sock *po = pkt_sk(sk);
1201 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1202 
1203 	if (peer)
1204 		return -EOPNOTSUPP;
1205 
1206 	sll->sll_family = AF_PACKET;
1207 	sll->sll_ifindex = po->ifindex;
1208 	sll->sll_protocol = po->num;
1209 	dev = dev_get_by_index(po->ifindex);
1210 	if (dev) {
1211 		sll->sll_hatype = dev->type;
1212 		sll->sll_halen = dev->addr_len;
1213 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1214 		dev_put(dev);
1215 	} else {
1216 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1217 		sll->sll_halen = 0;
1218 	}
1219 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1220 
1221 	return 0;
1222 }
1223 
1224 #ifdef CONFIG_PACKET_MULTICAST
1225 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1226 {
1227 	switch (i->type) {
1228 	case PACKET_MR_MULTICAST:
1229 		if (what > 0)
1230 			dev_mc_add(dev, i->addr, i->alen, 0);
1231 		else
1232 			dev_mc_delete(dev, i->addr, i->alen, 0);
1233 		break;
1234 	case PACKET_MR_PROMISC:
1235 		dev_set_promiscuity(dev, what);
1236 		break;
1237 	case PACKET_MR_ALLMULTI:
1238 		dev_set_allmulti(dev, what);
1239 		break;
1240 	default:;
1241 	}
1242 }
1243 
1244 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1245 {
1246 	for ( ; i; i=i->next) {
1247 		if (i->ifindex == dev->ifindex)
1248 			packet_dev_mc(dev, i, what);
1249 	}
1250 }
1251 
1252 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1253 {
1254 	struct packet_sock *po = pkt_sk(sk);
1255 	struct packet_mclist *ml, *i;
1256 	struct net_device *dev;
1257 	int err;
1258 
1259 	rtnl_lock();
1260 
1261 	err = -ENODEV;
1262 	dev = __dev_get_by_index(mreq->mr_ifindex);
1263 	if (!dev)
1264 		goto done;
1265 
1266 	err = -EINVAL;
1267 	if (mreq->mr_alen > dev->addr_len)
1268 		goto done;
1269 
1270 	err = -ENOBUFS;
1271 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1272 	if (i == NULL)
1273 		goto done;
1274 
1275 	err = 0;
1276 	for (ml = po->mclist; ml; ml = ml->next) {
1277 		if (ml->ifindex == mreq->mr_ifindex &&
1278 		    ml->type == mreq->mr_type &&
1279 		    ml->alen == mreq->mr_alen &&
1280 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1281 			ml->count++;
1282 			/* Free the new element ... */
1283 			kfree(i);
1284 			goto done;
1285 		}
1286 	}
1287 
1288 	i->type = mreq->mr_type;
1289 	i->ifindex = mreq->mr_ifindex;
1290 	i->alen = mreq->mr_alen;
1291 	memcpy(i->addr, mreq->mr_address, i->alen);
1292 	i->count = 1;
1293 	i->next = po->mclist;
1294 	po->mclist = i;
1295 	packet_dev_mc(dev, i, +1);
1296 
1297 done:
1298 	rtnl_unlock();
1299 	return err;
1300 }
1301 
1302 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1303 {
1304 	struct packet_mclist *ml, **mlp;
1305 
1306 	rtnl_lock();
1307 
1308 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1309 		if (ml->ifindex == mreq->mr_ifindex &&
1310 		    ml->type == mreq->mr_type &&
1311 		    ml->alen == mreq->mr_alen &&
1312 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1313 			if (--ml->count == 0) {
1314 				struct net_device *dev;
1315 				*mlp = ml->next;
1316 				dev = dev_get_by_index(ml->ifindex);
1317 				if (dev) {
1318 					packet_dev_mc(dev, ml, -1);
1319 					dev_put(dev);
1320 				}
1321 				kfree(ml);
1322 			}
1323 			rtnl_unlock();
1324 			return 0;
1325 		}
1326 	}
1327 	rtnl_unlock();
1328 	return -EADDRNOTAVAIL;
1329 }
1330 
1331 static void packet_flush_mclist(struct sock *sk)
1332 {
1333 	struct packet_sock *po = pkt_sk(sk);
1334 	struct packet_mclist *ml;
1335 
1336 	if (!po->mclist)
1337 		return;
1338 
1339 	rtnl_lock();
1340 	while ((ml = po->mclist) != NULL) {
1341 		struct net_device *dev;
1342 
1343 		po->mclist = ml->next;
1344 		if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1345 			packet_dev_mc(dev, ml, -1);
1346 			dev_put(dev);
1347 		}
1348 		kfree(ml);
1349 	}
1350 	rtnl_unlock();
1351 }
1352 #endif
1353 
1354 static int
1355 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1356 {
1357 	struct sock *sk = sock->sk;
1358 	struct packet_sock *po = pkt_sk(sk);
1359 	int ret;
1360 
1361 	if (level != SOL_PACKET)
1362 		return -ENOPROTOOPT;
1363 
1364 	switch(optname)	{
1365 #ifdef CONFIG_PACKET_MULTICAST
1366 	case PACKET_ADD_MEMBERSHIP:
1367 	case PACKET_DROP_MEMBERSHIP:
1368 	{
1369 		struct packet_mreq_max mreq;
1370 		int len = optlen;
1371 		memset(&mreq, 0, sizeof(mreq));
1372 		if (len < sizeof(struct packet_mreq))
1373 			return -EINVAL;
1374 		if (len > sizeof(mreq))
1375 			len = sizeof(mreq);
1376 		if (copy_from_user(&mreq,optval,len))
1377 			return -EFAULT;
1378 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1379 			return -EINVAL;
1380 		if (optname == PACKET_ADD_MEMBERSHIP)
1381 			ret = packet_mc_add(sk, &mreq);
1382 		else
1383 			ret = packet_mc_drop(sk, &mreq);
1384 		return ret;
1385 	}
1386 #endif
1387 #ifdef CONFIG_PACKET_MMAP
1388 	case PACKET_RX_RING:
1389 	{
1390 		struct tpacket_req req;
1391 
1392 		if (optlen<sizeof(req))
1393 			return -EINVAL;
1394 		if (copy_from_user(&req,optval,sizeof(req)))
1395 			return -EFAULT;
1396 		return packet_set_ring(sk, &req, 0);
1397 	}
1398 	case PACKET_COPY_THRESH:
1399 	{
1400 		int val;
1401 
1402 		if (optlen!=sizeof(val))
1403 			return -EINVAL;
1404 		if (copy_from_user(&val,optval,sizeof(val)))
1405 			return -EFAULT;
1406 
1407 		pkt_sk(sk)->copy_thresh = val;
1408 		return 0;
1409 	}
1410 #endif
1411 	case PACKET_AUXDATA:
1412 	{
1413 		int val;
1414 
1415 		if (optlen < sizeof(val))
1416 			return -EINVAL;
1417 		if (copy_from_user(&val, optval, sizeof(val)))
1418 			return -EFAULT;
1419 
1420 		po->auxdata = !!val;
1421 		return 0;
1422 	}
1423 	case PACKET_ORIGDEV:
1424 	{
1425 		int val;
1426 
1427 		if (optlen < sizeof(val))
1428 			return -EINVAL;
1429 		if (copy_from_user(&val, optval, sizeof(val)))
1430 			return -EFAULT;
1431 
1432 		po->origdev = !!val;
1433 		return 0;
1434 	}
1435 	default:
1436 		return -ENOPROTOOPT;
1437 	}
1438 }
1439 
1440 static int packet_getsockopt(struct socket *sock, int level, int optname,
1441 			     char __user *optval, int __user *optlen)
1442 {
1443 	int len;
1444 	int val;
1445 	struct sock *sk = sock->sk;
1446 	struct packet_sock *po = pkt_sk(sk);
1447 	void *data;
1448 	struct tpacket_stats st;
1449 
1450 	if (level != SOL_PACKET)
1451 		return -ENOPROTOOPT;
1452 
1453 	if (get_user(len, optlen))
1454 		return -EFAULT;
1455 
1456 	if (len < 0)
1457 		return -EINVAL;
1458 
1459 	switch(optname)	{
1460 	case PACKET_STATISTICS:
1461 		if (len > sizeof(struct tpacket_stats))
1462 			len = sizeof(struct tpacket_stats);
1463 		spin_lock_bh(&sk->sk_receive_queue.lock);
1464 		st = po->stats;
1465 		memset(&po->stats, 0, sizeof(st));
1466 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1467 		st.tp_packets += st.tp_drops;
1468 
1469 		data = &st;
1470 		break;
1471 	case PACKET_AUXDATA:
1472 		if (len > sizeof(int))
1473 			len = sizeof(int);
1474 		val = po->auxdata;
1475 
1476 		data = &val;
1477 		break;
1478 	case PACKET_ORIGDEV:
1479 		if (len > sizeof(int))
1480 			len = sizeof(int);
1481 		val = po->origdev;
1482 
1483 		data = &val;
1484 		break;
1485 	default:
1486 		return -ENOPROTOOPT;
1487 	}
1488 
1489 	if (put_user(len, optlen))
1490 		return -EFAULT;
1491 	if (copy_to_user(optval, data, len))
1492 		return -EFAULT;
1493 	return 0;
1494 }
1495 
1496 
1497 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1498 {
1499 	struct sock *sk;
1500 	struct hlist_node *node;
1501 	struct net_device *dev = data;
1502 
1503 	read_lock(&packet_sklist_lock);
1504 	sk_for_each(sk, node, &packet_sklist) {
1505 		struct packet_sock *po = pkt_sk(sk);
1506 
1507 		switch (msg) {
1508 		case NETDEV_UNREGISTER:
1509 #ifdef CONFIG_PACKET_MULTICAST
1510 			if (po->mclist)
1511 				packet_dev_mclist(dev, po->mclist, -1);
1512 			// fallthrough
1513 #endif
1514 		case NETDEV_DOWN:
1515 			if (dev->ifindex == po->ifindex) {
1516 				spin_lock(&po->bind_lock);
1517 				if (po->running) {
1518 					__dev_remove_pack(&po->prot_hook);
1519 					__sock_put(sk);
1520 					po->running = 0;
1521 					sk->sk_err = ENETDOWN;
1522 					if (!sock_flag(sk, SOCK_DEAD))
1523 						sk->sk_error_report(sk);
1524 				}
1525 				if (msg == NETDEV_UNREGISTER) {
1526 					po->ifindex = -1;
1527 					po->prot_hook.dev = NULL;
1528 				}
1529 				spin_unlock(&po->bind_lock);
1530 			}
1531 			break;
1532 		case NETDEV_UP:
1533 			spin_lock(&po->bind_lock);
1534 			if (dev->ifindex == po->ifindex && po->num &&
1535 			    !po->running) {
1536 				dev_add_pack(&po->prot_hook);
1537 				sock_hold(sk);
1538 				po->running = 1;
1539 			}
1540 			spin_unlock(&po->bind_lock);
1541 			break;
1542 		}
1543 	}
1544 	read_unlock(&packet_sklist_lock);
1545 	return NOTIFY_DONE;
1546 }
1547 
1548 
1549 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1550 			unsigned long arg)
1551 {
1552 	struct sock *sk = sock->sk;
1553 
1554 	switch(cmd) {
1555 		case SIOCOUTQ:
1556 		{
1557 			int amount = atomic_read(&sk->sk_wmem_alloc);
1558 			return put_user(amount, (int __user *)arg);
1559 		}
1560 		case SIOCINQ:
1561 		{
1562 			struct sk_buff *skb;
1563 			int amount = 0;
1564 
1565 			spin_lock_bh(&sk->sk_receive_queue.lock);
1566 			skb = skb_peek(&sk->sk_receive_queue);
1567 			if (skb)
1568 				amount = skb->len;
1569 			spin_unlock_bh(&sk->sk_receive_queue.lock);
1570 			return put_user(amount, (int __user *)arg);
1571 		}
1572 		case SIOCGSTAMP:
1573 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1574 		case SIOCGSTAMPNS:
1575 			return sock_get_timestampns(sk, (struct timespec __user *)arg);
1576 
1577 #ifdef CONFIG_INET
1578 		case SIOCADDRT:
1579 		case SIOCDELRT:
1580 		case SIOCDARP:
1581 		case SIOCGARP:
1582 		case SIOCSARP:
1583 		case SIOCGIFADDR:
1584 		case SIOCSIFADDR:
1585 		case SIOCGIFBRDADDR:
1586 		case SIOCSIFBRDADDR:
1587 		case SIOCGIFNETMASK:
1588 		case SIOCSIFNETMASK:
1589 		case SIOCGIFDSTADDR:
1590 		case SIOCSIFDSTADDR:
1591 		case SIOCSIFFLAGS:
1592 			return inet_dgram_ops.ioctl(sock, cmd, arg);
1593 #endif
1594 
1595 		default:
1596 			return -ENOIOCTLCMD;
1597 	}
1598 	return 0;
1599 }
1600 
1601 #ifndef CONFIG_PACKET_MMAP
1602 #define packet_mmap sock_no_mmap
1603 #define packet_poll datagram_poll
1604 #else
1605 
1606 static unsigned int packet_poll(struct file * file, struct socket *sock,
1607 				poll_table *wait)
1608 {
1609 	struct sock *sk = sock->sk;
1610 	struct packet_sock *po = pkt_sk(sk);
1611 	unsigned int mask = datagram_poll(file, sock, wait);
1612 
1613 	spin_lock_bh(&sk->sk_receive_queue.lock);
1614 	if (po->pg_vec) {
1615 		unsigned last = po->head ? po->head-1 : po->frame_max;
1616 		struct tpacket_hdr *h;
1617 
1618 		h = packet_lookup_frame(po, last);
1619 
1620 		if (h->tp_status)
1621 			mask |= POLLIN | POLLRDNORM;
1622 	}
1623 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1624 	return mask;
1625 }
1626 
1627 
1628 /* Dirty? Well, I still did not learn better way to account
1629  * for user mmaps.
1630  */
1631 
1632 static void packet_mm_open(struct vm_area_struct *vma)
1633 {
1634 	struct file *file = vma->vm_file;
1635 	struct socket * sock = file->private_data;
1636 	struct sock *sk = sock->sk;
1637 
1638 	if (sk)
1639 		atomic_inc(&pkt_sk(sk)->mapped);
1640 }
1641 
1642 static void packet_mm_close(struct vm_area_struct *vma)
1643 {
1644 	struct file *file = vma->vm_file;
1645 	struct socket * sock = file->private_data;
1646 	struct sock *sk = sock->sk;
1647 
1648 	if (sk)
1649 		atomic_dec(&pkt_sk(sk)->mapped);
1650 }
1651 
1652 static struct vm_operations_struct packet_mmap_ops = {
1653 	.open =	packet_mm_open,
1654 	.close =packet_mm_close,
1655 };
1656 
1657 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1658 {
1659 	return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1660 }
1661 
1662 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1663 {
1664 	int i;
1665 
1666 	for (i = 0; i < len; i++) {
1667 		if (likely(pg_vec[i]))
1668 			free_pages((unsigned long) pg_vec[i], order);
1669 	}
1670 	kfree(pg_vec);
1671 }
1672 
1673 static inline char *alloc_one_pg_vec_page(unsigned long order)
1674 {
1675 	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1676 					 order);
1677 }
1678 
1679 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1680 {
1681 	unsigned int block_nr = req->tp_block_nr;
1682 	char **pg_vec;
1683 	int i;
1684 
1685 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1686 	if (unlikely(!pg_vec))
1687 		goto out;
1688 
1689 	for (i = 0; i < block_nr; i++) {
1690 		pg_vec[i] = alloc_one_pg_vec_page(order);
1691 		if (unlikely(!pg_vec[i]))
1692 			goto out_free_pgvec;
1693 	}
1694 
1695 out:
1696 	return pg_vec;
1697 
1698 out_free_pgvec:
1699 	free_pg_vec(pg_vec, order, block_nr);
1700 	pg_vec = NULL;
1701 	goto out;
1702 }
1703 
1704 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1705 {
1706 	char **pg_vec = NULL;
1707 	struct packet_sock *po = pkt_sk(sk);
1708 	int was_running, order = 0;
1709 	__be16 num;
1710 	int err = 0;
1711 
1712 	if (req->tp_block_nr) {
1713 		int i, l;
1714 
1715 		/* Sanity tests and some calculations */
1716 
1717 		if (unlikely(po->pg_vec))
1718 			return -EBUSY;
1719 
1720 		if (unlikely((int)req->tp_block_size <= 0))
1721 			return -EINVAL;
1722 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1723 			return -EINVAL;
1724 		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1725 			return -EINVAL;
1726 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1727 			return -EINVAL;
1728 
1729 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1730 		if (unlikely(po->frames_per_block <= 0))
1731 			return -EINVAL;
1732 		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1733 			     req->tp_frame_nr))
1734 			return -EINVAL;
1735 
1736 		err = -ENOMEM;
1737 		order = get_order(req->tp_block_size);
1738 		pg_vec = alloc_pg_vec(req, order);
1739 		if (unlikely(!pg_vec))
1740 			goto out;
1741 
1742 		l = 0;
1743 		for (i = 0; i < req->tp_block_nr; i++) {
1744 			char *ptr = pg_vec[i];
1745 			struct tpacket_hdr *header;
1746 			int k;
1747 
1748 			for (k = 0; k < po->frames_per_block; k++) {
1749 				header = (struct tpacket_hdr *) ptr;
1750 				header->tp_status = TP_STATUS_KERNEL;
1751 				ptr += req->tp_frame_size;
1752 			}
1753 		}
1754 		/* Done */
1755 	} else {
1756 		if (unlikely(req->tp_frame_nr))
1757 			return -EINVAL;
1758 	}
1759 
1760 	lock_sock(sk);
1761 
1762 	/* Detach socket from network */
1763 	spin_lock(&po->bind_lock);
1764 	was_running = po->running;
1765 	num = po->num;
1766 	if (was_running) {
1767 		__dev_remove_pack(&po->prot_hook);
1768 		po->num = 0;
1769 		po->running = 0;
1770 		__sock_put(sk);
1771 	}
1772 	spin_unlock(&po->bind_lock);
1773 
1774 	synchronize_net();
1775 
1776 	err = -EBUSY;
1777 	if (closing || atomic_read(&po->mapped) == 0) {
1778 		err = 0;
1779 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1780 
1781 		spin_lock_bh(&sk->sk_receive_queue.lock);
1782 		pg_vec = XC(po->pg_vec, pg_vec);
1783 		po->frame_max = (req->tp_frame_nr - 1);
1784 		po->head = 0;
1785 		po->frame_size = req->tp_frame_size;
1786 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1787 
1788 		order = XC(po->pg_vec_order, order);
1789 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1790 
1791 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1792 		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1793 		skb_queue_purge(&sk->sk_receive_queue);
1794 #undef XC
1795 		if (atomic_read(&po->mapped))
1796 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1797 	}
1798 
1799 	spin_lock(&po->bind_lock);
1800 	if (was_running && !po->running) {
1801 		sock_hold(sk);
1802 		po->running = 1;
1803 		po->num = num;
1804 		dev_add_pack(&po->prot_hook);
1805 	}
1806 	spin_unlock(&po->bind_lock);
1807 
1808 	release_sock(sk);
1809 
1810 	if (pg_vec)
1811 		free_pg_vec(pg_vec, order, req->tp_block_nr);
1812 out:
1813 	return err;
1814 }
1815 
1816 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1817 {
1818 	struct sock *sk = sock->sk;
1819 	struct packet_sock *po = pkt_sk(sk);
1820 	unsigned long size;
1821 	unsigned long start;
1822 	int err = -EINVAL;
1823 	int i;
1824 
1825 	if (vma->vm_pgoff)
1826 		return -EINVAL;
1827 
1828 	size = vma->vm_end - vma->vm_start;
1829 
1830 	lock_sock(sk);
1831 	if (po->pg_vec == NULL)
1832 		goto out;
1833 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1834 		goto out;
1835 
1836 	start = vma->vm_start;
1837 	for (i = 0; i < po->pg_vec_len; i++) {
1838 		struct page *page = virt_to_page(po->pg_vec[i]);
1839 		int pg_num;
1840 
1841 		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1842 			err = vm_insert_page(vma, start, page);
1843 			if (unlikely(err))
1844 				goto out;
1845 			start += PAGE_SIZE;
1846 		}
1847 	}
1848 	atomic_inc(&po->mapped);
1849 	vma->vm_ops = &packet_mmap_ops;
1850 	err = 0;
1851 
1852 out:
1853 	release_sock(sk);
1854 	return err;
1855 }
1856 #endif
1857 
1858 
1859 #ifdef CONFIG_SOCK_PACKET
1860 static const struct proto_ops packet_ops_spkt = {
1861 	.family =	PF_PACKET,
1862 	.owner =	THIS_MODULE,
1863 	.release =	packet_release,
1864 	.bind =		packet_bind_spkt,
1865 	.connect =	sock_no_connect,
1866 	.socketpair =	sock_no_socketpair,
1867 	.accept =	sock_no_accept,
1868 	.getname =	packet_getname_spkt,
1869 	.poll =		datagram_poll,
1870 	.ioctl =	packet_ioctl,
1871 	.listen =	sock_no_listen,
1872 	.shutdown =	sock_no_shutdown,
1873 	.setsockopt =	sock_no_setsockopt,
1874 	.getsockopt =	sock_no_getsockopt,
1875 	.sendmsg =	packet_sendmsg_spkt,
1876 	.recvmsg =	packet_recvmsg,
1877 	.mmap =		sock_no_mmap,
1878 	.sendpage =	sock_no_sendpage,
1879 };
1880 #endif
1881 
1882 static const struct proto_ops packet_ops = {
1883 	.family =	PF_PACKET,
1884 	.owner =	THIS_MODULE,
1885 	.release =	packet_release,
1886 	.bind =		packet_bind,
1887 	.connect =	sock_no_connect,
1888 	.socketpair =	sock_no_socketpair,
1889 	.accept =	sock_no_accept,
1890 	.getname =	packet_getname,
1891 	.poll =		packet_poll,
1892 	.ioctl =	packet_ioctl,
1893 	.listen =	sock_no_listen,
1894 	.shutdown =	sock_no_shutdown,
1895 	.setsockopt =	packet_setsockopt,
1896 	.getsockopt =	packet_getsockopt,
1897 	.sendmsg =	packet_sendmsg,
1898 	.recvmsg =	packet_recvmsg,
1899 	.mmap =		packet_mmap,
1900 	.sendpage =	sock_no_sendpage,
1901 };
1902 
1903 static struct net_proto_family packet_family_ops = {
1904 	.family =	PF_PACKET,
1905 	.create =	packet_create,
1906 	.owner	=	THIS_MODULE,
1907 };
1908 
1909 static struct notifier_block packet_netdev_notifier = {
1910 	.notifier_call =packet_notifier,
1911 };
1912 
1913 #ifdef CONFIG_PROC_FS
1914 static inline struct sock *packet_seq_idx(loff_t off)
1915 {
1916 	struct sock *s;
1917 	struct hlist_node *node;
1918 
1919 	sk_for_each(s, node, &packet_sklist) {
1920 		if (!off--)
1921 			return s;
1922 	}
1923 	return NULL;
1924 }
1925 
1926 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1927 {
1928 	read_lock(&packet_sklist_lock);
1929 	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1930 }
1931 
1932 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1933 {
1934 	++*pos;
1935 	return  (v == SEQ_START_TOKEN)
1936 		? sk_head(&packet_sklist)
1937 		: sk_next((struct sock*)v) ;
1938 }
1939 
1940 static void packet_seq_stop(struct seq_file *seq, void *v)
1941 {
1942 	read_unlock(&packet_sklist_lock);
1943 }
1944 
1945 static int packet_seq_show(struct seq_file *seq, void *v)
1946 {
1947 	if (v == SEQ_START_TOKEN)
1948 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1949 	else {
1950 		struct sock *s = v;
1951 		const struct packet_sock *po = pkt_sk(s);
1952 
1953 		seq_printf(seq,
1954 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1955 			   s,
1956 			   atomic_read(&s->sk_refcnt),
1957 			   s->sk_type,
1958 			   ntohs(po->num),
1959 			   po->ifindex,
1960 			   po->running,
1961 			   atomic_read(&s->sk_rmem_alloc),
1962 			   sock_i_uid(s),
1963 			   sock_i_ino(s) );
1964 	}
1965 
1966 	return 0;
1967 }
1968 
1969 static struct seq_operations packet_seq_ops = {
1970 	.start	= packet_seq_start,
1971 	.next	= packet_seq_next,
1972 	.stop	= packet_seq_stop,
1973 	.show	= packet_seq_show,
1974 };
1975 
1976 static int packet_seq_open(struct inode *inode, struct file *file)
1977 {
1978 	return seq_open(file, &packet_seq_ops);
1979 }
1980 
1981 static const struct file_operations packet_seq_fops = {
1982 	.owner		= THIS_MODULE,
1983 	.open		= packet_seq_open,
1984 	.read		= seq_read,
1985 	.llseek		= seq_lseek,
1986 	.release	= seq_release,
1987 };
1988 
1989 #endif
1990 
1991 static void __exit packet_exit(void)
1992 {
1993 	proc_net_remove("packet");
1994 	unregister_netdevice_notifier(&packet_netdev_notifier);
1995 	sock_unregister(PF_PACKET);
1996 	proto_unregister(&packet_proto);
1997 }
1998 
1999 static int __init packet_init(void)
2000 {
2001 	int rc = proto_register(&packet_proto, 0);
2002 
2003 	if (rc != 0)
2004 		goto out;
2005 
2006 	sock_register(&packet_family_ops);
2007 	register_netdevice_notifier(&packet_netdev_notifier);
2008 	proc_net_fops_create("packet", 0, &packet_seq_fops);
2009 out:
2010 	return rc;
2011 }
2012 
2013 module_init(packet_init);
2014 module_exit(packet_exit);
2015 MODULE_LICENSE("GPL");
2016 MODULE_ALIAS_NETPROTO(PF_PACKET);
2017