xref: /linux/net/packet/af_packet.c (revision 2624f124b3b5d550ab2fbef7ee3bc0e1fed09722)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *		Alan Cox	:	verify_area() now used correctly
16  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17  *		Alan Cox	:	tidied skbuff lists.
18  *		Alan Cox	:	Now uses generic datagram routines I
19  *					added. Also fixed the peek/read crash
20  *					from all old Linux datagram code.
21  *		Alan Cox	:	Uses the improved datagram code.
22  *		Alan Cox	:	Added NULL's for socket options.
23  *		Alan Cox	:	Re-commented the code.
24  *		Alan Cox	:	Use new kernel side addressing
25  *		Rob Janssen	:	Correct MTU usage.
26  *		Dave Platt	:	Counter leaks caused by incorrect
27  *					interrupt locking and some slightly
28  *					dubious gcc output. Can you read
29  *					compiler: it said _VOLATILE_
30  *	Richard Kooijman	:	Timestamp fixes.
31  *		Alan Cox	:	New buffers. Use sk->mac.raw.
32  *		Alan Cox	:	sendmsg/recvmsg support.
33  *		Alan Cox	:	Protocol setting support
34  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35  *	Cyrus Durgin		:	Fixed kerneld for kmod.
36  *	Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  */
46 
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <asm/page.h>
69 #include <asm/io.h>
70 #include <linux/proc_fs.h>
71 #include <linux/seq_file.h>
72 #include <linux/poll.h>
73 #include <linux/module.h>
74 #include <linux/init.h>
75 
76 #ifdef CONFIG_INET
77 #include <net/inet_common.h>
78 #endif
79 
80 #define CONFIG_SOCK_PACKET	1
81 
82 /*
83    Proposed replacement for SIOC{ADD,DEL}MULTI and
84    IFF_PROMISC, IFF_ALLMULTI flags.
85 
86    It is more expensive, but I believe,
87    it is really correct solution: reentereble, safe and fault tolerant.
88 
89    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
90    reference count and global flag, so that real status is
91    (gflag|(count != 0)), so that we can use obsolete faulty interface
92    not harming clever users.
93  */
94 #define CONFIG_PACKET_MULTICAST	1
95 
96 /*
97    Assumptions:
98    - if device has no dev->hard_header routine, it adds and removes ll header
99      inside itself. In this case ll header is invisible outside of device,
100      but higher levels still should reserve dev->hard_header_len.
101      Some devices are enough clever to reallocate skb, when header
102      will not fit to reserved space (tunnel), another ones are silly
103      (PPP).
104    - packet socket receives packets with pulled ll header,
105      so that SOCK_RAW should push it back.
106 
107 On receive:
108 -----------
109 
110 Incoming, dev->hard_header!=NULL
111    mac.raw -> ll header
112    data    -> data
113 
114 Outgoing, dev->hard_header!=NULL
115    mac.raw -> ll header
116    data    -> ll header
117 
118 Incoming, dev->hard_header==NULL
119    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
120               PPP makes it, that is wrong, because introduce assymetry
121 	      between rx and tx paths.
122    data    -> data
123 
124 Outgoing, dev->hard_header==NULL
125    mac.raw -> data. ll header is still not built!
126    data    -> data
127 
128 Resume
129   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130 
131 
132 On transmit:
133 ------------
134 
135 dev->hard_header != NULL
136    mac.raw -> ll header
137    data    -> ll header
138 
139 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140    mac.raw -> data
141    data -> data
142 
143    We should set nh.raw on output to correct posistion,
144    packet classifier depends on it.
145  */
146 
147 /* List of all packet sockets. */
148 static HLIST_HEAD(packet_sklist);
149 static DEFINE_RWLOCK(packet_sklist_lock);
150 
151 static atomic_t packet_socks_nr;
152 
153 
154 /* Private packet socket structures. */
155 
156 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist
158 {
159 	struct packet_mclist	*next;
160 	int			ifindex;
161 	int			count;
162 	unsigned short		type;
163 	unsigned short		alen;
164 	unsigned char		addr[8];
165 };
166 #endif
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170 
171 static void packet_flush_mclist(struct sock *sk);
172 
173 struct packet_sock {
174 	/* struct sock has to be the first member of packet_sock */
175 	struct sock		sk;
176 	struct tpacket_stats	stats;
177 #ifdef CONFIG_PACKET_MMAP
178 	char *			*pg_vec;
179 	unsigned int		head;
180 	unsigned int            frames_per_block;
181 	unsigned int		frame_size;
182 	unsigned int		frame_max;
183 	int			copy_thresh;
184 #endif
185 	struct packet_type	prot_hook;
186 	spinlock_t		bind_lock;
187 	char			running;	/* prot_hook is attached*/
188 	int			ifindex;	/* bound device		*/
189 	unsigned short		num;
190 #ifdef CONFIG_PACKET_MULTICAST
191 	struct packet_mclist	*mclist;
192 #endif
193 #ifdef CONFIG_PACKET_MMAP
194 	atomic_t		mapped;
195 	unsigned int            pg_vec_order;
196 	unsigned int		pg_vec_pages;
197 	unsigned int		pg_vec_len;
198 #endif
199 };
200 
201 #ifdef CONFIG_PACKET_MMAP
202 
203 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
204 {
205 	unsigned int pg_vec_pos, frame_offset;
206 	char *frame;
207 
208 	pg_vec_pos = position / po->frames_per_block;
209 	frame_offset = position % po->frames_per_block;
210 
211 	frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
212 
213 	return frame;
214 }
215 #endif
216 
217 static inline struct packet_sock *pkt_sk(struct sock *sk)
218 {
219 	return (struct packet_sock *)sk;
220 }
221 
222 static void packet_sock_destruct(struct sock *sk)
223 {
224 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
225 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
226 
227 	if (!sock_flag(sk, SOCK_DEAD)) {
228 		printk("Attempt to release alive packet socket: %p\n", sk);
229 		return;
230 	}
231 
232 	atomic_dec(&packet_socks_nr);
233 #ifdef PACKET_REFCNT_DEBUG
234 	printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
235 #endif
236 }
237 
238 
239 static struct proto_ops packet_ops;
240 
241 #ifdef CONFIG_SOCK_PACKET
242 static struct proto_ops packet_ops_spkt;
243 
244 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
245 {
246 	struct sock *sk;
247 	struct sockaddr_pkt *spkt;
248 
249 	/*
250 	 *	When we registered the protocol we saved the socket in the data
251 	 *	field for just this event.
252 	 */
253 
254 	sk = pt->af_packet_priv;
255 
256 	/*
257 	 *	Yank back the headers [hope the device set this
258 	 *	right or kerboom...]
259 	 *
260 	 *	Incoming packets have ll header pulled,
261 	 *	push it back.
262 	 *
263 	 *	For outgoing ones skb->data == skb->mac.raw
264 	 *	so that this procedure is noop.
265 	 */
266 
267 	if (skb->pkt_type == PACKET_LOOPBACK)
268 		goto out;
269 
270 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
271 		goto oom;
272 
273 	/* drop any routing info */
274 	dst_release(skb->dst);
275 	skb->dst = NULL;
276 
277 	/* drop conntrack reference */
278 	nf_reset(skb);
279 
280 	spkt = (struct sockaddr_pkt*)skb->cb;
281 
282 	skb_push(skb, skb->data-skb->mac.raw);
283 
284 	/*
285 	 *	The SOCK_PACKET socket receives _all_ frames.
286 	 */
287 
288 	spkt->spkt_family = dev->type;
289 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
290 	spkt->spkt_protocol = skb->protocol;
291 
292 	/*
293 	 *	Charge the memory to the socket. This is done specifically
294 	 *	to prevent sockets using all the memory up.
295 	 */
296 
297 	if (sock_queue_rcv_skb(sk,skb) == 0)
298 		return 0;
299 
300 out:
301 	kfree_skb(skb);
302 oom:
303 	return 0;
304 }
305 
306 
307 /*
308  *	Output a raw packet to a device layer. This bypasses all the other
309  *	protocol layers and you must therefore supply it with a complete frame
310  */
311 
312 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
313 			       struct msghdr *msg, size_t len)
314 {
315 	struct sock *sk = sock->sk;
316 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
317 	struct sk_buff *skb;
318 	struct net_device *dev;
319 	unsigned short proto=0;
320 	int err;
321 
322 	/*
323 	 *	Get and verify the address.
324 	 */
325 
326 	if (saddr)
327 	{
328 		if (msg->msg_namelen < sizeof(struct sockaddr))
329 			return(-EINVAL);
330 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
331 			proto=saddr->spkt_protocol;
332 	}
333 	else
334 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
335 
336 	/*
337 	 *	Find the device first to size check it
338 	 */
339 
340 	saddr->spkt_device[13] = 0;
341 	dev = dev_get_by_name(saddr->spkt_device);
342 	err = -ENODEV;
343 	if (dev == NULL)
344 		goto out_unlock;
345 
346 	/*
347 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
348 	 *	raw protocol and you must do your own fragmentation at this level.
349 	 */
350 
351 	err = -EMSGSIZE;
352  	if(len>dev->mtu+dev->hard_header_len)
353 		goto out_unlock;
354 
355 	err = -ENOBUFS;
356 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
357 
358 	/*
359 	 *	If the write buffer is full, then tough. At this level the user gets to
360 	 *	deal with the problem - do your own algorithmic backoffs. That's far
361 	 *	more flexible.
362 	 */
363 
364 	if (skb == NULL)
365 		goto out_unlock;
366 
367 	/*
368 	 *	Fill it in
369 	 */
370 
371 	/* FIXME: Save some space for broken drivers that write a
372 	 * hard header at transmission time by themselves. PPP is the
373 	 * notable one here. This should really be fixed at the driver level.
374 	 */
375 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
376 	skb->nh.raw = skb->data;
377 
378 	/* Try to align data part correctly */
379 	if (dev->hard_header) {
380 		skb->data -= dev->hard_header_len;
381 		skb->tail -= dev->hard_header_len;
382 		if (len < dev->hard_header_len)
383 			skb->nh.raw = skb->data;
384 	}
385 
386 	/* Returns -EFAULT on error */
387 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
388 	skb->protocol = proto;
389 	skb->dev = dev;
390 	skb->priority = sk->sk_priority;
391 	if (err)
392 		goto out_free;
393 
394 	err = -ENETDOWN;
395 	if (!(dev->flags & IFF_UP))
396 		goto out_free;
397 
398 	/*
399 	 *	Now send it
400 	 */
401 
402 	dev_queue_xmit(skb);
403 	dev_put(dev);
404 	return(len);
405 
406 out_free:
407 	kfree_skb(skb);
408 out_unlock:
409 	if (dev)
410 		dev_put(dev);
411 	return err;
412 }
413 #endif
414 
415 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
416 {
417 	struct sk_filter *filter;
418 
419 	bh_lock_sock(sk);
420 	filter = sk->sk_filter;
421 	/*
422 	 * Our caller already checked that filter != NULL but we need to
423 	 * verify that under bh_lock_sock() to be safe
424 	 */
425 	if (likely(filter != NULL))
426 		res = sk_run_filter(skb, filter->insns, filter->len);
427 	bh_unlock_sock(sk);
428 
429 	return res;
430 }
431 
432 /*
433    This function makes lazy skb cloning in hope that most of packets
434    are discarded by BPF.
435 
436    Note tricky part: we DO mangle shared skb! skb->data, skb->len
437    and skb->cb are mangled. It works because (and until) packets
438    falling here are owned by current CPU. Output packets are cloned
439    by dev_queue_xmit_nit(), input packets are processed by net_bh
440    sequencially, so that if we return skb to original state on exit,
441    we will not harm anyone.
442  */
443 
444 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
445 {
446 	struct sock *sk;
447 	struct sockaddr_ll *sll;
448 	struct packet_sock *po;
449 	u8 * skb_head = skb->data;
450 	int skb_len = skb->len;
451 	unsigned snaplen;
452 
453 	if (skb->pkt_type == PACKET_LOOPBACK)
454 		goto drop;
455 
456 	sk = pt->af_packet_priv;
457 	po = pkt_sk(sk);
458 
459 	skb->dev = dev;
460 
461 	if (dev->hard_header) {
462 		/* The device has an explicit notion of ll header,
463 		   exported to higher levels.
464 
465 		   Otherwise, the device hides datails of it frame
466 		   structure, so that corresponding packet head
467 		   never delivered to user.
468 		 */
469 		if (sk->sk_type != SOCK_DGRAM)
470 			skb_push(skb, skb->data - skb->mac.raw);
471 		else if (skb->pkt_type == PACKET_OUTGOING) {
472 			/* Special case: outgoing packets have ll header at head */
473 			skb_pull(skb, skb->nh.raw - skb->data);
474 		}
475 	}
476 
477 	snaplen = skb->len;
478 
479 	if (sk->sk_filter) {
480 		unsigned res = run_filter(skb, sk, snaplen);
481 		if (res == 0)
482 			goto drop_n_restore;
483 		if (snaplen > res)
484 			snaplen = res;
485 	}
486 
487 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488 	    (unsigned)sk->sk_rcvbuf)
489 		goto drop_n_acct;
490 
491 	if (skb_shared(skb)) {
492 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493 		if (nskb == NULL)
494 			goto drop_n_acct;
495 
496 		if (skb_head != skb->data) {
497 			skb->data = skb_head;
498 			skb->len = skb_len;
499 		}
500 		kfree_skb(skb);
501 		skb = nskb;
502 	}
503 
504 	sll = (struct sockaddr_ll*)skb->cb;
505 	sll->sll_family = AF_PACKET;
506 	sll->sll_hatype = dev->type;
507 	sll->sll_protocol = skb->protocol;
508 	sll->sll_pkttype = skb->pkt_type;
509 	sll->sll_ifindex = dev->ifindex;
510 	sll->sll_halen = 0;
511 
512 	if (dev->hard_header_parse)
513 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
514 
515 	if (pskb_trim(skb, snaplen))
516 		goto drop_n_acct;
517 
518 	skb_set_owner_r(skb, sk);
519 	skb->dev = NULL;
520 	dst_release(skb->dst);
521 	skb->dst = NULL;
522 
523 	/* drop conntrack reference */
524 	nf_reset(skb);
525 
526 	spin_lock(&sk->sk_receive_queue.lock);
527 	po->stats.tp_packets++;
528 	__skb_queue_tail(&sk->sk_receive_queue, skb);
529 	spin_unlock(&sk->sk_receive_queue.lock);
530 	sk->sk_data_ready(sk, skb->len);
531 	return 0;
532 
533 drop_n_acct:
534 	spin_lock(&sk->sk_receive_queue.lock);
535 	po->stats.tp_drops++;
536 	spin_unlock(&sk->sk_receive_queue.lock);
537 
538 drop_n_restore:
539 	if (skb_head != skb->data && skb_shared(skb)) {
540 		skb->data = skb_head;
541 		skb->len = skb_len;
542 	}
543 drop:
544 	kfree_skb(skb);
545 	return 0;
546 }
547 
548 #ifdef CONFIG_PACKET_MMAP
549 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
550 {
551 	struct sock *sk;
552 	struct packet_sock *po;
553 	struct sockaddr_ll *sll;
554 	struct tpacket_hdr *h;
555 	u8 * skb_head = skb->data;
556 	int skb_len = skb->len;
557 	unsigned snaplen;
558 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
559 	unsigned short macoff, netoff;
560 	struct sk_buff *copy_skb = NULL;
561 
562 	if (skb->pkt_type == PACKET_LOOPBACK)
563 		goto drop;
564 
565 	sk = pt->af_packet_priv;
566 	po = pkt_sk(sk);
567 
568 	if (dev->hard_header) {
569 		if (sk->sk_type != SOCK_DGRAM)
570 			skb_push(skb, skb->data - skb->mac.raw);
571 		else if (skb->pkt_type == PACKET_OUTGOING) {
572 			/* Special case: outgoing packets have ll header at head */
573 			skb_pull(skb, skb->nh.raw - skb->data);
574 			if (skb->ip_summed == CHECKSUM_HW)
575 				status |= TP_STATUS_CSUMNOTREADY;
576 		}
577 	}
578 
579 	snaplen = skb->len;
580 
581 	if (sk->sk_filter) {
582 		unsigned res = run_filter(skb, sk, snaplen);
583 		if (res == 0)
584 			goto drop_n_restore;
585 		if (snaplen > res)
586 			snaplen = res;
587 	}
588 
589 	if (sk->sk_type == SOCK_DGRAM) {
590 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
591 	} else {
592 		unsigned maclen = skb->nh.raw - skb->data;
593 		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
594 		macoff = netoff - maclen;
595 	}
596 
597 	if (macoff + snaplen > po->frame_size) {
598 		if (po->copy_thresh &&
599 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
600 		    (unsigned)sk->sk_rcvbuf) {
601 			if (skb_shared(skb)) {
602 				copy_skb = skb_clone(skb, GFP_ATOMIC);
603 			} else {
604 				copy_skb = skb_get(skb);
605 				skb_head = skb->data;
606 			}
607 			if (copy_skb)
608 				skb_set_owner_r(copy_skb, sk);
609 		}
610 		snaplen = po->frame_size - macoff;
611 		if ((int)snaplen < 0)
612 			snaplen = 0;
613 	}
614 	if (snaplen > skb->len-skb->data_len)
615 		snaplen = skb->len-skb->data_len;
616 
617 	spin_lock(&sk->sk_receive_queue.lock);
618 	h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
619 
620 	if (h->tp_status)
621 		goto ring_is_full;
622 	po->head = po->head != po->frame_max ? po->head+1 : 0;
623 	po->stats.tp_packets++;
624 	if (copy_skb) {
625 		status |= TP_STATUS_COPY;
626 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
627 	}
628 	if (!po->stats.tp_drops)
629 		status &= ~TP_STATUS_LOSING;
630 	spin_unlock(&sk->sk_receive_queue.lock);
631 
632 	memcpy((u8*)h + macoff, skb->data, snaplen);
633 
634 	h->tp_len = skb->len;
635 	h->tp_snaplen = snaplen;
636 	h->tp_mac = macoff;
637 	h->tp_net = netoff;
638 	if (skb->tstamp.off_sec == 0) {
639 		__net_timestamp(skb);
640 		sock_enable_timestamp(sk);
641 	}
642 	h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
643 	h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
644 
645 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
646 	sll->sll_halen = 0;
647 	if (dev->hard_header_parse)
648 		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
649 	sll->sll_family = AF_PACKET;
650 	sll->sll_hatype = dev->type;
651 	sll->sll_protocol = skb->protocol;
652 	sll->sll_pkttype = skb->pkt_type;
653 	sll->sll_ifindex = dev->ifindex;
654 
655 	h->tp_status = status;
656 	mb();
657 
658 	{
659 		struct page *p_start, *p_end;
660 		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
661 
662 		p_start = virt_to_page(h);
663 		p_end = virt_to_page(h_end);
664 		while (p_start <= p_end) {
665 			flush_dcache_page(p_start);
666 			p_start++;
667 		}
668 	}
669 
670 	sk->sk_data_ready(sk, 0);
671 
672 drop_n_restore:
673 	if (skb_head != skb->data && skb_shared(skb)) {
674 		skb->data = skb_head;
675 		skb->len = skb_len;
676 	}
677 drop:
678         kfree_skb(skb);
679 	return 0;
680 
681 ring_is_full:
682 	po->stats.tp_drops++;
683 	spin_unlock(&sk->sk_receive_queue.lock);
684 
685 	sk->sk_data_ready(sk, 0);
686 	if (copy_skb)
687 		kfree_skb(copy_skb);
688 	goto drop_n_restore;
689 }
690 
691 #endif
692 
693 
694 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
695 			  struct msghdr *msg, size_t len)
696 {
697 	struct sock *sk = sock->sk;
698 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
699 	struct sk_buff *skb;
700 	struct net_device *dev;
701 	unsigned short proto;
702 	unsigned char *addr;
703 	int ifindex, err, reserve = 0;
704 
705 	/*
706 	 *	Get and verify the address.
707 	 */
708 
709 	if (saddr == NULL) {
710 		struct packet_sock *po = pkt_sk(sk);
711 
712 		ifindex	= po->ifindex;
713 		proto	= po->num;
714 		addr	= NULL;
715 	} else {
716 		err = -EINVAL;
717 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
718 			goto out;
719 		ifindex	= saddr->sll_ifindex;
720 		proto	= saddr->sll_protocol;
721 		addr	= saddr->sll_addr;
722 	}
723 
724 
725 	dev = dev_get_by_index(ifindex);
726 	err = -ENXIO;
727 	if (dev == NULL)
728 		goto out_unlock;
729 	if (sock->type == SOCK_RAW)
730 		reserve = dev->hard_header_len;
731 
732 	err = -EMSGSIZE;
733 	if (len > dev->mtu+reserve)
734 		goto out_unlock;
735 
736 	skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
737 				msg->msg_flags & MSG_DONTWAIT, &err);
738 	if (skb==NULL)
739 		goto out_unlock;
740 
741 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
742 	skb->nh.raw = skb->data;
743 
744 	if (dev->hard_header) {
745 		int res;
746 		err = -EINVAL;
747 		res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
748 		if (sock->type != SOCK_DGRAM) {
749 			skb->tail = skb->data;
750 			skb->len = 0;
751 		} else if (res < 0)
752 			goto out_free;
753 	}
754 
755 	/* Returns -EFAULT on error */
756 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
757 	if (err)
758 		goto out_free;
759 
760 	skb->protocol = proto;
761 	skb->dev = dev;
762 	skb->priority = sk->sk_priority;
763 
764 	err = -ENETDOWN;
765 	if (!(dev->flags & IFF_UP))
766 		goto out_free;
767 
768 	/*
769 	 *	Now send it
770 	 */
771 
772 	err = dev_queue_xmit(skb);
773 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
774 		goto out_unlock;
775 
776 	dev_put(dev);
777 
778 	return(len);
779 
780 out_free:
781 	kfree_skb(skb);
782 out_unlock:
783 	if (dev)
784 		dev_put(dev);
785 out:
786 	return err;
787 }
788 
789 /*
790  *	Close a PACKET socket. This is fairly simple. We immediately go
791  *	to 'closed' state and remove our protocol entry in the device list.
792  */
793 
794 static int packet_release(struct socket *sock)
795 {
796 	struct sock *sk = sock->sk;
797 	struct packet_sock *po;
798 
799 	if (!sk)
800 		return 0;
801 
802 	po = pkt_sk(sk);
803 
804 	write_lock_bh(&packet_sklist_lock);
805 	sk_del_node_init(sk);
806 	write_unlock_bh(&packet_sklist_lock);
807 
808 	/*
809 	 *	Unhook packet receive handler.
810 	 */
811 
812 	if (po->running) {
813 		/*
814 		 *	Remove the protocol hook
815 		 */
816 		dev_remove_pack(&po->prot_hook);
817 		po->running = 0;
818 		po->num = 0;
819 		__sock_put(sk);
820 	}
821 
822 #ifdef CONFIG_PACKET_MULTICAST
823 	packet_flush_mclist(sk);
824 #endif
825 
826 #ifdef CONFIG_PACKET_MMAP
827 	if (po->pg_vec) {
828 		struct tpacket_req req;
829 		memset(&req, 0, sizeof(req));
830 		packet_set_ring(sk, &req, 1);
831 	}
832 #endif
833 
834 	/*
835 	 *	Now the socket is dead. No more input will appear.
836 	 */
837 
838 	sock_orphan(sk);
839 	sock->sk = NULL;
840 
841 	/* Purge queues */
842 
843 	skb_queue_purge(&sk->sk_receive_queue);
844 
845 	sock_put(sk);
846 	return 0;
847 }
848 
849 /*
850  *	Attach a packet hook.
851  */
852 
853 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
854 {
855 	struct packet_sock *po = pkt_sk(sk);
856 	/*
857 	 *	Detach an existing hook if present.
858 	 */
859 
860 	lock_sock(sk);
861 
862 	spin_lock(&po->bind_lock);
863 	if (po->running) {
864 		__sock_put(sk);
865 		po->running = 0;
866 		po->num = 0;
867 		spin_unlock(&po->bind_lock);
868 		dev_remove_pack(&po->prot_hook);
869 		spin_lock(&po->bind_lock);
870 	}
871 
872 	po->num = protocol;
873 	po->prot_hook.type = protocol;
874 	po->prot_hook.dev = dev;
875 
876 	po->ifindex = dev ? dev->ifindex : 0;
877 
878 	if (protocol == 0)
879 		goto out_unlock;
880 
881 	if (dev) {
882 		if (dev->flags&IFF_UP) {
883 			dev_add_pack(&po->prot_hook);
884 			sock_hold(sk);
885 			po->running = 1;
886 		} else {
887 			sk->sk_err = ENETDOWN;
888 			if (!sock_flag(sk, SOCK_DEAD))
889 				sk->sk_error_report(sk);
890 		}
891 	} else {
892 		dev_add_pack(&po->prot_hook);
893 		sock_hold(sk);
894 		po->running = 1;
895 	}
896 
897 out_unlock:
898 	spin_unlock(&po->bind_lock);
899 	release_sock(sk);
900 	return 0;
901 }
902 
903 /*
904  *	Bind a packet socket to a device
905  */
906 
907 #ifdef CONFIG_SOCK_PACKET
908 
909 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
910 {
911 	struct sock *sk=sock->sk;
912 	char name[15];
913 	struct net_device *dev;
914 	int err = -ENODEV;
915 
916 	/*
917 	 *	Check legality
918 	 */
919 
920 	if(addr_len!=sizeof(struct sockaddr))
921 		return -EINVAL;
922 	strlcpy(name,uaddr->sa_data,sizeof(name));
923 
924 	dev = dev_get_by_name(name);
925 	if (dev) {
926 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
927 		dev_put(dev);
928 	}
929 	return err;
930 }
931 #endif
932 
933 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
934 {
935 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
936 	struct sock *sk=sock->sk;
937 	struct net_device *dev = NULL;
938 	int err;
939 
940 
941 	/*
942 	 *	Check legality
943 	 */
944 
945 	if (addr_len < sizeof(struct sockaddr_ll))
946 		return -EINVAL;
947 	if (sll->sll_family != AF_PACKET)
948 		return -EINVAL;
949 
950 	if (sll->sll_ifindex) {
951 		err = -ENODEV;
952 		dev = dev_get_by_index(sll->sll_ifindex);
953 		if (dev == NULL)
954 			goto out;
955 	}
956 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
957 	if (dev)
958 		dev_put(dev);
959 
960 out:
961 	return err;
962 }
963 
964 static struct proto packet_proto = {
965 	.name	  = "PACKET",
966 	.owner	  = THIS_MODULE,
967 	.obj_size = sizeof(struct packet_sock),
968 };
969 
970 /*
971  *	Create a packet of type SOCK_PACKET.
972  */
973 
974 static int packet_create(struct socket *sock, int protocol)
975 {
976 	struct sock *sk;
977 	struct packet_sock *po;
978 	int err;
979 
980 	if (!capable(CAP_NET_RAW))
981 		return -EPERM;
982 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
983 #ifdef CONFIG_SOCK_PACKET
984 	    && sock->type != SOCK_PACKET
985 #endif
986 	    )
987 		return -ESOCKTNOSUPPORT;
988 
989 	sock->state = SS_UNCONNECTED;
990 
991 	err = -ENOBUFS;
992 	sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
993 	if (sk == NULL)
994 		goto out;
995 
996 	sock->ops = &packet_ops;
997 #ifdef CONFIG_SOCK_PACKET
998 	if (sock->type == SOCK_PACKET)
999 		sock->ops = &packet_ops_spkt;
1000 #endif
1001 	sock_init_data(sock, sk);
1002 
1003 	po = pkt_sk(sk);
1004 	sk->sk_family = PF_PACKET;
1005 	po->num = protocol;
1006 
1007 	sk->sk_destruct = packet_sock_destruct;
1008 	atomic_inc(&packet_socks_nr);
1009 
1010 	/*
1011 	 *	Attach a protocol block
1012 	 */
1013 
1014 	spin_lock_init(&po->bind_lock);
1015 	po->prot_hook.func = packet_rcv;
1016 #ifdef CONFIG_SOCK_PACKET
1017 	if (sock->type == SOCK_PACKET)
1018 		po->prot_hook.func = packet_rcv_spkt;
1019 #endif
1020 	po->prot_hook.af_packet_priv = sk;
1021 
1022 	if (protocol) {
1023 		po->prot_hook.type = protocol;
1024 		dev_add_pack(&po->prot_hook);
1025 		sock_hold(sk);
1026 		po->running = 1;
1027 	}
1028 
1029 	write_lock_bh(&packet_sklist_lock);
1030 	sk_add_node(sk, &packet_sklist);
1031 	write_unlock_bh(&packet_sklist_lock);
1032 	return(0);
1033 out:
1034 	return err;
1035 }
1036 
1037 /*
1038  *	Pull a packet from our receive queue and hand it to the user.
1039  *	If necessary we block.
1040  */
1041 
1042 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1043 			  struct msghdr *msg, size_t len, int flags)
1044 {
1045 	struct sock *sk = sock->sk;
1046 	struct sk_buff *skb;
1047 	int copied, err;
1048 
1049 	err = -EINVAL;
1050 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1051 		goto out;
1052 
1053 #if 0
1054 	/* What error should we return now? EUNATTACH? */
1055 	if (pkt_sk(sk)->ifindex < 0)
1056 		return -ENODEV;
1057 #endif
1058 
1059 	/*
1060 	 *	If the address length field is there to be filled in, we fill
1061 	 *	it in now.
1062 	 */
1063 
1064 	if (sock->type == SOCK_PACKET)
1065 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1066 	else
1067 		msg->msg_namelen = sizeof(struct sockaddr_ll);
1068 
1069 	/*
1070 	 *	Call the generic datagram receiver. This handles all sorts
1071 	 *	of horrible races and re-entrancy so we can forget about it
1072 	 *	in the protocol layers.
1073 	 *
1074 	 *	Now it will return ENETDOWN, if device have just gone down,
1075 	 *	but then it will block.
1076 	 */
1077 
1078 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1079 
1080 	/*
1081 	 *	An error occurred so return it. Because skb_recv_datagram()
1082 	 *	handles the blocking we don't see and worry about blocking
1083 	 *	retries.
1084 	 */
1085 
1086 	if(skb==NULL)
1087 		goto out;
1088 
1089 	/*
1090 	 *	You lose any data beyond the buffer you gave. If it worries a
1091 	 *	user program they can ask the device for its MTU anyway.
1092 	 */
1093 
1094 	copied = skb->len;
1095 	if (copied > len)
1096 	{
1097 		copied=len;
1098 		msg->msg_flags|=MSG_TRUNC;
1099 	}
1100 
1101 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1102 	if (err)
1103 		goto out_free;
1104 
1105 	sock_recv_timestamp(msg, sk, skb);
1106 
1107 	if (msg->msg_name)
1108 		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1109 
1110 	/*
1111 	 *	Free or return the buffer as appropriate. Again this
1112 	 *	hides all the races and re-entrancy issues from us.
1113 	 */
1114 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1115 
1116 out_free:
1117 	skb_free_datagram(sk, skb);
1118 out:
1119 	return err;
1120 }
1121 
1122 #ifdef CONFIG_SOCK_PACKET
1123 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1124 			       int *uaddr_len, int peer)
1125 {
1126 	struct net_device *dev;
1127 	struct sock *sk	= sock->sk;
1128 
1129 	if (peer)
1130 		return -EOPNOTSUPP;
1131 
1132 	uaddr->sa_family = AF_PACKET;
1133 	dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1134 	if (dev) {
1135 		strlcpy(uaddr->sa_data, dev->name, 15);
1136 		dev_put(dev);
1137 	} else
1138 		memset(uaddr->sa_data, 0, 14);
1139 	*uaddr_len = sizeof(*uaddr);
1140 
1141 	return 0;
1142 }
1143 #endif
1144 
1145 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1146 			  int *uaddr_len, int peer)
1147 {
1148 	struct net_device *dev;
1149 	struct sock *sk = sock->sk;
1150 	struct packet_sock *po = pkt_sk(sk);
1151 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1152 
1153 	if (peer)
1154 		return -EOPNOTSUPP;
1155 
1156 	sll->sll_family = AF_PACKET;
1157 	sll->sll_ifindex = po->ifindex;
1158 	sll->sll_protocol = po->num;
1159 	dev = dev_get_by_index(po->ifindex);
1160 	if (dev) {
1161 		sll->sll_hatype = dev->type;
1162 		sll->sll_halen = dev->addr_len;
1163 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1164 		dev_put(dev);
1165 	} else {
1166 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1167 		sll->sll_halen = 0;
1168 	}
1169 	*uaddr_len = sizeof(*sll);
1170 
1171 	return 0;
1172 }
1173 
1174 #ifdef CONFIG_PACKET_MULTICAST
1175 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1176 {
1177 	switch (i->type) {
1178 	case PACKET_MR_MULTICAST:
1179 		if (what > 0)
1180 			dev_mc_add(dev, i->addr, i->alen, 0);
1181 		else
1182 			dev_mc_delete(dev, i->addr, i->alen, 0);
1183 		break;
1184 	case PACKET_MR_PROMISC:
1185 		dev_set_promiscuity(dev, what);
1186 		break;
1187 	case PACKET_MR_ALLMULTI:
1188 		dev_set_allmulti(dev, what);
1189 		break;
1190 	default:;
1191 	}
1192 }
1193 
1194 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1195 {
1196 	for ( ; i; i=i->next) {
1197 		if (i->ifindex == dev->ifindex)
1198 			packet_dev_mc(dev, i, what);
1199 	}
1200 }
1201 
1202 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1203 {
1204 	struct packet_sock *po = pkt_sk(sk);
1205 	struct packet_mclist *ml, *i;
1206 	struct net_device *dev;
1207 	int err;
1208 
1209 	rtnl_lock();
1210 
1211 	err = -ENODEV;
1212 	dev = __dev_get_by_index(mreq->mr_ifindex);
1213 	if (!dev)
1214 		goto done;
1215 
1216 	err = -EINVAL;
1217 	if (mreq->mr_alen > dev->addr_len)
1218 		goto done;
1219 
1220 	err = -ENOBUFS;
1221 	i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1222 	if (i == NULL)
1223 		goto done;
1224 
1225 	err = 0;
1226 	for (ml = po->mclist; ml; ml = ml->next) {
1227 		if (ml->ifindex == mreq->mr_ifindex &&
1228 		    ml->type == mreq->mr_type &&
1229 		    ml->alen == mreq->mr_alen &&
1230 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1231 			ml->count++;
1232 			/* Free the new element ... */
1233 			kfree(i);
1234 			goto done;
1235 		}
1236 	}
1237 
1238 	i->type = mreq->mr_type;
1239 	i->ifindex = mreq->mr_ifindex;
1240 	i->alen = mreq->mr_alen;
1241 	memcpy(i->addr, mreq->mr_address, i->alen);
1242 	i->count = 1;
1243 	i->next = po->mclist;
1244 	po->mclist = i;
1245 	packet_dev_mc(dev, i, +1);
1246 
1247 done:
1248 	rtnl_unlock();
1249 	return err;
1250 }
1251 
1252 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1253 {
1254 	struct packet_mclist *ml, **mlp;
1255 
1256 	rtnl_lock();
1257 
1258 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1259 		if (ml->ifindex == mreq->mr_ifindex &&
1260 		    ml->type == mreq->mr_type &&
1261 		    ml->alen == mreq->mr_alen &&
1262 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1263 			if (--ml->count == 0) {
1264 				struct net_device *dev;
1265 				*mlp = ml->next;
1266 				dev = dev_get_by_index(ml->ifindex);
1267 				if (dev) {
1268 					packet_dev_mc(dev, ml, -1);
1269 					dev_put(dev);
1270 				}
1271 				kfree(ml);
1272 			}
1273 			rtnl_unlock();
1274 			return 0;
1275 		}
1276 	}
1277 	rtnl_unlock();
1278 	return -EADDRNOTAVAIL;
1279 }
1280 
1281 static void packet_flush_mclist(struct sock *sk)
1282 {
1283 	struct packet_sock *po = pkt_sk(sk);
1284 	struct packet_mclist *ml;
1285 
1286 	if (!po->mclist)
1287 		return;
1288 
1289 	rtnl_lock();
1290 	while ((ml = po->mclist) != NULL) {
1291 		struct net_device *dev;
1292 
1293 		po->mclist = ml->next;
1294 		if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1295 			packet_dev_mc(dev, ml, -1);
1296 			dev_put(dev);
1297 		}
1298 		kfree(ml);
1299 	}
1300 	rtnl_unlock();
1301 }
1302 #endif
1303 
1304 static int
1305 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1306 {
1307 	struct sock *sk = sock->sk;
1308 	int ret;
1309 
1310 	if (level != SOL_PACKET)
1311 		return -ENOPROTOOPT;
1312 
1313 	switch(optname)	{
1314 #ifdef CONFIG_PACKET_MULTICAST
1315 	case PACKET_ADD_MEMBERSHIP:
1316 	case PACKET_DROP_MEMBERSHIP:
1317 	{
1318 		struct packet_mreq mreq;
1319 		if (optlen<sizeof(mreq))
1320 			return -EINVAL;
1321 		if (copy_from_user(&mreq,optval,sizeof(mreq)))
1322 			return -EFAULT;
1323 		if (optname == PACKET_ADD_MEMBERSHIP)
1324 			ret = packet_mc_add(sk, &mreq);
1325 		else
1326 			ret = packet_mc_drop(sk, &mreq);
1327 		return ret;
1328 	}
1329 #endif
1330 #ifdef CONFIG_PACKET_MMAP
1331 	case PACKET_RX_RING:
1332 	{
1333 		struct tpacket_req req;
1334 
1335 		if (optlen<sizeof(req))
1336 			return -EINVAL;
1337 		if (copy_from_user(&req,optval,sizeof(req)))
1338 			return -EFAULT;
1339 		return packet_set_ring(sk, &req, 0);
1340 	}
1341 	case PACKET_COPY_THRESH:
1342 	{
1343 		int val;
1344 
1345 		if (optlen!=sizeof(val))
1346 			return -EINVAL;
1347 		if (copy_from_user(&val,optval,sizeof(val)))
1348 			return -EFAULT;
1349 
1350 		pkt_sk(sk)->copy_thresh = val;
1351 		return 0;
1352 	}
1353 #endif
1354 	default:
1355 		return -ENOPROTOOPT;
1356 	}
1357 }
1358 
1359 static int packet_getsockopt(struct socket *sock, int level, int optname,
1360 			     char __user *optval, int __user *optlen)
1361 {
1362 	int len;
1363 	struct sock *sk = sock->sk;
1364 	struct packet_sock *po = pkt_sk(sk);
1365 
1366 	if (level != SOL_PACKET)
1367 		return -ENOPROTOOPT;
1368 
1369   	if (get_user(len,optlen))
1370   		return -EFAULT;
1371 
1372 	if (len < 0)
1373 		return -EINVAL;
1374 
1375 	switch(optname)	{
1376 	case PACKET_STATISTICS:
1377 	{
1378 		struct tpacket_stats st;
1379 
1380 		if (len > sizeof(struct tpacket_stats))
1381 			len = sizeof(struct tpacket_stats);
1382 		spin_lock_bh(&sk->sk_receive_queue.lock);
1383 		st = po->stats;
1384 		memset(&po->stats, 0, sizeof(st));
1385 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1386 		st.tp_packets += st.tp_drops;
1387 
1388 		if (copy_to_user(optval, &st, len))
1389 			return -EFAULT;
1390 		break;
1391 	}
1392 	default:
1393 		return -ENOPROTOOPT;
1394 	}
1395 
1396   	if (put_user(len, optlen))
1397   		return -EFAULT;
1398   	return 0;
1399 }
1400 
1401 
1402 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1403 {
1404 	struct sock *sk;
1405 	struct hlist_node *node;
1406 	struct net_device *dev = (struct net_device*)data;
1407 
1408 	read_lock(&packet_sklist_lock);
1409 	sk_for_each(sk, node, &packet_sklist) {
1410 		struct packet_sock *po = pkt_sk(sk);
1411 
1412 		switch (msg) {
1413 		case NETDEV_UNREGISTER:
1414 #ifdef CONFIG_PACKET_MULTICAST
1415 			if (po->mclist)
1416 				packet_dev_mclist(dev, po->mclist, -1);
1417 			// fallthrough
1418 #endif
1419 		case NETDEV_DOWN:
1420 			if (dev->ifindex == po->ifindex) {
1421 				spin_lock(&po->bind_lock);
1422 				if (po->running) {
1423 					__dev_remove_pack(&po->prot_hook);
1424 					__sock_put(sk);
1425 					po->running = 0;
1426 					sk->sk_err = ENETDOWN;
1427 					if (!sock_flag(sk, SOCK_DEAD))
1428 						sk->sk_error_report(sk);
1429 				}
1430 				if (msg == NETDEV_UNREGISTER) {
1431 					po->ifindex = -1;
1432 					po->prot_hook.dev = NULL;
1433 				}
1434 				spin_unlock(&po->bind_lock);
1435 			}
1436 			break;
1437 		case NETDEV_UP:
1438 			spin_lock(&po->bind_lock);
1439 			if (dev->ifindex == po->ifindex && po->num &&
1440 			    !po->running) {
1441 				dev_add_pack(&po->prot_hook);
1442 				sock_hold(sk);
1443 				po->running = 1;
1444 			}
1445 			spin_unlock(&po->bind_lock);
1446 			break;
1447 		}
1448 	}
1449 	read_unlock(&packet_sklist_lock);
1450 	return NOTIFY_DONE;
1451 }
1452 
1453 
1454 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1455 			unsigned long arg)
1456 {
1457 	struct sock *sk = sock->sk;
1458 
1459 	switch(cmd) {
1460 		case SIOCOUTQ:
1461 		{
1462 			int amount = atomic_read(&sk->sk_wmem_alloc);
1463 			return put_user(amount, (int __user *)arg);
1464 		}
1465 		case SIOCINQ:
1466 		{
1467 			struct sk_buff *skb;
1468 			int amount = 0;
1469 
1470 			spin_lock_bh(&sk->sk_receive_queue.lock);
1471 			skb = skb_peek(&sk->sk_receive_queue);
1472 			if (skb)
1473 				amount = skb->len;
1474 			spin_unlock_bh(&sk->sk_receive_queue.lock);
1475 			return put_user(amount, (int __user *)arg);
1476 		}
1477 		case SIOCGSTAMP:
1478 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1479 
1480 #ifdef CONFIG_INET
1481 		case SIOCADDRT:
1482 		case SIOCDELRT:
1483 		case SIOCDARP:
1484 		case SIOCGARP:
1485 		case SIOCSARP:
1486 		case SIOCGIFADDR:
1487 		case SIOCSIFADDR:
1488 		case SIOCGIFBRDADDR:
1489 		case SIOCSIFBRDADDR:
1490 		case SIOCGIFNETMASK:
1491 		case SIOCSIFNETMASK:
1492 		case SIOCGIFDSTADDR:
1493 		case SIOCSIFDSTADDR:
1494 		case SIOCSIFFLAGS:
1495 			return inet_dgram_ops.ioctl(sock, cmd, arg);
1496 #endif
1497 
1498 		default:
1499 			return dev_ioctl(cmd, (void __user *)arg);
1500 	}
1501 	return 0;
1502 }
1503 
1504 #ifndef CONFIG_PACKET_MMAP
1505 #define packet_mmap sock_no_mmap
1506 #define packet_poll datagram_poll
1507 #else
1508 
1509 static unsigned int packet_poll(struct file * file, struct socket *sock,
1510 				poll_table *wait)
1511 {
1512 	struct sock *sk = sock->sk;
1513 	struct packet_sock *po = pkt_sk(sk);
1514 	unsigned int mask = datagram_poll(file, sock, wait);
1515 
1516 	spin_lock_bh(&sk->sk_receive_queue.lock);
1517 	if (po->pg_vec) {
1518 		unsigned last = po->head ? po->head-1 : po->frame_max;
1519 		struct tpacket_hdr *h;
1520 
1521 		h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1522 
1523 		if (h->tp_status)
1524 			mask |= POLLIN | POLLRDNORM;
1525 	}
1526 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1527 	return mask;
1528 }
1529 
1530 
1531 /* Dirty? Well, I still did not learn better way to account
1532  * for user mmaps.
1533  */
1534 
1535 static void packet_mm_open(struct vm_area_struct *vma)
1536 {
1537 	struct file *file = vma->vm_file;
1538 	struct socket * sock = file->private_data;
1539 	struct sock *sk = sock->sk;
1540 
1541 	if (sk)
1542 		atomic_inc(&pkt_sk(sk)->mapped);
1543 }
1544 
1545 static void packet_mm_close(struct vm_area_struct *vma)
1546 {
1547 	struct file *file = vma->vm_file;
1548 	struct socket * sock = file->private_data;
1549 	struct sock *sk = sock->sk;
1550 
1551 	if (sk)
1552 		atomic_dec(&pkt_sk(sk)->mapped);
1553 }
1554 
1555 static struct vm_operations_struct packet_mmap_ops = {
1556 	.open =	packet_mm_open,
1557 	.close =packet_mm_close,
1558 };
1559 
1560 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1561 {
1562 	return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1563 }
1564 
1565 static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
1566 {
1567 	int i;
1568 
1569 	for (i=0; i<len; i++) {
1570 		if (pg_vec[i]) {
1571 			struct page *page, *pend;
1572 
1573 			pend = pg_vec_endpage(pg_vec[i], order);
1574 			for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1575 				ClearPageReserved(page);
1576 			free_pages((unsigned long)pg_vec[i], order);
1577 		}
1578 	}
1579 	kfree(pg_vec);
1580 }
1581 
1582 
1583 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1584 {
1585 	char **pg_vec = NULL;
1586 	struct packet_sock *po = pkt_sk(sk);
1587 	int was_running, num, order = 0;
1588 	int err = 0;
1589 
1590 	if (req->tp_block_nr) {
1591 		int i, l;
1592 
1593 		/* Sanity tests and some calculations */
1594 
1595 		if (po->pg_vec)
1596 			return -EBUSY;
1597 
1598 		if ((int)req->tp_block_size <= 0)
1599 			return -EINVAL;
1600 		if (req->tp_block_size&(PAGE_SIZE-1))
1601 			return -EINVAL;
1602 		if (req->tp_frame_size < TPACKET_HDRLEN)
1603 			return -EINVAL;
1604 		if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1605 			return -EINVAL;
1606 
1607 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1608 		if (po->frames_per_block <= 0)
1609 			return -EINVAL;
1610 		if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1611 			return -EINVAL;
1612 		/* OK! */
1613 
1614 		/* Allocate page vector */
1615 		while ((PAGE_SIZE<<order) < req->tp_block_size)
1616 			order++;
1617 
1618 		err = -ENOMEM;
1619 
1620 		pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
1621 		if (pg_vec == NULL)
1622 			goto out;
1623 		memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
1624 
1625 		for (i=0; i<req->tp_block_nr; i++) {
1626 			struct page *page, *pend;
1627 			pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
1628 			if (!pg_vec[i])
1629 				goto out_free_pgvec;
1630 
1631 			pend = pg_vec_endpage(pg_vec[i], order);
1632 			for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1633 				SetPageReserved(page);
1634 		}
1635 		/* Page vector is allocated */
1636 
1637 		l = 0;
1638 		for (i=0; i<req->tp_block_nr; i++) {
1639 			char *ptr = pg_vec[i];
1640 			struct tpacket_hdr *header;
1641 			int k;
1642 
1643 			for (k=0; k<po->frames_per_block; k++) {
1644 
1645 				header = (struct tpacket_hdr*)ptr;
1646 				header->tp_status = TP_STATUS_KERNEL;
1647 				ptr += req->tp_frame_size;
1648 			}
1649 		}
1650 		/* Done */
1651 	} else {
1652 		if (req->tp_frame_nr)
1653 			return -EINVAL;
1654 	}
1655 
1656 	lock_sock(sk);
1657 
1658 	/* Detach socket from network */
1659 	spin_lock(&po->bind_lock);
1660 	was_running = po->running;
1661 	num = po->num;
1662 	if (was_running) {
1663 		__dev_remove_pack(&po->prot_hook);
1664 		po->num = 0;
1665 		po->running = 0;
1666 		__sock_put(sk);
1667 	}
1668 	spin_unlock(&po->bind_lock);
1669 
1670 	synchronize_net();
1671 
1672 	err = -EBUSY;
1673 	if (closing || atomic_read(&po->mapped) == 0) {
1674 		err = 0;
1675 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1676 
1677 		spin_lock_bh(&sk->sk_receive_queue.lock);
1678 		pg_vec = XC(po->pg_vec, pg_vec);
1679 		po->frame_max = req->tp_frame_nr-1;
1680 		po->head = 0;
1681 		po->frame_size = req->tp_frame_size;
1682 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1683 
1684 		order = XC(po->pg_vec_order, order);
1685 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1686 
1687 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1688 		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1689 		skb_queue_purge(&sk->sk_receive_queue);
1690 #undef XC
1691 		if (atomic_read(&po->mapped))
1692 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1693 	}
1694 
1695 	spin_lock(&po->bind_lock);
1696 	if (was_running && !po->running) {
1697 		sock_hold(sk);
1698 		po->running = 1;
1699 		po->num = num;
1700 		dev_add_pack(&po->prot_hook);
1701 	}
1702 	spin_unlock(&po->bind_lock);
1703 
1704 	release_sock(sk);
1705 
1706 out_free_pgvec:
1707 	if (pg_vec)
1708 		free_pg_vec(pg_vec, order, req->tp_block_nr);
1709 out:
1710 	return err;
1711 }
1712 
1713 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1714 {
1715 	struct sock *sk = sock->sk;
1716 	struct packet_sock *po = pkt_sk(sk);
1717 	unsigned long size;
1718 	unsigned long start;
1719 	int err = -EINVAL;
1720 	int i;
1721 
1722 	if (vma->vm_pgoff)
1723 		return -EINVAL;
1724 
1725 	size = vma->vm_end - vma->vm_start;
1726 
1727 	lock_sock(sk);
1728 	if (po->pg_vec == NULL)
1729 		goto out;
1730 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1731 		goto out;
1732 
1733 	atomic_inc(&po->mapped);
1734 	start = vma->vm_start;
1735 	err = -EAGAIN;
1736 	for (i=0; i<po->pg_vec_len; i++) {
1737 		if (remap_pfn_range(vma, start,
1738 				     __pa(po->pg_vec[i]) >> PAGE_SHIFT,
1739 				     po->pg_vec_pages*PAGE_SIZE,
1740 				     vma->vm_page_prot))
1741 			goto out;
1742 		start += po->pg_vec_pages*PAGE_SIZE;
1743 	}
1744 	vma->vm_ops = &packet_mmap_ops;
1745 	err = 0;
1746 
1747 out:
1748 	release_sock(sk);
1749 	return err;
1750 }
1751 #endif
1752 
1753 
1754 #ifdef CONFIG_SOCK_PACKET
1755 static struct proto_ops packet_ops_spkt = {
1756 	.family =	PF_PACKET,
1757 	.owner =	THIS_MODULE,
1758 	.release =	packet_release,
1759 	.bind =		packet_bind_spkt,
1760 	.connect =	sock_no_connect,
1761 	.socketpair =	sock_no_socketpair,
1762 	.accept =	sock_no_accept,
1763 	.getname =	packet_getname_spkt,
1764 	.poll =		datagram_poll,
1765 	.ioctl =	packet_ioctl,
1766 	.listen =	sock_no_listen,
1767 	.shutdown =	sock_no_shutdown,
1768 	.setsockopt =	sock_no_setsockopt,
1769 	.getsockopt =	sock_no_getsockopt,
1770 	.sendmsg =	packet_sendmsg_spkt,
1771 	.recvmsg =	packet_recvmsg,
1772 	.mmap =		sock_no_mmap,
1773 	.sendpage =	sock_no_sendpage,
1774 };
1775 #endif
1776 
1777 static struct proto_ops packet_ops = {
1778 	.family =	PF_PACKET,
1779 	.owner =	THIS_MODULE,
1780 	.release =	packet_release,
1781 	.bind =		packet_bind,
1782 	.connect =	sock_no_connect,
1783 	.socketpair =	sock_no_socketpair,
1784 	.accept =	sock_no_accept,
1785 	.getname =	packet_getname,
1786 	.poll =		packet_poll,
1787 	.ioctl =	packet_ioctl,
1788 	.listen =	sock_no_listen,
1789 	.shutdown =	sock_no_shutdown,
1790 	.setsockopt =	packet_setsockopt,
1791 	.getsockopt =	packet_getsockopt,
1792 	.sendmsg =	packet_sendmsg,
1793 	.recvmsg =	packet_recvmsg,
1794 	.mmap =		packet_mmap,
1795 	.sendpage =	sock_no_sendpage,
1796 };
1797 
1798 static struct net_proto_family packet_family_ops = {
1799 	.family =	PF_PACKET,
1800 	.create =	packet_create,
1801 	.owner	=	THIS_MODULE,
1802 };
1803 
1804 static struct notifier_block packet_netdev_notifier = {
1805 	.notifier_call =packet_notifier,
1806 };
1807 
1808 #ifdef CONFIG_PROC_FS
1809 static inline struct sock *packet_seq_idx(loff_t off)
1810 {
1811 	struct sock *s;
1812 	struct hlist_node *node;
1813 
1814 	sk_for_each(s, node, &packet_sklist) {
1815 		if (!off--)
1816 			return s;
1817 	}
1818 	return NULL;
1819 }
1820 
1821 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1822 {
1823 	read_lock(&packet_sklist_lock);
1824 	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1825 }
1826 
1827 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1828 {
1829 	++*pos;
1830 	return  (v == SEQ_START_TOKEN)
1831 		? sk_head(&packet_sklist)
1832 		: sk_next((struct sock*)v) ;
1833 }
1834 
1835 static void packet_seq_stop(struct seq_file *seq, void *v)
1836 {
1837 	read_unlock(&packet_sklist_lock);
1838 }
1839 
1840 static int packet_seq_show(struct seq_file *seq, void *v)
1841 {
1842 	if (v == SEQ_START_TOKEN)
1843 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1844 	else {
1845 		struct sock *s = v;
1846 		const struct packet_sock *po = pkt_sk(s);
1847 
1848 		seq_printf(seq,
1849 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1850 			   s,
1851 			   atomic_read(&s->sk_refcnt),
1852 			   s->sk_type,
1853 			   ntohs(po->num),
1854 			   po->ifindex,
1855 			   po->running,
1856 			   atomic_read(&s->sk_rmem_alloc),
1857 			   sock_i_uid(s),
1858 			   sock_i_ino(s) );
1859 	}
1860 
1861 	return 0;
1862 }
1863 
1864 static struct seq_operations packet_seq_ops = {
1865 	.start	= packet_seq_start,
1866 	.next	= packet_seq_next,
1867 	.stop	= packet_seq_stop,
1868 	.show	= packet_seq_show,
1869 };
1870 
1871 static int packet_seq_open(struct inode *inode, struct file *file)
1872 {
1873 	return seq_open(file, &packet_seq_ops);
1874 }
1875 
1876 static struct file_operations packet_seq_fops = {
1877 	.owner		= THIS_MODULE,
1878 	.open		= packet_seq_open,
1879 	.read		= seq_read,
1880 	.llseek		= seq_lseek,
1881 	.release	= seq_release,
1882 };
1883 
1884 #endif
1885 
1886 static void __exit packet_exit(void)
1887 {
1888 	proc_net_remove("packet");
1889 	unregister_netdevice_notifier(&packet_netdev_notifier);
1890 	sock_unregister(PF_PACKET);
1891 	proto_unregister(&packet_proto);
1892 }
1893 
1894 static int __init packet_init(void)
1895 {
1896 	int rc = proto_register(&packet_proto, 0);
1897 
1898 	if (rc != 0)
1899 		goto out;
1900 
1901 	sock_register(&packet_family_ops);
1902 	register_netdevice_notifier(&packet_netdev_notifier);
1903 	proc_net_fops_create("packet", 0, &packet_seq_fops);
1904 out:
1905 	return rc;
1906 }
1907 
1908 module_init(packet_init);
1909 module_exit(packet_exit);
1910 MODULE_LICENSE("GPL");
1911 MODULE_ALIAS_NETPROTO(PF_PACKET);
1912