xref: /linux/net/packet/af_packet.c (revision 5bdef865eb358b6f3760e25e591ae115e9eeddef)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* Private packet socket structures. */
139 
140 struct packet_mclist
141 {
142 	struct packet_mclist	*next;
143 	int			ifindex;
144 	int			count;
145 	unsigned short		type;
146 	unsigned short		alen;
147 	unsigned char		addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max
153 {
154 	int		mr_ifindex;
155 	unsigned short	mr_type;
156 	unsigned short	mr_alen;
157 	unsigned char	mr_address[MAX_ADDR_LEN];
158 };
159 
160 #ifdef CONFIG_PACKET_MMAP
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 		int closing, int tx_ring);
163 
164 struct packet_ring_buffer {
165 	char *			*pg_vec;
166 	unsigned int		head;
167 	unsigned int		frames_per_block;
168 	unsigned int		frame_size;
169 	unsigned int		frame_max;
170 
171 	unsigned int		pg_vec_order;
172 	unsigned int		pg_vec_pages;
173 	unsigned int		pg_vec_len;
174 
175 	atomic_t		pending;
176 };
177 
178 struct packet_sock;
179 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180 #endif
181 
182 static void packet_flush_mclist(struct sock *sk);
183 
184 struct packet_sock {
185 	/* struct sock has to be the first member of packet_sock */
186 	struct sock		sk;
187 	struct tpacket_stats	stats;
188 #ifdef CONFIG_PACKET_MMAP
189 	struct packet_ring_buffer	rx_ring;
190 	struct packet_ring_buffer	tx_ring;
191 	int			copy_thresh;
192 #endif
193 	struct packet_type	prot_hook;
194 	spinlock_t		bind_lock;
195 	struct mutex		pg_vec_lock;
196 	unsigned int		running:1,	/* prot_hook is attached*/
197 				auxdata:1,
198 				origdev:1;
199 	int			ifindex;	/* bound device		*/
200 	__be16			num;
201 	struct packet_mclist	*mclist;
202 #ifdef CONFIG_PACKET_MMAP
203 	atomic_t		mapped;
204 	enum tpacket_versions	tp_version;
205 	unsigned int		tp_hdrlen;
206 	unsigned int		tp_reserve;
207 	unsigned int		tp_loss:1;
208 #endif
209 };
210 
211 struct packet_skb_cb {
212 	unsigned int origlen;
213 	union {
214 		struct sockaddr_pkt pkt;
215 		struct sockaddr_ll ll;
216 	} sa;
217 };
218 
219 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
220 
221 #ifdef CONFIG_PACKET_MMAP
222 
223 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
224 {
225 	union {
226 		struct tpacket_hdr *h1;
227 		struct tpacket2_hdr *h2;
228 		void *raw;
229 	} h;
230 
231 	h.raw = frame;
232 	switch (po->tp_version) {
233 	case TPACKET_V1:
234 		h.h1->tp_status = status;
235 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
236 		break;
237 	case TPACKET_V2:
238 		h.h2->tp_status = status;
239 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
240 		break;
241 	default:
242 		printk(KERN_ERR "TPACKET version not supported\n");
243 		BUG();
244 	}
245 
246 	smp_wmb();
247 }
248 
249 static int __packet_get_status(struct packet_sock *po, void *frame)
250 {
251 	union {
252 		struct tpacket_hdr *h1;
253 		struct tpacket2_hdr *h2;
254 		void *raw;
255 	} h;
256 
257 	smp_rmb();
258 
259 	h.raw = frame;
260 	switch (po->tp_version) {
261 	case TPACKET_V1:
262 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
263 		return h.h1->tp_status;
264 	case TPACKET_V2:
265 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
266 		return h.h2->tp_status;
267 	default:
268 		printk(KERN_ERR "TPACKET version not supported\n");
269 		BUG();
270 		return 0;
271 	}
272 }
273 
274 static void *packet_lookup_frame(struct packet_sock *po,
275 		struct packet_ring_buffer *rb,
276 		unsigned int position,
277 		int status)
278 {
279 	unsigned int pg_vec_pos, frame_offset;
280 	union {
281 		struct tpacket_hdr *h1;
282 		struct tpacket2_hdr *h2;
283 		void *raw;
284 	} h;
285 
286 	pg_vec_pos = position / rb->frames_per_block;
287 	frame_offset = position % rb->frames_per_block;
288 
289 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290 
291 	if (status != __packet_get_status(po, h.raw))
292 		return NULL;
293 
294 	return h.raw;
295 }
296 
297 static inline void *packet_current_frame(struct packet_sock *po,
298 		struct packet_ring_buffer *rb,
299 		int status)
300 {
301 	return packet_lookup_frame(po, rb, rb->head, status);
302 }
303 
304 static inline void *packet_previous_frame(struct packet_sock *po,
305 		struct packet_ring_buffer *rb,
306 		int status)
307 {
308 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309 	return packet_lookup_frame(po, rb, previous, status);
310 }
311 
312 static inline void packet_increment_head(struct packet_ring_buffer *buff)
313 {
314 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315 }
316 
317 #endif
318 
319 static inline struct packet_sock *pkt_sk(struct sock *sk)
320 {
321 	return (struct packet_sock *)sk;
322 }
323 
324 static void packet_sock_destruct(struct sock *sk)
325 {
326 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
327 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
328 
329 	if (!sock_flag(sk, SOCK_DEAD)) {
330 		printk("Attempt to release alive packet socket: %p\n", sk);
331 		return;
332 	}
333 
334 	sk_refcnt_debug_dec(sk);
335 }
336 
337 
338 static const struct proto_ops packet_ops;
339 
340 static const struct proto_ops packet_ops_spkt;
341 
342 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
343 {
344 	struct sock *sk;
345 	struct sockaddr_pkt *spkt;
346 
347 	/*
348 	 *	When we registered the protocol we saved the socket in the data
349 	 *	field for just this event.
350 	 */
351 
352 	sk = pt->af_packet_priv;
353 
354 	/*
355 	 *	Yank back the headers [hope the device set this
356 	 *	right or kerboom...]
357 	 *
358 	 *	Incoming packets have ll header pulled,
359 	 *	push it back.
360 	 *
361 	 *	For outgoing ones skb->data == skb_mac_header(skb)
362 	 *	so that this procedure is noop.
363 	 */
364 
365 	if (skb->pkt_type == PACKET_LOOPBACK)
366 		goto out;
367 
368 	if (dev_net(dev) != sock_net(sk))
369 		goto out;
370 
371 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
372 		goto oom;
373 
374 	/* drop any routing info */
375 	skb_dst_drop(skb);
376 
377 	/* drop conntrack reference */
378 	nf_reset(skb);
379 
380 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381 
382 	skb_push(skb, skb->data - skb_mac_header(skb));
383 
384 	/*
385 	 *	The SOCK_PACKET socket receives _all_ frames.
386 	 */
387 
388 	spkt->spkt_family = dev->type;
389 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390 	spkt->spkt_protocol = skb->protocol;
391 
392 	/*
393 	 *	Charge the memory to the socket. This is done specifically
394 	 *	to prevent sockets using all the memory up.
395 	 */
396 
397 	if (sock_queue_rcv_skb(sk,skb) == 0)
398 		return 0;
399 
400 out:
401 	kfree_skb(skb);
402 oom:
403 	return 0;
404 }
405 
406 
407 /*
408  *	Output a raw packet to a device layer. This bypasses all the other
409  *	protocol layers and you must therefore supply it with a complete frame
410  */
411 
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413 			       struct msghdr *msg, size_t len)
414 {
415 	struct sock *sk = sock->sk;
416 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
417 	struct sk_buff *skb;
418 	struct net_device *dev;
419 	__be16 proto=0;
420 	int err;
421 
422 	/*
423 	 *	Get and verify the address.
424 	 */
425 
426 	if (saddr)
427 	{
428 		if (msg->msg_namelen < sizeof(struct sockaddr))
429 			return(-EINVAL);
430 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
431 			proto=saddr->spkt_protocol;
432 	}
433 	else
434 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
435 
436 	/*
437 	 *	Find the device first to size check it
438 	 */
439 
440 	saddr->spkt_device[13] = 0;
441 	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
442 	err = -ENODEV;
443 	if (dev == NULL)
444 		goto out_unlock;
445 
446 	err = -ENETDOWN;
447 	if (!(dev->flags & IFF_UP))
448 		goto out_unlock;
449 
450 	/*
451 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
452 	 *	raw protocol and you must do your own fragmentation at this level.
453 	 */
454 
455 	err = -EMSGSIZE;
456 	if (len > dev->mtu + dev->hard_header_len)
457 		goto out_unlock;
458 
459 	err = -ENOBUFS;
460 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
461 
462 	/*
463 	 *	If the write buffer is full, then tough. At this level the user gets to
464 	 *	deal with the problem - do your own algorithmic backoffs. That's far
465 	 *	more flexible.
466 	 */
467 
468 	if (skb == NULL)
469 		goto out_unlock;
470 
471 	/*
472 	 *	Fill it in
473 	 */
474 
475 	/* FIXME: Save some space for broken drivers that write a
476 	 * hard header at transmission time by themselves. PPP is the
477 	 * notable one here. This should really be fixed at the driver level.
478 	 */
479 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
480 	skb_reset_network_header(skb);
481 
482 	/* Try to align data part correctly */
483 	if (dev->header_ops) {
484 		skb->data -= dev->hard_header_len;
485 		skb->tail -= dev->hard_header_len;
486 		if (len < dev->hard_header_len)
487 			skb_reset_network_header(skb);
488 	}
489 
490 	/* Returns -EFAULT on error */
491 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
492 	skb->protocol = proto;
493 	skb->dev = dev;
494 	skb->priority = sk->sk_priority;
495 	if (err)
496 		goto out_free;
497 
498 	/*
499 	 *	Now send it
500 	 */
501 
502 	dev_queue_xmit(skb);
503 	dev_put(dev);
504 	return(len);
505 
506 out_free:
507 	kfree_skb(skb);
508 out_unlock:
509 	if (dev)
510 		dev_put(dev);
511 	return err;
512 }
513 
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 				      unsigned int res)
516 {
517 	struct sk_filter *filter;
518 
519 	rcu_read_lock_bh();
520 	filter = rcu_dereference(sk->sk_filter);
521 	if (filter != NULL)
522 		res = sk_run_filter(skb, filter->insns, filter->len);
523 	rcu_read_unlock_bh();
524 
525 	return res;
526 }
527 
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531 
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539 
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
541 {
542 	struct sock *sk;
543 	struct sockaddr_ll *sll;
544 	struct packet_sock *po;
545 	u8 * skb_head = skb->data;
546 	int skb_len = skb->len;
547 	unsigned int snaplen, res;
548 
549 	if (skb->pkt_type == PACKET_LOOPBACK)
550 		goto drop;
551 
552 	sk = pt->af_packet_priv;
553 	po = pkt_sk(sk);
554 
555 	if (dev_net(dev) != sock_net(sk))
556 		goto drop;
557 
558 	skb->dev = dev;
559 
560 	if (dev->header_ops) {
561 		/* The device has an explicit notion of ll header,
562 		   exported to higher levels.
563 
564 		   Otherwise, the device hides datails of it frame
565 		   structure, so that corresponding packet head
566 		   never delivered to user.
567 		 */
568 		if (sk->sk_type != SOCK_DGRAM)
569 			skb_push(skb, skb->data - skb_mac_header(skb));
570 		else if (skb->pkt_type == PACKET_OUTGOING) {
571 			/* Special case: outgoing packets have ll header at head */
572 			skb_pull(skb, skb_network_offset(skb));
573 		}
574 	}
575 
576 	snaplen = skb->len;
577 
578 	res = run_filter(skb, sk, snaplen);
579 	if (!res)
580 		goto drop_n_restore;
581 	if (snaplen > res)
582 		snaplen = res;
583 
584 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
585 	    (unsigned)sk->sk_rcvbuf)
586 		goto drop_n_acct;
587 
588 	if (skb_shared(skb)) {
589 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
590 		if (nskb == NULL)
591 			goto drop_n_acct;
592 
593 		if (skb_head != skb->data) {
594 			skb->data = skb_head;
595 			skb->len = skb_len;
596 		}
597 		kfree_skb(skb);
598 		skb = nskb;
599 	}
600 
601 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
602 		     sizeof(skb->cb));
603 
604 	sll = &PACKET_SKB_CB(skb)->sa.ll;
605 	sll->sll_family = AF_PACKET;
606 	sll->sll_hatype = dev->type;
607 	sll->sll_protocol = skb->protocol;
608 	sll->sll_pkttype = skb->pkt_type;
609 	if (unlikely(po->origdev))
610 		sll->sll_ifindex = orig_dev->ifindex;
611 	else
612 		sll->sll_ifindex = dev->ifindex;
613 
614 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
615 
616 	PACKET_SKB_CB(skb)->origlen = skb->len;
617 
618 	if (pskb_trim(skb, snaplen))
619 		goto drop_n_acct;
620 
621 	skb_set_owner_r(skb, sk);
622 	skb->dev = NULL;
623 	skb_dst_drop(skb);
624 
625 	/* drop conntrack reference */
626 	nf_reset(skb);
627 
628 	spin_lock(&sk->sk_receive_queue.lock);
629 	po->stats.tp_packets++;
630 	__skb_queue_tail(&sk->sk_receive_queue, skb);
631 	spin_unlock(&sk->sk_receive_queue.lock);
632 	sk->sk_data_ready(sk, skb->len);
633 	return 0;
634 
635 drop_n_acct:
636 	spin_lock(&sk->sk_receive_queue.lock);
637 	po->stats.tp_drops++;
638 	spin_unlock(&sk->sk_receive_queue.lock);
639 
640 drop_n_restore:
641 	if (skb_head != skb->data && skb_shared(skb)) {
642 		skb->data = skb_head;
643 		skb->len = skb_len;
644 	}
645 drop:
646 	consume_skb(skb);
647 	return 0;
648 }
649 
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
652 {
653 	struct sock *sk;
654 	struct packet_sock *po;
655 	struct sockaddr_ll *sll;
656 	union {
657 		struct tpacket_hdr *h1;
658 		struct tpacket2_hdr *h2;
659 		void *raw;
660 	} h;
661 	u8 * skb_head = skb->data;
662 	int skb_len = skb->len;
663 	unsigned int snaplen, res;
664 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 	unsigned short macoff, netoff, hdrlen;
666 	struct sk_buff *copy_skb = NULL;
667 	struct timeval tv;
668 	struct timespec ts;
669 
670 	if (skb->pkt_type == PACKET_LOOPBACK)
671 		goto drop;
672 
673 	sk = pt->af_packet_priv;
674 	po = pkt_sk(sk);
675 
676 	if (dev_net(dev) != sock_net(sk))
677 		goto drop;
678 
679 	if (dev->header_ops) {
680 		if (sk->sk_type != SOCK_DGRAM)
681 			skb_push(skb, skb->data - skb_mac_header(skb));
682 		else if (skb->pkt_type == PACKET_OUTGOING) {
683 			/* Special case: outgoing packets have ll header at head */
684 			skb_pull(skb, skb_network_offset(skb));
685 		}
686 	}
687 
688 	if (skb->ip_summed == CHECKSUM_PARTIAL)
689 		status |= TP_STATUS_CSUMNOTREADY;
690 
691 	snaplen = skb->len;
692 
693 	res = run_filter(skb, sk, snaplen);
694 	if (!res)
695 		goto drop_n_restore;
696 	if (snaplen > res)
697 		snaplen = res;
698 
699 	if (sk->sk_type == SOCK_DGRAM) {
700 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701 				  po->tp_reserve;
702 	} else {
703 		unsigned maclen = skb_network_offset(skb);
704 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
705 				       (maclen < 16 ? 16 : maclen)) +
706 			po->tp_reserve;
707 		macoff = netoff - maclen;
708 	}
709 
710 	if (macoff + snaplen > po->rx_ring.frame_size) {
711 		if (po->copy_thresh &&
712 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713 		    (unsigned)sk->sk_rcvbuf) {
714 			if (skb_shared(skb)) {
715 				copy_skb = skb_clone(skb, GFP_ATOMIC);
716 			} else {
717 				copy_skb = skb_get(skb);
718 				skb_head = skb->data;
719 			}
720 			if (copy_skb)
721 				skb_set_owner_r(copy_skb, sk);
722 		}
723 		snaplen = po->rx_ring.frame_size - macoff;
724 		if ((int)snaplen < 0)
725 			snaplen = 0;
726 	}
727 
728 	spin_lock(&sk->sk_receive_queue.lock);
729 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730 	if (!h.raw)
731 		goto ring_is_full;
732 	packet_increment_head(&po->rx_ring);
733 	po->stats.tp_packets++;
734 	if (copy_skb) {
735 		status |= TP_STATUS_COPY;
736 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737 	}
738 	if (!po->stats.tp_drops)
739 		status &= ~TP_STATUS_LOSING;
740 	spin_unlock(&sk->sk_receive_queue.lock);
741 
742 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743 
744 	switch (po->tp_version) {
745 	case TPACKET_V1:
746 		h.h1->tp_len = skb->len;
747 		h.h1->tp_snaplen = snaplen;
748 		h.h1->tp_mac = macoff;
749 		h.h1->tp_net = netoff;
750 		if (skb->tstamp.tv64)
751 			tv = ktime_to_timeval(skb->tstamp);
752 		else
753 			do_gettimeofday(&tv);
754 		h.h1->tp_sec = tv.tv_sec;
755 		h.h1->tp_usec = tv.tv_usec;
756 		hdrlen = sizeof(*h.h1);
757 		break;
758 	case TPACKET_V2:
759 		h.h2->tp_len = skb->len;
760 		h.h2->tp_snaplen = snaplen;
761 		h.h2->tp_mac = macoff;
762 		h.h2->tp_net = netoff;
763 		if (skb->tstamp.tv64)
764 			ts = ktime_to_timespec(skb->tstamp);
765 		else
766 			getnstimeofday(&ts);
767 		h.h2->tp_sec = ts.tv_sec;
768 		h.h2->tp_nsec = ts.tv_nsec;
769 		h.h2->tp_vlan_tci = skb->vlan_tci;
770 		hdrlen = sizeof(*h.h2);
771 		break;
772 	default:
773 		BUG();
774 	}
775 
776 	sll = h.raw + TPACKET_ALIGN(hdrlen);
777 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778 	sll->sll_family = AF_PACKET;
779 	sll->sll_hatype = dev->type;
780 	sll->sll_protocol = skb->protocol;
781 	sll->sll_pkttype = skb->pkt_type;
782 	if (unlikely(po->origdev))
783 		sll->sll_ifindex = orig_dev->ifindex;
784 	else
785 		sll->sll_ifindex = dev->ifindex;
786 
787 	__packet_set_status(po, h.raw, status);
788 	smp_mb();
789 	{
790 		struct page *p_start, *p_end;
791 		u8 *h_end = h.raw + macoff + snaplen - 1;
792 
793 		p_start = virt_to_page(h.raw);
794 		p_end = virt_to_page(h_end);
795 		while (p_start <= p_end) {
796 			flush_dcache_page(p_start);
797 			p_start++;
798 		}
799 	}
800 
801 	sk->sk_data_ready(sk, 0);
802 
803 drop_n_restore:
804 	if (skb_head != skb->data && skb_shared(skb)) {
805 		skb->data = skb_head;
806 		skb->len = skb_len;
807 	}
808 drop:
809 	kfree_skb(skb);
810 	return 0;
811 
812 ring_is_full:
813 	po->stats.tp_drops++;
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	sk->sk_data_ready(sk, 0);
817 	kfree_skb(copy_skb);
818 	goto drop_n_restore;
819 }
820 
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823 	struct packet_sock *po = pkt_sk(skb->sk);
824 	void * ph;
825 
826 	BUG_ON(skb == NULL);
827 
828 	if (likely(po->tx_ring.pg_vec)) {
829 		ph = skb_shinfo(skb)->destructor_arg;
830 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832 		atomic_dec(&po->tx_ring.pending);
833 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834 	}
835 
836 	sock_wfree(skb);
837 }
838 
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
840 		void * frame, struct net_device *dev, int size_max,
841 		__be16 proto, unsigned char * addr)
842 {
843 	union {
844 		struct tpacket_hdr *h1;
845 		struct tpacket2_hdr *h2;
846 		void *raw;
847 	} ph;
848 	int to_write, offset, len, tp_len, nr_frags, len_max;
849 	struct socket *sock = po->sk.sk_socket;
850 	struct page *page;
851 	void *data;
852 	int err;
853 
854 	ph.raw = frame;
855 
856 	skb->protocol = proto;
857 	skb->dev = dev;
858 	skb->priority = po->sk.sk_priority;
859 	skb_shinfo(skb)->destructor_arg = ph.raw;
860 
861 	switch (po->tp_version) {
862 	case TPACKET_V2:
863 		tp_len = ph.h2->tp_len;
864 		break;
865 	default:
866 		tp_len = ph.h1->tp_len;
867 		break;
868 	}
869 	if (unlikely(tp_len > size_max)) {
870 		printk(KERN_ERR "packet size is too long (%d > %d)\n",
871 				tp_len, size_max);
872 		return -EMSGSIZE;
873 	}
874 
875 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
876 	skb_reset_network_header(skb);
877 
878 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
879 	to_write = tp_len;
880 
881 	if (sock->type == SOCK_DGRAM) {
882 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
883 				NULL, tp_len);
884 		if (unlikely(err < 0))
885 			return -EINVAL;
886 	} else if (dev->hard_header_len ) {
887 		/* net device doesn't like empty head */
888 		if (unlikely(tp_len <= dev->hard_header_len)) {
889 			printk(KERN_ERR "packet size is too short "
890 					"(%d < %d)\n", tp_len,
891 					dev->hard_header_len);
892 			return -EINVAL;
893 		}
894 
895 		skb_push(skb, dev->hard_header_len);
896 		err = skb_store_bits(skb, 0, data,
897 				dev->hard_header_len);
898 		if (unlikely(err))
899 			return err;
900 
901 		data += dev->hard_header_len;
902 		to_write -= dev->hard_header_len;
903 	}
904 
905 	err = -EFAULT;
906 	page = virt_to_page(data);
907 	offset = offset_in_page(data);
908 	len_max = PAGE_SIZE - offset;
909 	len = ((to_write > len_max) ? len_max : to_write);
910 
911 	skb->data_len = to_write;
912 	skb->len += to_write;
913 	skb->truesize += to_write;
914 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
915 
916 	while (likely(to_write)) {
917 		nr_frags = skb_shinfo(skb)->nr_frags;
918 
919 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920 			printk(KERN_ERR "Packet exceed the number "
921 					"of skb frags(%lu)\n",
922 					MAX_SKB_FRAGS);
923 			return -EFAULT;
924 		}
925 
926 		flush_dcache_page(page);
927 		get_page(page);
928 		skb_fill_page_desc(skb,
929 				nr_frags,
930 				page++, offset, len);
931 		to_write -= len;
932 		offset = 0;
933 		len_max = PAGE_SIZE;
934 		len = ((to_write > len_max) ? len_max : to_write);
935 	}
936 
937 	return tp_len;
938 }
939 
940 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
941 {
942 	struct socket *sock;
943 	struct sk_buff *skb;
944 	struct net_device *dev;
945 	__be16 proto;
946 	int ifindex, err, reserve = 0;
947 	void * ph;
948 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
949 	int tp_len, size_max;
950 	unsigned char *addr;
951 	int len_sum = 0;
952 	int status = 0;
953 
954 	sock = po->sk.sk_socket;
955 
956 	mutex_lock(&po->pg_vec_lock);
957 
958 	err = -EBUSY;
959 	if (saddr == NULL) {
960 		ifindex	= po->ifindex;
961 		proto	= po->num;
962 		addr	= NULL;
963 	} else {
964 		err = -EINVAL;
965 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
966 			goto out;
967 		if (msg->msg_namelen < (saddr->sll_halen
968 					+ offsetof(struct sockaddr_ll,
969 						sll_addr)))
970 			goto out;
971 		ifindex	= saddr->sll_ifindex;
972 		proto	= saddr->sll_protocol;
973 		addr	= saddr->sll_addr;
974 	}
975 
976 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
977 	err = -ENXIO;
978 	if (unlikely(dev == NULL))
979 		goto out;
980 
981 	reserve = dev->hard_header_len;
982 
983 	err = -ENETDOWN;
984 	if (unlikely(!(dev->flags & IFF_UP)))
985 		goto out_put;
986 
987 	size_max = po->tx_ring.frame_size
988 		- sizeof(struct skb_shared_info)
989 		- po->tp_hdrlen
990 		- LL_ALLOCATED_SPACE(dev)
991 		- sizeof(struct sockaddr_ll);
992 
993 	if (size_max > dev->mtu + reserve)
994 		size_max = dev->mtu + reserve;
995 
996 	do {
997 		ph = packet_current_frame(po, &po->tx_ring,
998 				TP_STATUS_SEND_REQUEST);
999 
1000 		if (unlikely(ph == NULL)) {
1001 			schedule();
1002 			continue;
1003 		}
1004 
1005 		status = TP_STATUS_SEND_REQUEST;
1006 		skb = sock_alloc_send_skb(&po->sk,
1007 				LL_ALLOCATED_SPACE(dev)
1008 				+ sizeof(struct sockaddr_ll),
1009 				0, &err);
1010 
1011 		if (unlikely(skb == NULL))
1012 			goto out_status;
1013 
1014 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015 				addr);
1016 
1017 		if (unlikely(tp_len < 0)) {
1018 			if (po->tp_loss) {
1019 				__packet_set_status(po, ph,
1020 						TP_STATUS_AVAILABLE);
1021 				packet_increment_head(&po->tx_ring);
1022 				kfree_skb(skb);
1023 				continue;
1024 			} else {
1025 				status = TP_STATUS_WRONG_FORMAT;
1026 				err = tp_len;
1027 				goto out_status;
1028 			}
1029 		}
1030 
1031 		skb->destructor = tpacket_destruct_skb;
1032 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1033 		atomic_inc(&po->tx_ring.pending);
1034 
1035 		status = TP_STATUS_SEND_REQUEST;
1036 		err = dev_queue_xmit(skb);
1037 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1038 			goto out_xmit;
1039 		packet_increment_head(&po->tx_ring);
1040 		len_sum += tp_len;
1041 	}
1042 	while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1043 					&& (atomic_read(&po->tx_ring.pending))))
1044 	      );
1045 
1046 	err = len_sum;
1047 	goto out_put;
1048 
1049 out_xmit:
1050 	skb->destructor = sock_wfree;
1051 	atomic_dec(&po->tx_ring.pending);
1052 out_status:
1053 	__packet_set_status(po, ph, status);
1054 	kfree_skb(skb);
1055 out_put:
1056 	dev_put(dev);
1057 out:
1058 	mutex_unlock(&po->pg_vec_lock);
1059 	return err;
1060 }
1061 #endif
1062 
1063 static int packet_snd(struct socket *sock,
1064 			  struct msghdr *msg, size_t len)
1065 {
1066 	struct sock *sk = sock->sk;
1067 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
1068 	struct sk_buff *skb;
1069 	struct net_device *dev;
1070 	__be16 proto;
1071 	unsigned char *addr;
1072 	int ifindex, err, reserve = 0;
1073 
1074 	/*
1075 	 *	Get and verify the address.
1076 	 */
1077 
1078 	if (saddr == NULL) {
1079 		struct packet_sock *po = pkt_sk(sk);
1080 
1081 		ifindex	= po->ifindex;
1082 		proto	= po->num;
1083 		addr	= NULL;
1084 	} else {
1085 		err = -EINVAL;
1086 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1087 			goto out;
1088 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1089 			goto out;
1090 		ifindex	= saddr->sll_ifindex;
1091 		proto	= saddr->sll_protocol;
1092 		addr	= saddr->sll_addr;
1093 	}
1094 
1095 
1096 	dev = dev_get_by_index(sock_net(sk), ifindex);
1097 	err = -ENXIO;
1098 	if (dev == NULL)
1099 		goto out_unlock;
1100 	if (sock->type == SOCK_RAW)
1101 		reserve = dev->hard_header_len;
1102 
1103 	err = -ENETDOWN;
1104 	if (!(dev->flags & IFF_UP))
1105 		goto out_unlock;
1106 
1107 	err = -EMSGSIZE;
1108 	if (len > dev->mtu+reserve)
1109 		goto out_unlock;
1110 
1111 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1112 				msg->msg_flags & MSG_DONTWAIT, &err);
1113 	if (skb==NULL)
1114 		goto out_unlock;
1115 
1116 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1117 	skb_reset_network_header(skb);
1118 
1119 	err = -EINVAL;
1120 	if (sock->type == SOCK_DGRAM &&
1121 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1122 		goto out_free;
1123 
1124 	/* Returns -EFAULT on error */
1125 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1126 	if (err)
1127 		goto out_free;
1128 
1129 	skb->protocol = proto;
1130 	skb->dev = dev;
1131 	skb->priority = sk->sk_priority;
1132 
1133 	/*
1134 	 *	Now send it
1135 	 */
1136 
1137 	err = dev_queue_xmit(skb);
1138 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1139 		goto out_unlock;
1140 
1141 	dev_put(dev);
1142 
1143 	return(len);
1144 
1145 out_free:
1146 	kfree_skb(skb);
1147 out_unlock:
1148 	if (dev)
1149 		dev_put(dev);
1150 out:
1151 	return err;
1152 }
1153 
1154 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1155 		struct msghdr *msg, size_t len)
1156 {
1157 #ifdef CONFIG_PACKET_MMAP
1158 	struct sock *sk = sock->sk;
1159 	struct packet_sock *po = pkt_sk(sk);
1160 	if (po->tx_ring.pg_vec)
1161 		return tpacket_snd(po, msg);
1162 	else
1163 #endif
1164 		return packet_snd(sock, msg, len);
1165 }
1166 
1167 /*
1168  *	Close a PACKET socket. This is fairly simple. We immediately go
1169  *	to 'closed' state and remove our protocol entry in the device list.
1170  */
1171 
1172 static int packet_release(struct socket *sock)
1173 {
1174 	struct sock *sk = sock->sk;
1175 	struct packet_sock *po;
1176 	struct net *net;
1177 #ifdef CONFIG_PACKET_MMAP
1178 	struct tpacket_req req;
1179 #endif
1180 
1181 	if (!sk)
1182 		return 0;
1183 
1184 	net = sock_net(sk);
1185 	po = pkt_sk(sk);
1186 
1187 	write_lock_bh(&net->packet.sklist_lock);
1188 	sk_del_node_init(sk);
1189 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1190 	write_unlock_bh(&net->packet.sklist_lock);
1191 
1192 	/*
1193 	 *	Unhook packet receive handler.
1194 	 */
1195 
1196 	if (po->running) {
1197 		/*
1198 		 *	Remove the protocol hook
1199 		 */
1200 		dev_remove_pack(&po->prot_hook);
1201 		po->running = 0;
1202 		po->num = 0;
1203 		__sock_put(sk);
1204 	}
1205 
1206 	packet_flush_mclist(sk);
1207 
1208 #ifdef CONFIG_PACKET_MMAP
1209 	memset(&req, 0, sizeof(req));
1210 
1211 	if (po->rx_ring.pg_vec)
1212 		packet_set_ring(sk, &req, 1, 0);
1213 
1214 	if (po->tx_ring.pg_vec)
1215 		packet_set_ring(sk, &req, 1, 1);
1216 #endif
1217 
1218 	/*
1219 	 *	Now the socket is dead. No more input will appear.
1220 	 */
1221 
1222 	sock_orphan(sk);
1223 	sock->sk = NULL;
1224 
1225 	/* Purge queues */
1226 
1227 	skb_queue_purge(&sk->sk_receive_queue);
1228 	sk_refcnt_debug_release(sk);
1229 
1230 	sock_put(sk);
1231 	return 0;
1232 }
1233 
1234 /*
1235  *	Attach a packet hook.
1236  */
1237 
1238 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1239 {
1240 	struct packet_sock *po = pkt_sk(sk);
1241 	/*
1242 	 *	Detach an existing hook if present.
1243 	 */
1244 
1245 	lock_sock(sk);
1246 
1247 	spin_lock(&po->bind_lock);
1248 	if (po->running) {
1249 		__sock_put(sk);
1250 		po->running = 0;
1251 		po->num = 0;
1252 		spin_unlock(&po->bind_lock);
1253 		dev_remove_pack(&po->prot_hook);
1254 		spin_lock(&po->bind_lock);
1255 	}
1256 
1257 	po->num = protocol;
1258 	po->prot_hook.type = protocol;
1259 	po->prot_hook.dev = dev;
1260 
1261 	po->ifindex = dev ? dev->ifindex : 0;
1262 
1263 	if (protocol == 0)
1264 		goto out_unlock;
1265 
1266 	if (!dev || (dev->flags & IFF_UP)) {
1267 		dev_add_pack(&po->prot_hook);
1268 		sock_hold(sk);
1269 		po->running = 1;
1270 	} else {
1271 		sk->sk_err = ENETDOWN;
1272 		if (!sock_flag(sk, SOCK_DEAD))
1273 			sk->sk_error_report(sk);
1274 	}
1275 
1276 out_unlock:
1277 	spin_unlock(&po->bind_lock);
1278 	release_sock(sk);
1279 	return 0;
1280 }
1281 
1282 /*
1283  *	Bind a packet socket to a device
1284  */
1285 
1286 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1287 {
1288 	struct sock *sk=sock->sk;
1289 	char name[15];
1290 	struct net_device *dev;
1291 	int err = -ENODEV;
1292 
1293 	/*
1294 	 *	Check legality
1295 	 */
1296 
1297 	if (addr_len != sizeof(struct sockaddr))
1298 		return -EINVAL;
1299 	strlcpy(name,uaddr->sa_data,sizeof(name));
1300 
1301 	dev = dev_get_by_name(sock_net(sk), name);
1302 	if (dev) {
1303 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1304 		dev_put(dev);
1305 	}
1306 	return err;
1307 }
1308 
1309 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1310 {
1311 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1312 	struct sock *sk=sock->sk;
1313 	struct net_device *dev = NULL;
1314 	int err;
1315 
1316 
1317 	/*
1318 	 *	Check legality
1319 	 */
1320 
1321 	if (addr_len < sizeof(struct sockaddr_ll))
1322 		return -EINVAL;
1323 	if (sll->sll_family != AF_PACKET)
1324 		return -EINVAL;
1325 
1326 	if (sll->sll_ifindex) {
1327 		err = -ENODEV;
1328 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1329 		if (dev == NULL)
1330 			goto out;
1331 	}
1332 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1333 	if (dev)
1334 		dev_put(dev);
1335 
1336 out:
1337 	return err;
1338 }
1339 
1340 static struct proto packet_proto = {
1341 	.name	  = "PACKET",
1342 	.owner	  = THIS_MODULE,
1343 	.obj_size = sizeof(struct packet_sock),
1344 };
1345 
1346 /*
1347  *	Create a packet of type SOCK_PACKET.
1348  */
1349 
1350 static int packet_create(struct net *net, struct socket *sock, int protocol)
1351 {
1352 	struct sock *sk;
1353 	struct packet_sock *po;
1354 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1355 	int err;
1356 
1357 	if (!capable(CAP_NET_RAW))
1358 		return -EPERM;
1359 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1360 	    sock->type != SOCK_PACKET)
1361 		return -ESOCKTNOSUPPORT;
1362 
1363 	sock->state = SS_UNCONNECTED;
1364 
1365 	err = -ENOBUFS;
1366 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1367 	if (sk == NULL)
1368 		goto out;
1369 
1370 	sock->ops = &packet_ops;
1371 	if (sock->type == SOCK_PACKET)
1372 		sock->ops = &packet_ops_spkt;
1373 
1374 	sock_init_data(sock, sk);
1375 
1376 	po = pkt_sk(sk);
1377 	sk->sk_family = PF_PACKET;
1378 	po->num = proto;
1379 
1380 	sk->sk_destruct = packet_sock_destruct;
1381 	sk_refcnt_debug_inc(sk);
1382 
1383 	/*
1384 	 *	Attach a protocol block
1385 	 */
1386 
1387 	spin_lock_init(&po->bind_lock);
1388 	mutex_init(&po->pg_vec_lock);
1389 	po->prot_hook.func = packet_rcv;
1390 
1391 	if (sock->type == SOCK_PACKET)
1392 		po->prot_hook.func = packet_rcv_spkt;
1393 
1394 	po->prot_hook.af_packet_priv = sk;
1395 
1396 	if (proto) {
1397 		po->prot_hook.type = proto;
1398 		dev_add_pack(&po->prot_hook);
1399 		sock_hold(sk);
1400 		po->running = 1;
1401 	}
1402 
1403 	write_lock_bh(&net->packet.sklist_lock);
1404 	sk_add_node(sk, &net->packet.sklist);
1405 	sock_prot_inuse_add(net, &packet_proto, 1);
1406 	write_unlock_bh(&net->packet.sklist_lock);
1407 	return(0);
1408 out:
1409 	return err;
1410 }
1411 
1412 /*
1413  *	Pull a packet from our receive queue and hand it to the user.
1414  *	If necessary we block.
1415  */
1416 
1417 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1418 			  struct msghdr *msg, size_t len, int flags)
1419 {
1420 	struct sock *sk = sock->sk;
1421 	struct sk_buff *skb;
1422 	int copied, err;
1423 	struct sockaddr_ll *sll;
1424 
1425 	err = -EINVAL;
1426 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1427 		goto out;
1428 
1429 #if 0
1430 	/* What error should we return now? EUNATTACH? */
1431 	if (pkt_sk(sk)->ifindex < 0)
1432 		return -ENODEV;
1433 #endif
1434 
1435 	/*
1436 	 *	Call the generic datagram receiver. This handles all sorts
1437 	 *	of horrible races and re-entrancy so we can forget about it
1438 	 *	in the protocol layers.
1439 	 *
1440 	 *	Now it will return ENETDOWN, if device have just gone down,
1441 	 *	but then it will block.
1442 	 */
1443 
1444 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1445 
1446 	/*
1447 	 *	An error occurred so return it. Because skb_recv_datagram()
1448 	 *	handles the blocking we don't see and worry about blocking
1449 	 *	retries.
1450 	 */
1451 
1452 	if (skb == NULL)
1453 		goto out;
1454 
1455 	/*
1456 	 *	If the address length field is there to be filled in, we fill
1457 	 *	it in now.
1458 	 */
1459 
1460 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1461 	if (sock->type == SOCK_PACKET)
1462 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1463 	else
1464 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1465 
1466 	/*
1467 	 *	You lose any data beyond the buffer you gave. If it worries a
1468 	 *	user program they can ask the device for its MTU anyway.
1469 	 */
1470 
1471 	copied = skb->len;
1472 	if (copied > len)
1473 	{
1474 		copied=len;
1475 		msg->msg_flags|=MSG_TRUNC;
1476 	}
1477 
1478 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1479 	if (err)
1480 		goto out_free;
1481 
1482 	sock_recv_timestamp(msg, sk, skb);
1483 
1484 	if (msg->msg_name)
1485 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1486 		       msg->msg_namelen);
1487 
1488 	if (pkt_sk(sk)->auxdata) {
1489 		struct tpacket_auxdata aux;
1490 
1491 		aux.tp_status = TP_STATUS_USER;
1492 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1493 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1494 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1495 		aux.tp_snaplen = skb->len;
1496 		aux.tp_mac = 0;
1497 		aux.tp_net = skb_network_offset(skb);
1498 		aux.tp_vlan_tci = skb->vlan_tci;
1499 
1500 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1501 	}
1502 
1503 	/*
1504 	 *	Free or return the buffer as appropriate. Again this
1505 	 *	hides all the races and re-entrancy issues from us.
1506 	 */
1507 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1508 
1509 out_free:
1510 	skb_free_datagram(sk, skb);
1511 out:
1512 	return err;
1513 }
1514 
1515 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1516 			       int *uaddr_len, int peer)
1517 {
1518 	struct net_device *dev;
1519 	struct sock *sk	= sock->sk;
1520 
1521 	if (peer)
1522 		return -EOPNOTSUPP;
1523 
1524 	uaddr->sa_family = AF_PACKET;
1525 	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1526 	if (dev) {
1527 		strlcpy(uaddr->sa_data, dev->name, 15);
1528 		dev_put(dev);
1529 	} else
1530 		memset(uaddr->sa_data, 0, 14);
1531 	*uaddr_len = sizeof(*uaddr);
1532 
1533 	return 0;
1534 }
1535 
1536 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1537 			  int *uaddr_len, int peer)
1538 {
1539 	struct net_device *dev;
1540 	struct sock *sk = sock->sk;
1541 	struct packet_sock *po = pkt_sk(sk);
1542 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1543 
1544 	if (peer)
1545 		return -EOPNOTSUPP;
1546 
1547 	sll->sll_family = AF_PACKET;
1548 	sll->sll_ifindex = po->ifindex;
1549 	sll->sll_protocol = po->num;
1550 	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1551 	if (dev) {
1552 		sll->sll_hatype = dev->type;
1553 		sll->sll_halen = dev->addr_len;
1554 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1555 		dev_put(dev);
1556 	} else {
1557 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1558 		sll->sll_halen = 0;
1559 	}
1560 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1561 
1562 	return 0;
1563 }
1564 
1565 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1566 			 int what)
1567 {
1568 	switch (i->type) {
1569 	case PACKET_MR_MULTICAST:
1570 		if (what > 0)
1571 			return dev_mc_add(dev, i->addr, i->alen, 0);
1572 		else
1573 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1574 		break;
1575 	case PACKET_MR_PROMISC:
1576 		return dev_set_promiscuity(dev, what);
1577 		break;
1578 	case PACKET_MR_ALLMULTI:
1579 		return dev_set_allmulti(dev, what);
1580 		break;
1581 	case PACKET_MR_UNICAST:
1582 		if (what > 0)
1583 			return dev_unicast_add(dev, i->addr);
1584 		else
1585 			return dev_unicast_delete(dev, i->addr);
1586 		break;
1587 	default:;
1588 	}
1589 	return 0;
1590 }
1591 
1592 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1593 {
1594 	for ( ; i; i=i->next) {
1595 		if (i->ifindex == dev->ifindex)
1596 			packet_dev_mc(dev, i, what);
1597 	}
1598 }
1599 
1600 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1601 {
1602 	struct packet_sock *po = pkt_sk(sk);
1603 	struct packet_mclist *ml, *i;
1604 	struct net_device *dev;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 
1609 	err = -ENODEV;
1610 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1611 	if (!dev)
1612 		goto done;
1613 
1614 	err = -EINVAL;
1615 	if (mreq->mr_alen > dev->addr_len)
1616 		goto done;
1617 
1618 	err = -ENOBUFS;
1619 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1620 	if (i == NULL)
1621 		goto done;
1622 
1623 	err = 0;
1624 	for (ml = po->mclist; ml; ml = ml->next) {
1625 		if (ml->ifindex == mreq->mr_ifindex &&
1626 		    ml->type == mreq->mr_type &&
1627 		    ml->alen == mreq->mr_alen &&
1628 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1629 			ml->count++;
1630 			/* Free the new element ... */
1631 			kfree(i);
1632 			goto done;
1633 		}
1634 	}
1635 
1636 	i->type = mreq->mr_type;
1637 	i->ifindex = mreq->mr_ifindex;
1638 	i->alen = mreq->mr_alen;
1639 	memcpy(i->addr, mreq->mr_address, i->alen);
1640 	i->count = 1;
1641 	i->next = po->mclist;
1642 	po->mclist = i;
1643 	err = packet_dev_mc(dev, i, 1);
1644 	if (err) {
1645 		po->mclist = i->next;
1646 		kfree(i);
1647 	}
1648 
1649 done:
1650 	rtnl_unlock();
1651 	return err;
1652 }
1653 
1654 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1655 {
1656 	struct packet_mclist *ml, **mlp;
1657 
1658 	rtnl_lock();
1659 
1660 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1661 		if (ml->ifindex == mreq->mr_ifindex &&
1662 		    ml->type == mreq->mr_type &&
1663 		    ml->alen == mreq->mr_alen &&
1664 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1665 			if (--ml->count == 0) {
1666 				struct net_device *dev;
1667 				*mlp = ml->next;
1668 				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1669 				if (dev) {
1670 					packet_dev_mc(dev, ml, -1);
1671 					dev_put(dev);
1672 				}
1673 				kfree(ml);
1674 			}
1675 			rtnl_unlock();
1676 			return 0;
1677 		}
1678 	}
1679 	rtnl_unlock();
1680 	return -EADDRNOTAVAIL;
1681 }
1682 
1683 static void packet_flush_mclist(struct sock *sk)
1684 {
1685 	struct packet_sock *po = pkt_sk(sk);
1686 	struct packet_mclist *ml;
1687 
1688 	if (!po->mclist)
1689 		return;
1690 
1691 	rtnl_lock();
1692 	while ((ml = po->mclist) != NULL) {
1693 		struct net_device *dev;
1694 
1695 		po->mclist = ml->next;
1696 		if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1697 			packet_dev_mc(dev, ml, -1);
1698 			dev_put(dev);
1699 		}
1700 		kfree(ml);
1701 	}
1702 	rtnl_unlock();
1703 }
1704 
1705 static int
1706 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1707 {
1708 	struct sock *sk = sock->sk;
1709 	struct packet_sock *po = pkt_sk(sk);
1710 	int ret;
1711 
1712 	if (level != SOL_PACKET)
1713 		return -ENOPROTOOPT;
1714 
1715 	switch (optname) {
1716 	case PACKET_ADD_MEMBERSHIP:
1717 	case PACKET_DROP_MEMBERSHIP:
1718 	{
1719 		struct packet_mreq_max mreq;
1720 		int len = optlen;
1721 		memset(&mreq, 0, sizeof(mreq));
1722 		if (len < sizeof(struct packet_mreq))
1723 			return -EINVAL;
1724 		if (len > sizeof(mreq))
1725 			len = sizeof(mreq);
1726 		if (copy_from_user(&mreq,optval,len))
1727 			return -EFAULT;
1728 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1729 			return -EINVAL;
1730 		if (optname == PACKET_ADD_MEMBERSHIP)
1731 			ret = packet_mc_add(sk, &mreq);
1732 		else
1733 			ret = packet_mc_drop(sk, &mreq);
1734 		return ret;
1735 	}
1736 
1737 #ifdef CONFIG_PACKET_MMAP
1738 	case PACKET_RX_RING:
1739 	case PACKET_TX_RING:
1740 	{
1741 		struct tpacket_req req;
1742 
1743 		if (optlen<sizeof(req))
1744 			return -EINVAL;
1745 		if (copy_from_user(&req,optval,sizeof(req)))
1746 			return -EFAULT;
1747 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1748 	}
1749 	case PACKET_COPY_THRESH:
1750 	{
1751 		int val;
1752 
1753 		if (optlen!=sizeof(val))
1754 			return -EINVAL;
1755 		if (copy_from_user(&val,optval,sizeof(val)))
1756 			return -EFAULT;
1757 
1758 		pkt_sk(sk)->copy_thresh = val;
1759 		return 0;
1760 	}
1761 	case PACKET_VERSION:
1762 	{
1763 		int val;
1764 
1765 		if (optlen != sizeof(val))
1766 			return -EINVAL;
1767 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1768 			return -EBUSY;
1769 		if (copy_from_user(&val, optval, sizeof(val)))
1770 			return -EFAULT;
1771 		switch (val) {
1772 		case TPACKET_V1:
1773 		case TPACKET_V2:
1774 			po->tp_version = val;
1775 			return 0;
1776 		default:
1777 			return -EINVAL;
1778 		}
1779 	}
1780 	case PACKET_RESERVE:
1781 	{
1782 		unsigned int val;
1783 
1784 		if (optlen != sizeof(val))
1785 			return -EINVAL;
1786 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1787 			return -EBUSY;
1788 		if (copy_from_user(&val, optval, sizeof(val)))
1789 			return -EFAULT;
1790 		po->tp_reserve = val;
1791 		return 0;
1792 	}
1793 	case PACKET_LOSS:
1794 	{
1795 		unsigned int val;
1796 
1797 		if (optlen != sizeof(val))
1798 			return -EINVAL;
1799 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1800 			return -EBUSY;
1801 		if (copy_from_user(&val, optval, sizeof(val)))
1802 			return -EFAULT;
1803 		po->tp_loss = !!val;
1804 		return 0;
1805 	}
1806 #endif
1807 	case PACKET_AUXDATA:
1808 	{
1809 		int val;
1810 
1811 		if (optlen < sizeof(val))
1812 			return -EINVAL;
1813 		if (copy_from_user(&val, optval, sizeof(val)))
1814 			return -EFAULT;
1815 
1816 		po->auxdata = !!val;
1817 		return 0;
1818 	}
1819 	case PACKET_ORIGDEV:
1820 	{
1821 		int val;
1822 
1823 		if (optlen < sizeof(val))
1824 			return -EINVAL;
1825 		if (copy_from_user(&val, optval, sizeof(val)))
1826 			return -EFAULT;
1827 
1828 		po->origdev = !!val;
1829 		return 0;
1830 	}
1831 	default:
1832 		return -ENOPROTOOPT;
1833 	}
1834 }
1835 
1836 static int packet_getsockopt(struct socket *sock, int level, int optname,
1837 			     char __user *optval, int __user *optlen)
1838 {
1839 	int len;
1840 	int val;
1841 	struct sock *sk = sock->sk;
1842 	struct packet_sock *po = pkt_sk(sk);
1843 	void *data;
1844 	struct tpacket_stats st;
1845 
1846 	if (level != SOL_PACKET)
1847 		return -ENOPROTOOPT;
1848 
1849 	if (get_user(len, optlen))
1850 		return -EFAULT;
1851 
1852 	if (len < 0)
1853 		return -EINVAL;
1854 
1855 	switch (optname) {
1856 	case PACKET_STATISTICS:
1857 		if (len > sizeof(struct tpacket_stats))
1858 			len = sizeof(struct tpacket_stats);
1859 		spin_lock_bh(&sk->sk_receive_queue.lock);
1860 		st = po->stats;
1861 		memset(&po->stats, 0, sizeof(st));
1862 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1863 		st.tp_packets += st.tp_drops;
1864 
1865 		data = &st;
1866 		break;
1867 	case PACKET_AUXDATA:
1868 		if (len > sizeof(int))
1869 			len = sizeof(int);
1870 		val = po->auxdata;
1871 
1872 		data = &val;
1873 		break;
1874 	case PACKET_ORIGDEV:
1875 		if (len > sizeof(int))
1876 			len = sizeof(int);
1877 		val = po->origdev;
1878 
1879 		data = &val;
1880 		break;
1881 #ifdef CONFIG_PACKET_MMAP
1882 	case PACKET_VERSION:
1883 		if (len > sizeof(int))
1884 			len = sizeof(int);
1885 		val = po->tp_version;
1886 		data = &val;
1887 		break;
1888 	case PACKET_HDRLEN:
1889 		if (len > sizeof(int))
1890 			len = sizeof(int);
1891 		if (copy_from_user(&val, optval, len))
1892 			return -EFAULT;
1893 		switch (val) {
1894 		case TPACKET_V1:
1895 			val = sizeof(struct tpacket_hdr);
1896 			break;
1897 		case TPACKET_V2:
1898 			val = sizeof(struct tpacket2_hdr);
1899 			break;
1900 		default:
1901 			return -EINVAL;
1902 		}
1903 		data = &val;
1904 		break;
1905 	case PACKET_RESERVE:
1906 		if (len > sizeof(unsigned int))
1907 			len = sizeof(unsigned int);
1908 		val = po->tp_reserve;
1909 		data = &val;
1910 		break;
1911 	case PACKET_LOSS:
1912 		if (len > sizeof(unsigned int))
1913 			len = sizeof(unsigned int);
1914 		val = po->tp_loss;
1915 		data = &val;
1916 		break;
1917 #endif
1918 	default:
1919 		return -ENOPROTOOPT;
1920 	}
1921 
1922 	if (put_user(len, optlen))
1923 		return -EFAULT;
1924 	if (copy_to_user(optval, data, len))
1925 		return -EFAULT;
1926 	return 0;
1927 }
1928 
1929 
1930 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1931 {
1932 	struct sock *sk;
1933 	struct hlist_node *node;
1934 	struct net_device *dev = data;
1935 	struct net *net = dev_net(dev);
1936 
1937 	read_lock(&net->packet.sklist_lock);
1938 	sk_for_each(sk, node, &net->packet.sklist) {
1939 		struct packet_sock *po = pkt_sk(sk);
1940 
1941 		switch (msg) {
1942 		case NETDEV_UNREGISTER:
1943 			if (po->mclist)
1944 				packet_dev_mclist(dev, po->mclist, -1);
1945 			/* fallthrough */
1946 
1947 		case NETDEV_DOWN:
1948 			if (dev->ifindex == po->ifindex) {
1949 				spin_lock(&po->bind_lock);
1950 				if (po->running) {
1951 					__dev_remove_pack(&po->prot_hook);
1952 					__sock_put(sk);
1953 					po->running = 0;
1954 					sk->sk_err = ENETDOWN;
1955 					if (!sock_flag(sk, SOCK_DEAD))
1956 						sk->sk_error_report(sk);
1957 				}
1958 				if (msg == NETDEV_UNREGISTER) {
1959 					po->ifindex = -1;
1960 					po->prot_hook.dev = NULL;
1961 				}
1962 				spin_unlock(&po->bind_lock);
1963 			}
1964 			break;
1965 		case NETDEV_UP:
1966 			spin_lock(&po->bind_lock);
1967 			if (dev->ifindex == po->ifindex && po->num &&
1968 			    !po->running) {
1969 				dev_add_pack(&po->prot_hook);
1970 				sock_hold(sk);
1971 				po->running = 1;
1972 			}
1973 			spin_unlock(&po->bind_lock);
1974 			break;
1975 		}
1976 	}
1977 	read_unlock(&net->packet.sklist_lock);
1978 	return NOTIFY_DONE;
1979 }
1980 
1981 
1982 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1983 			unsigned long arg)
1984 {
1985 	struct sock *sk = sock->sk;
1986 
1987 	switch (cmd) {
1988 		case SIOCOUTQ:
1989 		{
1990 			int amount = sk_wmem_alloc_get(sk);
1991 
1992 			return put_user(amount, (int __user *)arg);
1993 		}
1994 		case SIOCINQ:
1995 		{
1996 			struct sk_buff *skb;
1997 			int amount = 0;
1998 
1999 			spin_lock_bh(&sk->sk_receive_queue.lock);
2000 			skb = skb_peek(&sk->sk_receive_queue);
2001 			if (skb)
2002 				amount = skb->len;
2003 			spin_unlock_bh(&sk->sk_receive_queue.lock);
2004 			return put_user(amount, (int __user *)arg);
2005 		}
2006 		case SIOCGSTAMP:
2007 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
2008 		case SIOCGSTAMPNS:
2009 			return sock_get_timestampns(sk, (struct timespec __user *)arg);
2010 
2011 #ifdef CONFIG_INET
2012 		case SIOCADDRT:
2013 		case SIOCDELRT:
2014 		case SIOCDARP:
2015 		case SIOCGARP:
2016 		case SIOCSARP:
2017 		case SIOCGIFADDR:
2018 		case SIOCSIFADDR:
2019 		case SIOCGIFBRDADDR:
2020 		case SIOCSIFBRDADDR:
2021 		case SIOCGIFNETMASK:
2022 		case SIOCSIFNETMASK:
2023 		case SIOCGIFDSTADDR:
2024 		case SIOCSIFDSTADDR:
2025 		case SIOCSIFFLAGS:
2026 			if (!net_eq(sock_net(sk), &init_net))
2027 				return -ENOIOCTLCMD;
2028 			return inet_dgram_ops.ioctl(sock, cmd, arg);
2029 #endif
2030 
2031 		default:
2032 			return -ENOIOCTLCMD;
2033 	}
2034 	return 0;
2035 }
2036 
2037 #ifndef CONFIG_PACKET_MMAP
2038 #define packet_mmap sock_no_mmap
2039 #define packet_poll datagram_poll
2040 #else
2041 
2042 static unsigned int packet_poll(struct file * file, struct socket *sock,
2043 				poll_table *wait)
2044 {
2045 	struct sock *sk = sock->sk;
2046 	struct packet_sock *po = pkt_sk(sk);
2047 	unsigned int mask = datagram_poll(file, sock, wait);
2048 
2049 	spin_lock_bh(&sk->sk_receive_queue.lock);
2050 	if (po->rx_ring.pg_vec) {
2051 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2052 			mask |= POLLIN | POLLRDNORM;
2053 	}
2054 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2055 	spin_lock_bh(&sk->sk_write_queue.lock);
2056 	if (po->tx_ring.pg_vec) {
2057 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2058 			mask |= POLLOUT | POLLWRNORM;
2059 	}
2060 	spin_unlock_bh(&sk->sk_write_queue.lock);
2061 	return mask;
2062 }
2063 
2064 
2065 /* Dirty? Well, I still did not learn better way to account
2066  * for user mmaps.
2067  */
2068 
2069 static void packet_mm_open(struct vm_area_struct *vma)
2070 {
2071 	struct file *file = vma->vm_file;
2072 	struct socket * sock = file->private_data;
2073 	struct sock *sk = sock->sk;
2074 
2075 	if (sk)
2076 		atomic_inc(&pkt_sk(sk)->mapped);
2077 }
2078 
2079 static void packet_mm_close(struct vm_area_struct *vma)
2080 {
2081 	struct file *file = vma->vm_file;
2082 	struct socket * sock = file->private_data;
2083 	struct sock *sk = sock->sk;
2084 
2085 	if (sk)
2086 		atomic_dec(&pkt_sk(sk)->mapped);
2087 }
2088 
2089 static struct vm_operations_struct packet_mmap_ops = {
2090 	.open =	packet_mm_open,
2091 	.close =packet_mm_close,
2092 };
2093 
2094 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2095 {
2096 	int i;
2097 
2098 	for (i = 0; i < len; i++) {
2099 		if (likely(pg_vec[i]))
2100 			free_pages((unsigned long) pg_vec[i], order);
2101 	}
2102 	kfree(pg_vec);
2103 }
2104 
2105 static inline char *alloc_one_pg_vec_page(unsigned long order)
2106 {
2107 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2108 
2109 	return (char *) __get_free_pages(gfp_flags, order);
2110 }
2111 
2112 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2113 {
2114 	unsigned int block_nr = req->tp_block_nr;
2115 	char **pg_vec;
2116 	int i;
2117 
2118 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2119 	if (unlikely(!pg_vec))
2120 		goto out;
2121 
2122 	for (i = 0; i < block_nr; i++) {
2123 		pg_vec[i] = alloc_one_pg_vec_page(order);
2124 		if (unlikely(!pg_vec[i]))
2125 			goto out_free_pgvec;
2126 	}
2127 
2128 out:
2129 	return pg_vec;
2130 
2131 out_free_pgvec:
2132 	free_pg_vec(pg_vec, order, block_nr);
2133 	pg_vec = NULL;
2134 	goto out;
2135 }
2136 
2137 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2138 		int closing, int tx_ring)
2139 {
2140 	char **pg_vec = NULL;
2141 	struct packet_sock *po = pkt_sk(sk);
2142 	int was_running, order = 0;
2143 	struct packet_ring_buffer *rb;
2144 	struct sk_buff_head *rb_queue;
2145 	__be16 num;
2146 	int err;
2147 
2148 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2149 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2150 
2151 	err = -EBUSY;
2152 	if (!closing) {
2153 		if (atomic_read(&po->mapped))
2154 			goto out;
2155 		if (atomic_read(&rb->pending))
2156 			goto out;
2157 	}
2158 
2159 	if (req->tp_block_nr) {
2160 		/* Sanity tests and some calculations */
2161 		err = -EBUSY;
2162 		if (unlikely(rb->pg_vec))
2163 			goto out;
2164 
2165 		switch (po->tp_version) {
2166 		case TPACKET_V1:
2167 			po->tp_hdrlen = TPACKET_HDRLEN;
2168 			break;
2169 		case TPACKET_V2:
2170 			po->tp_hdrlen = TPACKET2_HDRLEN;
2171 			break;
2172 		}
2173 
2174 		err = -EINVAL;
2175 		if (unlikely((int)req->tp_block_size <= 0))
2176 			goto out;
2177 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2178 			goto out;
2179 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2180 					po->tp_reserve))
2181 			goto out;
2182 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2183 			goto out;
2184 
2185 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2186 		if (unlikely(rb->frames_per_block <= 0))
2187 			goto out;
2188 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2189 					req->tp_frame_nr))
2190 			goto out;
2191 
2192 		err = -ENOMEM;
2193 		order = get_order(req->tp_block_size);
2194 		pg_vec = alloc_pg_vec(req, order);
2195 		if (unlikely(!pg_vec))
2196 			goto out;
2197 	}
2198 	/* Done */
2199 	else {
2200 		err = -EINVAL;
2201 		if (unlikely(req->tp_frame_nr))
2202 			goto out;
2203 	}
2204 
2205 	lock_sock(sk);
2206 
2207 	/* Detach socket from network */
2208 	spin_lock(&po->bind_lock);
2209 	was_running = po->running;
2210 	num = po->num;
2211 	if (was_running) {
2212 		__dev_remove_pack(&po->prot_hook);
2213 		po->num = 0;
2214 		po->running = 0;
2215 		__sock_put(sk);
2216 	}
2217 	spin_unlock(&po->bind_lock);
2218 
2219 	synchronize_net();
2220 
2221 	err = -EBUSY;
2222 	mutex_lock(&po->pg_vec_lock);
2223 	if (closing || atomic_read(&po->mapped) == 0) {
2224 		err = 0;
2225 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2226 		spin_lock_bh(&rb_queue->lock);
2227 		pg_vec = XC(rb->pg_vec, pg_vec);
2228 		rb->frame_max = (req->tp_frame_nr - 1);
2229 		rb->head = 0;
2230 		rb->frame_size = req->tp_frame_size;
2231 		spin_unlock_bh(&rb_queue->lock);
2232 
2233 		order = XC(rb->pg_vec_order, order);
2234 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2235 
2236 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2237 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2238 						tpacket_rcv : packet_rcv;
2239 		skb_queue_purge(rb_queue);
2240 #undef XC
2241 		if (atomic_read(&po->mapped))
2242 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2243 						atomic_read(&po->mapped));
2244 	}
2245 	mutex_unlock(&po->pg_vec_lock);
2246 
2247 	spin_lock(&po->bind_lock);
2248 	if (was_running && !po->running) {
2249 		sock_hold(sk);
2250 		po->running = 1;
2251 		po->num = num;
2252 		dev_add_pack(&po->prot_hook);
2253 	}
2254 	spin_unlock(&po->bind_lock);
2255 
2256 	release_sock(sk);
2257 
2258 	if (pg_vec)
2259 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2260 out:
2261 	return err;
2262 }
2263 
2264 static int packet_mmap(struct file *file, struct socket *sock,
2265 		struct vm_area_struct *vma)
2266 {
2267 	struct sock *sk = sock->sk;
2268 	struct packet_sock *po = pkt_sk(sk);
2269 	unsigned long size, expected_size;
2270 	struct packet_ring_buffer *rb;
2271 	unsigned long start;
2272 	int err = -EINVAL;
2273 	int i;
2274 
2275 	if (vma->vm_pgoff)
2276 		return -EINVAL;
2277 
2278 	mutex_lock(&po->pg_vec_lock);
2279 
2280 	expected_size = 0;
2281 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2282 		if (rb->pg_vec) {
2283 			expected_size += rb->pg_vec_len
2284 						* rb->pg_vec_pages
2285 						* PAGE_SIZE;
2286 		}
2287 	}
2288 
2289 	if (expected_size == 0)
2290 		goto out;
2291 
2292 	size = vma->vm_end - vma->vm_start;
2293 	if (size != expected_size)
2294 		goto out;
2295 
2296 	start = vma->vm_start;
2297 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2298 		if (rb->pg_vec == NULL)
2299 			continue;
2300 
2301 		for (i = 0; i < rb->pg_vec_len; i++) {
2302 			struct page *page = virt_to_page(rb->pg_vec[i]);
2303 			int pg_num;
2304 
2305 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2306 					pg_num++,page++) {
2307 				err = vm_insert_page(vma, start, page);
2308 				if (unlikely(err))
2309 					goto out;
2310 				start += PAGE_SIZE;
2311 			}
2312 		}
2313 	}
2314 
2315 	atomic_inc(&po->mapped);
2316 	vma->vm_ops = &packet_mmap_ops;
2317 	err = 0;
2318 
2319 out:
2320 	mutex_unlock(&po->pg_vec_lock);
2321 	return err;
2322 }
2323 #endif
2324 
2325 
2326 static const struct proto_ops packet_ops_spkt = {
2327 	.family =	PF_PACKET,
2328 	.owner =	THIS_MODULE,
2329 	.release =	packet_release,
2330 	.bind =		packet_bind_spkt,
2331 	.connect =	sock_no_connect,
2332 	.socketpair =	sock_no_socketpair,
2333 	.accept =	sock_no_accept,
2334 	.getname =	packet_getname_spkt,
2335 	.poll =		datagram_poll,
2336 	.ioctl =	packet_ioctl,
2337 	.listen =	sock_no_listen,
2338 	.shutdown =	sock_no_shutdown,
2339 	.setsockopt =	sock_no_setsockopt,
2340 	.getsockopt =	sock_no_getsockopt,
2341 	.sendmsg =	packet_sendmsg_spkt,
2342 	.recvmsg =	packet_recvmsg,
2343 	.mmap =		sock_no_mmap,
2344 	.sendpage =	sock_no_sendpage,
2345 };
2346 
2347 static const struct proto_ops packet_ops = {
2348 	.family =	PF_PACKET,
2349 	.owner =	THIS_MODULE,
2350 	.release =	packet_release,
2351 	.bind =		packet_bind,
2352 	.connect =	sock_no_connect,
2353 	.socketpair =	sock_no_socketpair,
2354 	.accept =	sock_no_accept,
2355 	.getname =	packet_getname,
2356 	.poll =		packet_poll,
2357 	.ioctl =	packet_ioctl,
2358 	.listen =	sock_no_listen,
2359 	.shutdown =	sock_no_shutdown,
2360 	.setsockopt =	packet_setsockopt,
2361 	.getsockopt =	packet_getsockopt,
2362 	.sendmsg =	packet_sendmsg,
2363 	.recvmsg =	packet_recvmsg,
2364 	.mmap =		packet_mmap,
2365 	.sendpage =	sock_no_sendpage,
2366 };
2367 
2368 static struct net_proto_family packet_family_ops = {
2369 	.family =	PF_PACKET,
2370 	.create =	packet_create,
2371 	.owner	=	THIS_MODULE,
2372 };
2373 
2374 static struct notifier_block packet_netdev_notifier = {
2375 	.notifier_call =packet_notifier,
2376 };
2377 
2378 #ifdef CONFIG_PROC_FS
2379 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2380 {
2381 	struct sock *s;
2382 	struct hlist_node *node;
2383 
2384 	sk_for_each(s, node, &net->packet.sklist) {
2385 		if (!off--)
2386 			return s;
2387 	}
2388 	return NULL;
2389 }
2390 
2391 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2392 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2393 {
2394 	struct net *net = seq_file_net(seq);
2395 	read_lock(&net->packet.sklist_lock);
2396 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2397 }
2398 
2399 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2400 {
2401 	struct net *net = seq_file_net(seq);
2402 	++*pos;
2403 	return  (v == SEQ_START_TOKEN)
2404 		? sk_head(&net->packet.sklist)
2405 		: sk_next((struct sock*)v) ;
2406 }
2407 
2408 static void packet_seq_stop(struct seq_file *seq, void *v)
2409 	__releases(seq_file_net(seq)->packet.sklist_lock)
2410 {
2411 	struct net *net = seq_file_net(seq);
2412 	read_unlock(&net->packet.sklist_lock);
2413 }
2414 
2415 static int packet_seq_show(struct seq_file *seq, void *v)
2416 {
2417 	if (v == SEQ_START_TOKEN)
2418 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2419 	else {
2420 		struct sock *s = v;
2421 		const struct packet_sock *po = pkt_sk(s);
2422 
2423 		seq_printf(seq,
2424 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2425 			   s,
2426 			   atomic_read(&s->sk_refcnt),
2427 			   s->sk_type,
2428 			   ntohs(po->num),
2429 			   po->ifindex,
2430 			   po->running,
2431 			   atomic_read(&s->sk_rmem_alloc),
2432 			   sock_i_uid(s),
2433 			   sock_i_ino(s) );
2434 	}
2435 
2436 	return 0;
2437 }
2438 
2439 static const struct seq_operations packet_seq_ops = {
2440 	.start	= packet_seq_start,
2441 	.next	= packet_seq_next,
2442 	.stop	= packet_seq_stop,
2443 	.show	= packet_seq_show,
2444 };
2445 
2446 static int packet_seq_open(struct inode *inode, struct file *file)
2447 {
2448 	return seq_open_net(inode, file, &packet_seq_ops,
2449 			    sizeof(struct seq_net_private));
2450 }
2451 
2452 static const struct file_operations packet_seq_fops = {
2453 	.owner		= THIS_MODULE,
2454 	.open		= packet_seq_open,
2455 	.read		= seq_read,
2456 	.llseek		= seq_lseek,
2457 	.release	= seq_release_net,
2458 };
2459 
2460 #endif
2461 
2462 static int packet_net_init(struct net *net)
2463 {
2464 	rwlock_init(&net->packet.sklist_lock);
2465 	INIT_HLIST_HEAD(&net->packet.sklist);
2466 
2467 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2468 		return -ENOMEM;
2469 
2470 	return 0;
2471 }
2472 
2473 static void packet_net_exit(struct net *net)
2474 {
2475 	proc_net_remove(net, "packet");
2476 }
2477 
2478 static struct pernet_operations packet_net_ops = {
2479 	.init = packet_net_init,
2480 	.exit = packet_net_exit,
2481 };
2482 
2483 
2484 static void __exit packet_exit(void)
2485 {
2486 	unregister_netdevice_notifier(&packet_netdev_notifier);
2487 	unregister_pernet_subsys(&packet_net_ops);
2488 	sock_unregister(PF_PACKET);
2489 	proto_unregister(&packet_proto);
2490 }
2491 
2492 static int __init packet_init(void)
2493 {
2494 	int rc = proto_register(&packet_proto, 0);
2495 
2496 	if (rc != 0)
2497 		goto out;
2498 
2499 	sock_register(&packet_family_ops);
2500 	register_pernet_subsys(&packet_net_ops);
2501 	register_netdevice_notifier(&packet_netdev_notifier);
2502 out:
2503 	return rc;
2504 }
2505 
2506 module_init(packet_init);
2507 module_exit(packet_exit);
2508 MODULE_LICENSE("GPL");
2509 MODULE_ALIAS_NETPROTO(PF_PACKET);
2510