xref: /linux/net/packet/af_packet.c (revision e27ecdd94d81e5bc3d1f68591701db5adb342f0d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* Private packet socket structures. */
139 
140 struct packet_mclist
141 {
142 	struct packet_mclist	*next;
143 	int			ifindex;
144 	int			count;
145 	unsigned short		type;
146 	unsigned short		alen;
147 	unsigned char		addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max
153 {
154 	int		mr_ifindex;
155 	unsigned short	mr_type;
156 	unsigned short	mr_alen;
157 	unsigned char	mr_address[MAX_ADDR_LEN];
158 };
159 
160 #ifdef CONFIG_PACKET_MMAP
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 		int closing, int tx_ring);
163 
164 struct packet_ring_buffer {
165 	char *			*pg_vec;
166 	unsigned int		head;
167 	unsigned int		frames_per_block;
168 	unsigned int		frame_size;
169 	unsigned int		frame_max;
170 
171 	unsigned int		pg_vec_order;
172 	unsigned int		pg_vec_pages;
173 	unsigned int		pg_vec_len;
174 
175 	atomic_t		pending;
176 };
177 
178 struct packet_sock;
179 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180 #endif
181 
182 static void packet_flush_mclist(struct sock *sk);
183 
184 struct packet_sock {
185 	/* struct sock has to be the first member of packet_sock */
186 	struct sock		sk;
187 	struct tpacket_stats	stats;
188 #ifdef CONFIG_PACKET_MMAP
189 	struct packet_ring_buffer	rx_ring;
190 	struct packet_ring_buffer	tx_ring;
191 	int			copy_thresh;
192 #endif
193 	struct packet_type	prot_hook;
194 	spinlock_t		bind_lock;
195 	struct mutex		pg_vec_lock;
196 	unsigned int		running:1,	/* prot_hook is attached*/
197 				auxdata:1,
198 				origdev:1;
199 	int			ifindex;	/* bound device		*/
200 	__be16			num;
201 	struct packet_mclist	*mclist;
202 #ifdef CONFIG_PACKET_MMAP
203 	atomic_t		mapped;
204 	enum tpacket_versions	tp_version;
205 	unsigned int		tp_hdrlen;
206 	unsigned int		tp_reserve;
207 	unsigned int		tp_loss:1;
208 #endif
209 };
210 
211 struct packet_skb_cb {
212 	unsigned int origlen;
213 	union {
214 		struct sockaddr_pkt pkt;
215 		struct sockaddr_ll ll;
216 	} sa;
217 };
218 
219 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
220 
221 #ifdef CONFIG_PACKET_MMAP
222 
223 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
224 {
225 	union {
226 		struct tpacket_hdr *h1;
227 		struct tpacket2_hdr *h2;
228 		void *raw;
229 	} h;
230 
231 	h.raw = frame;
232 	switch (po->tp_version) {
233 	case TPACKET_V1:
234 		h.h1->tp_status = status;
235 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
236 		break;
237 	case TPACKET_V2:
238 		h.h2->tp_status = status;
239 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
240 		break;
241 	default:
242 		printk(KERN_ERR "TPACKET version not supported\n");
243 		BUG();
244 	}
245 
246 	smp_wmb();
247 }
248 
249 static int __packet_get_status(struct packet_sock *po, void *frame)
250 {
251 	union {
252 		struct tpacket_hdr *h1;
253 		struct tpacket2_hdr *h2;
254 		void *raw;
255 	} h;
256 
257 	smp_rmb();
258 
259 	h.raw = frame;
260 	switch (po->tp_version) {
261 	case TPACKET_V1:
262 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
263 		return h.h1->tp_status;
264 	case TPACKET_V2:
265 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
266 		return h.h2->tp_status;
267 	default:
268 		printk(KERN_ERR "TPACKET version not supported\n");
269 		BUG();
270 		return 0;
271 	}
272 }
273 
274 static void *packet_lookup_frame(struct packet_sock *po,
275 		struct packet_ring_buffer *rb,
276 		unsigned int position,
277 		int status)
278 {
279 	unsigned int pg_vec_pos, frame_offset;
280 	union {
281 		struct tpacket_hdr *h1;
282 		struct tpacket2_hdr *h2;
283 		void *raw;
284 	} h;
285 
286 	pg_vec_pos = position / rb->frames_per_block;
287 	frame_offset = position % rb->frames_per_block;
288 
289 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290 
291 	if (status != __packet_get_status(po, h.raw))
292 		return NULL;
293 
294 	return h.raw;
295 }
296 
297 static inline void *packet_current_frame(struct packet_sock *po,
298 		struct packet_ring_buffer *rb,
299 		int status)
300 {
301 	return packet_lookup_frame(po, rb, rb->head, status);
302 }
303 
304 static inline void *packet_previous_frame(struct packet_sock *po,
305 		struct packet_ring_buffer *rb,
306 		int status)
307 {
308 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309 	return packet_lookup_frame(po, rb, previous, status);
310 }
311 
312 static inline void packet_increment_head(struct packet_ring_buffer *buff)
313 {
314 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315 }
316 
317 #endif
318 
319 static inline struct packet_sock *pkt_sk(struct sock *sk)
320 {
321 	return (struct packet_sock *)sk;
322 }
323 
324 static void packet_sock_destruct(struct sock *sk)
325 {
326 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
327 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
328 
329 	if (!sock_flag(sk, SOCK_DEAD)) {
330 		printk("Attempt to release alive packet socket: %p\n", sk);
331 		return;
332 	}
333 
334 	sk_refcnt_debug_dec(sk);
335 }
336 
337 
338 static const struct proto_ops packet_ops;
339 
340 static const struct proto_ops packet_ops_spkt;
341 
342 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
343 {
344 	struct sock *sk;
345 	struct sockaddr_pkt *spkt;
346 
347 	/*
348 	 *	When we registered the protocol we saved the socket in the data
349 	 *	field for just this event.
350 	 */
351 
352 	sk = pt->af_packet_priv;
353 
354 	/*
355 	 *	Yank back the headers [hope the device set this
356 	 *	right or kerboom...]
357 	 *
358 	 *	Incoming packets have ll header pulled,
359 	 *	push it back.
360 	 *
361 	 *	For outgoing ones skb->data == skb_mac_header(skb)
362 	 *	so that this procedure is noop.
363 	 */
364 
365 	if (skb->pkt_type == PACKET_LOOPBACK)
366 		goto out;
367 
368 	if (dev_net(dev) != sock_net(sk))
369 		goto out;
370 
371 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
372 		goto oom;
373 
374 	/* drop any routing info */
375 	skb_dst_drop(skb);
376 
377 	/* drop conntrack reference */
378 	nf_reset(skb);
379 
380 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381 
382 	skb_push(skb, skb->data - skb_mac_header(skb));
383 
384 	/*
385 	 *	The SOCK_PACKET socket receives _all_ frames.
386 	 */
387 
388 	spkt->spkt_family = dev->type;
389 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390 	spkt->spkt_protocol = skb->protocol;
391 
392 	/*
393 	 *	Charge the memory to the socket. This is done specifically
394 	 *	to prevent sockets using all the memory up.
395 	 */
396 
397 	if (sock_queue_rcv_skb(sk,skb) == 0)
398 		return 0;
399 
400 out:
401 	kfree_skb(skb);
402 oom:
403 	return 0;
404 }
405 
406 
407 /*
408  *	Output a raw packet to a device layer. This bypasses all the other
409  *	protocol layers and you must therefore supply it with a complete frame
410  */
411 
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413 			       struct msghdr *msg, size_t len)
414 {
415 	struct sock *sk = sock->sk;
416 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
417 	struct sk_buff *skb;
418 	struct net_device *dev;
419 	__be16 proto=0;
420 	int err;
421 
422 	/*
423 	 *	Get and verify the address.
424 	 */
425 
426 	if (saddr)
427 	{
428 		if (msg->msg_namelen < sizeof(struct sockaddr))
429 			return(-EINVAL);
430 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
431 			proto=saddr->spkt_protocol;
432 	}
433 	else
434 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
435 
436 	/*
437 	 *	Find the device first to size check it
438 	 */
439 
440 	saddr->spkt_device[13] = 0;
441 	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
442 	err = -ENODEV;
443 	if (dev == NULL)
444 		goto out_unlock;
445 
446 	err = -ENETDOWN;
447 	if (!(dev->flags & IFF_UP))
448 		goto out_unlock;
449 
450 	/*
451 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
452 	 *	raw protocol and you must do your own fragmentation at this level.
453 	 */
454 
455 	err = -EMSGSIZE;
456 	if (len > dev->mtu + dev->hard_header_len)
457 		goto out_unlock;
458 
459 	err = -ENOBUFS;
460 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
461 
462 	/*
463 	 *	If the write buffer is full, then tough. At this level the user gets to
464 	 *	deal with the problem - do your own algorithmic backoffs. That's far
465 	 *	more flexible.
466 	 */
467 
468 	if (skb == NULL)
469 		goto out_unlock;
470 
471 	/*
472 	 *	Fill it in
473 	 */
474 
475 	/* FIXME: Save some space for broken drivers that write a
476 	 * hard header at transmission time by themselves. PPP is the
477 	 * notable one here. This should really be fixed at the driver level.
478 	 */
479 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
480 	skb_reset_network_header(skb);
481 
482 	/* Try to align data part correctly */
483 	if (dev->header_ops) {
484 		skb->data -= dev->hard_header_len;
485 		skb->tail -= dev->hard_header_len;
486 		if (len < dev->hard_header_len)
487 			skb_reset_network_header(skb);
488 	}
489 
490 	/* Returns -EFAULT on error */
491 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
492 	skb->protocol = proto;
493 	skb->dev = dev;
494 	skb->priority = sk->sk_priority;
495 	if (err)
496 		goto out_free;
497 
498 	/*
499 	 *	Now send it
500 	 */
501 
502 	dev_queue_xmit(skb);
503 	dev_put(dev);
504 	return(len);
505 
506 out_free:
507 	kfree_skb(skb);
508 out_unlock:
509 	if (dev)
510 		dev_put(dev);
511 	return err;
512 }
513 
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 				      unsigned int res)
516 {
517 	struct sk_filter *filter;
518 
519 	rcu_read_lock_bh();
520 	filter = rcu_dereference(sk->sk_filter);
521 	if (filter != NULL)
522 		res = sk_run_filter(skb, filter->insns, filter->len);
523 	rcu_read_unlock_bh();
524 
525 	return res;
526 }
527 
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531 
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539 
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
541 {
542 	struct sock *sk;
543 	struct sockaddr_ll *sll;
544 	struct packet_sock *po;
545 	u8 * skb_head = skb->data;
546 	int skb_len = skb->len;
547 	unsigned int snaplen, res;
548 
549 	if (skb->pkt_type == PACKET_LOOPBACK)
550 		goto drop;
551 
552 	sk = pt->af_packet_priv;
553 	po = pkt_sk(sk);
554 
555 	if (dev_net(dev) != sock_net(sk))
556 		goto drop;
557 
558 	skb->dev = dev;
559 
560 	if (dev->header_ops) {
561 		/* The device has an explicit notion of ll header,
562 		   exported to higher levels.
563 
564 		   Otherwise, the device hides datails of it frame
565 		   structure, so that corresponding packet head
566 		   never delivered to user.
567 		 */
568 		if (sk->sk_type != SOCK_DGRAM)
569 			skb_push(skb, skb->data - skb_mac_header(skb));
570 		else if (skb->pkt_type == PACKET_OUTGOING) {
571 			/* Special case: outgoing packets have ll header at head */
572 			skb_pull(skb, skb_network_offset(skb));
573 		}
574 	}
575 
576 	snaplen = skb->len;
577 
578 	res = run_filter(skb, sk, snaplen);
579 	if (!res)
580 		goto drop_n_restore;
581 	if (snaplen > res)
582 		snaplen = res;
583 
584 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
585 	    (unsigned)sk->sk_rcvbuf)
586 		goto drop_n_acct;
587 
588 	if (skb_shared(skb)) {
589 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
590 		if (nskb == NULL)
591 			goto drop_n_acct;
592 
593 		if (skb_head != skb->data) {
594 			skb->data = skb_head;
595 			skb->len = skb_len;
596 		}
597 		kfree_skb(skb);
598 		skb = nskb;
599 	}
600 
601 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
602 		     sizeof(skb->cb));
603 
604 	sll = &PACKET_SKB_CB(skb)->sa.ll;
605 	sll->sll_family = AF_PACKET;
606 	sll->sll_hatype = dev->type;
607 	sll->sll_protocol = skb->protocol;
608 	sll->sll_pkttype = skb->pkt_type;
609 	if (unlikely(po->origdev))
610 		sll->sll_ifindex = orig_dev->ifindex;
611 	else
612 		sll->sll_ifindex = dev->ifindex;
613 
614 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
615 
616 	PACKET_SKB_CB(skb)->origlen = skb->len;
617 
618 	if (pskb_trim(skb, snaplen))
619 		goto drop_n_acct;
620 
621 	skb_set_owner_r(skb, sk);
622 	skb->dev = NULL;
623 	skb_dst_drop(skb);
624 
625 	/* drop conntrack reference */
626 	nf_reset(skb);
627 
628 	spin_lock(&sk->sk_receive_queue.lock);
629 	po->stats.tp_packets++;
630 	__skb_queue_tail(&sk->sk_receive_queue, skb);
631 	spin_unlock(&sk->sk_receive_queue.lock);
632 	sk->sk_data_ready(sk, skb->len);
633 	return 0;
634 
635 drop_n_acct:
636 	spin_lock(&sk->sk_receive_queue.lock);
637 	po->stats.tp_drops++;
638 	spin_unlock(&sk->sk_receive_queue.lock);
639 
640 drop_n_restore:
641 	if (skb_head != skb->data && skb_shared(skb)) {
642 		skb->data = skb_head;
643 		skb->len = skb_len;
644 	}
645 drop:
646 	consume_skb(skb);
647 	return 0;
648 }
649 
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
652 {
653 	struct sock *sk;
654 	struct packet_sock *po;
655 	struct sockaddr_ll *sll;
656 	union {
657 		struct tpacket_hdr *h1;
658 		struct tpacket2_hdr *h2;
659 		void *raw;
660 	} h;
661 	u8 * skb_head = skb->data;
662 	int skb_len = skb->len;
663 	unsigned int snaplen, res;
664 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 	unsigned short macoff, netoff, hdrlen;
666 	struct sk_buff *copy_skb = NULL;
667 	struct timeval tv;
668 	struct timespec ts;
669 
670 	if (skb->pkt_type == PACKET_LOOPBACK)
671 		goto drop;
672 
673 	sk = pt->af_packet_priv;
674 	po = pkt_sk(sk);
675 
676 	if (dev_net(dev) != sock_net(sk))
677 		goto drop;
678 
679 	if (dev->header_ops) {
680 		if (sk->sk_type != SOCK_DGRAM)
681 			skb_push(skb, skb->data - skb_mac_header(skb));
682 		else if (skb->pkt_type == PACKET_OUTGOING) {
683 			/* Special case: outgoing packets have ll header at head */
684 			skb_pull(skb, skb_network_offset(skb));
685 		}
686 	}
687 
688 	if (skb->ip_summed == CHECKSUM_PARTIAL)
689 		status |= TP_STATUS_CSUMNOTREADY;
690 
691 	snaplen = skb->len;
692 
693 	res = run_filter(skb, sk, snaplen);
694 	if (!res)
695 		goto drop_n_restore;
696 	if (snaplen > res)
697 		snaplen = res;
698 
699 	if (sk->sk_type == SOCK_DGRAM) {
700 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701 				  po->tp_reserve;
702 	} else {
703 		unsigned maclen = skb_network_offset(skb);
704 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
705 				       (maclen < 16 ? 16 : maclen)) +
706 			po->tp_reserve;
707 		macoff = netoff - maclen;
708 	}
709 
710 	if (macoff + snaplen > po->rx_ring.frame_size) {
711 		if (po->copy_thresh &&
712 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713 		    (unsigned)sk->sk_rcvbuf) {
714 			if (skb_shared(skb)) {
715 				copy_skb = skb_clone(skb, GFP_ATOMIC);
716 			} else {
717 				copy_skb = skb_get(skb);
718 				skb_head = skb->data;
719 			}
720 			if (copy_skb)
721 				skb_set_owner_r(copy_skb, sk);
722 		}
723 		snaplen = po->rx_ring.frame_size - macoff;
724 		if ((int)snaplen < 0)
725 			snaplen = 0;
726 	}
727 
728 	spin_lock(&sk->sk_receive_queue.lock);
729 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730 	if (!h.raw)
731 		goto ring_is_full;
732 	packet_increment_head(&po->rx_ring);
733 	po->stats.tp_packets++;
734 	if (copy_skb) {
735 		status |= TP_STATUS_COPY;
736 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737 	}
738 	if (!po->stats.tp_drops)
739 		status &= ~TP_STATUS_LOSING;
740 	spin_unlock(&sk->sk_receive_queue.lock);
741 
742 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743 
744 	switch (po->tp_version) {
745 	case TPACKET_V1:
746 		h.h1->tp_len = skb->len;
747 		h.h1->tp_snaplen = snaplen;
748 		h.h1->tp_mac = macoff;
749 		h.h1->tp_net = netoff;
750 		if (skb->tstamp.tv64)
751 			tv = ktime_to_timeval(skb->tstamp);
752 		else
753 			do_gettimeofday(&tv);
754 		h.h1->tp_sec = tv.tv_sec;
755 		h.h1->tp_usec = tv.tv_usec;
756 		hdrlen = sizeof(*h.h1);
757 		break;
758 	case TPACKET_V2:
759 		h.h2->tp_len = skb->len;
760 		h.h2->tp_snaplen = snaplen;
761 		h.h2->tp_mac = macoff;
762 		h.h2->tp_net = netoff;
763 		if (skb->tstamp.tv64)
764 			ts = ktime_to_timespec(skb->tstamp);
765 		else
766 			getnstimeofday(&ts);
767 		h.h2->tp_sec = ts.tv_sec;
768 		h.h2->tp_nsec = ts.tv_nsec;
769 		h.h2->tp_vlan_tci = skb->vlan_tci;
770 		hdrlen = sizeof(*h.h2);
771 		break;
772 	default:
773 		BUG();
774 	}
775 
776 	sll = h.raw + TPACKET_ALIGN(hdrlen);
777 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778 	sll->sll_family = AF_PACKET;
779 	sll->sll_hatype = dev->type;
780 	sll->sll_protocol = skb->protocol;
781 	sll->sll_pkttype = skb->pkt_type;
782 	if (unlikely(po->origdev))
783 		sll->sll_ifindex = orig_dev->ifindex;
784 	else
785 		sll->sll_ifindex = dev->ifindex;
786 
787 	__packet_set_status(po, h.raw, status);
788 	smp_mb();
789 	{
790 		struct page *p_start, *p_end;
791 		u8 *h_end = h.raw + macoff + snaplen - 1;
792 
793 		p_start = virt_to_page(h.raw);
794 		p_end = virt_to_page(h_end);
795 		while (p_start <= p_end) {
796 			flush_dcache_page(p_start);
797 			p_start++;
798 		}
799 	}
800 
801 	sk->sk_data_ready(sk, 0);
802 
803 drop_n_restore:
804 	if (skb_head != skb->data && skb_shared(skb)) {
805 		skb->data = skb_head;
806 		skb->len = skb_len;
807 	}
808 drop:
809 	kfree_skb(skb);
810 	return 0;
811 
812 ring_is_full:
813 	po->stats.tp_drops++;
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	sk->sk_data_ready(sk, 0);
817 	kfree_skb(copy_skb);
818 	goto drop_n_restore;
819 }
820 
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823 	struct packet_sock *po = pkt_sk(skb->sk);
824 	void * ph;
825 
826 	BUG_ON(skb == NULL);
827 
828 	if (likely(po->tx_ring.pg_vec)) {
829 		ph = skb_shinfo(skb)->destructor_arg;
830 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832 		atomic_dec(&po->tx_ring.pending);
833 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834 	}
835 
836 	sock_wfree(skb);
837 }
838 
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
840 		void * frame, struct net_device *dev, int size_max,
841 		__be16 proto, unsigned char * addr)
842 {
843 	union {
844 		struct tpacket_hdr *h1;
845 		struct tpacket2_hdr *h2;
846 		void *raw;
847 	} ph;
848 	int to_write, offset, len, tp_len, nr_frags, len_max;
849 	struct socket *sock = po->sk.sk_socket;
850 	struct page *page;
851 	void *data;
852 	int err;
853 
854 	ph.raw = frame;
855 
856 	skb->protocol = proto;
857 	skb->dev = dev;
858 	skb->priority = po->sk.sk_priority;
859 	skb_shinfo(skb)->destructor_arg = ph.raw;
860 
861 	switch (po->tp_version) {
862 	case TPACKET_V2:
863 		tp_len = ph.h2->tp_len;
864 		break;
865 	default:
866 		tp_len = ph.h1->tp_len;
867 		break;
868 	}
869 	if (unlikely(tp_len > size_max)) {
870 		printk(KERN_ERR "packet size is too long (%d > %d)\n",
871 				tp_len, size_max);
872 		return -EMSGSIZE;
873 	}
874 
875 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
876 	skb_reset_network_header(skb);
877 
878 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
879 	to_write = tp_len;
880 
881 	if (sock->type == SOCK_DGRAM) {
882 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
883 				NULL, tp_len);
884 		if (unlikely(err < 0))
885 			return -EINVAL;
886 	} else if (dev->hard_header_len ) {
887 		/* net device doesn't like empty head */
888 		if (unlikely(tp_len <= dev->hard_header_len)) {
889 			printk(KERN_ERR "packet size is too short "
890 					"(%d < %d)\n", tp_len,
891 					dev->hard_header_len);
892 			return -EINVAL;
893 		}
894 
895 		skb_push(skb, dev->hard_header_len);
896 		err = skb_store_bits(skb, 0, data,
897 				dev->hard_header_len);
898 		if (unlikely(err))
899 			return err;
900 
901 		data += dev->hard_header_len;
902 		to_write -= dev->hard_header_len;
903 	}
904 
905 	err = -EFAULT;
906 	page = virt_to_page(data);
907 	offset = offset_in_page(data);
908 	len_max = PAGE_SIZE - offset;
909 	len = ((to_write > len_max) ? len_max : to_write);
910 
911 	skb->data_len = to_write;
912 	skb->len += to_write;
913 	skb->truesize += to_write;
914 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
915 
916 	while (likely(to_write)) {
917 		nr_frags = skb_shinfo(skb)->nr_frags;
918 
919 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920 			printk(KERN_ERR "Packet exceed the number "
921 					"of skb frags(%lu)\n",
922 					MAX_SKB_FRAGS);
923 			return -EFAULT;
924 		}
925 
926 		flush_dcache_page(page);
927 		get_page(page);
928 		skb_fill_page_desc(skb,
929 				nr_frags,
930 				page++, offset, len);
931 		to_write -= len;
932 		offset = 0;
933 		len_max = PAGE_SIZE;
934 		len = ((to_write > len_max) ? len_max : to_write);
935 	}
936 
937 	return tp_len;
938 }
939 
940 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
941 {
942 	struct socket *sock;
943 	struct sk_buff *skb;
944 	struct net_device *dev;
945 	__be16 proto;
946 	int ifindex, err, reserve = 0;
947 	void * ph;
948 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
949 	int tp_len, size_max;
950 	unsigned char *addr;
951 	int len_sum = 0;
952 	int status = 0;
953 
954 	sock = po->sk.sk_socket;
955 
956 	mutex_lock(&po->pg_vec_lock);
957 
958 	err = -EBUSY;
959 	if (saddr == NULL) {
960 		ifindex	= po->ifindex;
961 		proto	= po->num;
962 		addr	= NULL;
963 	} else {
964 		err = -EINVAL;
965 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
966 			goto out;
967 		if (msg->msg_namelen < (saddr->sll_halen
968 					+ offsetof(struct sockaddr_ll,
969 						sll_addr)))
970 			goto out;
971 		ifindex	= saddr->sll_ifindex;
972 		proto	= saddr->sll_protocol;
973 		addr	= saddr->sll_addr;
974 	}
975 
976 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
977 	err = -ENXIO;
978 	if (unlikely(dev == NULL))
979 		goto out;
980 
981 	reserve = dev->hard_header_len;
982 
983 	err = -ENETDOWN;
984 	if (unlikely(!(dev->flags & IFF_UP)))
985 		goto out_put;
986 
987 	size_max = po->tx_ring.frame_size
988 		- sizeof(struct skb_shared_info)
989 		- po->tp_hdrlen
990 		- LL_ALLOCATED_SPACE(dev)
991 		- sizeof(struct sockaddr_ll);
992 
993 	if (size_max > dev->mtu + reserve)
994 		size_max = dev->mtu + reserve;
995 
996 	do {
997 		ph = packet_current_frame(po, &po->tx_ring,
998 				TP_STATUS_SEND_REQUEST);
999 
1000 		if (unlikely(ph == NULL)) {
1001 			schedule();
1002 			continue;
1003 		}
1004 
1005 		status = TP_STATUS_SEND_REQUEST;
1006 		skb = sock_alloc_send_skb(&po->sk,
1007 				LL_ALLOCATED_SPACE(dev)
1008 				+ sizeof(struct sockaddr_ll),
1009 				0, &err);
1010 
1011 		if (unlikely(skb == NULL))
1012 			goto out_status;
1013 
1014 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015 				addr);
1016 
1017 		if (unlikely(tp_len < 0)) {
1018 			if (po->tp_loss) {
1019 				__packet_set_status(po, ph,
1020 						TP_STATUS_AVAILABLE);
1021 				packet_increment_head(&po->tx_ring);
1022 				kfree_skb(skb);
1023 				continue;
1024 			} else {
1025 				status = TP_STATUS_WRONG_FORMAT;
1026 				err = tp_len;
1027 				goto out_status;
1028 			}
1029 		}
1030 
1031 		skb->destructor = tpacket_destruct_skb;
1032 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1033 		atomic_inc(&po->tx_ring.pending);
1034 
1035 		status = TP_STATUS_SEND_REQUEST;
1036 		err = dev_queue_xmit(skb);
1037 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1038 			goto out_xmit;
1039 		packet_increment_head(&po->tx_ring);
1040 		len_sum += tp_len;
1041 	}
1042 	while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1043 					&& (atomic_read(&po->tx_ring.pending))))
1044 	      );
1045 
1046 	err = len_sum;
1047 	goto out_put;
1048 
1049 out_xmit:
1050 	skb->destructor = sock_wfree;
1051 	atomic_dec(&po->tx_ring.pending);
1052 out_status:
1053 	__packet_set_status(po, ph, status);
1054 	kfree_skb(skb);
1055 out_put:
1056 	dev_put(dev);
1057 out:
1058 	mutex_unlock(&po->pg_vec_lock);
1059 	return err;
1060 }
1061 #endif
1062 
1063 static int packet_snd(struct socket *sock,
1064 			  struct msghdr *msg, size_t len)
1065 {
1066 	struct sock *sk = sock->sk;
1067 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
1068 	struct sk_buff *skb;
1069 	struct net_device *dev;
1070 	__be16 proto;
1071 	unsigned char *addr;
1072 	int ifindex, err, reserve = 0;
1073 
1074 	/*
1075 	 *	Get and verify the address.
1076 	 */
1077 
1078 	if (saddr == NULL) {
1079 		struct packet_sock *po = pkt_sk(sk);
1080 
1081 		ifindex	= po->ifindex;
1082 		proto	= po->num;
1083 		addr	= NULL;
1084 	} else {
1085 		err = -EINVAL;
1086 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1087 			goto out;
1088 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1089 			goto out;
1090 		ifindex	= saddr->sll_ifindex;
1091 		proto	= saddr->sll_protocol;
1092 		addr	= saddr->sll_addr;
1093 	}
1094 
1095 
1096 	dev = dev_get_by_index(sock_net(sk), ifindex);
1097 	err = -ENXIO;
1098 	if (dev == NULL)
1099 		goto out_unlock;
1100 	if (sock->type == SOCK_RAW)
1101 		reserve = dev->hard_header_len;
1102 
1103 	err = -ENETDOWN;
1104 	if (!(dev->flags & IFF_UP))
1105 		goto out_unlock;
1106 
1107 	err = -EMSGSIZE;
1108 	if (len > dev->mtu+reserve)
1109 		goto out_unlock;
1110 
1111 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1112 				msg->msg_flags & MSG_DONTWAIT, &err);
1113 	if (skb==NULL)
1114 		goto out_unlock;
1115 
1116 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1117 	skb_reset_network_header(skb);
1118 
1119 	err = -EINVAL;
1120 	if (sock->type == SOCK_DGRAM &&
1121 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1122 		goto out_free;
1123 
1124 	/* Returns -EFAULT on error */
1125 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1126 	if (err)
1127 		goto out_free;
1128 
1129 	skb->protocol = proto;
1130 	skb->dev = dev;
1131 	skb->priority = sk->sk_priority;
1132 
1133 	/*
1134 	 *	Now send it
1135 	 */
1136 
1137 	err = dev_queue_xmit(skb);
1138 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1139 		goto out_unlock;
1140 
1141 	dev_put(dev);
1142 
1143 	return(len);
1144 
1145 out_free:
1146 	kfree_skb(skb);
1147 out_unlock:
1148 	if (dev)
1149 		dev_put(dev);
1150 out:
1151 	return err;
1152 }
1153 
1154 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1155 		struct msghdr *msg, size_t len)
1156 {
1157 #ifdef CONFIG_PACKET_MMAP
1158 	struct sock *sk = sock->sk;
1159 	struct packet_sock *po = pkt_sk(sk);
1160 	if (po->tx_ring.pg_vec)
1161 		return tpacket_snd(po, msg);
1162 	else
1163 #endif
1164 		return packet_snd(sock, msg, len);
1165 }
1166 
1167 /*
1168  *	Close a PACKET socket. This is fairly simple. We immediately go
1169  *	to 'closed' state and remove our protocol entry in the device list.
1170  */
1171 
1172 static int packet_release(struct socket *sock)
1173 {
1174 	struct sock *sk = sock->sk;
1175 	struct packet_sock *po;
1176 	struct net *net;
1177 #ifdef CONFIG_PACKET_MMAP
1178 	struct tpacket_req req;
1179 #endif
1180 
1181 	if (!sk)
1182 		return 0;
1183 
1184 	net = sock_net(sk);
1185 	po = pkt_sk(sk);
1186 
1187 	write_lock_bh(&net->packet.sklist_lock);
1188 	sk_del_node_init(sk);
1189 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1190 	write_unlock_bh(&net->packet.sklist_lock);
1191 
1192 	/*
1193 	 *	Unhook packet receive handler.
1194 	 */
1195 
1196 	if (po->running) {
1197 		/*
1198 		 *	Remove the protocol hook
1199 		 */
1200 		dev_remove_pack(&po->prot_hook);
1201 		po->running = 0;
1202 		po->num = 0;
1203 		__sock_put(sk);
1204 	}
1205 
1206 	packet_flush_mclist(sk);
1207 
1208 #ifdef CONFIG_PACKET_MMAP
1209 	memset(&req, 0, sizeof(req));
1210 
1211 	if (po->rx_ring.pg_vec)
1212 		packet_set_ring(sk, &req, 1, 0);
1213 
1214 	if (po->tx_ring.pg_vec)
1215 		packet_set_ring(sk, &req, 1, 1);
1216 #endif
1217 
1218 	/*
1219 	 *	Now the socket is dead. No more input will appear.
1220 	 */
1221 
1222 	sock_orphan(sk);
1223 	sock->sk = NULL;
1224 
1225 	/* Purge queues */
1226 
1227 	skb_queue_purge(&sk->sk_receive_queue);
1228 	sk_refcnt_debug_release(sk);
1229 
1230 	sock_put(sk);
1231 	return 0;
1232 }
1233 
1234 /*
1235  *	Attach a packet hook.
1236  */
1237 
1238 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1239 {
1240 	struct packet_sock *po = pkt_sk(sk);
1241 	/*
1242 	 *	Detach an existing hook if present.
1243 	 */
1244 
1245 	lock_sock(sk);
1246 
1247 	spin_lock(&po->bind_lock);
1248 	if (po->running) {
1249 		__sock_put(sk);
1250 		po->running = 0;
1251 		po->num = 0;
1252 		spin_unlock(&po->bind_lock);
1253 		dev_remove_pack(&po->prot_hook);
1254 		spin_lock(&po->bind_lock);
1255 	}
1256 
1257 	po->num = protocol;
1258 	po->prot_hook.type = protocol;
1259 	po->prot_hook.dev = dev;
1260 
1261 	po->ifindex = dev ? dev->ifindex : 0;
1262 
1263 	if (protocol == 0)
1264 		goto out_unlock;
1265 
1266 	if (!dev || (dev->flags & IFF_UP)) {
1267 		dev_add_pack(&po->prot_hook);
1268 		sock_hold(sk);
1269 		po->running = 1;
1270 	} else {
1271 		sk->sk_err = ENETDOWN;
1272 		if (!sock_flag(sk, SOCK_DEAD))
1273 			sk->sk_error_report(sk);
1274 	}
1275 
1276 out_unlock:
1277 	spin_unlock(&po->bind_lock);
1278 	release_sock(sk);
1279 	return 0;
1280 }
1281 
1282 /*
1283  *	Bind a packet socket to a device
1284  */
1285 
1286 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1287 {
1288 	struct sock *sk=sock->sk;
1289 	char name[15];
1290 	struct net_device *dev;
1291 	int err = -ENODEV;
1292 
1293 	/*
1294 	 *	Check legality
1295 	 */
1296 
1297 	if (addr_len != sizeof(struct sockaddr))
1298 		return -EINVAL;
1299 	strlcpy(name,uaddr->sa_data,sizeof(name));
1300 
1301 	dev = dev_get_by_name(sock_net(sk), name);
1302 	if (dev) {
1303 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1304 		dev_put(dev);
1305 	}
1306 	return err;
1307 }
1308 
1309 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1310 {
1311 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1312 	struct sock *sk=sock->sk;
1313 	struct net_device *dev = NULL;
1314 	int err;
1315 
1316 
1317 	/*
1318 	 *	Check legality
1319 	 */
1320 
1321 	if (addr_len < sizeof(struct sockaddr_ll))
1322 		return -EINVAL;
1323 	if (sll->sll_family != AF_PACKET)
1324 		return -EINVAL;
1325 
1326 	if (sll->sll_ifindex) {
1327 		err = -ENODEV;
1328 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1329 		if (dev == NULL)
1330 			goto out;
1331 	}
1332 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1333 	if (dev)
1334 		dev_put(dev);
1335 
1336 out:
1337 	return err;
1338 }
1339 
1340 static struct proto packet_proto = {
1341 	.name	  = "PACKET",
1342 	.owner	  = THIS_MODULE,
1343 	.obj_size = sizeof(struct packet_sock),
1344 };
1345 
1346 /*
1347  *	Create a packet of type SOCK_PACKET.
1348  */
1349 
1350 static int packet_create(struct net *net, struct socket *sock, int protocol)
1351 {
1352 	struct sock *sk;
1353 	struct packet_sock *po;
1354 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1355 	int err;
1356 
1357 	if (!capable(CAP_NET_RAW))
1358 		return -EPERM;
1359 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1360 	    sock->type != SOCK_PACKET)
1361 		return -ESOCKTNOSUPPORT;
1362 
1363 	sock->state = SS_UNCONNECTED;
1364 
1365 	err = -ENOBUFS;
1366 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1367 	if (sk == NULL)
1368 		goto out;
1369 
1370 	sock->ops = &packet_ops;
1371 	if (sock->type == SOCK_PACKET)
1372 		sock->ops = &packet_ops_spkt;
1373 
1374 	sock_init_data(sock, sk);
1375 
1376 	po = pkt_sk(sk);
1377 	sk->sk_family = PF_PACKET;
1378 	po->num = proto;
1379 
1380 	sk->sk_destruct = packet_sock_destruct;
1381 	sk_refcnt_debug_inc(sk);
1382 
1383 	/*
1384 	 *	Attach a protocol block
1385 	 */
1386 
1387 	spin_lock_init(&po->bind_lock);
1388 	mutex_init(&po->pg_vec_lock);
1389 	po->prot_hook.func = packet_rcv;
1390 
1391 	if (sock->type == SOCK_PACKET)
1392 		po->prot_hook.func = packet_rcv_spkt;
1393 
1394 	po->prot_hook.af_packet_priv = sk;
1395 
1396 	if (proto) {
1397 		po->prot_hook.type = proto;
1398 		dev_add_pack(&po->prot_hook);
1399 		sock_hold(sk);
1400 		po->running = 1;
1401 	}
1402 
1403 	write_lock_bh(&net->packet.sklist_lock);
1404 	sk_add_node(sk, &net->packet.sklist);
1405 	sock_prot_inuse_add(net, &packet_proto, 1);
1406 	write_unlock_bh(&net->packet.sklist_lock);
1407 	return(0);
1408 out:
1409 	return err;
1410 }
1411 
1412 /*
1413  *	Pull a packet from our receive queue and hand it to the user.
1414  *	If necessary we block.
1415  */
1416 
1417 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1418 			  struct msghdr *msg, size_t len, int flags)
1419 {
1420 	struct sock *sk = sock->sk;
1421 	struct sk_buff *skb;
1422 	int copied, err;
1423 	struct sockaddr_ll *sll;
1424 
1425 	err = -EINVAL;
1426 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1427 		goto out;
1428 
1429 #if 0
1430 	/* What error should we return now? EUNATTACH? */
1431 	if (pkt_sk(sk)->ifindex < 0)
1432 		return -ENODEV;
1433 #endif
1434 
1435 	/*
1436 	 *	Call the generic datagram receiver. This handles all sorts
1437 	 *	of horrible races and re-entrancy so we can forget about it
1438 	 *	in the protocol layers.
1439 	 *
1440 	 *	Now it will return ENETDOWN, if device have just gone down,
1441 	 *	but then it will block.
1442 	 */
1443 
1444 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1445 
1446 	/*
1447 	 *	An error occurred so return it. Because skb_recv_datagram()
1448 	 *	handles the blocking we don't see and worry about blocking
1449 	 *	retries.
1450 	 */
1451 
1452 	if (skb == NULL)
1453 		goto out;
1454 
1455 	/*
1456 	 *	If the address length field is there to be filled in, we fill
1457 	 *	it in now.
1458 	 */
1459 
1460 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1461 	if (sock->type == SOCK_PACKET)
1462 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1463 	else
1464 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1465 
1466 	/*
1467 	 *	You lose any data beyond the buffer you gave. If it worries a
1468 	 *	user program they can ask the device for its MTU anyway.
1469 	 */
1470 
1471 	copied = skb->len;
1472 	if (copied > len)
1473 	{
1474 		copied=len;
1475 		msg->msg_flags|=MSG_TRUNC;
1476 	}
1477 
1478 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1479 	if (err)
1480 		goto out_free;
1481 
1482 	sock_recv_timestamp(msg, sk, skb);
1483 
1484 	if (msg->msg_name)
1485 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1486 		       msg->msg_namelen);
1487 
1488 	if (pkt_sk(sk)->auxdata) {
1489 		struct tpacket_auxdata aux;
1490 
1491 		aux.tp_status = TP_STATUS_USER;
1492 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1493 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1494 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1495 		aux.tp_snaplen = skb->len;
1496 		aux.tp_mac = 0;
1497 		aux.tp_net = skb_network_offset(skb);
1498 		aux.tp_vlan_tci = skb->vlan_tci;
1499 
1500 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1501 	}
1502 
1503 	/*
1504 	 *	Free or return the buffer as appropriate. Again this
1505 	 *	hides all the races and re-entrancy issues from us.
1506 	 */
1507 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1508 
1509 out_free:
1510 	skb_free_datagram(sk, skb);
1511 out:
1512 	return err;
1513 }
1514 
1515 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1516 			       int *uaddr_len, int peer)
1517 {
1518 	struct net_device *dev;
1519 	struct sock *sk	= sock->sk;
1520 
1521 	if (peer)
1522 		return -EOPNOTSUPP;
1523 
1524 	uaddr->sa_family = AF_PACKET;
1525 	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1526 	if (dev) {
1527 		strlcpy(uaddr->sa_data, dev->name, 15);
1528 		dev_put(dev);
1529 	} else
1530 		memset(uaddr->sa_data, 0, 14);
1531 	*uaddr_len = sizeof(*uaddr);
1532 
1533 	return 0;
1534 }
1535 
1536 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1537 			  int *uaddr_len, int peer)
1538 {
1539 	struct net_device *dev;
1540 	struct sock *sk = sock->sk;
1541 	struct packet_sock *po = pkt_sk(sk);
1542 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1543 
1544 	if (peer)
1545 		return -EOPNOTSUPP;
1546 
1547 	sll->sll_family = AF_PACKET;
1548 	sll->sll_ifindex = po->ifindex;
1549 	sll->sll_protocol = po->num;
1550 	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1551 	if (dev) {
1552 		sll->sll_hatype = dev->type;
1553 		sll->sll_halen = dev->addr_len;
1554 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1555 		dev_put(dev);
1556 	} else {
1557 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1558 		sll->sll_halen = 0;
1559 	}
1560 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1561 
1562 	return 0;
1563 }
1564 
1565 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1566 			 int what)
1567 {
1568 	switch (i->type) {
1569 	case PACKET_MR_MULTICAST:
1570 		if (what > 0)
1571 			return dev_mc_add(dev, i->addr, i->alen, 0);
1572 		else
1573 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1574 		break;
1575 	case PACKET_MR_PROMISC:
1576 		return dev_set_promiscuity(dev, what);
1577 		break;
1578 	case PACKET_MR_ALLMULTI:
1579 		return dev_set_allmulti(dev, what);
1580 		break;
1581 	case PACKET_MR_UNICAST:
1582 		if (what > 0)
1583 			return dev_unicast_add(dev, i->addr);
1584 		else
1585 			return dev_unicast_delete(dev, i->addr);
1586 		break;
1587 	default:;
1588 	}
1589 	return 0;
1590 }
1591 
1592 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1593 {
1594 	for ( ; i; i=i->next) {
1595 		if (i->ifindex == dev->ifindex)
1596 			packet_dev_mc(dev, i, what);
1597 	}
1598 }
1599 
1600 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1601 {
1602 	struct packet_sock *po = pkt_sk(sk);
1603 	struct packet_mclist *ml, *i;
1604 	struct net_device *dev;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 
1609 	err = -ENODEV;
1610 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1611 	if (!dev)
1612 		goto done;
1613 
1614 	err = -EINVAL;
1615 	if (mreq->mr_alen > dev->addr_len)
1616 		goto done;
1617 
1618 	err = -ENOBUFS;
1619 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1620 	if (i == NULL)
1621 		goto done;
1622 
1623 	err = 0;
1624 	for (ml = po->mclist; ml; ml = ml->next) {
1625 		if (ml->ifindex == mreq->mr_ifindex &&
1626 		    ml->type == mreq->mr_type &&
1627 		    ml->alen == mreq->mr_alen &&
1628 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1629 			ml->count++;
1630 			/* Free the new element ... */
1631 			kfree(i);
1632 			goto done;
1633 		}
1634 	}
1635 
1636 	i->type = mreq->mr_type;
1637 	i->ifindex = mreq->mr_ifindex;
1638 	i->alen = mreq->mr_alen;
1639 	memcpy(i->addr, mreq->mr_address, i->alen);
1640 	i->count = 1;
1641 	i->next = po->mclist;
1642 	po->mclist = i;
1643 	err = packet_dev_mc(dev, i, 1);
1644 	if (err) {
1645 		po->mclist = i->next;
1646 		kfree(i);
1647 	}
1648 
1649 done:
1650 	rtnl_unlock();
1651 	return err;
1652 }
1653 
1654 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1655 {
1656 	struct packet_mclist *ml, **mlp;
1657 
1658 	rtnl_lock();
1659 
1660 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1661 		if (ml->ifindex == mreq->mr_ifindex &&
1662 		    ml->type == mreq->mr_type &&
1663 		    ml->alen == mreq->mr_alen &&
1664 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1665 			if (--ml->count == 0) {
1666 				struct net_device *dev;
1667 				*mlp = ml->next;
1668 				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1669 				if (dev) {
1670 					packet_dev_mc(dev, ml, -1);
1671 					dev_put(dev);
1672 				}
1673 				kfree(ml);
1674 			}
1675 			rtnl_unlock();
1676 			return 0;
1677 		}
1678 	}
1679 	rtnl_unlock();
1680 	return -EADDRNOTAVAIL;
1681 }
1682 
1683 static void packet_flush_mclist(struct sock *sk)
1684 {
1685 	struct packet_sock *po = pkt_sk(sk);
1686 	struct packet_mclist *ml;
1687 
1688 	if (!po->mclist)
1689 		return;
1690 
1691 	rtnl_lock();
1692 	while ((ml = po->mclist) != NULL) {
1693 		struct net_device *dev;
1694 
1695 		po->mclist = ml->next;
1696 		if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1697 			packet_dev_mc(dev, ml, -1);
1698 			dev_put(dev);
1699 		}
1700 		kfree(ml);
1701 	}
1702 	rtnl_unlock();
1703 }
1704 
1705 static int
1706 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1707 {
1708 	struct sock *sk = sock->sk;
1709 	struct packet_sock *po = pkt_sk(sk);
1710 	int ret;
1711 
1712 	if (level != SOL_PACKET)
1713 		return -ENOPROTOOPT;
1714 
1715 	switch (optname) {
1716 	case PACKET_ADD_MEMBERSHIP:
1717 	case PACKET_DROP_MEMBERSHIP:
1718 	{
1719 		struct packet_mreq_max mreq;
1720 		int len = optlen;
1721 		memset(&mreq, 0, sizeof(mreq));
1722 		if (len < sizeof(struct packet_mreq))
1723 			return -EINVAL;
1724 		if (len > sizeof(mreq))
1725 			len = sizeof(mreq);
1726 		if (copy_from_user(&mreq,optval,len))
1727 			return -EFAULT;
1728 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1729 			return -EINVAL;
1730 		if (optname == PACKET_ADD_MEMBERSHIP)
1731 			ret = packet_mc_add(sk, &mreq);
1732 		else
1733 			ret = packet_mc_drop(sk, &mreq);
1734 		return ret;
1735 	}
1736 
1737 #ifdef CONFIG_PACKET_MMAP
1738 	case PACKET_RX_RING:
1739 	case PACKET_TX_RING:
1740 	{
1741 		struct tpacket_req req;
1742 
1743 		if (optlen<sizeof(req))
1744 			return -EINVAL;
1745 		if (copy_from_user(&req,optval,sizeof(req)))
1746 			return -EFAULT;
1747 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1748 	}
1749 	case PACKET_COPY_THRESH:
1750 	{
1751 		int val;
1752 
1753 		if (optlen!=sizeof(val))
1754 			return -EINVAL;
1755 		if (copy_from_user(&val,optval,sizeof(val)))
1756 			return -EFAULT;
1757 
1758 		pkt_sk(sk)->copy_thresh = val;
1759 		return 0;
1760 	}
1761 	case PACKET_VERSION:
1762 	{
1763 		int val;
1764 
1765 		if (optlen != sizeof(val))
1766 			return -EINVAL;
1767 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1768 			return -EBUSY;
1769 		if (copy_from_user(&val, optval, sizeof(val)))
1770 			return -EFAULT;
1771 		switch (val) {
1772 		case TPACKET_V1:
1773 		case TPACKET_V2:
1774 			po->tp_version = val;
1775 			return 0;
1776 		default:
1777 			return -EINVAL;
1778 		}
1779 	}
1780 	case PACKET_RESERVE:
1781 	{
1782 		unsigned int val;
1783 
1784 		if (optlen != sizeof(val))
1785 			return -EINVAL;
1786 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1787 			return -EBUSY;
1788 		if (copy_from_user(&val, optval, sizeof(val)))
1789 			return -EFAULT;
1790 		po->tp_reserve = val;
1791 		return 0;
1792 	}
1793 	case PACKET_LOSS:
1794 	{
1795 		unsigned int val;
1796 
1797 		if (optlen != sizeof(val))
1798 			return -EINVAL;
1799 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1800 			return -EBUSY;
1801 		if (copy_from_user(&val, optval, sizeof(val)))
1802 			return -EFAULT;
1803 		po->tp_loss = !!val;
1804 		return 0;
1805 	}
1806 #endif
1807 	case PACKET_AUXDATA:
1808 	{
1809 		int val;
1810 
1811 		if (optlen < sizeof(val))
1812 			return -EINVAL;
1813 		if (copy_from_user(&val, optval, sizeof(val)))
1814 			return -EFAULT;
1815 
1816 		po->auxdata = !!val;
1817 		return 0;
1818 	}
1819 	case PACKET_ORIGDEV:
1820 	{
1821 		int val;
1822 
1823 		if (optlen < sizeof(val))
1824 			return -EINVAL;
1825 		if (copy_from_user(&val, optval, sizeof(val)))
1826 			return -EFAULT;
1827 
1828 		po->origdev = !!val;
1829 		return 0;
1830 	}
1831 	default:
1832 		return -ENOPROTOOPT;
1833 	}
1834 }
1835 
1836 static int packet_getsockopt(struct socket *sock, int level, int optname,
1837 			     char __user *optval, int __user *optlen)
1838 {
1839 	int len;
1840 	int val;
1841 	struct sock *sk = sock->sk;
1842 	struct packet_sock *po = pkt_sk(sk);
1843 	void *data;
1844 	struct tpacket_stats st;
1845 
1846 	if (level != SOL_PACKET)
1847 		return -ENOPROTOOPT;
1848 
1849 	if (get_user(len, optlen))
1850 		return -EFAULT;
1851 
1852 	if (len < 0)
1853 		return -EINVAL;
1854 
1855 	switch (optname) {
1856 	case PACKET_STATISTICS:
1857 		if (len > sizeof(struct tpacket_stats))
1858 			len = sizeof(struct tpacket_stats);
1859 		spin_lock_bh(&sk->sk_receive_queue.lock);
1860 		st = po->stats;
1861 		memset(&po->stats, 0, sizeof(st));
1862 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1863 		st.tp_packets += st.tp_drops;
1864 
1865 		data = &st;
1866 		break;
1867 	case PACKET_AUXDATA:
1868 		if (len > sizeof(int))
1869 			len = sizeof(int);
1870 		val = po->auxdata;
1871 
1872 		data = &val;
1873 		break;
1874 	case PACKET_ORIGDEV:
1875 		if (len > sizeof(int))
1876 			len = sizeof(int);
1877 		val = po->origdev;
1878 
1879 		data = &val;
1880 		break;
1881 #ifdef CONFIG_PACKET_MMAP
1882 	case PACKET_VERSION:
1883 		if (len > sizeof(int))
1884 			len = sizeof(int);
1885 		val = po->tp_version;
1886 		data = &val;
1887 		break;
1888 	case PACKET_HDRLEN:
1889 		if (len > sizeof(int))
1890 			len = sizeof(int);
1891 		if (copy_from_user(&val, optval, len))
1892 			return -EFAULT;
1893 		switch (val) {
1894 		case TPACKET_V1:
1895 			val = sizeof(struct tpacket_hdr);
1896 			break;
1897 		case TPACKET_V2:
1898 			val = sizeof(struct tpacket2_hdr);
1899 			break;
1900 		default:
1901 			return -EINVAL;
1902 		}
1903 		data = &val;
1904 		break;
1905 	case PACKET_RESERVE:
1906 		if (len > sizeof(unsigned int))
1907 			len = sizeof(unsigned int);
1908 		val = po->tp_reserve;
1909 		data = &val;
1910 		break;
1911 	case PACKET_LOSS:
1912 		if (len > sizeof(unsigned int))
1913 			len = sizeof(unsigned int);
1914 		val = po->tp_loss;
1915 		data = &val;
1916 		break;
1917 #endif
1918 	default:
1919 		return -ENOPROTOOPT;
1920 	}
1921 
1922 	if (put_user(len, optlen))
1923 		return -EFAULT;
1924 	if (copy_to_user(optval, data, len))
1925 		return -EFAULT;
1926 	return 0;
1927 }
1928 
1929 
1930 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1931 {
1932 	struct sock *sk;
1933 	struct hlist_node *node;
1934 	struct net_device *dev = data;
1935 	struct net *net = dev_net(dev);
1936 
1937 	read_lock(&net->packet.sklist_lock);
1938 	sk_for_each(sk, node, &net->packet.sklist) {
1939 		struct packet_sock *po = pkt_sk(sk);
1940 
1941 		switch (msg) {
1942 		case NETDEV_UNREGISTER:
1943 			if (po->mclist)
1944 				packet_dev_mclist(dev, po->mclist, -1);
1945 			/* fallthrough */
1946 
1947 		case NETDEV_DOWN:
1948 			if (dev->ifindex == po->ifindex) {
1949 				spin_lock(&po->bind_lock);
1950 				if (po->running) {
1951 					__dev_remove_pack(&po->prot_hook);
1952 					__sock_put(sk);
1953 					po->running = 0;
1954 					sk->sk_err = ENETDOWN;
1955 					if (!sock_flag(sk, SOCK_DEAD))
1956 						sk->sk_error_report(sk);
1957 				}
1958 				if (msg == NETDEV_UNREGISTER) {
1959 					po->ifindex = -1;
1960 					po->prot_hook.dev = NULL;
1961 				}
1962 				spin_unlock(&po->bind_lock);
1963 			}
1964 			break;
1965 		case NETDEV_UP:
1966 			spin_lock(&po->bind_lock);
1967 			if (dev->ifindex == po->ifindex && po->num &&
1968 			    !po->running) {
1969 				dev_add_pack(&po->prot_hook);
1970 				sock_hold(sk);
1971 				po->running = 1;
1972 			}
1973 			spin_unlock(&po->bind_lock);
1974 			break;
1975 		}
1976 	}
1977 	read_unlock(&net->packet.sklist_lock);
1978 	return NOTIFY_DONE;
1979 }
1980 
1981 
1982 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1983 			unsigned long arg)
1984 {
1985 	struct sock *sk = sock->sk;
1986 
1987 	switch (cmd) {
1988 		case SIOCOUTQ:
1989 		{
1990 			int amount = atomic_read(&sk->sk_wmem_alloc);
1991 			return put_user(amount, (int __user *)arg);
1992 		}
1993 		case SIOCINQ:
1994 		{
1995 			struct sk_buff *skb;
1996 			int amount = 0;
1997 
1998 			spin_lock_bh(&sk->sk_receive_queue.lock);
1999 			skb = skb_peek(&sk->sk_receive_queue);
2000 			if (skb)
2001 				amount = skb->len;
2002 			spin_unlock_bh(&sk->sk_receive_queue.lock);
2003 			return put_user(amount, (int __user *)arg);
2004 		}
2005 		case SIOCGSTAMP:
2006 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
2007 		case SIOCGSTAMPNS:
2008 			return sock_get_timestampns(sk, (struct timespec __user *)arg);
2009 
2010 #ifdef CONFIG_INET
2011 		case SIOCADDRT:
2012 		case SIOCDELRT:
2013 		case SIOCDARP:
2014 		case SIOCGARP:
2015 		case SIOCSARP:
2016 		case SIOCGIFADDR:
2017 		case SIOCSIFADDR:
2018 		case SIOCGIFBRDADDR:
2019 		case SIOCSIFBRDADDR:
2020 		case SIOCGIFNETMASK:
2021 		case SIOCSIFNETMASK:
2022 		case SIOCGIFDSTADDR:
2023 		case SIOCSIFDSTADDR:
2024 		case SIOCSIFFLAGS:
2025 			if (!net_eq(sock_net(sk), &init_net))
2026 				return -ENOIOCTLCMD;
2027 			return inet_dgram_ops.ioctl(sock, cmd, arg);
2028 #endif
2029 
2030 		default:
2031 			return -ENOIOCTLCMD;
2032 	}
2033 	return 0;
2034 }
2035 
2036 #ifndef CONFIG_PACKET_MMAP
2037 #define packet_mmap sock_no_mmap
2038 #define packet_poll datagram_poll
2039 #else
2040 
2041 static unsigned int packet_poll(struct file * file, struct socket *sock,
2042 				poll_table *wait)
2043 {
2044 	struct sock *sk = sock->sk;
2045 	struct packet_sock *po = pkt_sk(sk);
2046 	unsigned int mask = datagram_poll(file, sock, wait);
2047 
2048 	spin_lock_bh(&sk->sk_receive_queue.lock);
2049 	if (po->rx_ring.pg_vec) {
2050 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2051 			mask |= POLLIN | POLLRDNORM;
2052 	}
2053 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2054 	spin_lock_bh(&sk->sk_write_queue.lock);
2055 	if (po->tx_ring.pg_vec) {
2056 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2057 			mask |= POLLOUT | POLLWRNORM;
2058 	}
2059 	spin_unlock_bh(&sk->sk_write_queue.lock);
2060 	return mask;
2061 }
2062 
2063 
2064 /* Dirty? Well, I still did not learn better way to account
2065  * for user mmaps.
2066  */
2067 
2068 static void packet_mm_open(struct vm_area_struct *vma)
2069 {
2070 	struct file *file = vma->vm_file;
2071 	struct socket * sock = file->private_data;
2072 	struct sock *sk = sock->sk;
2073 
2074 	if (sk)
2075 		atomic_inc(&pkt_sk(sk)->mapped);
2076 }
2077 
2078 static void packet_mm_close(struct vm_area_struct *vma)
2079 {
2080 	struct file *file = vma->vm_file;
2081 	struct socket * sock = file->private_data;
2082 	struct sock *sk = sock->sk;
2083 
2084 	if (sk)
2085 		atomic_dec(&pkt_sk(sk)->mapped);
2086 }
2087 
2088 static struct vm_operations_struct packet_mmap_ops = {
2089 	.open =	packet_mm_open,
2090 	.close =packet_mm_close,
2091 };
2092 
2093 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2094 {
2095 	int i;
2096 
2097 	for (i = 0; i < len; i++) {
2098 		if (likely(pg_vec[i]))
2099 			free_pages((unsigned long) pg_vec[i], order);
2100 	}
2101 	kfree(pg_vec);
2102 }
2103 
2104 static inline char *alloc_one_pg_vec_page(unsigned long order)
2105 {
2106 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2107 
2108 	return (char *) __get_free_pages(gfp_flags, order);
2109 }
2110 
2111 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2112 {
2113 	unsigned int block_nr = req->tp_block_nr;
2114 	char **pg_vec;
2115 	int i;
2116 
2117 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2118 	if (unlikely(!pg_vec))
2119 		goto out;
2120 
2121 	for (i = 0; i < block_nr; i++) {
2122 		pg_vec[i] = alloc_one_pg_vec_page(order);
2123 		if (unlikely(!pg_vec[i]))
2124 			goto out_free_pgvec;
2125 	}
2126 
2127 out:
2128 	return pg_vec;
2129 
2130 out_free_pgvec:
2131 	free_pg_vec(pg_vec, order, block_nr);
2132 	pg_vec = NULL;
2133 	goto out;
2134 }
2135 
2136 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2137 		int closing, int tx_ring)
2138 {
2139 	char **pg_vec = NULL;
2140 	struct packet_sock *po = pkt_sk(sk);
2141 	int was_running, order = 0;
2142 	struct packet_ring_buffer *rb;
2143 	struct sk_buff_head *rb_queue;
2144 	__be16 num;
2145 	int err;
2146 
2147 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2148 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2149 
2150 	err = -EBUSY;
2151 	if (!closing) {
2152 		if (atomic_read(&po->mapped))
2153 			goto out;
2154 		if (atomic_read(&rb->pending))
2155 			goto out;
2156 	}
2157 
2158 	if (req->tp_block_nr) {
2159 		/* Sanity tests and some calculations */
2160 		err = -EBUSY;
2161 		if (unlikely(rb->pg_vec))
2162 			goto out;
2163 
2164 		switch (po->tp_version) {
2165 		case TPACKET_V1:
2166 			po->tp_hdrlen = TPACKET_HDRLEN;
2167 			break;
2168 		case TPACKET_V2:
2169 			po->tp_hdrlen = TPACKET2_HDRLEN;
2170 			break;
2171 		}
2172 
2173 		err = -EINVAL;
2174 		if (unlikely((int)req->tp_block_size <= 0))
2175 			goto out;
2176 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2177 			goto out;
2178 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2179 					po->tp_reserve))
2180 			goto out;
2181 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2182 			goto out;
2183 
2184 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2185 		if (unlikely(rb->frames_per_block <= 0))
2186 			goto out;
2187 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2188 					req->tp_frame_nr))
2189 			goto out;
2190 
2191 		err = -ENOMEM;
2192 		order = get_order(req->tp_block_size);
2193 		pg_vec = alloc_pg_vec(req, order);
2194 		if (unlikely(!pg_vec))
2195 			goto out;
2196 	}
2197 	/* Done */
2198 	else {
2199 		err = -EINVAL;
2200 		if (unlikely(req->tp_frame_nr))
2201 			goto out;
2202 	}
2203 
2204 	lock_sock(sk);
2205 
2206 	/* Detach socket from network */
2207 	spin_lock(&po->bind_lock);
2208 	was_running = po->running;
2209 	num = po->num;
2210 	if (was_running) {
2211 		__dev_remove_pack(&po->prot_hook);
2212 		po->num = 0;
2213 		po->running = 0;
2214 		__sock_put(sk);
2215 	}
2216 	spin_unlock(&po->bind_lock);
2217 
2218 	synchronize_net();
2219 
2220 	err = -EBUSY;
2221 	mutex_lock(&po->pg_vec_lock);
2222 	if (closing || atomic_read(&po->mapped) == 0) {
2223 		err = 0;
2224 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2225 		spin_lock_bh(&rb_queue->lock);
2226 		pg_vec = XC(rb->pg_vec, pg_vec);
2227 		rb->frame_max = (req->tp_frame_nr - 1);
2228 		rb->head = 0;
2229 		rb->frame_size = req->tp_frame_size;
2230 		spin_unlock_bh(&rb_queue->lock);
2231 
2232 		order = XC(rb->pg_vec_order, order);
2233 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2234 
2235 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2236 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2237 						tpacket_rcv : packet_rcv;
2238 		skb_queue_purge(rb_queue);
2239 #undef XC
2240 		if (atomic_read(&po->mapped))
2241 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2242 						atomic_read(&po->mapped));
2243 	}
2244 	mutex_unlock(&po->pg_vec_lock);
2245 
2246 	spin_lock(&po->bind_lock);
2247 	if (was_running && !po->running) {
2248 		sock_hold(sk);
2249 		po->running = 1;
2250 		po->num = num;
2251 		dev_add_pack(&po->prot_hook);
2252 	}
2253 	spin_unlock(&po->bind_lock);
2254 
2255 	release_sock(sk);
2256 
2257 	if (pg_vec)
2258 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2259 out:
2260 	return err;
2261 }
2262 
2263 static int packet_mmap(struct file *file, struct socket *sock,
2264 		struct vm_area_struct *vma)
2265 {
2266 	struct sock *sk = sock->sk;
2267 	struct packet_sock *po = pkt_sk(sk);
2268 	unsigned long size, expected_size;
2269 	struct packet_ring_buffer *rb;
2270 	unsigned long start;
2271 	int err = -EINVAL;
2272 	int i;
2273 
2274 	if (vma->vm_pgoff)
2275 		return -EINVAL;
2276 
2277 	mutex_lock(&po->pg_vec_lock);
2278 
2279 	expected_size = 0;
2280 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2281 		if (rb->pg_vec) {
2282 			expected_size += rb->pg_vec_len
2283 						* rb->pg_vec_pages
2284 						* PAGE_SIZE;
2285 		}
2286 	}
2287 
2288 	if (expected_size == 0)
2289 		goto out;
2290 
2291 	size = vma->vm_end - vma->vm_start;
2292 	if (size != expected_size)
2293 		goto out;
2294 
2295 	start = vma->vm_start;
2296 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2297 		if (rb->pg_vec == NULL)
2298 			continue;
2299 
2300 		for (i = 0; i < rb->pg_vec_len; i++) {
2301 			struct page *page = virt_to_page(rb->pg_vec[i]);
2302 			int pg_num;
2303 
2304 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2305 					pg_num++,page++) {
2306 				err = vm_insert_page(vma, start, page);
2307 				if (unlikely(err))
2308 					goto out;
2309 				start += PAGE_SIZE;
2310 			}
2311 		}
2312 	}
2313 
2314 	atomic_inc(&po->mapped);
2315 	vma->vm_ops = &packet_mmap_ops;
2316 	err = 0;
2317 
2318 out:
2319 	mutex_unlock(&po->pg_vec_lock);
2320 	return err;
2321 }
2322 #endif
2323 
2324 
2325 static const struct proto_ops packet_ops_spkt = {
2326 	.family =	PF_PACKET,
2327 	.owner =	THIS_MODULE,
2328 	.release =	packet_release,
2329 	.bind =		packet_bind_spkt,
2330 	.connect =	sock_no_connect,
2331 	.socketpair =	sock_no_socketpair,
2332 	.accept =	sock_no_accept,
2333 	.getname =	packet_getname_spkt,
2334 	.poll =		datagram_poll,
2335 	.ioctl =	packet_ioctl,
2336 	.listen =	sock_no_listen,
2337 	.shutdown =	sock_no_shutdown,
2338 	.setsockopt =	sock_no_setsockopt,
2339 	.getsockopt =	sock_no_getsockopt,
2340 	.sendmsg =	packet_sendmsg_spkt,
2341 	.recvmsg =	packet_recvmsg,
2342 	.mmap =		sock_no_mmap,
2343 	.sendpage =	sock_no_sendpage,
2344 };
2345 
2346 static const struct proto_ops packet_ops = {
2347 	.family =	PF_PACKET,
2348 	.owner =	THIS_MODULE,
2349 	.release =	packet_release,
2350 	.bind =		packet_bind,
2351 	.connect =	sock_no_connect,
2352 	.socketpair =	sock_no_socketpair,
2353 	.accept =	sock_no_accept,
2354 	.getname =	packet_getname,
2355 	.poll =		packet_poll,
2356 	.ioctl =	packet_ioctl,
2357 	.listen =	sock_no_listen,
2358 	.shutdown =	sock_no_shutdown,
2359 	.setsockopt =	packet_setsockopt,
2360 	.getsockopt =	packet_getsockopt,
2361 	.sendmsg =	packet_sendmsg,
2362 	.recvmsg =	packet_recvmsg,
2363 	.mmap =		packet_mmap,
2364 	.sendpage =	sock_no_sendpage,
2365 };
2366 
2367 static struct net_proto_family packet_family_ops = {
2368 	.family =	PF_PACKET,
2369 	.create =	packet_create,
2370 	.owner	=	THIS_MODULE,
2371 };
2372 
2373 static struct notifier_block packet_netdev_notifier = {
2374 	.notifier_call =packet_notifier,
2375 };
2376 
2377 #ifdef CONFIG_PROC_FS
2378 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2379 {
2380 	struct sock *s;
2381 	struct hlist_node *node;
2382 
2383 	sk_for_each(s, node, &net->packet.sklist) {
2384 		if (!off--)
2385 			return s;
2386 	}
2387 	return NULL;
2388 }
2389 
2390 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2391 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2392 {
2393 	struct net *net = seq_file_net(seq);
2394 	read_lock(&net->packet.sklist_lock);
2395 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2396 }
2397 
2398 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2399 {
2400 	struct net *net = seq_file_net(seq);
2401 	++*pos;
2402 	return  (v == SEQ_START_TOKEN)
2403 		? sk_head(&net->packet.sklist)
2404 		: sk_next((struct sock*)v) ;
2405 }
2406 
2407 static void packet_seq_stop(struct seq_file *seq, void *v)
2408 	__releases(seq_file_net(seq)->packet.sklist_lock)
2409 {
2410 	struct net *net = seq_file_net(seq);
2411 	read_unlock(&net->packet.sklist_lock);
2412 }
2413 
2414 static int packet_seq_show(struct seq_file *seq, void *v)
2415 {
2416 	if (v == SEQ_START_TOKEN)
2417 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2418 	else {
2419 		struct sock *s = v;
2420 		const struct packet_sock *po = pkt_sk(s);
2421 
2422 		seq_printf(seq,
2423 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2424 			   s,
2425 			   atomic_read(&s->sk_refcnt),
2426 			   s->sk_type,
2427 			   ntohs(po->num),
2428 			   po->ifindex,
2429 			   po->running,
2430 			   atomic_read(&s->sk_rmem_alloc),
2431 			   sock_i_uid(s),
2432 			   sock_i_ino(s) );
2433 	}
2434 
2435 	return 0;
2436 }
2437 
2438 static const struct seq_operations packet_seq_ops = {
2439 	.start	= packet_seq_start,
2440 	.next	= packet_seq_next,
2441 	.stop	= packet_seq_stop,
2442 	.show	= packet_seq_show,
2443 };
2444 
2445 static int packet_seq_open(struct inode *inode, struct file *file)
2446 {
2447 	return seq_open_net(inode, file, &packet_seq_ops,
2448 			    sizeof(struct seq_net_private));
2449 }
2450 
2451 static const struct file_operations packet_seq_fops = {
2452 	.owner		= THIS_MODULE,
2453 	.open		= packet_seq_open,
2454 	.read		= seq_read,
2455 	.llseek		= seq_lseek,
2456 	.release	= seq_release_net,
2457 };
2458 
2459 #endif
2460 
2461 static int packet_net_init(struct net *net)
2462 {
2463 	rwlock_init(&net->packet.sklist_lock);
2464 	INIT_HLIST_HEAD(&net->packet.sklist);
2465 
2466 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2467 		return -ENOMEM;
2468 
2469 	return 0;
2470 }
2471 
2472 static void packet_net_exit(struct net *net)
2473 {
2474 	proc_net_remove(net, "packet");
2475 }
2476 
2477 static struct pernet_operations packet_net_ops = {
2478 	.init = packet_net_init,
2479 	.exit = packet_net_exit,
2480 };
2481 
2482 
2483 static void __exit packet_exit(void)
2484 {
2485 	unregister_netdevice_notifier(&packet_netdev_notifier);
2486 	unregister_pernet_subsys(&packet_net_ops);
2487 	sock_unregister(PF_PACKET);
2488 	proto_unregister(&packet_proto);
2489 }
2490 
2491 static int __init packet_init(void)
2492 {
2493 	int rc = proto_register(&packet_proto, 0);
2494 
2495 	if (rc != 0)
2496 		goto out;
2497 
2498 	sock_register(&packet_family_ops);
2499 	register_pernet_subsys(&packet_net_ops);
2500 	register_netdevice_notifier(&packet_netdev_notifier);
2501 out:
2502 	return rc;
2503 }
2504 
2505 module_init(packet_init);
2506 module_exit(packet_exit);
2507 MODULE_LICENSE("GPL");
2508 MODULE_ALIAS_NETPROTO(PF_PACKET);
2509