xref: /linux/net/packet/af_packet.c (revision b3b77c8caef1750ebeea1054e39e358550ea9f55)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86 
87 #ifdef CONFIG_INET
88 #include <net/inet_common.h>
89 #endif
90 
91 /*
92    Assumptions:
93    - if device has no dev->hard_header routine, it adds and removes ll header
94      inside itself. In this case ll header is invisible outside of device,
95      but higher levels still should reserve dev->hard_header_len.
96      Some devices are enough clever to reallocate skb, when header
97      will not fit to reserved space (tunnel), another ones are silly
98      (PPP).
99    - packet socket receives packets with pulled ll header,
100      so that SOCK_RAW should push it back.
101 
102 On receive:
103 -----------
104 
105 Incoming, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> data
108 
109 Outgoing, dev->hard_header!=NULL
110    mac_header -> ll header
111    data       -> ll header
112 
113 Incoming, dev->hard_header==NULL
114    mac_header -> UNKNOWN position. It is very likely, that it points to ll
115 		 header.  PPP makes it, that is wrong, because introduce
116 		 assymetry between rx and tx paths.
117    data       -> data
118 
119 Outgoing, dev->hard_header==NULL
120    mac_header -> data. ll header is still not built!
121    data       -> data
122 
123 Resume
124   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
125 
126 
127 On transmit:
128 ------------
129 
130 dev->hard_header != NULL
131    mac_header -> ll header
132    data       -> ll header
133 
134 dev->hard_header == NULL (ll header is added by device, we cannot control it)
135    mac_header -> data
136    data       -> data
137 
138    We should set nh.raw on output to correct posistion,
139    packet classifier depends on it.
140  */
141 
142 /* Private packet socket structures. */
143 
144 struct packet_mclist {
145 	struct packet_mclist	*next;
146 	int			ifindex;
147 	int			count;
148 	unsigned short		type;
149 	unsigned short		alen;
150 	unsigned char		addr[MAX_ADDR_LEN];
151 };
152 /* identical to struct packet_mreq except it has
153  * a longer address field.
154  */
155 struct packet_mreq_max {
156 	int		mr_ifindex;
157 	unsigned short	mr_type;
158 	unsigned short	mr_alen;
159 	unsigned char	mr_address[MAX_ADDR_LEN];
160 };
161 
162 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
163 		int closing, int tx_ring);
164 
165 struct packet_ring_buffer {
166 	char			**pg_vec;
167 	unsigned int		head;
168 	unsigned int		frames_per_block;
169 	unsigned int		frame_size;
170 	unsigned int		frame_max;
171 
172 	unsigned int		pg_vec_order;
173 	unsigned int		pg_vec_pages;
174 	unsigned int		pg_vec_len;
175 
176 	atomic_t		pending;
177 };
178 
179 struct packet_sock;
180 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
181 
182 static void packet_flush_mclist(struct sock *sk);
183 
184 struct packet_sock {
185 	/* struct sock has to be the first member of packet_sock */
186 	struct sock		sk;
187 	struct tpacket_stats	stats;
188 	struct packet_ring_buffer	rx_ring;
189 	struct packet_ring_buffer	tx_ring;
190 	int			copy_thresh;
191 	spinlock_t		bind_lock;
192 	struct mutex		pg_vec_lock;
193 	unsigned int		running:1,	/* prot_hook is attached*/
194 				auxdata:1,
195 				origdev:1,
196 				has_vnet_hdr:1;
197 	int			ifindex;	/* bound device		*/
198 	__be16			num;
199 	struct packet_mclist	*mclist;
200 	atomic_t		mapped;
201 	enum tpacket_versions	tp_version;
202 	unsigned int		tp_hdrlen;
203 	unsigned int		tp_reserve;
204 	unsigned int		tp_loss:1;
205 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
206 };
207 
208 struct packet_skb_cb {
209 	unsigned int origlen;
210 	union {
211 		struct sockaddr_pkt pkt;
212 		struct sockaddr_ll ll;
213 	} sa;
214 };
215 
216 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
217 
218 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
219 {
220 	union {
221 		struct tpacket_hdr *h1;
222 		struct tpacket2_hdr *h2;
223 		void *raw;
224 	} h;
225 
226 	h.raw = frame;
227 	switch (po->tp_version) {
228 	case TPACKET_V1:
229 		h.h1->tp_status = status;
230 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
231 		break;
232 	case TPACKET_V2:
233 		h.h2->tp_status = status;
234 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
235 		break;
236 	default:
237 		pr_err("TPACKET version not supported\n");
238 		BUG();
239 	}
240 
241 	smp_wmb();
242 }
243 
244 static int __packet_get_status(struct packet_sock *po, void *frame)
245 {
246 	union {
247 		struct tpacket_hdr *h1;
248 		struct tpacket2_hdr *h2;
249 		void *raw;
250 	} h;
251 
252 	smp_rmb();
253 
254 	h.raw = frame;
255 	switch (po->tp_version) {
256 	case TPACKET_V1:
257 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
258 		return h.h1->tp_status;
259 	case TPACKET_V2:
260 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
261 		return h.h2->tp_status;
262 	default:
263 		pr_err("TPACKET version not supported\n");
264 		BUG();
265 		return 0;
266 	}
267 }
268 
269 static void *packet_lookup_frame(struct packet_sock *po,
270 		struct packet_ring_buffer *rb,
271 		unsigned int position,
272 		int status)
273 {
274 	unsigned int pg_vec_pos, frame_offset;
275 	union {
276 		struct tpacket_hdr *h1;
277 		struct tpacket2_hdr *h2;
278 		void *raw;
279 	} h;
280 
281 	pg_vec_pos = position / rb->frames_per_block;
282 	frame_offset = position % rb->frames_per_block;
283 
284 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
285 
286 	if (status != __packet_get_status(po, h.raw))
287 		return NULL;
288 
289 	return h.raw;
290 }
291 
292 static inline void *packet_current_frame(struct packet_sock *po,
293 		struct packet_ring_buffer *rb,
294 		int status)
295 {
296 	return packet_lookup_frame(po, rb, rb->head, status);
297 }
298 
299 static inline void *packet_previous_frame(struct packet_sock *po,
300 		struct packet_ring_buffer *rb,
301 		int status)
302 {
303 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
304 	return packet_lookup_frame(po, rb, previous, status);
305 }
306 
307 static inline void packet_increment_head(struct packet_ring_buffer *buff)
308 {
309 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
310 }
311 
312 static inline struct packet_sock *pkt_sk(struct sock *sk)
313 {
314 	return (struct packet_sock *)sk;
315 }
316 
317 static void packet_sock_destruct(struct sock *sk)
318 {
319 	skb_queue_purge(&sk->sk_error_queue);
320 
321 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
322 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
323 
324 	if (!sock_flag(sk, SOCK_DEAD)) {
325 		pr_err("Attempt to release alive packet socket: %p\n", sk);
326 		return;
327 	}
328 
329 	sk_refcnt_debug_dec(sk);
330 }
331 
332 
333 static const struct proto_ops packet_ops;
334 
335 static const struct proto_ops packet_ops_spkt;
336 
337 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
338 			   struct packet_type *pt, struct net_device *orig_dev)
339 {
340 	struct sock *sk;
341 	struct sockaddr_pkt *spkt;
342 
343 	/*
344 	 *	When we registered the protocol we saved the socket in the data
345 	 *	field for just this event.
346 	 */
347 
348 	sk = pt->af_packet_priv;
349 
350 	/*
351 	 *	Yank back the headers [hope the device set this
352 	 *	right or kerboom...]
353 	 *
354 	 *	Incoming packets have ll header pulled,
355 	 *	push it back.
356 	 *
357 	 *	For outgoing ones skb->data == skb_mac_header(skb)
358 	 *	so that this procedure is noop.
359 	 */
360 
361 	if (skb->pkt_type == PACKET_LOOPBACK)
362 		goto out;
363 
364 	if (!net_eq(dev_net(dev), sock_net(sk)))
365 		goto out;
366 
367 	skb = skb_share_check(skb, GFP_ATOMIC);
368 	if (skb == NULL)
369 		goto oom;
370 
371 	/* drop any routing info */
372 	skb_dst_drop(skb);
373 
374 	/* drop conntrack reference */
375 	nf_reset(skb);
376 
377 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
378 
379 	skb_push(skb, skb->data - skb_mac_header(skb));
380 
381 	/*
382 	 *	The SOCK_PACKET socket receives _all_ frames.
383 	 */
384 
385 	spkt->spkt_family = dev->type;
386 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
387 	spkt->spkt_protocol = skb->protocol;
388 
389 	/*
390 	 *	Charge the memory to the socket. This is done specifically
391 	 *	to prevent sockets using all the memory up.
392 	 */
393 
394 	if (sock_queue_rcv_skb(sk, skb) == 0)
395 		return 0;
396 
397 out:
398 	kfree_skb(skb);
399 oom:
400 	return 0;
401 }
402 
403 
404 /*
405  *	Output a raw packet to a device layer. This bypasses all the other
406  *	protocol layers and you must therefore supply it with a complete frame
407  */
408 
409 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
410 			       struct msghdr *msg, size_t len)
411 {
412 	struct sock *sk = sock->sk;
413 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
414 	struct sk_buff *skb = NULL;
415 	struct net_device *dev;
416 	__be16 proto = 0;
417 	int err;
418 
419 	/*
420 	 *	Get and verify the address.
421 	 */
422 
423 	if (saddr) {
424 		if (msg->msg_namelen < sizeof(struct sockaddr))
425 			return -EINVAL;
426 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
427 			proto = saddr->spkt_protocol;
428 	} else
429 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
430 
431 	/*
432 	 *	Find the device first to size check it
433 	 */
434 
435 	saddr->spkt_device[13] = 0;
436 retry:
437 	rcu_read_lock();
438 	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
439 	err = -ENODEV;
440 	if (dev == NULL)
441 		goto out_unlock;
442 
443 	err = -ENETDOWN;
444 	if (!(dev->flags & IFF_UP))
445 		goto out_unlock;
446 
447 	/*
448 	 * You may not queue a frame bigger than the mtu. This is the lowest level
449 	 * raw protocol and you must do your own fragmentation at this level.
450 	 */
451 
452 	err = -EMSGSIZE;
453 	if (len > dev->mtu + dev->hard_header_len)
454 		goto out_unlock;
455 
456 	if (!skb) {
457 		size_t reserved = LL_RESERVED_SPACE(dev);
458 		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
459 
460 		rcu_read_unlock();
461 		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
462 		if (skb == NULL)
463 			return -ENOBUFS;
464 		/* FIXME: Save some space for broken drivers that write a hard
465 		 * header at transmission time by themselves. PPP is the notable
466 		 * one here. This should really be fixed at the driver level.
467 		 */
468 		skb_reserve(skb, reserved);
469 		skb_reset_network_header(skb);
470 
471 		/* Try to align data part correctly */
472 		if (hhlen) {
473 			skb->data -= hhlen;
474 			skb->tail -= hhlen;
475 			if (len < hhlen)
476 				skb_reset_network_header(skb);
477 		}
478 		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
479 		if (err)
480 			goto out_free;
481 		goto retry;
482 	}
483 
484 
485 	skb->protocol = proto;
486 	skb->dev = dev;
487 	skb->priority = sk->sk_priority;
488 	skb->mark = sk->sk_mark;
489 	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
490 	if (err < 0)
491 		goto out_unlock;
492 
493 	dev_queue_xmit(skb);
494 	rcu_read_unlock();
495 	return len;
496 
497 out_unlock:
498 	rcu_read_unlock();
499 out_free:
500 	kfree_skb(skb);
501 	return err;
502 }
503 
504 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
505 				      unsigned int res)
506 {
507 	struct sk_filter *filter;
508 
509 	rcu_read_lock_bh();
510 	filter = rcu_dereference_bh(sk->sk_filter);
511 	if (filter != NULL)
512 		res = sk_run_filter(skb, filter->insns, filter->len);
513 	rcu_read_unlock_bh();
514 
515 	return res;
516 }
517 
518 /*
519    This function makes lazy skb cloning in hope that most of packets
520    are discarded by BPF.
521 
522    Note tricky part: we DO mangle shared skb! skb->data, skb->len
523    and skb->cb are mangled. It works because (and until) packets
524    falling here are owned by current CPU. Output packets are cloned
525    by dev_queue_xmit_nit(), input packets are processed by net_bh
526    sequencially, so that if we return skb to original state on exit,
527    we will not harm anyone.
528  */
529 
530 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
531 		      struct packet_type *pt, struct net_device *orig_dev)
532 {
533 	struct sock *sk;
534 	struct sockaddr_ll *sll;
535 	struct packet_sock *po;
536 	u8 *skb_head = skb->data;
537 	int skb_len = skb->len;
538 	unsigned int snaplen, res;
539 
540 	if (skb->pkt_type == PACKET_LOOPBACK)
541 		goto drop;
542 
543 	sk = pt->af_packet_priv;
544 	po = pkt_sk(sk);
545 
546 	if (!net_eq(dev_net(dev), sock_net(sk)))
547 		goto drop;
548 
549 	skb->dev = dev;
550 
551 	if (dev->header_ops) {
552 		/* The device has an explicit notion of ll header,
553 		   exported to higher levels.
554 
555 		   Otherwise, the device hides datails of it frame
556 		   structure, so that corresponding packet head
557 		   never delivered to user.
558 		 */
559 		if (sk->sk_type != SOCK_DGRAM)
560 			skb_push(skb, skb->data - skb_mac_header(skb));
561 		else if (skb->pkt_type == PACKET_OUTGOING) {
562 			/* Special case: outgoing packets have ll header at head */
563 			skb_pull(skb, skb_network_offset(skb));
564 		}
565 	}
566 
567 	snaplen = skb->len;
568 
569 	res = run_filter(skb, sk, snaplen);
570 	if (!res)
571 		goto drop_n_restore;
572 	if (snaplen > res)
573 		snaplen = res;
574 
575 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
576 	    (unsigned)sk->sk_rcvbuf)
577 		goto drop_n_acct;
578 
579 	if (skb_shared(skb)) {
580 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
581 		if (nskb == NULL)
582 			goto drop_n_acct;
583 
584 		if (skb_head != skb->data) {
585 			skb->data = skb_head;
586 			skb->len = skb_len;
587 		}
588 		kfree_skb(skb);
589 		skb = nskb;
590 	}
591 
592 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
593 		     sizeof(skb->cb));
594 
595 	sll = &PACKET_SKB_CB(skb)->sa.ll;
596 	sll->sll_family = AF_PACKET;
597 	sll->sll_hatype = dev->type;
598 	sll->sll_protocol = skb->protocol;
599 	sll->sll_pkttype = skb->pkt_type;
600 	if (unlikely(po->origdev))
601 		sll->sll_ifindex = orig_dev->ifindex;
602 	else
603 		sll->sll_ifindex = dev->ifindex;
604 
605 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
606 
607 	PACKET_SKB_CB(skb)->origlen = skb->len;
608 
609 	if (pskb_trim(skb, snaplen))
610 		goto drop_n_acct;
611 
612 	skb_set_owner_r(skb, sk);
613 	skb->dev = NULL;
614 	skb_dst_drop(skb);
615 
616 	/* drop conntrack reference */
617 	nf_reset(skb);
618 
619 	spin_lock(&sk->sk_receive_queue.lock);
620 	po->stats.tp_packets++;
621 	skb->dropcount = atomic_read(&sk->sk_drops);
622 	__skb_queue_tail(&sk->sk_receive_queue, skb);
623 	spin_unlock(&sk->sk_receive_queue.lock);
624 	sk->sk_data_ready(sk, skb->len);
625 	return 0;
626 
627 drop_n_acct:
628 	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
629 
630 drop_n_restore:
631 	if (skb_head != skb->data && skb_shared(skb)) {
632 		skb->data = skb_head;
633 		skb->len = skb_len;
634 	}
635 drop:
636 	consume_skb(skb);
637 	return 0;
638 }
639 
640 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
641 		       struct packet_type *pt, struct net_device *orig_dev)
642 {
643 	struct sock *sk;
644 	struct packet_sock *po;
645 	struct sockaddr_ll *sll;
646 	union {
647 		struct tpacket_hdr *h1;
648 		struct tpacket2_hdr *h2;
649 		void *raw;
650 	} h;
651 	u8 *skb_head = skb->data;
652 	int skb_len = skb->len;
653 	unsigned int snaplen, res;
654 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
655 	unsigned short macoff, netoff, hdrlen;
656 	struct sk_buff *copy_skb = NULL;
657 	struct timeval tv;
658 	struct timespec ts;
659 
660 	if (skb->pkt_type == PACKET_LOOPBACK)
661 		goto drop;
662 
663 	sk = pt->af_packet_priv;
664 	po = pkt_sk(sk);
665 
666 	if (!net_eq(dev_net(dev), sock_net(sk)))
667 		goto drop;
668 
669 	if (dev->header_ops) {
670 		if (sk->sk_type != SOCK_DGRAM)
671 			skb_push(skb, skb->data - skb_mac_header(skb));
672 		else if (skb->pkt_type == PACKET_OUTGOING) {
673 			/* Special case: outgoing packets have ll header at head */
674 			skb_pull(skb, skb_network_offset(skb));
675 		}
676 	}
677 
678 	if (skb->ip_summed == CHECKSUM_PARTIAL)
679 		status |= TP_STATUS_CSUMNOTREADY;
680 
681 	snaplen = skb->len;
682 
683 	res = run_filter(skb, sk, snaplen);
684 	if (!res)
685 		goto drop_n_restore;
686 	if (snaplen > res)
687 		snaplen = res;
688 
689 	if (sk->sk_type == SOCK_DGRAM) {
690 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
691 				  po->tp_reserve;
692 	} else {
693 		unsigned maclen = skb_network_offset(skb);
694 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
695 				       (maclen < 16 ? 16 : maclen)) +
696 			po->tp_reserve;
697 		macoff = netoff - maclen;
698 	}
699 
700 	if (macoff + snaplen > po->rx_ring.frame_size) {
701 		if (po->copy_thresh &&
702 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
703 		    (unsigned)sk->sk_rcvbuf) {
704 			if (skb_shared(skb)) {
705 				copy_skb = skb_clone(skb, GFP_ATOMIC);
706 			} else {
707 				copy_skb = skb_get(skb);
708 				skb_head = skb->data;
709 			}
710 			if (copy_skb)
711 				skb_set_owner_r(copy_skb, sk);
712 		}
713 		snaplen = po->rx_ring.frame_size - macoff;
714 		if ((int)snaplen < 0)
715 			snaplen = 0;
716 	}
717 
718 	spin_lock(&sk->sk_receive_queue.lock);
719 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
720 	if (!h.raw)
721 		goto ring_is_full;
722 	packet_increment_head(&po->rx_ring);
723 	po->stats.tp_packets++;
724 	if (copy_skb) {
725 		status |= TP_STATUS_COPY;
726 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
727 	}
728 	if (!po->stats.tp_drops)
729 		status &= ~TP_STATUS_LOSING;
730 	spin_unlock(&sk->sk_receive_queue.lock);
731 
732 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
733 
734 	switch (po->tp_version) {
735 	case TPACKET_V1:
736 		h.h1->tp_len = skb->len;
737 		h.h1->tp_snaplen = snaplen;
738 		h.h1->tp_mac = macoff;
739 		h.h1->tp_net = netoff;
740 		if (skb->tstamp.tv64)
741 			tv = ktime_to_timeval(skb->tstamp);
742 		else
743 			do_gettimeofday(&tv);
744 		h.h1->tp_sec = tv.tv_sec;
745 		h.h1->tp_usec = tv.tv_usec;
746 		hdrlen = sizeof(*h.h1);
747 		break;
748 	case TPACKET_V2:
749 		h.h2->tp_len = skb->len;
750 		h.h2->tp_snaplen = snaplen;
751 		h.h2->tp_mac = macoff;
752 		h.h2->tp_net = netoff;
753 		if (skb->tstamp.tv64)
754 			ts = ktime_to_timespec(skb->tstamp);
755 		else
756 			getnstimeofday(&ts);
757 		h.h2->tp_sec = ts.tv_sec;
758 		h.h2->tp_nsec = ts.tv_nsec;
759 		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
760 		hdrlen = sizeof(*h.h2);
761 		break;
762 	default:
763 		BUG();
764 	}
765 
766 	sll = h.raw + TPACKET_ALIGN(hdrlen);
767 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
768 	sll->sll_family = AF_PACKET;
769 	sll->sll_hatype = dev->type;
770 	sll->sll_protocol = skb->protocol;
771 	sll->sll_pkttype = skb->pkt_type;
772 	if (unlikely(po->origdev))
773 		sll->sll_ifindex = orig_dev->ifindex;
774 	else
775 		sll->sll_ifindex = dev->ifindex;
776 
777 	__packet_set_status(po, h.raw, status);
778 	smp_mb();
779 	{
780 		struct page *p_start, *p_end;
781 		u8 *h_end = h.raw + macoff + snaplen - 1;
782 
783 		p_start = virt_to_page(h.raw);
784 		p_end = virt_to_page(h_end);
785 		while (p_start <= p_end) {
786 			flush_dcache_page(p_start);
787 			p_start++;
788 		}
789 	}
790 
791 	sk->sk_data_ready(sk, 0);
792 
793 drop_n_restore:
794 	if (skb_head != skb->data && skb_shared(skb)) {
795 		skb->data = skb_head;
796 		skb->len = skb_len;
797 	}
798 drop:
799 	kfree_skb(skb);
800 	return 0;
801 
802 ring_is_full:
803 	po->stats.tp_drops++;
804 	spin_unlock(&sk->sk_receive_queue.lock);
805 
806 	sk->sk_data_ready(sk, 0);
807 	kfree_skb(copy_skb);
808 	goto drop_n_restore;
809 }
810 
811 static void tpacket_destruct_skb(struct sk_buff *skb)
812 {
813 	struct packet_sock *po = pkt_sk(skb->sk);
814 	void *ph;
815 
816 	BUG_ON(skb == NULL);
817 
818 	if (likely(po->tx_ring.pg_vec)) {
819 		ph = skb_shinfo(skb)->destructor_arg;
820 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
821 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
822 		atomic_dec(&po->tx_ring.pending);
823 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
824 	}
825 
826 	sock_wfree(skb);
827 }
828 
829 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
830 		void *frame, struct net_device *dev, int size_max,
831 		__be16 proto, unsigned char *addr)
832 {
833 	union {
834 		struct tpacket_hdr *h1;
835 		struct tpacket2_hdr *h2;
836 		void *raw;
837 	} ph;
838 	int to_write, offset, len, tp_len, nr_frags, len_max;
839 	struct socket *sock = po->sk.sk_socket;
840 	struct page *page;
841 	void *data;
842 	int err;
843 
844 	ph.raw = frame;
845 
846 	skb->protocol = proto;
847 	skb->dev = dev;
848 	skb->priority = po->sk.sk_priority;
849 	skb->mark = po->sk.sk_mark;
850 	skb_shinfo(skb)->destructor_arg = ph.raw;
851 
852 	switch (po->tp_version) {
853 	case TPACKET_V2:
854 		tp_len = ph.h2->tp_len;
855 		break;
856 	default:
857 		tp_len = ph.h1->tp_len;
858 		break;
859 	}
860 	if (unlikely(tp_len > size_max)) {
861 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
862 		return -EMSGSIZE;
863 	}
864 
865 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
866 	skb_reset_network_header(skb);
867 
868 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
869 	to_write = tp_len;
870 
871 	if (sock->type == SOCK_DGRAM) {
872 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
873 				NULL, tp_len);
874 		if (unlikely(err < 0))
875 			return -EINVAL;
876 	} else if (dev->hard_header_len) {
877 		/* net device doesn't like empty head */
878 		if (unlikely(tp_len <= dev->hard_header_len)) {
879 			pr_err("packet size is too short (%d < %d)\n",
880 			       tp_len, dev->hard_header_len);
881 			return -EINVAL;
882 		}
883 
884 		skb_push(skb, dev->hard_header_len);
885 		err = skb_store_bits(skb, 0, data,
886 				dev->hard_header_len);
887 		if (unlikely(err))
888 			return err;
889 
890 		data += dev->hard_header_len;
891 		to_write -= dev->hard_header_len;
892 	}
893 
894 	err = -EFAULT;
895 	page = virt_to_page(data);
896 	offset = offset_in_page(data);
897 	len_max = PAGE_SIZE - offset;
898 	len = ((to_write > len_max) ? len_max : to_write);
899 
900 	skb->data_len = to_write;
901 	skb->len += to_write;
902 	skb->truesize += to_write;
903 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
904 
905 	while (likely(to_write)) {
906 		nr_frags = skb_shinfo(skb)->nr_frags;
907 
908 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
909 			pr_err("Packet exceed the number of skb frags(%lu)\n",
910 			       MAX_SKB_FRAGS);
911 			return -EFAULT;
912 		}
913 
914 		flush_dcache_page(page);
915 		get_page(page);
916 		skb_fill_page_desc(skb,
917 				nr_frags,
918 				page++, offset, len);
919 		to_write -= len;
920 		offset = 0;
921 		len_max = PAGE_SIZE;
922 		len = ((to_write > len_max) ? len_max : to_write);
923 	}
924 
925 	return tp_len;
926 }
927 
928 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
929 {
930 	struct socket *sock;
931 	struct sk_buff *skb;
932 	struct net_device *dev;
933 	__be16 proto;
934 	int ifindex, err, reserve = 0;
935 	void *ph;
936 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
937 	int tp_len, size_max;
938 	unsigned char *addr;
939 	int len_sum = 0;
940 	int status = 0;
941 
942 	sock = po->sk.sk_socket;
943 
944 	mutex_lock(&po->pg_vec_lock);
945 
946 	err = -EBUSY;
947 	if (saddr == NULL) {
948 		ifindex	= po->ifindex;
949 		proto	= po->num;
950 		addr	= NULL;
951 	} else {
952 		err = -EINVAL;
953 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
954 			goto out;
955 		if (msg->msg_namelen < (saddr->sll_halen
956 					+ offsetof(struct sockaddr_ll,
957 						sll_addr)))
958 			goto out;
959 		ifindex	= saddr->sll_ifindex;
960 		proto	= saddr->sll_protocol;
961 		addr	= saddr->sll_addr;
962 	}
963 
964 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
965 	err = -ENXIO;
966 	if (unlikely(dev == NULL))
967 		goto out;
968 
969 	reserve = dev->hard_header_len;
970 
971 	err = -ENETDOWN;
972 	if (unlikely(!(dev->flags & IFF_UP)))
973 		goto out_put;
974 
975 	size_max = po->tx_ring.frame_size
976 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
977 
978 	if (size_max > dev->mtu + reserve)
979 		size_max = dev->mtu + reserve;
980 
981 	do {
982 		ph = packet_current_frame(po, &po->tx_ring,
983 				TP_STATUS_SEND_REQUEST);
984 
985 		if (unlikely(ph == NULL)) {
986 			schedule();
987 			continue;
988 		}
989 
990 		status = TP_STATUS_SEND_REQUEST;
991 		skb = sock_alloc_send_skb(&po->sk,
992 				LL_ALLOCATED_SPACE(dev)
993 				+ sizeof(struct sockaddr_ll),
994 				0, &err);
995 
996 		if (unlikely(skb == NULL))
997 			goto out_status;
998 
999 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1000 				addr);
1001 
1002 		if (unlikely(tp_len < 0)) {
1003 			if (po->tp_loss) {
1004 				__packet_set_status(po, ph,
1005 						TP_STATUS_AVAILABLE);
1006 				packet_increment_head(&po->tx_ring);
1007 				kfree_skb(skb);
1008 				continue;
1009 			} else {
1010 				status = TP_STATUS_WRONG_FORMAT;
1011 				err = tp_len;
1012 				goto out_status;
1013 			}
1014 		}
1015 
1016 		skb->destructor = tpacket_destruct_skb;
1017 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1018 		atomic_inc(&po->tx_ring.pending);
1019 
1020 		status = TP_STATUS_SEND_REQUEST;
1021 		err = dev_queue_xmit(skb);
1022 		if (unlikely(err > 0)) {
1023 			err = net_xmit_errno(err);
1024 			if (err && __packet_get_status(po, ph) ==
1025 				   TP_STATUS_AVAILABLE) {
1026 				/* skb was destructed already */
1027 				skb = NULL;
1028 				goto out_status;
1029 			}
1030 			/*
1031 			 * skb was dropped but not destructed yet;
1032 			 * let's treat it like congestion or err < 0
1033 			 */
1034 			err = 0;
1035 		}
1036 		packet_increment_head(&po->tx_ring);
1037 		len_sum += tp_len;
1038 	} while (likely((ph != NULL) ||
1039 			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1040 			 (atomic_read(&po->tx_ring.pending))))
1041 		);
1042 
1043 	err = len_sum;
1044 	goto out_put;
1045 
1046 out_status:
1047 	__packet_set_status(po, ph, status);
1048 	kfree_skb(skb);
1049 out_put:
1050 	dev_put(dev);
1051 out:
1052 	mutex_unlock(&po->pg_vec_lock);
1053 	return err;
1054 }
1055 
1056 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1057 					       size_t reserve, size_t len,
1058 					       size_t linear, int noblock,
1059 					       int *err)
1060 {
1061 	struct sk_buff *skb;
1062 
1063 	/* Under a page?  Don't bother with paged skb. */
1064 	if (prepad + len < PAGE_SIZE || !linear)
1065 		linear = len;
1066 
1067 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1068 				   err);
1069 	if (!skb)
1070 		return NULL;
1071 
1072 	skb_reserve(skb, reserve);
1073 	skb_put(skb, linear);
1074 	skb->data_len = len - linear;
1075 	skb->len += len - linear;
1076 
1077 	return skb;
1078 }
1079 
1080 static int packet_snd(struct socket *sock,
1081 			  struct msghdr *msg, size_t len)
1082 {
1083 	struct sock *sk = sock->sk;
1084 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1085 	struct sk_buff *skb;
1086 	struct net_device *dev;
1087 	__be16 proto;
1088 	unsigned char *addr;
1089 	int ifindex, err, reserve = 0;
1090 	struct virtio_net_hdr vnet_hdr = { 0 };
1091 	int offset = 0;
1092 	int vnet_hdr_len;
1093 	struct packet_sock *po = pkt_sk(sk);
1094 	unsigned short gso_type = 0;
1095 
1096 	/*
1097 	 *	Get and verify the address.
1098 	 */
1099 
1100 	if (saddr == NULL) {
1101 		ifindex	= po->ifindex;
1102 		proto	= po->num;
1103 		addr	= NULL;
1104 	} else {
1105 		err = -EINVAL;
1106 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1107 			goto out;
1108 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1109 			goto out;
1110 		ifindex	= saddr->sll_ifindex;
1111 		proto	= saddr->sll_protocol;
1112 		addr	= saddr->sll_addr;
1113 	}
1114 
1115 
1116 	dev = dev_get_by_index(sock_net(sk), ifindex);
1117 	err = -ENXIO;
1118 	if (dev == NULL)
1119 		goto out_unlock;
1120 	if (sock->type == SOCK_RAW)
1121 		reserve = dev->hard_header_len;
1122 
1123 	err = -ENETDOWN;
1124 	if (!(dev->flags & IFF_UP))
1125 		goto out_unlock;
1126 
1127 	if (po->has_vnet_hdr) {
1128 		vnet_hdr_len = sizeof(vnet_hdr);
1129 
1130 		err = -EINVAL;
1131 		if (len < vnet_hdr_len)
1132 			goto out_unlock;
1133 
1134 		len -= vnet_hdr_len;
1135 
1136 		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1137 				       vnet_hdr_len);
1138 		if (err < 0)
1139 			goto out_unlock;
1140 
1141 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1142 		    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1143 		      vnet_hdr.hdr_len))
1144 			vnet_hdr.hdr_len = vnet_hdr.csum_start +
1145 						 vnet_hdr.csum_offset + 2;
1146 
1147 		err = -EINVAL;
1148 		if (vnet_hdr.hdr_len > len)
1149 			goto out_unlock;
1150 
1151 		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1152 			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1153 			case VIRTIO_NET_HDR_GSO_TCPV4:
1154 				gso_type = SKB_GSO_TCPV4;
1155 				break;
1156 			case VIRTIO_NET_HDR_GSO_TCPV6:
1157 				gso_type = SKB_GSO_TCPV6;
1158 				break;
1159 			case VIRTIO_NET_HDR_GSO_UDP:
1160 				gso_type = SKB_GSO_UDP;
1161 				break;
1162 			default:
1163 				goto out_unlock;
1164 			}
1165 
1166 			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1167 				gso_type |= SKB_GSO_TCP_ECN;
1168 
1169 			if (vnet_hdr.gso_size == 0)
1170 				goto out_unlock;
1171 
1172 		}
1173 	}
1174 
1175 	err = -EMSGSIZE;
1176 	if (!gso_type && (len > dev->mtu+reserve))
1177 		goto out_unlock;
1178 
1179 	err = -ENOBUFS;
1180 	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1181 			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1182 			       msg->msg_flags & MSG_DONTWAIT, &err);
1183 	if (skb == NULL)
1184 		goto out_unlock;
1185 
1186 	skb_set_network_header(skb, reserve);
1187 
1188 	err = -EINVAL;
1189 	if (sock->type == SOCK_DGRAM &&
1190 	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1191 		goto out_free;
1192 
1193 	/* Returns -EFAULT on error */
1194 	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1195 	if (err)
1196 		goto out_free;
1197 	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
1198 	if (err < 0)
1199 		goto out_free;
1200 
1201 	skb->protocol = proto;
1202 	skb->dev = dev;
1203 	skb->priority = sk->sk_priority;
1204 	skb->mark = sk->sk_mark;
1205 
1206 	if (po->has_vnet_hdr) {
1207 		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1208 			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1209 						  vnet_hdr.csum_offset)) {
1210 				err = -EINVAL;
1211 				goto out_free;
1212 			}
1213 		}
1214 
1215 		skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1216 		skb_shinfo(skb)->gso_type = gso_type;
1217 
1218 		/* Header must be checked, and gso_segs computed. */
1219 		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1220 		skb_shinfo(skb)->gso_segs = 0;
1221 
1222 		len += vnet_hdr_len;
1223 	}
1224 
1225 	/*
1226 	 *	Now send it
1227 	 */
1228 
1229 	err = dev_queue_xmit(skb);
1230 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1231 		goto out_unlock;
1232 
1233 	dev_put(dev);
1234 
1235 	return len;
1236 
1237 out_free:
1238 	kfree_skb(skb);
1239 out_unlock:
1240 	if (dev)
1241 		dev_put(dev);
1242 out:
1243 	return err;
1244 }
1245 
1246 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1247 		struct msghdr *msg, size_t len)
1248 {
1249 	struct sock *sk = sock->sk;
1250 	struct packet_sock *po = pkt_sk(sk);
1251 	if (po->tx_ring.pg_vec)
1252 		return tpacket_snd(po, msg);
1253 	else
1254 		return packet_snd(sock, msg, len);
1255 }
1256 
1257 /*
1258  *	Close a PACKET socket. This is fairly simple. We immediately go
1259  *	to 'closed' state and remove our protocol entry in the device list.
1260  */
1261 
1262 static int packet_release(struct socket *sock)
1263 {
1264 	struct sock *sk = sock->sk;
1265 	struct packet_sock *po;
1266 	struct net *net;
1267 	struct tpacket_req req;
1268 
1269 	if (!sk)
1270 		return 0;
1271 
1272 	net = sock_net(sk);
1273 	po = pkt_sk(sk);
1274 
1275 	spin_lock_bh(&net->packet.sklist_lock);
1276 	sk_del_node_init_rcu(sk);
1277 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1278 	spin_unlock_bh(&net->packet.sklist_lock);
1279 
1280 	spin_lock(&po->bind_lock);
1281 	if (po->running) {
1282 		/*
1283 		 * Remove from protocol table
1284 		 */
1285 		po->running = 0;
1286 		po->num = 0;
1287 		__dev_remove_pack(&po->prot_hook);
1288 		__sock_put(sk);
1289 	}
1290 	spin_unlock(&po->bind_lock);
1291 
1292 	packet_flush_mclist(sk);
1293 
1294 	memset(&req, 0, sizeof(req));
1295 
1296 	if (po->rx_ring.pg_vec)
1297 		packet_set_ring(sk, &req, 1, 0);
1298 
1299 	if (po->tx_ring.pg_vec)
1300 		packet_set_ring(sk, &req, 1, 1);
1301 
1302 	synchronize_net();
1303 	/*
1304 	 *	Now the socket is dead. No more input will appear.
1305 	 */
1306 	sock_orphan(sk);
1307 	sock->sk = NULL;
1308 
1309 	/* Purge queues */
1310 
1311 	skb_queue_purge(&sk->sk_receive_queue);
1312 	sk_refcnt_debug_release(sk);
1313 
1314 	sock_put(sk);
1315 	return 0;
1316 }
1317 
1318 /*
1319  *	Attach a packet hook.
1320  */
1321 
1322 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1323 {
1324 	struct packet_sock *po = pkt_sk(sk);
1325 	/*
1326 	 *	Detach an existing hook if present.
1327 	 */
1328 
1329 	lock_sock(sk);
1330 
1331 	spin_lock(&po->bind_lock);
1332 	if (po->running) {
1333 		__sock_put(sk);
1334 		po->running = 0;
1335 		po->num = 0;
1336 		spin_unlock(&po->bind_lock);
1337 		dev_remove_pack(&po->prot_hook);
1338 		spin_lock(&po->bind_lock);
1339 	}
1340 
1341 	po->num = protocol;
1342 	po->prot_hook.type = protocol;
1343 	po->prot_hook.dev = dev;
1344 
1345 	po->ifindex = dev ? dev->ifindex : 0;
1346 
1347 	if (protocol == 0)
1348 		goto out_unlock;
1349 
1350 	if (!dev || (dev->flags & IFF_UP)) {
1351 		dev_add_pack(&po->prot_hook);
1352 		sock_hold(sk);
1353 		po->running = 1;
1354 	} else {
1355 		sk->sk_err = ENETDOWN;
1356 		if (!sock_flag(sk, SOCK_DEAD))
1357 			sk->sk_error_report(sk);
1358 	}
1359 
1360 out_unlock:
1361 	spin_unlock(&po->bind_lock);
1362 	release_sock(sk);
1363 	return 0;
1364 }
1365 
1366 /*
1367  *	Bind a packet socket to a device
1368  */
1369 
1370 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1371 			    int addr_len)
1372 {
1373 	struct sock *sk = sock->sk;
1374 	char name[15];
1375 	struct net_device *dev;
1376 	int err = -ENODEV;
1377 
1378 	/*
1379 	 *	Check legality
1380 	 */
1381 
1382 	if (addr_len != sizeof(struct sockaddr))
1383 		return -EINVAL;
1384 	strlcpy(name, uaddr->sa_data, sizeof(name));
1385 
1386 	dev = dev_get_by_name(sock_net(sk), name);
1387 	if (dev) {
1388 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1389 		dev_put(dev);
1390 	}
1391 	return err;
1392 }
1393 
1394 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1395 {
1396 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1397 	struct sock *sk = sock->sk;
1398 	struct net_device *dev = NULL;
1399 	int err;
1400 
1401 
1402 	/*
1403 	 *	Check legality
1404 	 */
1405 
1406 	if (addr_len < sizeof(struct sockaddr_ll))
1407 		return -EINVAL;
1408 	if (sll->sll_family != AF_PACKET)
1409 		return -EINVAL;
1410 
1411 	if (sll->sll_ifindex) {
1412 		err = -ENODEV;
1413 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1414 		if (dev == NULL)
1415 			goto out;
1416 	}
1417 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1418 	if (dev)
1419 		dev_put(dev);
1420 
1421 out:
1422 	return err;
1423 }
1424 
1425 static struct proto packet_proto = {
1426 	.name	  = "PACKET",
1427 	.owner	  = THIS_MODULE,
1428 	.obj_size = sizeof(struct packet_sock),
1429 };
1430 
1431 /*
1432  *	Create a packet of type SOCK_PACKET.
1433  */
1434 
1435 static int packet_create(struct net *net, struct socket *sock, int protocol,
1436 			 int kern)
1437 {
1438 	struct sock *sk;
1439 	struct packet_sock *po;
1440 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1441 	int err;
1442 
1443 	if (!capable(CAP_NET_RAW))
1444 		return -EPERM;
1445 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1446 	    sock->type != SOCK_PACKET)
1447 		return -ESOCKTNOSUPPORT;
1448 
1449 	sock->state = SS_UNCONNECTED;
1450 
1451 	err = -ENOBUFS;
1452 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1453 	if (sk == NULL)
1454 		goto out;
1455 
1456 	sock->ops = &packet_ops;
1457 	if (sock->type == SOCK_PACKET)
1458 		sock->ops = &packet_ops_spkt;
1459 
1460 	sock_init_data(sock, sk);
1461 
1462 	po = pkt_sk(sk);
1463 	sk->sk_family = PF_PACKET;
1464 	po->num = proto;
1465 
1466 	sk->sk_destruct = packet_sock_destruct;
1467 	sk_refcnt_debug_inc(sk);
1468 
1469 	/*
1470 	 *	Attach a protocol block
1471 	 */
1472 
1473 	spin_lock_init(&po->bind_lock);
1474 	mutex_init(&po->pg_vec_lock);
1475 	po->prot_hook.func = packet_rcv;
1476 
1477 	if (sock->type == SOCK_PACKET)
1478 		po->prot_hook.func = packet_rcv_spkt;
1479 
1480 	po->prot_hook.af_packet_priv = sk;
1481 
1482 	if (proto) {
1483 		po->prot_hook.type = proto;
1484 		dev_add_pack(&po->prot_hook);
1485 		sock_hold(sk);
1486 		po->running = 1;
1487 	}
1488 
1489 	spin_lock_bh(&net->packet.sklist_lock);
1490 	sk_add_node_rcu(sk, &net->packet.sklist);
1491 	sock_prot_inuse_add(net, &packet_proto, 1);
1492 	spin_unlock_bh(&net->packet.sklist_lock);
1493 
1494 	return 0;
1495 out:
1496 	return err;
1497 }
1498 
1499 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1500 {
1501 	struct sock_exterr_skb *serr;
1502 	struct sk_buff *skb, *skb2;
1503 	int copied, err;
1504 
1505 	err = -EAGAIN;
1506 	skb = skb_dequeue(&sk->sk_error_queue);
1507 	if (skb == NULL)
1508 		goto out;
1509 
1510 	copied = skb->len;
1511 	if (copied > len) {
1512 		msg->msg_flags |= MSG_TRUNC;
1513 		copied = len;
1514 	}
1515 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1516 	if (err)
1517 		goto out_free_skb;
1518 
1519 	sock_recv_timestamp(msg, sk, skb);
1520 
1521 	serr = SKB_EXT_ERR(skb);
1522 	put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1523 		 sizeof(serr->ee), &serr->ee);
1524 
1525 	msg->msg_flags |= MSG_ERRQUEUE;
1526 	err = copied;
1527 
1528 	/* Reset and regenerate socket error */
1529 	spin_lock_bh(&sk->sk_error_queue.lock);
1530 	sk->sk_err = 0;
1531 	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1532 		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1533 		spin_unlock_bh(&sk->sk_error_queue.lock);
1534 		sk->sk_error_report(sk);
1535 	} else
1536 		spin_unlock_bh(&sk->sk_error_queue.lock);
1537 
1538 out_free_skb:
1539 	kfree_skb(skb);
1540 out:
1541 	return err;
1542 }
1543 
1544 /*
1545  *	Pull a packet from our receive queue and hand it to the user.
1546  *	If necessary we block.
1547  */
1548 
1549 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1550 			  struct msghdr *msg, size_t len, int flags)
1551 {
1552 	struct sock *sk = sock->sk;
1553 	struct sk_buff *skb;
1554 	int copied, err;
1555 	struct sockaddr_ll *sll;
1556 	int vnet_hdr_len = 0;
1557 
1558 	err = -EINVAL;
1559 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1560 		goto out;
1561 
1562 #if 0
1563 	/* What error should we return now? EUNATTACH? */
1564 	if (pkt_sk(sk)->ifindex < 0)
1565 		return -ENODEV;
1566 #endif
1567 
1568 	if (flags & MSG_ERRQUEUE) {
1569 		err = packet_recv_error(sk, msg, len);
1570 		goto out;
1571 	}
1572 
1573 	/*
1574 	 *	Call the generic datagram receiver. This handles all sorts
1575 	 *	of horrible races and re-entrancy so we can forget about it
1576 	 *	in the protocol layers.
1577 	 *
1578 	 *	Now it will return ENETDOWN, if device have just gone down,
1579 	 *	but then it will block.
1580 	 */
1581 
1582 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1583 
1584 	/*
1585 	 *	An error occurred so return it. Because skb_recv_datagram()
1586 	 *	handles the blocking we don't see and worry about blocking
1587 	 *	retries.
1588 	 */
1589 
1590 	if (skb == NULL)
1591 		goto out;
1592 
1593 	if (pkt_sk(sk)->has_vnet_hdr) {
1594 		struct virtio_net_hdr vnet_hdr = { 0 };
1595 
1596 		err = -EINVAL;
1597 		vnet_hdr_len = sizeof(vnet_hdr);
1598 		if ((len -= vnet_hdr_len) < 0)
1599 			goto out_free;
1600 
1601 		if (skb_is_gso(skb)) {
1602 			struct skb_shared_info *sinfo = skb_shinfo(skb);
1603 
1604 			/* This is a hint as to how much should be linear. */
1605 			vnet_hdr.hdr_len = skb_headlen(skb);
1606 			vnet_hdr.gso_size = sinfo->gso_size;
1607 			if (sinfo->gso_type & SKB_GSO_TCPV4)
1608 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1609 			else if (sinfo->gso_type & SKB_GSO_TCPV6)
1610 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1611 			else if (sinfo->gso_type & SKB_GSO_UDP)
1612 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1613 			else if (sinfo->gso_type & SKB_GSO_FCOE)
1614 				goto out_free;
1615 			else
1616 				BUG();
1617 			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1618 				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1619 		} else
1620 			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1621 
1622 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1623 			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1624 			vnet_hdr.csum_start = skb->csum_start -
1625 							skb_headroom(skb);
1626 			vnet_hdr.csum_offset = skb->csum_offset;
1627 		} /* else everything is zero */
1628 
1629 		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1630 				     vnet_hdr_len);
1631 		if (err < 0)
1632 			goto out_free;
1633 	}
1634 
1635 	/*
1636 	 *	If the address length field is there to be filled in, we fill
1637 	 *	it in now.
1638 	 */
1639 
1640 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1641 	if (sock->type == SOCK_PACKET)
1642 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1643 	else
1644 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1645 
1646 	/*
1647 	 *	You lose any data beyond the buffer you gave. If it worries a
1648 	 *	user program they can ask the device for its MTU anyway.
1649 	 */
1650 
1651 	copied = skb->len;
1652 	if (copied > len) {
1653 		copied = len;
1654 		msg->msg_flags |= MSG_TRUNC;
1655 	}
1656 
1657 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1658 	if (err)
1659 		goto out_free;
1660 
1661 	sock_recv_ts_and_drops(msg, sk, skb);
1662 
1663 	if (msg->msg_name)
1664 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1665 		       msg->msg_namelen);
1666 
1667 	if (pkt_sk(sk)->auxdata) {
1668 		struct tpacket_auxdata aux;
1669 
1670 		aux.tp_status = TP_STATUS_USER;
1671 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1672 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1673 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1674 		aux.tp_snaplen = skb->len;
1675 		aux.tp_mac = 0;
1676 		aux.tp_net = skb_network_offset(skb);
1677 		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1678 
1679 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1680 	}
1681 
1682 	/*
1683 	 *	Free or return the buffer as appropriate. Again this
1684 	 *	hides all the races and re-entrancy issues from us.
1685 	 */
1686 	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1687 
1688 out_free:
1689 	skb_free_datagram(sk, skb);
1690 out:
1691 	return err;
1692 }
1693 
1694 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1695 			       int *uaddr_len, int peer)
1696 {
1697 	struct net_device *dev;
1698 	struct sock *sk	= sock->sk;
1699 
1700 	if (peer)
1701 		return -EOPNOTSUPP;
1702 
1703 	uaddr->sa_family = AF_PACKET;
1704 	rcu_read_lock();
1705 	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1706 	if (dev)
1707 		strlcpy(uaddr->sa_data, dev->name, 15);
1708 	else
1709 		memset(uaddr->sa_data, 0, 14);
1710 	rcu_read_unlock();
1711 	*uaddr_len = sizeof(*uaddr);
1712 
1713 	return 0;
1714 }
1715 
1716 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1717 			  int *uaddr_len, int peer)
1718 {
1719 	struct net_device *dev;
1720 	struct sock *sk = sock->sk;
1721 	struct packet_sock *po = pkt_sk(sk);
1722 	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1723 
1724 	if (peer)
1725 		return -EOPNOTSUPP;
1726 
1727 	sll->sll_family = AF_PACKET;
1728 	sll->sll_ifindex = po->ifindex;
1729 	sll->sll_protocol = po->num;
1730 	rcu_read_lock();
1731 	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1732 	if (dev) {
1733 		sll->sll_hatype = dev->type;
1734 		sll->sll_halen = dev->addr_len;
1735 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1736 	} else {
1737 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1738 		sll->sll_halen = 0;
1739 	}
1740 	rcu_read_unlock();
1741 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1742 
1743 	return 0;
1744 }
1745 
1746 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1747 			 int what)
1748 {
1749 	switch (i->type) {
1750 	case PACKET_MR_MULTICAST:
1751 		if (i->alen != dev->addr_len)
1752 			return -EINVAL;
1753 		if (what > 0)
1754 			return dev_mc_add(dev, i->addr);
1755 		else
1756 			return dev_mc_del(dev, i->addr);
1757 		break;
1758 	case PACKET_MR_PROMISC:
1759 		return dev_set_promiscuity(dev, what);
1760 		break;
1761 	case PACKET_MR_ALLMULTI:
1762 		return dev_set_allmulti(dev, what);
1763 		break;
1764 	case PACKET_MR_UNICAST:
1765 		if (i->alen != dev->addr_len)
1766 			return -EINVAL;
1767 		if (what > 0)
1768 			return dev_uc_add(dev, i->addr);
1769 		else
1770 			return dev_uc_del(dev, i->addr);
1771 		break;
1772 	default:
1773 		break;
1774 	}
1775 	return 0;
1776 }
1777 
1778 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1779 {
1780 	for ( ; i; i = i->next) {
1781 		if (i->ifindex == dev->ifindex)
1782 			packet_dev_mc(dev, i, what);
1783 	}
1784 }
1785 
1786 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1787 {
1788 	struct packet_sock *po = pkt_sk(sk);
1789 	struct packet_mclist *ml, *i;
1790 	struct net_device *dev;
1791 	int err;
1792 
1793 	rtnl_lock();
1794 
1795 	err = -ENODEV;
1796 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1797 	if (!dev)
1798 		goto done;
1799 
1800 	err = -EINVAL;
1801 	if (mreq->mr_alen > dev->addr_len)
1802 		goto done;
1803 
1804 	err = -ENOBUFS;
1805 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1806 	if (i == NULL)
1807 		goto done;
1808 
1809 	err = 0;
1810 	for (ml = po->mclist; ml; ml = ml->next) {
1811 		if (ml->ifindex == mreq->mr_ifindex &&
1812 		    ml->type == mreq->mr_type &&
1813 		    ml->alen == mreq->mr_alen &&
1814 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1815 			ml->count++;
1816 			/* Free the new element ... */
1817 			kfree(i);
1818 			goto done;
1819 		}
1820 	}
1821 
1822 	i->type = mreq->mr_type;
1823 	i->ifindex = mreq->mr_ifindex;
1824 	i->alen = mreq->mr_alen;
1825 	memcpy(i->addr, mreq->mr_address, i->alen);
1826 	i->count = 1;
1827 	i->next = po->mclist;
1828 	po->mclist = i;
1829 	err = packet_dev_mc(dev, i, 1);
1830 	if (err) {
1831 		po->mclist = i->next;
1832 		kfree(i);
1833 	}
1834 
1835 done:
1836 	rtnl_unlock();
1837 	return err;
1838 }
1839 
1840 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1841 {
1842 	struct packet_mclist *ml, **mlp;
1843 
1844 	rtnl_lock();
1845 
1846 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1847 		if (ml->ifindex == mreq->mr_ifindex &&
1848 		    ml->type == mreq->mr_type &&
1849 		    ml->alen == mreq->mr_alen &&
1850 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1851 			if (--ml->count == 0) {
1852 				struct net_device *dev;
1853 				*mlp = ml->next;
1854 				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1855 				if (dev)
1856 					packet_dev_mc(dev, ml, -1);
1857 				kfree(ml);
1858 			}
1859 			rtnl_unlock();
1860 			return 0;
1861 		}
1862 	}
1863 	rtnl_unlock();
1864 	return -EADDRNOTAVAIL;
1865 }
1866 
1867 static void packet_flush_mclist(struct sock *sk)
1868 {
1869 	struct packet_sock *po = pkt_sk(sk);
1870 	struct packet_mclist *ml;
1871 
1872 	if (!po->mclist)
1873 		return;
1874 
1875 	rtnl_lock();
1876 	while ((ml = po->mclist) != NULL) {
1877 		struct net_device *dev;
1878 
1879 		po->mclist = ml->next;
1880 		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881 		if (dev != NULL)
1882 			packet_dev_mc(dev, ml, -1);
1883 		kfree(ml);
1884 	}
1885 	rtnl_unlock();
1886 }
1887 
1888 static int
1889 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1890 {
1891 	struct sock *sk = sock->sk;
1892 	struct packet_sock *po = pkt_sk(sk);
1893 	int ret;
1894 
1895 	if (level != SOL_PACKET)
1896 		return -ENOPROTOOPT;
1897 
1898 	switch (optname) {
1899 	case PACKET_ADD_MEMBERSHIP:
1900 	case PACKET_DROP_MEMBERSHIP:
1901 	{
1902 		struct packet_mreq_max mreq;
1903 		int len = optlen;
1904 		memset(&mreq, 0, sizeof(mreq));
1905 		if (len < sizeof(struct packet_mreq))
1906 			return -EINVAL;
1907 		if (len > sizeof(mreq))
1908 			len = sizeof(mreq);
1909 		if (copy_from_user(&mreq, optval, len))
1910 			return -EFAULT;
1911 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1912 			return -EINVAL;
1913 		if (optname == PACKET_ADD_MEMBERSHIP)
1914 			ret = packet_mc_add(sk, &mreq);
1915 		else
1916 			ret = packet_mc_drop(sk, &mreq);
1917 		return ret;
1918 	}
1919 
1920 	case PACKET_RX_RING:
1921 	case PACKET_TX_RING:
1922 	{
1923 		struct tpacket_req req;
1924 
1925 		if (optlen < sizeof(req))
1926 			return -EINVAL;
1927 		if (pkt_sk(sk)->has_vnet_hdr)
1928 			return -EINVAL;
1929 		if (copy_from_user(&req, optval, sizeof(req)))
1930 			return -EFAULT;
1931 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1932 	}
1933 	case PACKET_COPY_THRESH:
1934 	{
1935 		int val;
1936 
1937 		if (optlen != sizeof(val))
1938 			return -EINVAL;
1939 		if (copy_from_user(&val, optval, sizeof(val)))
1940 			return -EFAULT;
1941 
1942 		pkt_sk(sk)->copy_thresh = val;
1943 		return 0;
1944 	}
1945 	case PACKET_VERSION:
1946 	{
1947 		int val;
1948 
1949 		if (optlen != sizeof(val))
1950 			return -EINVAL;
1951 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1952 			return -EBUSY;
1953 		if (copy_from_user(&val, optval, sizeof(val)))
1954 			return -EFAULT;
1955 		switch (val) {
1956 		case TPACKET_V1:
1957 		case TPACKET_V2:
1958 			po->tp_version = val;
1959 			return 0;
1960 		default:
1961 			return -EINVAL;
1962 		}
1963 	}
1964 	case PACKET_RESERVE:
1965 	{
1966 		unsigned int val;
1967 
1968 		if (optlen != sizeof(val))
1969 			return -EINVAL;
1970 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1971 			return -EBUSY;
1972 		if (copy_from_user(&val, optval, sizeof(val)))
1973 			return -EFAULT;
1974 		po->tp_reserve = val;
1975 		return 0;
1976 	}
1977 	case PACKET_LOSS:
1978 	{
1979 		unsigned int val;
1980 
1981 		if (optlen != sizeof(val))
1982 			return -EINVAL;
1983 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1984 			return -EBUSY;
1985 		if (copy_from_user(&val, optval, sizeof(val)))
1986 			return -EFAULT;
1987 		po->tp_loss = !!val;
1988 		return 0;
1989 	}
1990 	case PACKET_AUXDATA:
1991 	{
1992 		int val;
1993 
1994 		if (optlen < sizeof(val))
1995 			return -EINVAL;
1996 		if (copy_from_user(&val, optval, sizeof(val)))
1997 			return -EFAULT;
1998 
1999 		po->auxdata = !!val;
2000 		return 0;
2001 	}
2002 	case PACKET_ORIGDEV:
2003 	{
2004 		int val;
2005 
2006 		if (optlen < sizeof(val))
2007 			return -EINVAL;
2008 		if (copy_from_user(&val, optval, sizeof(val)))
2009 			return -EFAULT;
2010 
2011 		po->origdev = !!val;
2012 		return 0;
2013 	}
2014 	case PACKET_VNET_HDR:
2015 	{
2016 		int val;
2017 
2018 		if (sock->type != SOCK_RAW)
2019 			return -EINVAL;
2020 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2021 			return -EBUSY;
2022 		if (optlen < sizeof(val))
2023 			return -EINVAL;
2024 		if (copy_from_user(&val, optval, sizeof(val)))
2025 			return -EFAULT;
2026 
2027 		po->has_vnet_hdr = !!val;
2028 		return 0;
2029 	}
2030 	default:
2031 		return -ENOPROTOOPT;
2032 	}
2033 }
2034 
2035 static int packet_getsockopt(struct socket *sock, int level, int optname,
2036 			     char __user *optval, int __user *optlen)
2037 {
2038 	int len;
2039 	int val;
2040 	struct sock *sk = sock->sk;
2041 	struct packet_sock *po = pkt_sk(sk);
2042 	void *data;
2043 	struct tpacket_stats st;
2044 
2045 	if (level != SOL_PACKET)
2046 		return -ENOPROTOOPT;
2047 
2048 	if (get_user(len, optlen))
2049 		return -EFAULT;
2050 
2051 	if (len < 0)
2052 		return -EINVAL;
2053 
2054 	switch (optname) {
2055 	case PACKET_STATISTICS:
2056 		if (len > sizeof(struct tpacket_stats))
2057 			len = sizeof(struct tpacket_stats);
2058 		spin_lock_bh(&sk->sk_receive_queue.lock);
2059 		st = po->stats;
2060 		memset(&po->stats, 0, sizeof(st));
2061 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2062 		st.tp_packets += st.tp_drops;
2063 
2064 		data = &st;
2065 		break;
2066 	case PACKET_AUXDATA:
2067 		if (len > sizeof(int))
2068 			len = sizeof(int);
2069 		val = po->auxdata;
2070 
2071 		data = &val;
2072 		break;
2073 	case PACKET_ORIGDEV:
2074 		if (len > sizeof(int))
2075 			len = sizeof(int);
2076 		val = po->origdev;
2077 
2078 		data = &val;
2079 		break;
2080 	case PACKET_VNET_HDR:
2081 		if (len > sizeof(int))
2082 			len = sizeof(int);
2083 		val = po->has_vnet_hdr;
2084 
2085 		data = &val;
2086 		break;
2087 	case PACKET_VERSION:
2088 		if (len > sizeof(int))
2089 			len = sizeof(int);
2090 		val = po->tp_version;
2091 		data = &val;
2092 		break;
2093 	case PACKET_HDRLEN:
2094 		if (len > sizeof(int))
2095 			len = sizeof(int);
2096 		if (copy_from_user(&val, optval, len))
2097 			return -EFAULT;
2098 		switch (val) {
2099 		case TPACKET_V1:
2100 			val = sizeof(struct tpacket_hdr);
2101 			break;
2102 		case TPACKET_V2:
2103 			val = sizeof(struct tpacket2_hdr);
2104 			break;
2105 		default:
2106 			return -EINVAL;
2107 		}
2108 		data = &val;
2109 		break;
2110 	case PACKET_RESERVE:
2111 		if (len > sizeof(unsigned int))
2112 			len = sizeof(unsigned int);
2113 		val = po->tp_reserve;
2114 		data = &val;
2115 		break;
2116 	case PACKET_LOSS:
2117 		if (len > sizeof(unsigned int))
2118 			len = sizeof(unsigned int);
2119 		val = po->tp_loss;
2120 		data = &val;
2121 		break;
2122 	default:
2123 		return -ENOPROTOOPT;
2124 	}
2125 
2126 	if (put_user(len, optlen))
2127 		return -EFAULT;
2128 	if (copy_to_user(optval, data, len))
2129 		return -EFAULT;
2130 	return 0;
2131 }
2132 
2133 
2134 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2135 {
2136 	struct sock *sk;
2137 	struct hlist_node *node;
2138 	struct net_device *dev = data;
2139 	struct net *net = dev_net(dev);
2140 
2141 	rcu_read_lock();
2142 	sk_for_each_rcu(sk, node, &net->packet.sklist) {
2143 		struct packet_sock *po = pkt_sk(sk);
2144 
2145 		switch (msg) {
2146 		case NETDEV_UNREGISTER:
2147 			if (po->mclist)
2148 				packet_dev_mclist(dev, po->mclist, -1);
2149 			/* fallthrough */
2150 
2151 		case NETDEV_DOWN:
2152 			if (dev->ifindex == po->ifindex) {
2153 				spin_lock(&po->bind_lock);
2154 				if (po->running) {
2155 					__dev_remove_pack(&po->prot_hook);
2156 					__sock_put(sk);
2157 					po->running = 0;
2158 					sk->sk_err = ENETDOWN;
2159 					if (!sock_flag(sk, SOCK_DEAD))
2160 						sk->sk_error_report(sk);
2161 				}
2162 				if (msg == NETDEV_UNREGISTER) {
2163 					po->ifindex = -1;
2164 					po->prot_hook.dev = NULL;
2165 				}
2166 				spin_unlock(&po->bind_lock);
2167 			}
2168 			break;
2169 		case NETDEV_UP:
2170 			if (dev->ifindex == po->ifindex) {
2171 				spin_lock(&po->bind_lock);
2172 				if (po->num && !po->running) {
2173 					dev_add_pack(&po->prot_hook);
2174 					sock_hold(sk);
2175 					po->running = 1;
2176 				}
2177 				spin_unlock(&po->bind_lock);
2178 			}
2179 			break;
2180 		}
2181 	}
2182 	rcu_read_unlock();
2183 	return NOTIFY_DONE;
2184 }
2185 
2186 
2187 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2188 			unsigned long arg)
2189 {
2190 	struct sock *sk = sock->sk;
2191 
2192 	switch (cmd) {
2193 	case SIOCOUTQ:
2194 	{
2195 		int amount = sk_wmem_alloc_get(sk);
2196 
2197 		return put_user(amount, (int __user *)arg);
2198 	}
2199 	case SIOCINQ:
2200 	{
2201 		struct sk_buff *skb;
2202 		int amount = 0;
2203 
2204 		spin_lock_bh(&sk->sk_receive_queue.lock);
2205 		skb = skb_peek(&sk->sk_receive_queue);
2206 		if (skb)
2207 			amount = skb->len;
2208 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2209 		return put_user(amount, (int __user *)arg);
2210 	}
2211 	case SIOCGSTAMP:
2212 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2213 	case SIOCGSTAMPNS:
2214 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2215 
2216 #ifdef CONFIG_INET
2217 	case SIOCADDRT:
2218 	case SIOCDELRT:
2219 	case SIOCDARP:
2220 	case SIOCGARP:
2221 	case SIOCSARP:
2222 	case SIOCGIFADDR:
2223 	case SIOCSIFADDR:
2224 	case SIOCGIFBRDADDR:
2225 	case SIOCSIFBRDADDR:
2226 	case SIOCGIFNETMASK:
2227 	case SIOCSIFNETMASK:
2228 	case SIOCGIFDSTADDR:
2229 	case SIOCSIFDSTADDR:
2230 	case SIOCSIFFLAGS:
2231 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2232 #endif
2233 
2234 	default:
2235 		return -ENOIOCTLCMD;
2236 	}
2237 	return 0;
2238 }
2239 
2240 static unsigned int packet_poll(struct file *file, struct socket *sock,
2241 				poll_table *wait)
2242 {
2243 	struct sock *sk = sock->sk;
2244 	struct packet_sock *po = pkt_sk(sk);
2245 	unsigned int mask = datagram_poll(file, sock, wait);
2246 
2247 	spin_lock_bh(&sk->sk_receive_queue.lock);
2248 	if (po->rx_ring.pg_vec) {
2249 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2250 			mask |= POLLIN | POLLRDNORM;
2251 	}
2252 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2253 	spin_lock_bh(&sk->sk_write_queue.lock);
2254 	if (po->tx_ring.pg_vec) {
2255 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2256 			mask |= POLLOUT | POLLWRNORM;
2257 	}
2258 	spin_unlock_bh(&sk->sk_write_queue.lock);
2259 	return mask;
2260 }
2261 
2262 
2263 /* Dirty? Well, I still did not learn better way to account
2264  * for user mmaps.
2265  */
2266 
2267 static void packet_mm_open(struct vm_area_struct *vma)
2268 {
2269 	struct file *file = vma->vm_file;
2270 	struct socket *sock = file->private_data;
2271 	struct sock *sk = sock->sk;
2272 
2273 	if (sk)
2274 		atomic_inc(&pkt_sk(sk)->mapped);
2275 }
2276 
2277 static void packet_mm_close(struct vm_area_struct *vma)
2278 {
2279 	struct file *file = vma->vm_file;
2280 	struct socket *sock = file->private_data;
2281 	struct sock *sk = sock->sk;
2282 
2283 	if (sk)
2284 		atomic_dec(&pkt_sk(sk)->mapped);
2285 }
2286 
2287 static const struct vm_operations_struct packet_mmap_ops = {
2288 	.open	=	packet_mm_open,
2289 	.close	=	packet_mm_close,
2290 };
2291 
2292 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2293 {
2294 	int i;
2295 
2296 	for (i = 0; i < len; i++) {
2297 		if (likely(pg_vec[i]))
2298 			free_pages((unsigned long) pg_vec[i], order);
2299 	}
2300 	kfree(pg_vec);
2301 }
2302 
2303 static inline char *alloc_one_pg_vec_page(unsigned long order)
2304 {
2305 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2306 
2307 	return (char *) __get_free_pages(gfp_flags, order);
2308 }
2309 
2310 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2311 {
2312 	unsigned int block_nr = req->tp_block_nr;
2313 	char **pg_vec;
2314 	int i;
2315 
2316 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2317 	if (unlikely(!pg_vec))
2318 		goto out;
2319 
2320 	for (i = 0; i < block_nr; i++) {
2321 		pg_vec[i] = alloc_one_pg_vec_page(order);
2322 		if (unlikely(!pg_vec[i]))
2323 			goto out_free_pgvec;
2324 	}
2325 
2326 out:
2327 	return pg_vec;
2328 
2329 out_free_pgvec:
2330 	free_pg_vec(pg_vec, order, block_nr);
2331 	pg_vec = NULL;
2332 	goto out;
2333 }
2334 
2335 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2336 		int closing, int tx_ring)
2337 {
2338 	char **pg_vec = NULL;
2339 	struct packet_sock *po = pkt_sk(sk);
2340 	int was_running, order = 0;
2341 	struct packet_ring_buffer *rb;
2342 	struct sk_buff_head *rb_queue;
2343 	__be16 num;
2344 	int err;
2345 
2346 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2347 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2348 
2349 	err = -EBUSY;
2350 	if (!closing) {
2351 		if (atomic_read(&po->mapped))
2352 			goto out;
2353 		if (atomic_read(&rb->pending))
2354 			goto out;
2355 	}
2356 
2357 	if (req->tp_block_nr) {
2358 		/* Sanity tests and some calculations */
2359 		err = -EBUSY;
2360 		if (unlikely(rb->pg_vec))
2361 			goto out;
2362 
2363 		switch (po->tp_version) {
2364 		case TPACKET_V1:
2365 			po->tp_hdrlen = TPACKET_HDRLEN;
2366 			break;
2367 		case TPACKET_V2:
2368 			po->tp_hdrlen = TPACKET2_HDRLEN;
2369 			break;
2370 		}
2371 
2372 		err = -EINVAL;
2373 		if (unlikely((int)req->tp_block_size <= 0))
2374 			goto out;
2375 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2376 			goto out;
2377 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2378 					po->tp_reserve))
2379 			goto out;
2380 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2381 			goto out;
2382 
2383 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2384 		if (unlikely(rb->frames_per_block <= 0))
2385 			goto out;
2386 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2387 					req->tp_frame_nr))
2388 			goto out;
2389 
2390 		err = -ENOMEM;
2391 		order = get_order(req->tp_block_size);
2392 		pg_vec = alloc_pg_vec(req, order);
2393 		if (unlikely(!pg_vec))
2394 			goto out;
2395 	}
2396 	/* Done */
2397 	else {
2398 		err = -EINVAL;
2399 		if (unlikely(req->tp_frame_nr))
2400 			goto out;
2401 	}
2402 
2403 	lock_sock(sk);
2404 
2405 	/* Detach socket from network */
2406 	spin_lock(&po->bind_lock);
2407 	was_running = po->running;
2408 	num = po->num;
2409 	if (was_running) {
2410 		__dev_remove_pack(&po->prot_hook);
2411 		po->num = 0;
2412 		po->running = 0;
2413 		__sock_put(sk);
2414 	}
2415 	spin_unlock(&po->bind_lock);
2416 
2417 	synchronize_net();
2418 
2419 	err = -EBUSY;
2420 	mutex_lock(&po->pg_vec_lock);
2421 	if (closing || atomic_read(&po->mapped) == 0) {
2422 		err = 0;
2423 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2424 		spin_lock_bh(&rb_queue->lock);
2425 		pg_vec = XC(rb->pg_vec, pg_vec);
2426 		rb->frame_max = (req->tp_frame_nr - 1);
2427 		rb->head = 0;
2428 		rb->frame_size = req->tp_frame_size;
2429 		spin_unlock_bh(&rb_queue->lock);
2430 
2431 		order = XC(rb->pg_vec_order, order);
2432 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2433 
2434 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2435 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2436 						tpacket_rcv : packet_rcv;
2437 		skb_queue_purge(rb_queue);
2438 #undef XC
2439 		if (atomic_read(&po->mapped))
2440 			pr_err("packet_mmap: vma is busy: %d\n",
2441 			       atomic_read(&po->mapped));
2442 	}
2443 	mutex_unlock(&po->pg_vec_lock);
2444 
2445 	spin_lock(&po->bind_lock);
2446 	if (was_running && !po->running) {
2447 		sock_hold(sk);
2448 		po->running = 1;
2449 		po->num = num;
2450 		dev_add_pack(&po->prot_hook);
2451 	}
2452 	spin_unlock(&po->bind_lock);
2453 
2454 	release_sock(sk);
2455 
2456 	if (pg_vec)
2457 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2458 out:
2459 	return err;
2460 }
2461 
2462 static int packet_mmap(struct file *file, struct socket *sock,
2463 		struct vm_area_struct *vma)
2464 {
2465 	struct sock *sk = sock->sk;
2466 	struct packet_sock *po = pkt_sk(sk);
2467 	unsigned long size, expected_size;
2468 	struct packet_ring_buffer *rb;
2469 	unsigned long start;
2470 	int err = -EINVAL;
2471 	int i;
2472 
2473 	if (vma->vm_pgoff)
2474 		return -EINVAL;
2475 
2476 	mutex_lock(&po->pg_vec_lock);
2477 
2478 	expected_size = 0;
2479 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2480 		if (rb->pg_vec) {
2481 			expected_size += rb->pg_vec_len
2482 						* rb->pg_vec_pages
2483 						* PAGE_SIZE;
2484 		}
2485 	}
2486 
2487 	if (expected_size == 0)
2488 		goto out;
2489 
2490 	size = vma->vm_end - vma->vm_start;
2491 	if (size != expected_size)
2492 		goto out;
2493 
2494 	start = vma->vm_start;
2495 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2496 		if (rb->pg_vec == NULL)
2497 			continue;
2498 
2499 		for (i = 0; i < rb->pg_vec_len; i++) {
2500 			struct page *page = virt_to_page(rb->pg_vec[i]);
2501 			int pg_num;
2502 
2503 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2504 					pg_num++, page++) {
2505 				err = vm_insert_page(vma, start, page);
2506 				if (unlikely(err))
2507 					goto out;
2508 				start += PAGE_SIZE;
2509 			}
2510 		}
2511 	}
2512 
2513 	atomic_inc(&po->mapped);
2514 	vma->vm_ops = &packet_mmap_ops;
2515 	err = 0;
2516 
2517 out:
2518 	mutex_unlock(&po->pg_vec_lock);
2519 	return err;
2520 }
2521 
2522 static const struct proto_ops packet_ops_spkt = {
2523 	.family =	PF_PACKET,
2524 	.owner =	THIS_MODULE,
2525 	.release =	packet_release,
2526 	.bind =		packet_bind_spkt,
2527 	.connect =	sock_no_connect,
2528 	.socketpair =	sock_no_socketpair,
2529 	.accept =	sock_no_accept,
2530 	.getname =	packet_getname_spkt,
2531 	.poll =		datagram_poll,
2532 	.ioctl =	packet_ioctl,
2533 	.listen =	sock_no_listen,
2534 	.shutdown =	sock_no_shutdown,
2535 	.setsockopt =	sock_no_setsockopt,
2536 	.getsockopt =	sock_no_getsockopt,
2537 	.sendmsg =	packet_sendmsg_spkt,
2538 	.recvmsg =	packet_recvmsg,
2539 	.mmap =		sock_no_mmap,
2540 	.sendpage =	sock_no_sendpage,
2541 };
2542 
2543 static const struct proto_ops packet_ops = {
2544 	.family =	PF_PACKET,
2545 	.owner =	THIS_MODULE,
2546 	.release =	packet_release,
2547 	.bind =		packet_bind,
2548 	.connect =	sock_no_connect,
2549 	.socketpair =	sock_no_socketpair,
2550 	.accept =	sock_no_accept,
2551 	.getname =	packet_getname,
2552 	.poll =		packet_poll,
2553 	.ioctl =	packet_ioctl,
2554 	.listen =	sock_no_listen,
2555 	.shutdown =	sock_no_shutdown,
2556 	.setsockopt =	packet_setsockopt,
2557 	.getsockopt =	packet_getsockopt,
2558 	.sendmsg =	packet_sendmsg,
2559 	.recvmsg =	packet_recvmsg,
2560 	.mmap =		packet_mmap,
2561 	.sendpage =	sock_no_sendpage,
2562 };
2563 
2564 static const struct net_proto_family packet_family_ops = {
2565 	.family =	PF_PACKET,
2566 	.create =	packet_create,
2567 	.owner	=	THIS_MODULE,
2568 };
2569 
2570 static struct notifier_block packet_netdev_notifier = {
2571 	.notifier_call =	packet_notifier,
2572 };
2573 
2574 #ifdef CONFIG_PROC_FS
2575 
2576 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2577 	__acquires(RCU)
2578 {
2579 	struct net *net = seq_file_net(seq);
2580 
2581 	rcu_read_lock();
2582 	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2583 }
2584 
2585 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2586 {
2587 	struct net *net = seq_file_net(seq);
2588 	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2589 }
2590 
2591 static void packet_seq_stop(struct seq_file *seq, void *v)
2592 	__releases(RCU)
2593 {
2594 	rcu_read_unlock();
2595 }
2596 
2597 static int packet_seq_show(struct seq_file *seq, void *v)
2598 {
2599 	if (v == SEQ_START_TOKEN)
2600 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2601 	else {
2602 		struct sock *s = sk_entry(v);
2603 		const struct packet_sock *po = pkt_sk(s);
2604 
2605 		seq_printf(seq,
2606 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2607 			   s,
2608 			   atomic_read(&s->sk_refcnt),
2609 			   s->sk_type,
2610 			   ntohs(po->num),
2611 			   po->ifindex,
2612 			   po->running,
2613 			   atomic_read(&s->sk_rmem_alloc),
2614 			   sock_i_uid(s),
2615 			   sock_i_ino(s));
2616 	}
2617 
2618 	return 0;
2619 }
2620 
2621 static const struct seq_operations packet_seq_ops = {
2622 	.start	= packet_seq_start,
2623 	.next	= packet_seq_next,
2624 	.stop	= packet_seq_stop,
2625 	.show	= packet_seq_show,
2626 };
2627 
2628 static int packet_seq_open(struct inode *inode, struct file *file)
2629 {
2630 	return seq_open_net(inode, file, &packet_seq_ops,
2631 			    sizeof(struct seq_net_private));
2632 }
2633 
2634 static const struct file_operations packet_seq_fops = {
2635 	.owner		= THIS_MODULE,
2636 	.open		= packet_seq_open,
2637 	.read		= seq_read,
2638 	.llseek		= seq_lseek,
2639 	.release	= seq_release_net,
2640 };
2641 
2642 #endif
2643 
2644 static int __net_init packet_net_init(struct net *net)
2645 {
2646 	spin_lock_init(&net->packet.sklist_lock);
2647 	INIT_HLIST_HEAD(&net->packet.sklist);
2648 
2649 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2650 		return -ENOMEM;
2651 
2652 	return 0;
2653 }
2654 
2655 static void __net_exit packet_net_exit(struct net *net)
2656 {
2657 	proc_net_remove(net, "packet");
2658 }
2659 
2660 static struct pernet_operations packet_net_ops = {
2661 	.init = packet_net_init,
2662 	.exit = packet_net_exit,
2663 };
2664 
2665 
2666 static void __exit packet_exit(void)
2667 {
2668 	unregister_netdevice_notifier(&packet_netdev_notifier);
2669 	unregister_pernet_subsys(&packet_net_ops);
2670 	sock_unregister(PF_PACKET);
2671 	proto_unregister(&packet_proto);
2672 }
2673 
2674 static int __init packet_init(void)
2675 {
2676 	int rc = proto_register(&packet_proto, 0);
2677 
2678 	if (rc != 0)
2679 		goto out;
2680 
2681 	sock_register(&packet_family_ops);
2682 	register_pernet_subsys(&packet_net_ops);
2683 	register_netdevice_notifier(&packet_netdev_notifier);
2684 out:
2685 	return rc;
2686 }
2687 
2688 module_init(packet_init);
2689 module_exit(packet_exit);
2690 MODULE_LICENSE("GPL");
2691 MODULE_ALIAS_NETPROTO(PF_PACKET);
2692