xref: /linux/net/packet/af_packet.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 
86 #ifdef CONFIG_INET
87 #include <net/inet_common.h>
88 #endif
89 
90 /*
91    Assumptions:
92    - if device has no dev->hard_header routine, it adds and removes ll header
93      inside itself. In this case ll header is invisible outside of device,
94      but higher levels still should reserve dev->hard_header_len.
95      Some devices are enough clever to reallocate skb, when header
96      will not fit to reserved space (tunnel), another ones are silly
97      (PPP).
98    - packet socket receives packets with pulled ll header,
99      so that SOCK_RAW should push it back.
100 
101 On receive:
102 -----------
103 
104 Incoming, dev->hard_header!=NULL
105    mac_header -> ll header
106    data       -> data
107 
108 Outgoing, dev->hard_header!=NULL
109    mac_header -> ll header
110    data       -> ll header
111 
112 Incoming, dev->hard_header==NULL
113    mac_header -> UNKNOWN position. It is very likely, that it points to ll
114 		 header.  PPP makes it, that is wrong, because introduce
115 		 assymetry between rx and tx paths.
116    data       -> data
117 
118 Outgoing, dev->hard_header==NULL
119    mac_header -> data. ll header is still not built!
120    data       -> data
121 
122 Resume
123   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
124 
125 
126 On transmit:
127 ------------
128 
129 dev->hard_header != NULL
130    mac_header -> ll header
131    data       -> ll header
132 
133 dev->hard_header == NULL (ll header is added by device, we cannot control it)
134    mac_header -> data
135    data       -> data
136 
137    We should set nh.raw on output to correct posistion,
138    packet classifier depends on it.
139  */
140 
141 /* Private packet socket structures. */
142 
143 struct packet_mclist {
144 	struct packet_mclist	*next;
145 	int			ifindex;
146 	int			count;
147 	unsigned short		type;
148 	unsigned short		alen;
149 	unsigned char		addr[MAX_ADDR_LEN];
150 };
151 /* identical to struct packet_mreq except it has
152  * a longer address field.
153  */
154 struct packet_mreq_max {
155 	int		mr_ifindex;
156 	unsigned short	mr_type;
157 	unsigned short	mr_alen;
158 	unsigned char	mr_address[MAX_ADDR_LEN];
159 };
160 
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 		int closing, int tx_ring);
163 
164 struct packet_ring_buffer {
165 	char			**pg_vec;
166 	unsigned int		head;
167 	unsigned int		frames_per_block;
168 	unsigned int		frame_size;
169 	unsigned int		frame_max;
170 
171 	unsigned int		pg_vec_order;
172 	unsigned int		pg_vec_pages;
173 	unsigned int		pg_vec_len;
174 
175 	atomic_t		pending;
176 };
177 
178 struct packet_sock;
179 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180 
181 static void packet_flush_mclist(struct sock *sk);
182 
183 struct packet_sock {
184 	/* struct sock has to be the first member of packet_sock */
185 	struct sock		sk;
186 	struct tpacket_stats	stats;
187 	struct packet_ring_buffer	rx_ring;
188 	struct packet_ring_buffer	tx_ring;
189 	int			copy_thresh;
190 	spinlock_t		bind_lock;
191 	struct mutex		pg_vec_lock;
192 	unsigned int		running:1,	/* prot_hook is attached*/
193 				auxdata:1,
194 				origdev:1,
195 				has_vnet_hdr:1;
196 	int			ifindex;	/* bound device		*/
197 	__be16			num;
198 	struct packet_mclist	*mclist;
199 	atomic_t		mapped;
200 	enum tpacket_versions	tp_version;
201 	unsigned int		tp_hdrlen;
202 	unsigned int		tp_reserve;
203 	unsigned int		tp_loss:1;
204 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
205 };
206 
207 struct packet_skb_cb {
208 	unsigned int origlen;
209 	union {
210 		struct sockaddr_pkt pkt;
211 		struct sockaddr_ll ll;
212 	} sa;
213 };
214 
215 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
216 
217 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
218 {
219 	union {
220 		struct tpacket_hdr *h1;
221 		struct tpacket2_hdr *h2;
222 		void *raw;
223 	} h;
224 
225 	h.raw = frame;
226 	switch (po->tp_version) {
227 	case TPACKET_V1:
228 		h.h1->tp_status = status;
229 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
230 		break;
231 	case TPACKET_V2:
232 		h.h2->tp_status = status;
233 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
234 		break;
235 	default:
236 		pr_err("TPACKET version not supported\n");
237 		BUG();
238 	}
239 
240 	smp_wmb();
241 }
242 
243 static int __packet_get_status(struct packet_sock *po, void *frame)
244 {
245 	union {
246 		struct tpacket_hdr *h1;
247 		struct tpacket2_hdr *h2;
248 		void *raw;
249 	} h;
250 
251 	smp_rmb();
252 
253 	h.raw = frame;
254 	switch (po->tp_version) {
255 	case TPACKET_V1:
256 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
257 		return h.h1->tp_status;
258 	case TPACKET_V2:
259 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
260 		return h.h2->tp_status;
261 	default:
262 		pr_err("TPACKET version not supported\n");
263 		BUG();
264 		return 0;
265 	}
266 }
267 
268 static void *packet_lookup_frame(struct packet_sock *po,
269 		struct packet_ring_buffer *rb,
270 		unsigned int position,
271 		int status)
272 {
273 	unsigned int pg_vec_pos, frame_offset;
274 	union {
275 		struct tpacket_hdr *h1;
276 		struct tpacket2_hdr *h2;
277 		void *raw;
278 	} h;
279 
280 	pg_vec_pos = position / rb->frames_per_block;
281 	frame_offset = position % rb->frames_per_block;
282 
283 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
284 
285 	if (status != __packet_get_status(po, h.raw))
286 		return NULL;
287 
288 	return h.raw;
289 }
290 
291 static inline void *packet_current_frame(struct packet_sock *po,
292 		struct packet_ring_buffer *rb,
293 		int status)
294 {
295 	return packet_lookup_frame(po, rb, rb->head, status);
296 }
297 
298 static inline void *packet_previous_frame(struct packet_sock *po,
299 		struct packet_ring_buffer *rb,
300 		int status)
301 {
302 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
303 	return packet_lookup_frame(po, rb, previous, status);
304 }
305 
306 static inline void packet_increment_head(struct packet_ring_buffer *buff)
307 {
308 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
309 }
310 
311 static inline struct packet_sock *pkt_sk(struct sock *sk)
312 {
313 	return (struct packet_sock *)sk;
314 }
315 
316 static void packet_sock_destruct(struct sock *sk)
317 {
318 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
319 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
320 
321 	if (!sock_flag(sk, SOCK_DEAD)) {
322 		pr_err("Attempt to release alive packet socket: %p\n", sk);
323 		return;
324 	}
325 
326 	sk_refcnt_debug_dec(sk);
327 }
328 
329 
330 static const struct proto_ops packet_ops;
331 
332 static const struct proto_ops packet_ops_spkt;
333 
334 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
335 			   struct packet_type *pt, struct net_device *orig_dev)
336 {
337 	struct sock *sk;
338 	struct sockaddr_pkt *spkt;
339 
340 	/*
341 	 *	When we registered the protocol we saved the socket in the data
342 	 *	field for just this event.
343 	 */
344 
345 	sk = pt->af_packet_priv;
346 
347 	/*
348 	 *	Yank back the headers [hope the device set this
349 	 *	right or kerboom...]
350 	 *
351 	 *	Incoming packets have ll header pulled,
352 	 *	push it back.
353 	 *
354 	 *	For outgoing ones skb->data == skb_mac_header(skb)
355 	 *	so that this procedure is noop.
356 	 */
357 
358 	if (skb->pkt_type == PACKET_LOOPBACK)
359 		goto out;
360 
361 	if (!net_eq(dev_net(dev), sock_net(sk)))
362 		goto out;
363 
364 	skb = skb_share_check(skb, GFP_ATOMIC);
365 	if (skb == NULL)
366 		goto oom;
367 
368 	/* drop any routing info */
369 	skb_dst_drop(skb);
370 
371 	/* drop conntrack reference */
372 	nf_reset(skb);
373 
374 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
375 
376 	skb_push(skb, skb->data - skb_mac_header(skb));
377 
378 	/*
379 	 *	The SOCK_PACKET socket receives _all_ frames.
380 	 */
381 
382 	spkt->spkt_family = dev->type;
383 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
384 	spkt->spkt_protocol = skb->protocol;
385 
386 	/*
387 	 *	Charge the memory to the socket. This is done specifically
388 	 *	to prevent sockets using all the memory up.
389 	 */
390 
391 	if (sock_queue_rcv_skb(sk, skb) == 0)
392 		return 0;
393 
394 out:
395 	kfree_skb(skb);
396 oom:
397 	return 0;
398 }
399 
400 
401 /*
402  *	Output a raw packet to a device layer. This bypasses all the other
403  *	protocol layers and you must therefore supply it with a complete frame
404  */
405 
406 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
407 			       struct msghdr *msg, size_t len)
408 {
409 	struct sock *sk = sock->sk;
410 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
411 	struct sk_buff *skb = NULL;
412 	struct net_device *dev;
413 	__be16 proto = 0;
414 	int err;
415 
416 	/*
417 	 *	Get and verify the address.
418 	 */
419 
420 	if (saddr) {
421 		if (msg->msg_namelen < sizeof(struct sockaddr))
422 			return -EINVAL;
423 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
424 			proto = saddr->spkt_protocol;
425 	} else
426 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
427 
428 	/*
429 	 *	Find the device first to size check it
430 	 */
431 
432 	saddr->spkt_device[13] = 0;
433 retry:
434 	rcu_read_lock();
435 	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
436 	err = -ENODEV;
437 	if (dev == NULL)
438 		goto out_unlock;
439 
440 	err = -ENETDOWN;
441 	if (!(dev->flags & IFF_UP))
442 		goto out_unlock;
443 
444 	/*
445 	 * You may not queue a frame bigger than the mtu. This is the lowest level
446 	 * raw protocol and you must do your own fragmentation at this level.
447 	 */
448 
449 	err = -EMSGSIZE;
450 	if (len > dev->mtu + dev->hard_header_len)
451 		goto out_unlock;
452 
453 	if (!skb) {
454 		size_t reserved = LL_RESERVED_SPACE(dev);
455 		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
456 
457 		rcu_read_unlock();
458 		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
459 		if (skb == NULL)
460 			return -ENOBUFS;
461 		/* FIXME: Save some space for broken drivers that write a hard
462 		 * header at transmission time by themselves. PPP is the notable
463 		 * one here. This should really be fixed at the driver level.
464 		 */
465 		skb_reserve(skb, reserved);
466 		skb_reset_network_header(skb);
467 
468 		/* Try to align data part correctly */
469 		if (hhlen) {
470 			skb->data -= hhlen;
471 			skb->tail -= hhlen;
472 			if (len < hhlen)
473 				skb_reset_network_header(skb);
474 		}
475 		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
476 		if (err)
477 			goto out_free;
478 		goto retry;
479 	}
480 
481 
482 	skb->protocol = proto;
483 	skb->dev = dev;
484 	skb->priority = sk->sk_priority;
485 	skb->mark = sk->sk_mark;
486 
487 	dev_queue_xmit(skb);
488 	rcu_read_unlock();
489 	return len;
490 
491 out_unlock:
492 	rcu_read_unlock();
493 out_free:
494 	kfree_skb(skb);
495 	return err;
496 }
497 
498 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
499 				      unsigned int res)
500 {
501 	struct sk_filter *filter;
502 
503 	rcu_read_lock_bh();
504 	filter = rcu_dereference_bh(sk->sk_filter);
505 	if (filter != NULL)
506 		res = sk_run_filter(skb, filter->insns, filter->len);
507 	rcu_read_unlock_bh();
508 
509 	return res;
510 }
511 
512 /*
513    This function makes lazy skb cloning in hope that most of packets
514    are discarded by BPF.
515 
516    Note tricky part: we DO mangle shared skb! skb->data, skb->len
517    and skb->cb are mangled. It works because (and until) packets
518    falling here are owned by current CPU. Output packets are cloned
519    by dev_queue_xmit_nit(), input packets are processed by net_bh
520    sequencially, so that if we return skb to original state on exit,
521    we will not harm anyone.
522  */
523 
524 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
525 		      struct packet_type *pt, struct net_device *orig_dev)
526 {
527 	struct sock *sk;
528 	struct sockaddr_ll *sll;
529 	struct packet_sock *po;
530 	u8 *skb_head = skb->data;
531 	int skb_len = skb->len;
532 	unsigned int snaplen, res;
533 
534 	if (skb->pkt_type == PACKET_LOOPBACK)
535 		goto drop;
536 
537 	sk = pt->af_packet_priv;
538 	po = pkt_sk(sk);
539 
540 	if (!net_eq(dev_net(dev), sock_net(sk)))
541 		goto drop;
542 
543 	skb->dev = dev;
544 
545 	if (dev->header_ops) {
546 		/* The device has an explicit notion of ll header,
547 		   exported to higher levels.
548 
549 		   Otherwise, the device hides datails of it frame
550 		   structure, so that corresponding packet head
551 		   never delivered to user.
552 		 */
553 		if (sk->sk_type != SOCK_DGRAM)
554 			skb_push(skb, skb->data - skb_mac_header(skb));
555 		else if (skb->pkt_type == PACKET_OUTGOING) {
556 			/* Special case: outgoing packets have ll header at head */
557 			skb_pull(skb, skb_network_offset(skb));
558 		}
559 	}
560 
561 	snaplen = skb->len;
562 
563 	res = run_filter(skb, sk, snaplen);
564 	if (!res)
565 		goto drop_n_restore;
566 	if (snaplen > res)
567 		snaplen = res;
568 
569 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
570 	    (unsigned)sk->sk_rcvbuf)
571 		goto drop_n_acct;
572 
573 	if (skb_shared(skb)) {
574 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
575 		if (nskb == NULL)
576 			goto drop_n_acct;
577 
578 		if (skb_head != skb->data) {
579 			skb->data = skb_head;
580 			skb->len = skb_len;
581 		}
582 		kfree_skb(skb);
583 		skb = nskb;
584 	}
585 
586 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
587 		     sizeof(skb->cb));
588 
589 	sll = &PACKET_SKB_CB(skb)->sa.ll;
590 	sll->sll_family = AF_PACKET;
591 	sll->sll_hatype = dev->type;
592 	sll->sll_protocol = skb->protocol;
593 	sll->sll_pkttype = skb->pkt_type;
594 	if (unlikely(po->origdev))
595 		sll->sll_ifindex = orig_dev->ifindex;
596 	else
597 		sll->sll_ifindex = dev->ifindex;
598 
599 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
600 
601 	PACKET_SKB_CB(skb)->origlen = skb->len;
602 
603 	if (pskb_trim(skb, snaplen))
604 		goto drop_n_acct;
605 
606 	skb_set_owner_r(skb, sk);
607 	skb->dev = NULL;
608 	skb_dst_drop(skb);
609 
610 	/* drop conntrack reference */
611 	nf_reset(skb);
612 
613 	spin_lock(&sk->sk_receive_queue.lock);
614 	po->stats.tp_packets++;
615 	skb->dropcount = atomic_read(&sk->sk_drops);
616 	__skb_queue_tail(&sk->sk_receive_queue, skb);
617 	spin_unlock(&sk->sk_receive_queue.lock);
618 	sk->sk_data_ready(sk, skb->len);
619 	return 0;
620 
621 drop_n_acct:
622 	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
623 
624 drop_n_restore:
625 	if (skb_head != skb->data && skb_shared(skb)) {
626 		skb->data = skb_head;
627 		skb->len = skb_len;
628 	}
629 drop:
630 	consume_skb(skb);
631 	return 0;
632 }
633 
634 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
635 		       struct packet_type *pt, struct net_device *orig_dev)
636 {
637 	struct sock *sk;
638 	struct packet_sock *po;
639 	struct sockaddr_ll *sll;
640 	union {
641 		struct tpacket_hdr *h1;
642 		struct tpacket2_hdr *h2;
643 		void *raw;
644 	} h;
645 	u8 *skb_head = skb->data;
646 	int skb_len = skb->len;
647 	unsigned int snaplen, res;
648 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
649 	unsigned short macoff, netoff, hdrlen;
650 	struct sk_buff *copy_skb = NULL;
651 	struct timeval tv;
652 	struct timespec ts;
653 
654 	if (skb->pkt_type == PACKET_LOOPBACK)
655 		goto drop;
656 
657 	sk = pt->af_packet_priv;
658 	po = pkt_sk(sk);
659 
660 	if (!net_eq(dev_net(dev), sock_net(sk)))
661 		goto drop;
662 
663 	if (dev->header_ops) {
664 		if (sk->sk_type != SOCK_DGRAM)
665 			skb_push(skb, skb->data - skb_mac_header(skb));
666 		else if (skb->pkt_type == PACKET_OUTGOING) {
667 			/* Special case: outgoing packets have ll header at head */
668 			skb_pull(skb, skb_network_offset(skb));
669 		}
670 	}
671 
672 	if (skb->ip_summed == CHECKSUM_PARTIAL)
673 		status |= TP_STATUS_CSUMNOTREADY;
674 
675 	snaplen = skb->len;
676 
677 	res = run_filter(skb, sk, snaplen);
678 	if (!res)
679 		goto drop_n_restore;
680 	if (snaplen > res)
681 		snaplen = res;
682 
683 	if (sk->sk_type == SOCK_DGRAM) {
684 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
685 				  po->tp_reserve;
686 	} else {
687 		unsigned maclen = skb_network_offset(skb);
688 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
689 				       (maclen < 16 ? 16 : maclen)) +
690 			po->tp_reserve;
691 		macoff = netoff - maclen;
692 	}
693 
694 	if (macoff + snaplen > po->rx_ring.frame_size) {
695 		if (po->copy_thresh &&
696 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
697 		    (unsigned)sk->sk_rcvbuf) {
698 			if (skb_shared(skb)) {
699 				copy_skb = skb_clone(skb, GFP_ATOMIC);
700 			} else {
701 				copy_skb = skb_get(skb);
702 				skb_head = skb->data;
703 			}
704 			if (copy_skb)
705 				skb_set_owner_r(copy_skb, sk);
706 		}
707 		snaplen = po->rx_ring.frame_size - macoff;
708 		if ((int)snaplen < 0)
709 			snaplen = 0;
710 	}
711 
712 	spin_lock(&sk->sk_receive_queue.lock);
713 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
714 	if (!h.raw)
715 		goto ring_is_full;
716 	packet_increment_head(&po->rx_ring);
717 	po->stats.tp_packets++;
718 	if (copy_skb) {
719 		status |= TP_STATUS_COPY;
720 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
721 	}
722 	if (!po->stats.tp_drops)
723 		status &= ~TP_STATUS_LOSING;
724 	spin_unlock(&sk->sk_receive_queue.lock);
725 
726 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
727 
728 	switch (po->tp_version) {
729 	case TPACKET_V1:
730 		h.h1->tp_len = skb->len;
731 		h.h1->tp_snaplen = snaplen;
732 		h.h1->tp_mac = macoff;
733 		h.h1->tp_net = netoff;
734 		if (skb->tstamp.tv64)
735 			tv = ktime_to_timeval(skb->tstamp);
736 		else
737 			do_gettimeofday(&tv);
738 		h.h1->tp_sec = tv.tv_sec;
739 		h.h1->tp_usec = tv.tv_usec;
740 		hdrlen = sizeof(*h.h1);
741 		break;
742 	case TPACKET_V2:
743 		h.h2->tp_len = skb->len;
744 		h.h2->tp_snaplen = snaplen;
745 		h.h2->tp_mac = macoff;
746 		h.h2->tp_net = netoff;
747 		if (skb->tstamp.tv64)
748 			ts = ktime_to_timespec(skb->tstamp);
749 		else
750 			getnstimeofday(&ts);
751 		h.h2->tp_sec = ts.tv_sec;
752 		h.h2->tp_nsec = ts.tv_nsec;
753 		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
754 		hdrlen = sizeof(*h.h2);
755 		break;
756 	default:
757 		BUG();
758 	}
759 
760 	sll = h.raw + TPACKET_ALIGN(hdrlen);
761 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
762 	sll->sll_family = AF_PACKET;
763 	sll->sll_hatype = dev->type;
764 	sll->sll_protocol = skb->protocol;
765 	sll->sll_pkttype = skb->pkt_type;
766 	if (unlikely(po->origdev))
767 		sll->sll_ifindex = orig_dev->ifindex;
768 	else
769 		sll->sll_ifindex = dev->ifindex;
770 
771 	__packet_set_status(po, h.raw, status);
772 	smp_mb();
773 	{
774 		struct page *p_start, *p_end;
775 		u8 *h_end = h.raw + macoff + snaplen - 1;
776 
777 		p_start = virt_to_page(h.raw);
778 		p_end = virt_to_page(h_end);
779 		while (p_start <= p_end) {
780 			flush_dcache_page(p_start);
781 			p_start++;
782 		}
783 	}
784 
785 	sk->sk_data_ready(sk, 0);
786 
787 drop_n_restore:
788 	if (skb_head != skb->data && skb_shared(skb)) {
789 		skb->data = skb_head;
790 		skb->len = skb_len;
791 	}
792 drop:
793 	kfree_skb(skb);
794 	return 0;
795 
796 ring_is_full:
797 	po->stats.tp_drops++;
798 	spin_unlock(&sk->sk_receive_queue.lock);
799 
800 	sk->sk_data_ready(sk, 0);
801 	kfree_skb(copy_skb);
802 	goto drop_n_restore;
803 }
804 
805 static void tpacket_destruct_skb(struct sk_buff *skb)
806 {
807 	struct packet_sock *po = pkt_sk(skb->sk);
808 	void *ph;
809 
810 	BUG_ON(skb == NULL);
811 
812 	if (likely(po->tx_ring.pg_vec)) {
813 		ph = skb_shinfo(skb)->destructor_arg;
814 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
815 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
816 		atomic_dec(&po->tx_ring.pending);
817 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
818 	}
819 
820 	sock_wfree(skb);
821 }
822 
823 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
824 		void *frame, struct net_device *dev, int size_max,
825 		__be16 proto, unsigned char *addr)
826 {
827 	union {
828 		struct tpacket_hdr *h1;
829 		struct tpacket2_hdr *h2;
830 		void *raw;
831 	} ph;
832 	int to_write, offset, len, tp_len, nr_frags, len_max;
833 	struct socket *sock = po->sk.sk_socket;
834 	struct page *page;
835 	void *data;
836 	int err;
837 
838 	ph.raw = frame;
839 
840 	skb->protocol = proto;
841 	skb->dev = dev;
842 	skb->priority = po->sk.sk_priority;
843 	skb->mark = po->sk.sk_mark;
844 	skb_shinfo(skb)->destructor_arg = ph.raw;
845 
846 	switch (po->tp_version) {
847 	case TPACKET_V2:
848 		tp_len = ph.h2->tp_len;
849 		break;
850 	default:
851 		tp_len = ph.h1->tp_len;
852 		break;
853 	}
854 	if (unlikely(tp_len > size_max)) {
855 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
856 		return -EMSGSIZE;
857 	}
858 
859 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
860 	skb_reset_network_header(skb);
861 
862 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
863 	to_write = tp_len;
864 
865 	if (sock->type == SOCK_DGRAM) {
866 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
867 				NULL, tp_len);
868 		if (unlikely(err < 0))
869 			return -EINVAL;
870 	} else if (dev->hard_header_len) {
871 		/* net device doesn't like empty head */
872 		if (unlikely(tp_len <= dev->hard_header_len)) {
873 			pr_err("packet size is too short (%d < %d)\n",
874 			       tp_len, dev->hard_header_len);
875 			return -EINVAL;
876 		}
877 
878 		skb_push(skb, dev->hard_header_len);
879 		err = skb_store_bits(skb, 0, data,
880 				dev->hard_header_len);
881 		if (unlikely(err))
882 			return err;
883 
884 		data += dev->hard_header_len;
885 		to_write -= dev->hard_header_len;
886 	}
887 
888 	err = -EFAULT;
889 	page = virt_to_page(data);
890 	offset = offset_in_page(data);
891 	len_max = PAGE_SIZE - offset;
892 	len = ((to_write > len_max) ? len_max : to_write);
893 
894 	skb->data_len = to_write;
895 	skb->len += to_write;
896 	skb->truesize += to_write;
897 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
898 
899 	while (likely(to_write)) {
900 		nr_frags = skb_shinfo(skb)->nr_frags;
901 
902 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
903 			pr_err("Packet exceed the number of skb frags(%lu)\n",
904 			       MAX_SKB_FRAGS);
905 			return -EFAULT;
906 		}
907 
908 		flush_dcache_page(page);
909 		get_page(page);
910 		skb_fill_page_desc(skb,
911 				nr_frags,
912 				page++, offset, len);
913 		to_write -= len;
914 		offset = 0;
915 		len_max = PAGE_SIZE;
916 		len = ((to_write > len_max) ? len_max : to_write);
917 	}
918 
919 	return tp_len;
920 }
921 
922 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
923 {
924 	struct socket *sock;
925 	struct sk_buff *skb;
926 	struct net_device *dev;
927 	__be16 proto;
928 	int ifindex, err, reserve = 0;
929 	void *ph;
930 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
931 	int tp_len, size_max;
932 	unsigned char *addr;
933 	int len_sum = 0;
934 	int status = 0;
935 
936 	sock = po->sk.sk_socket;
937 
938 	mutex_lock(&po->pg_vec_lock);
939 
940 	err = -EBUSY;
941 	if (saddr == NULL) {
942 		ifindex	= po->ifindex;
943 		proto	= po->num;
944 		addr	= NULL;
945 	} else {
946 		err = -EINVAL;
947 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
948 			goto out;
949 		if (msg->msg_namelen < (saddr->sll_halen
950 					+ offsetof(struct sockaddr_ll,
951 						sll_addr)))
952 			goto out;
953 		ifindex	= saddr->sll_ifindex;
954 		proto	= saddr->sll_protocol;
955 		addr	= saddr->sll_addr;
956 	}
957 
958 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
959 	err = -ENXIO;
960 	if (unlikely(dev == NULL))
961 		goto out;
962 
963 	reserve = dev->hard_header_len;
964 
965 	err = -ENETDOWN;
966 	if (unlikely(!(dev->flags & IFF_UP)))
967 		goto out_put;
968 
969 	size_max = po->tx_ring.frame_size
970 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
971 
972 	if (size_max > dev->mtu + reserve)
973 		size_max = dev->mtu + reserve;
974 
975 	do {
976 		ph = packet_current_frame(po, &po->tx_ring,
977 				TP_STATUS_SEND_REQUEST);
978 
979 		if (unlikely(ph == NULL)) {
980 			schedule();
981 			continue;
982 		}
983 
984 		status = TP_STATUS_SEND_REQUEST;
985 		skb = sock_alloc_send_skb(&po->sk,
986 				LL_ALLOCATED_SPACE(dev)
987 				+ sizeof(struct sockaddr_ll),
988 				0, &err);
989 
990 		if (unlikely(skb == NULL))
991 			goto out_status;
992 
993 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
994 				addr);
995 
996 		if (unlikely(tp_len < 0)) {
997 			if (po->tp_loss) {
998 				__packet_set_status(po, ph,
999 						TP_STATUS_AVAILABLE);
1000 				packet_increment_head(&po->tx_ring);
1001 				kfree_skb(skb);
1002 				continue;
1003 			} else {
1004 				status = TP_STATUS_WRONG_FORMAT;
1005 				err = tp_len;
1006 				goto out_status;
1007 			}
1008 		}
1009 
1010 		skb->destructor = tpacket_destruct_skb;
1011 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1012 		atomic_inc(&po->tx_ring.pending);
1013 
1014 		status = TP_STATUS_SEND_REQUEST;
1015 		err = dev_queue_xmit(skb);
1016 		if (unlikely(err > 0)) {
1017 			err = net_xmit_errno(err);
1018 			if (err && __packet_get_status(po, ph) ==
1019 				   TP_STATUS_AVAILABLE) {
1020 				/* skb was destructed already */
1021 				skb = NULL;
1022 				goto out_status;
1023 			}
1024 			/*
1025 			 * skb was dropped but not destructed yet;
1026 			 * let's treat it like congestion or err < 0
1027 			 */
1028 			err = 0;
1029 		}
1030 		packet_increment_head(&po->tx_ring);
1031 		len_sum += tp_len;
1032 	} while (likely((ph != NULL) ||
1033 			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1034 			 (atomic_read(&po->tx_ring.pending))))
1035 		);
1036 
1037 	err = len_sum;
1038 	goto out_put;
1039 
1040 out_status:
1041 	__packet_set_status(po, ph, status);
1042 	kfree_skb(skb);
1043 out_put:
1044 	dev_put(dev);
1045 out:
1046 	mutex_unlock(&po->pg_vec_lock);
1047 	return err;
1048 }
1049 
1050 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1051 					       size_t reserve, size_t len,
1052 					       size_t linear, int noblock,
1053 					       int *err)
1054 {
1055 	struct sk_buff *skb;
1056 
1057 	/* Under a page?  Don't bother with paged skb. */
1058 	if (prepad + len < PAGE_SIZE || !linear)
1059 		linear = len;
1060 
1061 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1062 				   err);
1063 	if (!skb)
1064 		return NULL;
1065 
1066 	skb_reserve(skb, reserve);
1067 	skb_put(skb, linear);
1068 	skb->data_len = len - linear;
1069 	skb->len += len - linear;
1070 
1071 	return skb;
1072 }
1073 
1074 static int packet_snd(struct socket *sock,
1075 			  struct msghdr *msg, size_t len)
1076 {
1077 	struct sock *sk = sock->sk;
1078 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1079 	struct sk_buff *skb;
1080 	struct net_device *dev;
1081 	__be16 proto;
1082 	unsigned char *addr;
1083 	int ifindex, err, reserve = 0;
1084 	struct virtio_net_hdr vnet_hdr = { 0 };
1085 	int offset = 0;
1086 	int vnet_hdr_len;
1087 	struct packet_sock *po = pkt_sk(sk);
1088 	unsigned short gso_type = 0;
1089 
1090 	/*
1091 	 *	Get and verify the address.
1092 	 */
1093 
1094 	if (saddr == NULL) {
1095 		ifindex	= po->ifindex;
1096 		proto	= po->num;
1097 		addr	= NULL;
1098 	} else {
1099 		err = -EINVAL;
1100 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1101 			goto out;
1102 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1103 			goto out;
1104 		ifindex	= saddr->sll_ifindex;
1105 		proto	= saddr->sll_protocol;
1106 		addr	= saddr->sll_addr;
1107 	}
1108 
1109 
1110 	dev = dev_get_by_index(sock_net(sk), ifindex);
1111 	err = -ENXIO;
1112 	if (dev == NULL)
1113 		goto out_unlock;
1114 	if (sock->type == SOCK_RAW)
1115 		reserve = dev->hard_header_len;
1116 
1117 	err = -ENETDOWN;
1118 	if (!(dev->flags & IFF_UP))
1119 		goto out_unlock;
1120 
1121 	if (po->has_vnet_hdr) {
1122 		vnet_hdr_len = sizeof(vnet_hdr);
1123 
1124 		err = -EINVAL;
1125 		if (len < vnet_hdr_len)
1126 			goto out_unlock;
1127 
1128 		len -= vnet_hdr_len;
1129 
1130 		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1131 				       vnet_hdr_len);
1132 		if (err < 0)
1133 			goto out_unlock;
1134 
1135 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1136 		    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1137 		      vnet_hdr.hdr_len))
1138 			vnet_hdr.hdr_len = vnet_hdr.csum_start +
1139 						 vnet_hdr.csum_offset + 2;
1140 
1141 		err = -EINVAL;
1142 		if (vnet_hdr.hdr_len > len)
1143 			goto out_unlock;
1144 
1145 		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1146 			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1147 			case VIRTIO_NET_HDR_GSO_TCPV4:
1148 				gso_type = SKB_GSO_TCPV4;
1149 				break;
1150 			case VIRTIO_NET_HDR_GSO_TCPV6:
1151 				gso_type = SKB_GSO_TCPV6;
1152 				break;
1153 			case VIRTIO_NET_HDR_GSO_UDP:
1154 				gso_type = SKB_GSO_UDP;
1155 				break;
1156 			default:
1157 				goto out_unlock;
1158 			}
1159 
1160 			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1161 				gso_type |= SKB_GSO_TCP_ECN;
1162 
1163 			if (vnet_hdr.gso_size == 0)
1164 				goto out_unlock;
1165 
1166 		}
1167 	}
1168 
1169 	err = -EMSGSIZE;
1170 	if (!gso_type && (len > dev->mtu+reserve))
1171 		goto out_unlock;
1172 
1173 	err = -ENOBUFS;
1174 	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1175 			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1176 			       msg->msg_flags & MSG_DONTWAIT, &err);
1177 	if (skb == NULL)
1178 		goto out_unlock;
1179 
1180 	skb_set_network_header(skb, reserve);
1181 
1182 	err = -EINVAL;
1183 	if (sock->type == SOCK_DGRAM &&
1184 	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1185 		goto out_free;
1186 
1187 	/* Returns -EFAULT on error */
1188 	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1189 	if (err)
1190 		goto out_free;
1191 
1192 	skb->protocol = proto;
1193 	skb->dev = dev;
1194 	skb->priority = sk->sk_priority;
1195 	skb->mark = sk->sk_mark;
1196 
1197 	if (po->has_vnet_hdr) {
1198 		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1199 			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1200 						  vnet_hdr.csum_offset)) {
1201 				err = -EINVAL;
1202 				goto out_free;
1203 			}
1204 		}
1205 
1206 		skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1207 		skb_shinfo(skb)->gso_type = gso_type;
1208 
1209 		/* Header must be checked, and gso_segs computed. */
1210 		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1211 		skb_shinfo(skb)->gso_segs = 0;
1212 
1213 		len += vnet_hdr_len;
1214 	}
1215 
1216 	/*
1217 	 *	Now send it
1218 	 */
1219 
1220 	err = dev_queue_xmit(skb);
1221 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1222 		goto out_unlock;
1223 
1224 	dev_put(dev);
1225 
1226 	return len;
1227 
1228 out_free:
1229 	kfree_skb(skb);
1230 out_unlock:
1231 	if (dev)
1232 		dev_put(dev);
1233 out:
1234 	return err;
1235 }
1236 
1237 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1238 		struct msghdr *msg, size_t len)
1239 {
1240 	struct sock *sk = sock->sk;
1241 	struct packet_sock *po = pkt_sk(sk);
1242 	if (po->tx_ring.pg_vec)
1243 		return tpacket_snd(po, msg);
1244 	else
1245 		return packet_snd(sock, msg, len);
1246 }
1247 
1248 /*
1249  *	Close a PACKET socket. This is fairly simple. We immediately go
1250  *	to 'closed' state and remove our protocol entry in the device list.
1251  */
1252 
1253 static int packet_release(struct socket *sock)
1254 {
1255 	struct sock *sk = sock->sk;
1256 	struct packet_sock *po;
1257 	struct net *net;
1258 	struct tpacket_req req;
1259 
1260 	if (!sk)
1261 		return 0;
1262 
1263 	net = sock_net(sk);
1264 	po = pkt_sk(sk);
1265 
1266 	spin_lock_bh(&net->packet.sklist_lock);
1267 	sk_del_node_init_rcu(sk);
1268 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1269 	spin_unlock_bh(&net->packet.sklist_lock);
1270 
1271 	spin_lock(&po->bind_lock);
1272 	if (po->running) {
1273 		/*
1274 		 * Remove from protocol table
1275 		 */
1276 		po->running = 0;
1277 		po->num = 0;
1278 		__dev_remove_pack(&po->prot_hook);
1279 		__sock_put(sk);
1280 	}
1281 	spin_unlock(&po->bind_lock);
1282 
1283 	packet_flush_mclist(sk);
1284 
1285 	memset(&req, 0, sizeof(req));
1286 
1287 	if (po->rx_ring.pg_vec)
1288 		packet_set_ring(sk, &req, 1, 0);
1289 
1290 	if (po->tx_ring.pg_vec)
1291 		packet_set_ring(sk, &req, 1, 1);
1292 
1293 	synchronize_net();
1294 	/*
1295 	 *	Now the socket is dead. No more input will appear.
1296 	 */
1297 	sock_orphan(sk);
1298 	sock->sk = NULL;
1299 
1300 	/* Purge queues */
1301 
1302 	skb_queue_purge(&sk->sk_receive_queue);
1303 	sk_refcnt_debug_release(sk);
1304 
1305 	sock_put(sk);
1306 	return 0;
1307 }
1308 
1309 /*
1310  *	Attach a packet hook.
1311  */
1312 
1313 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1314 {
1315 	struct packet_sock *po = pkt_sk(sk);
1316 	/*
1317 	 *	Detach an existing hook if present.
1318 	 */
1319 
1320 	lock_sock(sk);
1321 
1322 	spin_lock(&po->bind_lock);
1323 	if (po->running) {
1324 		__sock_put(sk);
1325 		po->running = 0;
1326 		po->num = 0;
1327 		spin_unlock(&po->bind_lock);
1328 		dev_remove_pack(&po->prot_hook);
1329 		spin_lock(&po->bind_lock);
1330 	}
1331 
1332 	po->num = protocol;
1333 	po->prot_hook.type = protocol;
1334 	po->prot_hook.dev = dev;
1335 
1336 	po->ifindex = dev ? dev->ifindex : 0;
1337 
1338 	if (protocol == 0)
1339 		goto out_unlock;
1340 
1341 	if (!dev || (dev->flags & IFF_UP)) {
1342 		dev_add_pack(&po->prot_hook);
1343 		sock_hold(sk);
1344 		po->running = 1;
1345 	} else {
1346 		sk->sk_err = ENETDOWN;
1347 		if (!sock_flag(sk, SOCK_DEAD))
1348 			sk->sk_error_report(sk);
1349 	}
1350 
1351 out_unlock:
1352 	spin_unlock(&po->bind_lock);
1353 	release_sock(sk);
1354 	return 0;
1355 }
1356 
1357 /*
1358  *	Bind a packet socket to a device
1359  */
1360 
1361 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1362 			    int addr_len)
1363 {
1364 	struct sock *sk = sock->sk;
1365 	char name[15];
1366 	struct net_device *dev;
1367 	int err = -ENODEV;
1368 
1369 	/*
1370 	 *	Check legality
1371 	 */
1372 
1373 	if (addr_len != sizeof(struct sockaddr))
1374 		return -EINVAL;
1375 	strlcpy(name, uaddr->sa_data, sizeof(name));
1376 
1377 	dev = dev_get_by_name(sock_net(sk), name);
1378 	if (dev) {
1379 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1380 		dev_put(dev);
1381 	}
1382 	return err;
1383 }
1384 
1385 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1386 {
1387 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1388 	struct sock *sk = sock->sk;
1389 	struct net_device *dev = NULL;
1390 	int err;
1391 
1392 
1393 	/*
1394 	 *	Check legality
1395 	 */
1396 
1397 	if (addr_len < sizeof(struct sockaddr_ll))
1398 		return -EINVAL;
1399 	if (sll->sll_family != AF_PACKET)
1400 		return -EINVAL;
1401 
1402 	if (sll->sll_ifindex) {
1403 		err = -ENODEV;
1404 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1405 		if (dev == NULL)
1406 			goto out;
1407 	}
1408 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1409 	if (dev)
1410 		dev_put(dev);
1411 
1412 out:
1413 	return err;
1414 }
1415 
1416 static struct proto packet_proto = {
1417 	.name	  = "PACKET",
1418 	.owner	  = THIS_MODULE,
1419 	.obj_size = sizeof(struct packet_sock),
1420 };
1421 
1422 /*
1423  *	Create a packet of type SOCK_PACKET.
1424  */
1425 
1426 static int packet_create(struct net *net, struct socket *sock, int protocol,
1427 			 int kern)
1428 {
1429 	struct sock *sk;
1430 	struct packet_sock *po;
1431 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1432 	int err;
1433 
1434 	if (!capable(CAP_NET_RAW))
1435 		return -EPERM;
1436 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1437 	    sock->type != SOCK_PACKET)
1438 		return -ESOCKTNOSUPPORT;
1439 
1440 	sock->state = SS_UNCONNECTED;
1441 
1442 	err = -ENOBUFS;
1443 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1444 	if (sk == NULL)
1445 		goto out;
1446 
1447 	sock->ops = &packet_ops;
1448 	if (sock->type == SOCK_PACKET)
1449 		sock->ops = &packet_ops_spkt;
1450 
1451 	sock_init_data(sock, sk);
1452 
1453 	po = pkt_sk(sk);
1454 	sk->sk_family = PF_PACKET;
1455 	po->num = proto;
1456 
1457 	sk->sk_destruct = packet_sock_destruct;
1458 	sk_refcnt_debug_inc(sk);
1459 
1460 	/*
1461 	 *	Attach a protocol block
1462 	 */
1463 
1464 	spin_lock_init(&po->bind_lock);
1465 	mutex_init(&po->pg_vec_lock);
1466 	po->prot_hook.func = packet_rcv;
1467 
1468 	if (sock->type == SOCK_PACKET)
1469 		po->prot_hook.func = packet_rcv_spkt;
1470 
1471 	po->prot_hook.af_packet_priv = sk;
1472 
1473 	if (proto) {
1474 		po->prot_hook.type = proto;
1475 		dev_add_pack(&po->prot_hook);
1476 		sock_hold(sk);
1477 		po->running = 1;
1478 	}
1479 
1480 	spin_lock_bh(&net->packet.sklist_lock);
1481 	sk_add_node_rcu(sk, &net->packet.sklist);
1482 	sock_prot_inuse_add(net, &packet_proto, 1);
1483 	spin_unlock_bh(&net->packet.sklist_lock);
1484 
1485 	return 0;
1486 out:
1487 	return err;
1488 }
1489 
1490 /*
1491  *	Pull a packet from our receive queue and hand it to the user.
1492  *	If necessary we block.
1493  */
1494 
1495 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1496 			  struct msghdr *msg, size_t len, int flags)
1497 {
1498 	struct sock *sk = sock->sk;
1499 	struct sk_buff *skb;
1500 	int copied, err;
1501 	struct sockaddr_ll *sll;
1502 	int vnet_hdr_len = 0;
1503 
1504 	err = -EINVAL;
1505 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1506 		goto out;
1507 
1508 #if 0
1509 	/* What error should we return now? EUNATTACH? */
1510 	if (pkt_sk(sk)->ifindex < 0)
1511 		return -ENODEV;
1512 #endif
1513 
1514 	/*
1515 	 *	Call the generic datagram receiver. This handles all sorts
1516 	 *	of horrible races and re-entrancy so we can forget about it
1517 	 *	in the protocol layers.
1518 	 *
1519 	 *	Now it will return ENETDOWN, if device have just gone down,
1520 	 *	but then it will block.
1521 	 */
1522 
1523 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1524 
1525 	/*
1526 	 *	An error occurred so return it. Because skb_recv_datagram()
1527 	 *	handles the blocking we don't see and worry about blocking
1528 	 *	retries.
1529 	 */
1530 
1531 	if (skb == NULL)
1532 		goto out;
1533 
1534 	if (pkt_sk(sk)->has_vnet_hdr) {
1535 		struct virtio_net_hdr vnet_hdr = { 0 };
1536 
1537 		err = -EINVAL;
1538 		vnet_hdr_len = sizeof(vnet_hdr);
1539 		if ((len -= vnet_hdr_len) < 0)
1540 			goto out_free;
1541 
1542 		if (skb_is_gso(skb)) {
1543 			struct skb_shared_info *sinfo = skb_shinfo(skb);
1544 
1545 			/* This is a hint as to how much should be linear. */
1546 			vnet_hdr.hdr_len = skb_headlen(skb);
1547 			vnet_hdr.gso_size = sinfo->gso_size;
1548 			if (sinfo->gso_type & SKB_GSO_TCPV4)
1549 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1550 			else if (sinfo->gso_type & SKB_GSO_TCPV6)
1551 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1552 			else if (sinfo->gso_type & SKB_GSO_UDP)
1553 				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1554 			else if (sinfo->gso_type & SKB_GSO_FCOE)
1555 				goto out_free;
1556 			else
1557 				BUG();
1558 			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1559 				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1560 		} else
1561 			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1562 
1563 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1564 			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1565 			vnet_hdr.csum_start = skb->csum_start -
1566 							skb_headroom(skb);
1567 			vnet_hdr.csum_offset = skb->csum_offset;
1568 		} /* else everything is zero */
1569 
1570 		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1571 				     vnet_hdr_len);
1572 		if (err < 0)
1573 			goto out_free;
1574 	}
1575 
1576 	/*
1577 	 *	If the address length field is there to be filled in, we fill
1578 	 *	it in now.
1579 	 */
1580 
1581 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1582 	if (sock->type == SOCK_PACKET)
1583 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1584 	else
1585 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1586 
1587 	/*
1588 	 *	You lose any data beyond the buffer you gave. If it worries a
1589 	 *	user program they can ask the device for its MTU anyway.
1590 	 */
1591 
1592 	copied = skb->len;
1593 	if (copied > len) {
1594 		copied = len;
1595 		msg->msg_flags |= MSG_TRUNC;
1596 	}
1597 
1598 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1599 	if (err)
1600 		goto out_free;
1601 
1602 	sock_recv_ts_and_drops(msg, sk, skb);
1603 
1604 	if (msg->msg_name)
1605 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1606 		       msg->msg_namelen);
1607 
1608 	if (pkt_sk(sk)->auxdata) {
1609 		struct tpacket_auxdata aux;
1610 
1611 		aux.tp_status = TP_STATUS_USER;
1612 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1613 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1614 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1615 		aux.tp_snaplen = skb->len;
1616 		aux.tp_mac = 0;
1617 		aux.tp_net = skb_network_offset(skb);
1618 		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1619 
1620 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1621 	}
1622 
1623 	/*
1624 	 *	Free or return the buffer as appropriate. Again this
1625 	 *	hides all the races and re-entrancy issues from us.
1626 	 */
1627 	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1628 
1629 out_free:
1630 	skb_free_datagram(sk, skb);
1631 out:
1632 	return err;
1633 }
1634 
1635 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1636 			       int *uaddr_len, int peer)
1637 {
1638 	struct net_device *dev;
1639 	struct sock *sk	= sock->sk;
1640 
1641 	if (peer)
1642 		return -EOPNOTSUPP;
1643 
1644 	uaddr->sa_family = AF_PACKET;
1645 	rcu_read_lock();
1646 	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1647 	if (dev)
1648 		strlcpy(uaddr->sa_data, dev->name, 15);
1649 	else
1650 		memset(uaddr->sa_data, 0, 14);
1651 	rcu_read_unlock();
1652 	*uaddr_len = sizeof(*uaddr);
1653 
1654 	return 0;
1655 }
1656 
1657 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1658 			  int *uaddr_len, int peer)
1659 {
1660 	struct net_device *dev;
1661 	struct sock *sk = sock->sk;
1662 	struct packet_sock *po = pkt_sk(sk);
1663 	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1664 
1665 	if (peer)
1666 		return -EOPNOTSUPP;
1667 
1668 	sll->sll_family = AF_PACKET;
1669 	sll->sll_ifindex = po->ifindex;
1670 	sll->sll_protocol = po->num;
1671 	rcu_read_lock();
1672 	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1673 	if (dev) {
1674 		sll->sll_hatype = dev->type;
1675 		sll->sll_halen = dev->addr_len;
1676 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1677 	} else {
1678 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1679 		sll->sll_halen = 0;
1680 	}
1681 	rcu_read_unlock();
1682 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1683 
1684 	return 0;
1685 }
1686 
1687 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1688 			 int what)
1689 {
1690 	switch (i->type) {
1691 	case PACKET_MR_MULTICAST:
1692 		if (i->alen != dev->addr_len)
1693 			return -EINVAL;
1694 		if (what > 0)
1695 			return dev_mc_add(dev, i->addr, i->alen, 0);
1696 		else
1697 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1698 		break;
1699 	case PACKET_MR_PROMISC:
1700 		return dev_set_promiscuity(dev, what);
1701 		break;
1702 	case PACKET_MR_ALLMULTI:
1703 		return dev_set_allmulti(dev, what);
1704 		break;
1705 	case PACKET_MR_UNICAST:
1706 		if (i->alen != dev->addr_len)
1707 			return -EINVAL;
1708 		if (what > 0)
1709 			return dev_unicast_add(dev, i->addr);
1710 		else
1711 			return dev_unicast_delete(dev, i->addr);
1712 		break;
1713 	default:
1714 		break;
1715 	}
1716 	return 0;
1717 }
1718 
1719 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1720 {
1721 	for ( ; i; i = i->next) {
1722 		if (i->ifindex == dev->ifindex)
1723 			packet_dev_mc(dev, i, what);
1724 	}
1725 }
1726 
1727 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1728 {
1729 	struct packet_sock *po = pkt_sk(sk);
1730 	struct packet_mclist *ml, *i;
1731 	struct net_device *dev;
1732 	int err;
1733 
1734 	rtnl_lock();
1735 
1736 	err = -ENODEV;
1737 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1738 	if (!dev)
1739 		goto done;
1740 
1741 	err = -EINVAL;
1742 	if (mreq->mr_alen > dev->addr_len)
1743 		goto done;
1744 
1745 	err = -ENOBUFS;
1746 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1747 	if (i == NULL)
1748 		goto done;
1749 
1750 	err = 0;
1751 	for (ml = po->mclist; ml; ml = ml->next) {
1752 		if (ml->ifindex == mreq->mr_ifindex &&
1753 		    ml->type == mreq->mr_type &&
1754 		    ml->alen == mreq->mr_alen &&
1755 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1756 			ml->count++;
1757 			/* Free the new element ... */
1758 			kfree(i);
1759 			goto done;
1760 		}
1761 	}
1762 
1763 	i->type = mreq->mr_type;
1764 	i->ifindex = mreq->mr_ifindex;
1765 	i->alen = mreq->mr_alen;
1766 	memcpy(i->addr, mreq->mr_address, i->alen);
1767 	i->count = 1;
1768 	i->next = po->mclist;
1769 	po->mclist = i;
1770 	err = packet_dev_mc(dev, i, 1);
1771 	if (err) {
1772 		po->mclist = i->next;
1773 		kfree(i);
1774 	}
1775 
1776 done:
1777 	rtnl_unlock();
1778 	return err;
1779 }
1780 
1781 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1782 {
1783 	struct packet_mclist *ml, **mlp;
1784 
1785 	rtnl_lock();
1786 
1787 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1788 		if (ml->ifindex == mreq->mr_ifindex &&
1789 		    ml->type == mreq->mr_type &&
1790 		    ml->alen == mreq->mr_alen &&
1791 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1792 			if (--ml->count == 0) {
1793 				struct net_device *dev;
1794 				*mlp = ml->next;
1795 				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1796 				if (dev)
1797 					packet_dev_mc(dev, ml, -1);
1798 				kfree(ml);
1799 			}
1800 			rtnl_unlock();
1801 			return 0;
1802 		}
1803 	}
1804 	rtnl_unlock();
1805 	return -EADDRNOTAVAIL;
1806 }
1807 
1808 static void packet_flush_mclist(struct sock *sk)
1809 {
1810 	struct packet_sock *po = pkt_sk(sk);
1811 	struct packet_mclist *ml;
1812 
1813 	if (!po->mclist)
1814 		return;
1815 
1816 	rtnl_lock();
1817 	while ((ml = po->mclist) != NULL) {
1818 		struct net_device *dev;
1819 
1820 		po->mclist = ml->next;
1821 		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1822 		if (dev != NULL)
1823 			packet_dev_mc(dev, ml, -1);
1824 		kfree(ml);
1825 	}
1826 	rtnl_unlock();
1827 }
1828 
1829 static int
1830 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1831 {
1832 	struct sock *sk = sock->sk;
1833 	struct packet_sock *po = pkt_sk(sk);
1834 	int ret;
1835 
1836 	if (level != SOL_PACKET)
1837 		return -ENOPROTOOPT;
1838 
1839 	switch (optname) {
1840 	case PACKET_ADD_MEMBERSHIP:
1841 	case PACKET_DROP_MEMBERSHIP:
1842 	{
1843 		struct packet_mreq_max mreq;
1844 		int len = optlen;
1845 		memset(&mreq, 0, sizeof(mreq));
1846 		if (len < sizeof(struct packet_mreq))
1847 			return -EINVAL;
1848 		if (len > sizeof(mreq))
1849 			len = sizeof(mreq);
1850 		if (copy_from_user(&mreq, optval, len))
1851 			return -EFAULT;
1852 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1853 			return -EINVAL;
1854 		if (optname == PACKET_ADD_MEMBERSHIP)
1855 			ret = packet_mc_add(sk, &mreq);
1856 		else
1857 			ret = packet_mc_drop(sk, &mreq);
1858 		return ret;
1859 	}
1860 
1861 	case PACKET_RX_RING:
1862 	case PACKET_TX_RING:
1863 	{
1864 		struct tpacket_req req;
1865 
1866 		if (optlen < sizeof(req))
1867 			return -EINVAL;
1868 		if (pkt_sk(sk)->has_vnet_hdr)
1869 			return -EINVAL;
1870 		if (copy_from_user(&req, optval, sizeof(req)))
1871 			return -EFAULT;
1872 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1873 	}
1874 	case PACKET_COPY_THRESH:
1875 	{
1876 		int val;
1877 
1878 		if (optlen != sizeof(val))
1879 			return -EINVAL;
1880 		if (copy_from_user(&val, optval, sizeof(val)))
1881 			return -EFAULT;
1882 
1883 		pkt_sk(sk)->copy_thresh = val;
1884 		return 0;
1885 	}
1886 	case PACKET_VERSION:
1887 	{
1888 		int val;
1889 
1890 		if (optlen != sizeof(val))
1891 			return -EINVAL;
1892 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1893 			return -EBUSY;
1894 		if (copy_from_user(&val, optval, sizeof(val)))
1895 			return -EFAULT;
1896 		switch (val) {
1897 		case TPACKET_V1:
1898 		case TPACKET_V2:
1899 			po->tp_version = val;
1900 			return 0;
1901 		default:
1902 			return -EINVAL;
1903 		}
1904 	}
1905 	case PACKET_RESERVE:
1906 	{
1907 		unsigned int val;
1908 
1909 		if (optlen != sizeof(val))
1910 			return -EINVAL;
1911 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1912 			return -EBUSY;
1913 		if (copy_from_user(&val, optval, sizeof(val)))
1914 			return -EFAULT;
1915 		po->tp_reserve = val;
1916 		return 0;
1917 	}
1918 	case PACKET_LOSS:
1919 	{
1920 		unsigned int val;
1921 
1922 		if (optlen != sizeof(val))
1923 			return -EINVAL;
1924 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1925 			return -EBUSY;
1926 		if (copy_from_user(&val, optval, sizeof(val)))
1927 			return -EFAULT;
1928 		po->tp_loss = !!val;
1929 		return 0;
1930 	}
1931 	case PACKET_AUXDATA:
1932 	{
1933 		int val;
1934 
1935 		if (optlen < sizeof(val))
1936 			return -EINVAL;
1937 		if (copy_from_user(&val, optval, sizeof(val)))
1938 			return -EFAULT;
1939 
1940 		po->auxdata = !!val;
1941 		return 0;
1942 	}
1943 	case PACKET_ORIGDEV:
1944 	{
1945 		int val;
1946 
1947 		if (optlen < sizeof(val))
1948 			return -EINVAL;
1949 		if (copy_from_user(&val, optval, sizeof(val)))
1950 			return -EFAULT;
1951 
1952 		po->origdev = !!val;
1953 		return 0;
1954 	}
1955 	case PACKET_VNET_HDR:
1956 	{
1957 		int val;
1958 
1959 		if (sock->type != SOCK_RAW)
1960 			return -EINVAL;
1961 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1962 			return -EBUSY;
1963 		if (optlen < sizeof(val))
1964 			return -EINVAL;
1965 		if (copy_from_user(&val, optval, sizeof(val)))
1966 			return -EFAULT;
1967 
1968 		po->has_vnet_hdr = !!val;
1969 		return 0;
1970 	}
1971 	default:
1972 		return -ENOPROTOOPT;
1973 	}
1974 }
1975 
1976 static int packet_getsockopt(struct socket *sock, int level, int optname,
1977 			     char __user *optval, int __user *optlen)
1978 {
1979 	int len;
1980 	int val;
1981 	struct sock *sk = sock->sk;
1982 	struct packet_sock *po = pkt_sk(sk);
1983 	void *data;
1984 	struct tpacket_stats st;
1985 
1986 	if (level != SOL_PACKET)
1987 		return -ENOPROTOOPT;
1988 
1989 	if (get_user(len, optlen))
1990 		return -EFAULT;
1991 
1992 	if (len < 0)
1993 		return -EINVAL;
1994 
1995 	switch (optname) {
1996 	case PACKET_STATISTICS:
1997 		if (len > sizeof(struct tpacket_stats))
1998 			len = sizeof(struct tpacket_stats);
1999 		spin_lock_bh(&sk->sk_receive_queue.lock);
2000 		st = po->stats;
2001 		memset(&po->stats, 0, sizeof(st));
2002 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2003 		st.tp_packets += st.tp_drops;
2004 
2005 		data = &st;
2006 		break;
2007 	case PACKET_AUXDATA:
2008 		if (len > sizeof(int))
2009 			len = sizeof(int);
2010 		val = po->auxdata;
2011 
2012 		data = &val;
2013 		break;
2014 	case PACKET_ORIGDEV:
2015 		if (len > sizeof(int))
2016 			len = sizeof(int);
2017 		val = po->origdev;
2018 
2019 		data = &val;
2020 		break;
2021 	case PACKET_VNET_HDR:
2022 		if (len > sizeof(int))
2023 			len = sizeof(int);
2024 		val = po->has_vnet_hdr;
2025 
2026 		data = &val;
2027 		break;
2028 	case PACKET_VERSION:
2029 		if (len > sizeof(int))
2030 			len = sizeof(int);
2031 		val = po->tp_version;
2032 		data = &val;
2033 		break;
2034 	case PACKET_HDRLEN:
2035 		if (len > sizeof(int))
2036 			len = sizeof(int);
2037 		if (copy_from_user(&val, optval, len))
2038 			return -EFAULT;
2039 		switch (val) {
2040 		case TPACKET_V1:
2041 			val = sizeof(struct tpacket_hdr);
2042 			break;
2043 		case TPACKET_V2:
2044 			val = sizeof(struct tpacket2_hdr);
2045 			break;
2046 		default:
2047 			return -EINVAL;
2048 		}
2049 		data = &val;
2050 		break;
2051 	case PACKET_RESERVE:
2052 		if (len > sizeof(unsigned int))
2053 			len = sizeof(unsigned int);
2054 		val = po->tp_reserve;
2055 		data = &val;
2056 		break;
2057 	case PACKET_LOSS:
2058 		if (len > sizeof(unsigned int))
2059 			len = sizeof(unsigned int);
2060 		val = po->tp_loss;
2061 		data = &val;
2062 		break;
2063 	default:
2064 		return -ENOPROTOOPT;
2065 	}
2066 
2067 	if (put_user(len, optlen))
2068 		return -EFAULT;
2069 	if (copy_to_user(optval, data, len))
2070 		return -EFAULT;
2071 	return 0;
2072 }
2073 
2074 
2075 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2076 {
2077 	struct sock *sk;
2078 	struct hlist_node *node;
2079 	struct net_device *dev = data;
2080 	struct net *net = dev_net(dev);
2081 
2082 	rcu_read_lock();
2083 	sk_for_each_rcu(sk, node, &net->packet.sklist) {
2084 		struct packet_sock *po = pkt_sk(sk);
2085 
2086 		switch (msg) {
2087 		case NETDEV_UNREGISTER:
2088 			if (po->mclist)
2089 				packet_dev_mclist(dev, po->mclist, -1);
2090 			/* fallthrough */
2091 
2092 		case NETDEV_DOWN:
2093 			if (dev->ifindex == po->ifindex) {
2094 				spin_lock(&po->bind_lock);
2095 				if (po->running) {
2096 					__dev_remove_pack(&po->prot_hook);
2097 					__sock_put(sk);
2098 					po->running = 0;
2099 					sk->sk_err = ENETDOWN;
2100 					if (!sock_flag(sk, SOCK_DEAD))
2101 						sk->sk_error_report(sk);
2102 				}
2103 				if (msg == NETDEV_UNREGISTER) {
2104 					po->ifindex = -1;
2105 					po->prot_hook.dev = NULL;
2106 				}
2107 				spin_unlock(&po->bind_lock);
2108 			}
2109 			break;
2110 		case NETDEV_UP:
2111 			if (dev->ifindex == po->ifindex) {
2112 				spin_lock(&po->bind_lock);
2113 				if (po->num && !po->running) {
2114 					dev_add_pack(&po->prot_hook);
2115 					sock_hold(sk);
2116 					po->running = 1;
2117 				}
2118 				spin_unlock(&po->bind_lock);
2119 			}
2120 			break;
2121 		}
2122 	}
2123 	rcu_read_unlock();
2124 	return NOTIFY_DONE;
2125 }
2126 
2127 
2128 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2129 			unsigned long arg)
2130 {
2131 	struct sock *sk = sock->sk;
2132 
2133 	switch (cmd) {
2134 	case SIOCOUTQ:
2135 	{
2136 		int amount = sk_wmem_alloc_get(sk);
2137 
2138 		return put_user(amount, (int __user *)arg);
2139 	}
2140 	case SIOCINQ:
2141 	{
2142 		struct sk_buff *skb;
2143 		int amount = 0;
2144 
2145 		spin_lock_bh(&sk->sk_receive_queue.lock);
2146 		skb = skb_peek(&sk->sk_receive_queue);
2147 		if (skb)
2148 			amount = skb->len;
2149 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2150 		return put_user(amount, (int __user *)arg);
2151 	}
2152 	case SIOCGSTAMP:
2153 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2154 	case SIOCGSTAMPNS:
2155 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2156 
2157 #ifdef CONFIG_INET
2158 	case SIOCADDRT:
2159 	case SIOCDELRT:
2160 	case SIOCDARP:
2161 	case SIOCGARP:
2162 	case SIOCSARP:
2163 	case SIOCGIFADDR:
2164 	case SIOCSIFADDR:
2165 	case SIOCGIFBRDADDR:
2166 	case SIOCSIFBRDADDR:
2167 	case SIOCGIFNETMASK:
2168 	case SIOCSIFNETMASK:
2169 	case SIOCGIFDSTADDR:
2170 	case SIOCSIFDSTADDR:
2171 	case SIOCSIFFLAGS:
2172 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2173 #endif
2174 
2175 	default:
2176 		return -ENOIOCTLCMD;
2177 	}
2178 	return 0;
2179 }
2180 
2181 static unsigned int packet_poll(struct file *file, struct socket *sock,
2182 				poll_table *wait)
2183 {
2184 	struct sock *sk = sock->sk;
2185 	struct packet_sock *po = pkt_sk(sk);
2186 	unsigned int mask = datagram_poll(file, sock, wait);
2187 
2188 	spin_lock_bh(&sk->sk_receive_queue.lock);
2189 	if (po->rx_ring.pg_vec) {
2190 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2191 			mask |= POLLIN | POLLRDNORM;
2192 	}
2193 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2194 	spin_lock_bh(&sk->sk_write_queue.lock);
2195 	if (po->tx_ring.pg_vec) {
2196 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2197 			mask |= POLLOUT | POLLWRNORM;
2198 	}
2199 	spin_unlock_bh(&sk->sk_write_queue.lock);
2200 	return mask;
2201 }
2202 
2203 
2204 /* Dirty? Well, I still did not learn better way to account
2205  * for user mmaps.
2206  */
2207 
2208 static void packet_mm_open(struct vm_area_struct *vma)
2209 {
2210 	struct file *file = vma->vm_file;
2211 	struct socket *sock = file->private_data;
2212 	struct sock *sk = sock->sk;
2213 
2214 	if (sk)
2215 		atomic_inc(&pkt_sk(sk)->mapped);
2216 }
2217 
2218 static void packet_mm_close(struct vm_area_struct *vma)
2219 {
2220 	struct file *file = vma->vm_file;
2221 	struct socket *sock = file->private_data;
2222 	struct sock *sk = sock->sk;
2223 
2224 	if (sk)
2225 		atomic_dec(&pkt_sk(sk)->mapped);
2226 }
2227 
2228 static const struct vm_operations_struct packet_mmap_ops = {
2229 	.open	=	packet_mm_open,
2230 	.close	=	packet_mm_close,
2231 };
2232 
2233 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2234 {
2235 	int i;
2236 
2237 	for (i = 0; i < len; i++) {
2238 		if (likely(pg_vec[i]))
2239 			free_pages((unsigned long) pg_vec[i], order);
2240 	}
2241 	kfree(pg_vec);
2242 }
2243 
2244 static inline char *alloc_one_pg_vec_page(unsigned long order)
2245 {
2246 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2247 
2248 	return (char *) __get_free_pages(gfp_flags, order);
2249 }
2250 
2251 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2252 {
2253 	unsigned int block_nr = req->tp_block_nr;
2254 	char **pg_vec;
2255 	int i;
2256 
2257 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2258 	if (unlikely(!pg_vec))
2259 		goto out;
2260 
2261 	for (i = 0; i < block_nr; i++) {
2262 		pg_vec[i] = alloc_one_pg_vec_page(order);
2263 		if (unlikely(!pg_vec[i]))
2264 			goto out_free_pgvec;
2265 	}
2266 
2267 out:
2268 	return pg_vec;
2269 
2270 out_free_pgvec:
2271 	free_pg_vec(pg_vec, order, block_nr);
2272 	pg_vec = NULL;
2273 	goto out;
2274 }
2275 
2276 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2277 		int closing, int tx_ring)
2278 {
2279 	char **pg_vec = NULL;
2280 	struct packet_sock *po = pkt_sk(sk);
2281 	int was_running, order = 0;
2282 	struct packet_ring_buffer *rb;
2283 	struct sk_buff_head *rb_queue;
2284 	__be16 num;
2285 	int err;
2286 
2287 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2288 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2289 
2290 	err = -EBUSY;
2291 	if (!closing) {
2292 		if (atomic_read(&po->mapped))
2293 			goto out;
2294 		if (atomic_read(&rb->pending))
2295 			goto out;
2296 	}
2297 
2298 	if (req->tp_block_nr) {
2299 		/* Sanity tests and some calculations */
2300 		err = -EBUSY;
2301 		if (unlikely(rb->pg_vec))
2302 			goto out;
2303 
2304 		switch (po->tp_version) {
2305 		case TPACKET_V1:
2306 			po->tp_hdrlen = TPACKET_HDRLEN;
2307 			break;
2308 		case TPACKET_V2:
2309 			po->tp_hdrlen = TPACKET2_HDRLEN;
2310 			break;
2311 		}
2312 
2313 		err = -EINVAL;
2314 		if (unlikely((int)req->tp_block_size <= 0))
2315 			goto out;
2316 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2317 			goto out;
2318 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2319 					po->tp_reserve))
2320 			goto out;
2321 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2322 			goto out;
2323 
2324 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2325 		if (unlikely(rb->frames_per_block <= 0))
2326 			goto out;
2327 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2328 					req->tp_frame_nr))
2329 			goto out;
2330 
2331 		err = -ENOMEM;
2332 		order = get_order(req->tp_block_size);
2333 		pg_vec = alloc_pg_vec(req, order);
2334 		if (unlikely(!pg_vec))
2335 			goto out;
2336 	}
2337 	/* Done */
2338 	else {
2339 		err = -EINVAL;
2340 		if (unlikely(req->tp_frame_nr))
2341 			goto out;
2342 	}
2343 
2344 	lock_sock(sk);
2345 
2346 	/* Detach socket from network */
2347 	spin_lock(&po->bind_lock);
2348 	was_running = po->running;
2349 	num = po->num;
2350 	if (was_running) {
2351 		__dev_remove_pack(&po->prot_hook);
2352 		po->num = 0;
2353 		po->running = 0;
2354 		__sock_put(sk);
2355 	}
2356 	spin_unlock(&po->bind_lock);
2357 
2358 	synchronize_net();
2359 
2360 	err = -EBUSY;
2361 	mutex_lock(&po->pg_vec_lock);
2362 	if (closing || atomic_read(&po->mapped) == 0) {
2363 		err = 0;
2364 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2365 		spin_lock_bh(&rb_queue->lock);
2366 		pg_vec = XC(rb->pg_vec, pg_vec);
2367 		rb->frame_max = (req->tp_frame_nr - 1);
2368 		rb->head = 0;
2369 		rb->frame_size = req->tp_frame_size;
2370 		spin_unlock_bh(&rb_queue->lock);
2371 
2372 		order = XC(rb->pg_vec_order, order);
2373 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2374 
2375 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2376 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2377 						tpacket_rcv : packet_rcv;
2378 		skb_queue_purge(rb_queue);
2379 #undef XC
2380 		if (atomic_read(&po->mapped))
2381 			pr_err("packet_mmap: vma is busy: %d\n",
2382 			       atomic_read(&po->mapped));
2383 	}
2384 	mutex_unlock(&po->pg_vec_lock);
2385 
2386 	spin_lock(&po->bind_lock);
2387 	if (was_running && !po->running) {
2388 		sock_hold(sk);
2389 		po->running = 1;
2390 		po->num = num;
2391 		dev_add_pack(&po->prot_hook);
2392 	}
2393 	spin_unlock(&po->bind_lock);
2394 
2395 	release_sock(sk);
2396 
2397 	if (pg_vec)
2398 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2399 out:
2400 	return err;
2401 }
2402 
2403 static int packet_mmap(struct file *file, struct socket *sock,
2404 		struct vm_area_struct *vma)
2405 {
2406 	struct sock *sk = sock->sk;
2407 	struct packet_sock *po = pkt_sk(sk);
2408 	unsigned long size, expected_size;
2409 	struct packet_ring_buffer *rb;
2410 	unsigned long start;
2411 	int err = -EINVAL;
2412 	int i;
2413 
2414 	if (vma->vm_pgoff)
2415 		return -EINVAL;
2416 
2417 	mutex_lock(&po->pg_vec_lock);
2418 
2419 	expected_size = 0;
2420 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2421 		if (rb->pg_vec) {
2422 			expected_size += rb->pg_vec_len
2423 						* rb->pg_vec_pages
2424 						* PAGE_SIZE;
2425 		}
2426 	}
2427 
2428 	if (expected_size == 0)
2429 		goto out;
2430 
2431 	size = vma->vm_end - vma->vm_start;
2432 	if (size != expected_size)
2433 		goto out;
2434 
2435 	start = vma->vm_start;
2436 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2437 		if (rb->pg_vec == NULL)
2438 			continue;
2439 
2440 		for (i = 0; i < rb->pg_vec_len; i++) {
2441 			struct page *page = virt_to_page(rb->pg_vec[i]);
2442 			int pg_num;
2443 
2444 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2445 					pg_num++, page++) {
2446 				err = vm_insert_page(vma, start, page);
2447 				if (unlikely(err))
2448 					goto out;
2449 				start += PAGE_SIZE;
2450 			}
2451 		}
2452 	}
2453 
2454 	atomic_inc(&po->mapped);
2455 	vma->vm_ops = &packet_mmap_ops;
2456 	err = 0;
2457 
2458 out:
2459 	mutex_unlock(&po->pg_vec_lock);
2460 	return err;
2461 }
2462 
2463 static const struct proto_ops packet_ops_spkt = {
2464 	.family =	PF_PACKET,
2465 	.owner =	THIS_MODULE,
2466 	.release =	packet_release,
2467 	.bind =		packet_bind_spkt,
2468 	.connect =	sock_no_connect,
2469 	.socketpair =	sock_no_socketpair,
2470 	.accept =	sock_no_accept,
2471 	.getname =	packet_getname_spkt,
2472 	.poll =		datagram_poll,
2473 	.ioctl =	packet_ioctl,
2474 	.listen =	sock_no_listen,
2475 	.shutdown =	sock_no_shutdown,
2476 	.setsockopt =	sock_no_setsockopt,
2477 	.getsockopt =	sock_no_getsockopt,
2478 	.sendmsg =	packet_sendmsg_spkt,
2479 	.recvmsg =	packet_recvmsg,
2480 	.mmap =		sock_no_mmap,
2481 	.sendpage =	sock_no_sendpage,
2482 };
2483 
2484 static const struct proto_ops packet_ops = {
2485 	.family =	PF_PACKET,
2486 	.owner =	THIS_MODULE,
2487 	.release =	packet_release,
2488 	.bind =		packet_bind,
2489 	.connect =	sock_no_connect,
2490 	.socketpair =	sock_no_socketpair,
2491 	.accept =	sock_no_accept,
2492 	.getname =	packet_getname,
2493 	.poll =		packet_poll,
2494 	.ioctl =	packet_ioctl,
2495 	.listen =	sock_no_listen,
2496 	.shutdown =	sock_no_shutdown,
2497 	.setsockopt =	packet_setsockopt,
2498 	.getsockopt =	packet_getsockopt,
2499 	.sendmsg =	packet_sendmsg,
2500 	.recvmsg =	packet_recvmsg,
2501 	.mmap =		packet_mmap,
2502 	.sendpage =	sock_no_sendpage,
2503 };
2504 
2505 static const struct net_proto_family packet_family_ops = {
2506 	.family =	PF_PACKET,
2507 	.create =	packet_create,
2508 	.owner	=	THIS_MODULE,
2509 };
2510 
2511 static struct notifier_block packet_netdev_notifier = {
2512 	.notifier_call =	packet_notifier,
2513 };
2514 
2515 #ifdef CONFIG_PROC_FS
2516 
2517 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2518 	__acquires(RCU)
2519 {
2520 	struct net *net = seq_file_net(seq);
2521 
2522 	rcu_read_lock();
2523 	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2524 }
2525 
2526 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527 {
2528 	struct net *net = seq_file_net(seq);
2529 	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2530 }
2531 
2532 static void packet_seq_stop(struct seq_file *seq, void *v)
2533 	__releases(RCU)
2534 {
2535 	rcu_read_unlock();
2536 }
2537 
2538 static int packet_seq_show(struct seq_file *seq, void *v)
2539 {
2540 	if (v == SEQ_START_TOKEN)
2541 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2542 	else {
2543 		struct sock *s = sk_entry(v);
2544 		const struct packet_sock *po = pkt_sk(s);
2545 
2546 		seq_printf(seq,
2547 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2548 			   s,
2549 			   atomic_read(&s->sk_refcnt),
2550 			   s->sk_type,
2551 			   ntohs(po->num),
2552 			   po->ifindex,
2553 			   po->running,
2554 			   atomic_read(&s->sk_rmem_alloc),
2555 			   sock_i_uid(s),
2556 			   sock_i_ino(s));
2557 	}
2558 
2559 	return 0;
2560 }
2561 
2562 static const struct seq_operations packet_seq_ops = {
2563 	.start	= packet_seq_start,
2564 	.next	= packet_seq_next,
2565 	.stop	= packet_seq_stop,
2566 	.show	= packet_seq_show,
2567 };
2568 
2569 static int packet_seq_open(struct inode *inode, struct file *file)
2570 {
2571 	return seq_open_net(inode, file, &packet_seq_ops,
2572 			    sizeof(struct seq_net_private));
2573 }
2574 
2575 static const struct file_operations packet_seq_fops = {
2576 	.owner		= THIS_MODULE,
2577 	.open		= packet_seq_open,
2578 	.read		= seq_read,
2579 	.llseek		= seq_lseek,
2580 	.release	= seq_release_net,
2581 };
2582 
2583 #endif
2584 
2585 static int __net_init packet_net_init(struct net *net)
2586 {
2587 	spin_lock_init(&net->packet.sklist_lock);
2588 	INIT_HLIST_HEAD(&net->packet.sklist);
2589 
2590 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2591 		return -ENOMEM;
2592 
2593 	return 0;
2594 }
2595 
2596 static void __net_exit packet_net_exit(struct net *net)
2597 {
2598 	proc_net_remove(net, "packet");
2599 }
2600 
2601 static struct pernet_operations packet_net_ops = {
2602 	.init = packet_net_init,
2603 	.exit = packet_net_exit,
2604 };
2605 
2606 
2607 static void __exit packet_exit(void)
2608 {
2609 	unregister_netdevice_notifier(&packet_netdev_notifier);
2610 	unregister_pernet_subsys(&packet_net_ops);
2611 	sock_unregister(PF_PACKET);
2612 	proto_unregister(&packet_proto);
2613 }
2614 
2615 static int __init packet_init(void)
2616 {
2617 	int rc = proto_register(&packet_proto, 0);
2618 
2619 	if (rc != 0)
2620 		goto out;
2621 
2622 	sock_register(&packet_family_ops);
2623 	register_pernet_subsys(&packet_net_ops);
2624 	register_netdevice_notifier(&packet_netdev_notifier);
2625 out:
2626 	return rc;
2627 }
2628 
2629 module_init(packet_init);
2630 module_exit(packet_exit);
2631 MODULE_LICENSE("GPL");
2632 MODULE_ALIAS_NETPROTO(PF_PACKET);
2633