xref: /linux/net/packet/af_packet.c (revision dfc349402de8e95f6a42e8341e9ea193b718eee3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* Private packet socket structures. */
139 
140 struct packet_mclist {
141 	struct packet_mclist	*next;
142 	int			ifindex;
143 	int			count;
144 	unsigned short		type;
145 	unsigned short		alen;
146 	unsigned char		addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152 	int		mr_ifindex;
153 	unsigned short	mr_type;
154 	unsigned short	mr_alen;
155 	unsigned char	mr_address[MAX_ADDR_LEN];
156 };
157 
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160 		int closing, int tx_ring);
161 
162 struct packet_ring_buffer {
163 	char			**pg_vec;
164 	unsigned int		head;
165 	unsigned int		frames_per_block;
166 	unsigned int		frame_size;
167 	unsigned int		frame_max;
168 
169 	unsigned int		pg_vec_order;
170 	unsigned int		pg_vec_pages;
171 	unsigned int		pg_vec_len;
172 
173 	atomic_t		pending;
174 };
175 
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179 
180 static void packet_flush_mclist(struct sock *sk);
181 
182 struct packet_sock {
183 	/* struct sock has to be the first member of packet_sock */
184 	struct sock		sk;
185 	struct tpacket_stats	stats;
186 #ifdef CONFIG_PACKET_MMAP
187 	struct packet_ring_buffer	rx_ring;
188 	struct packet_ring_buffer	tx_ring;
189 	int			copy_thresh;
190 #endif
191 	struct packet_type	prot_hook;
192 	spinlock_t		bind_lock;
193 	struct mutex		pg_vec_lock;
194 	unsigned int		running:1,	/* prot_hook is attached*/
195 				auxdata:1,
196 				origdev:1;
197 	int			ifindex;	/* bound device		*/
198 	__be16			num;
199 	struct packet_mclist	*mclist;
200 #ifdef CONFIG_PACKET_MMAP
201 	atomic_t		mapped;
202 	enum tpacket_versions	tp_version;
203 	unsigned int		tp_hdrlen;
204 	unsigned int		tp_reserve;
205 	unsigned int		tp_loss:1;
206 #endif
207 };
208 
209 struct packet_skb_cb {
210 	unsigned int origlen;
211 	union {
212 		struct sockaddr_pkt pkt;
213 		struct sockaddr_ll ll;
214 	} sa;
215 };
216 
217 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
218 
219 #ifdef CONFIG_PACKET_MMAP
220 
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223 	union {
224 		struct tpacket_hdr *h1;
225 		struct tpacket2_hdr *h2;
226 		void *raw;
227 	} h;
228 
229 	h.raw = frame;
230 	switch (po->tp_version) {
231 	case TPACKET_V1:
232 		h.h1->tp_status = status;
233 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
234 		break;
235 	case TPACKET_V2:
236 		h.h2->tp_status = status;
237 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
238 		break;
239 	default:
240 		pr_err("TPACKET version not supported\n");
241 		BUG();
242 	}
243 
244 	smp_wmb();
245 }
246 
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249 	union {
250 		struct tpacket_hdr *h1;
251 		struct tpacket2_hdr *h2;
252 		void *raw;
253 	} h;
254 
255 	smp_rmb();
256 
257 	h.raw = frame;
258 	switch (po->tp_version) {
259 	case TPACKET_V1:
260 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
261 		return h.h1->tp_status;
262 	case TPACKET_V2:
263 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
264 		return h.h2->tp_status;
265 	default:
266 		pr_err("TPACKET version not supported\n");
267 		BUG();
268 		return 0;
269 	}
270 }
271 
272 static void *packet_lookup_frame(struct packet_sock *po,
273 		struct packet_ring_buffer *rb,
274 		unsigned int position,
275 		int status)
276 {
277 	unsigned int pg_vec_pos, frame_offset;
278 	union {
279 		struct tpacket_hdr *h1;
280 		struct tpacket2_hdr *h2;
281 		void *raw;
282 	} h;
283 
284 	pg_vec_pos = position / rb->frames_per_block;
285 	frame_offset = position % rb->frames_per_block;
286 
287 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288 
289 	if (status != __packet_get_status(po, h.raw))
290 		return NULL;
291 
292 	return h.raw;
293 }
294 
295 static inline void *packet_current_frame(struct packet_sock *po,
296 		struct packet_ring_buffer *rb,
297 		int status)
298 {
299 	return packet_lookup_frame(po, rb, rb->head, status);
300 }
301 
302 static inline void *packet_previous_frame(struct packet_sock *po,
303 		struct packet_ring_buffer *rb,
304 		int status)
305 {
306 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307 	return packet_lookup_frame(po, rb, previous, status);
308 }
309 
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314 
315 #endif
316 
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319 	return (struct packet_sock *)sk;
320 }
321 
322 static void packet_sock_destruct(struct sock *sk)
323 {
324 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326 
327 	if (!sock_flag(sk, SOCK_DEAD)) {
328 		pr_err("Attempt to release alive packet socket: %p\n", sk);
329 		return;
330 	}
331 
332 	sk_refcnt_debug_dec(sk);
333 }
334 
335 
336 static const struct proto_ops packet_ops;
337 
338 static const struct proto_ops packet_ops_spkt;
339 
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341 			   struct packet_type *pt, struct net_device *orig_dev)
342 {
343 	struct sock *sk;
344 	struct sockaddr_pkt *spkt;
345 
346 	/*
347 	 *	When we registered the protocol we saved the socket in the data
348 	 *	field for just this event.
349 	 */
350 
351 	sk = pt->af_packet_priv;
352 
353 	/*
354 	 *	Yank back the headers [hope the device set this
355 	 *	right or kerboom...]
356 	 *
357 	 *	Incoming packets have ll header pulled,
358 	 *	push it back.
359 	 *
360 	 *	For outgoing ones skb->data == skb_mac_header(skb)
361 	 *	so that this procedure is noop.
362 	 */
363 
364 	if (skb->pkt_type == PACKET_LOOPBACK)
365 		goto out;
366 
367 	if (dev_net(dev) != sock_net(sk))
368 		goto out;
369 
370 	skb = skb_share_check(skb, GFP_ATOMIC);
371 	if (skb == NULL)
372 		goto oom;
373 
374 	/* drop any routing info */
375 	skb_dst_drop(skb);
376 
377 	/* drop conntrack reference */
378 	nf_reset(skb);
379 
380 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381 
382 	skb_push(skb, skb->data - skb_mac_header(skb));
383 
384 	/*
385 	 *	The SOCK_PACKET socket receives _all_ frames.
386 	 */
387 
388 	spkt->spkt_family = dev->type;
389 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390 	spkt->spkt_protocol = skb->protocol;
391 
392 	/*
393 	 *	Charge the memory to the socket. This is done specifically
394 	 *	to prevent sockets using all the memory up.
395 	 */
396 
397 	if (sock_queue_rcv_skb(sk, skb) == 0)
398 		return 0;
399 
400 out:
401 	kfree_skb(skb);
402 oom:
403 	return 0;
404 }
405 
406 
407 /*
408  *	Output a raw packet to a device layer. This bypasses all the other
409  *	protocol layers and you must therefore supply it with a complete frame
410  */
411 
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413 			       struct msghdr *msg, size_t len)
414 {
415 	struct sock *sk = sock->sk;
416 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417 	struct sk_buff *skb;
418 	struct net_device *dev;
419 	__be16 proto = 0;
420 	int err;
421 
422 	/*
423 	 *	Get and verify the address.
424 	 */
425 
426 	if (saddr) {
427 		if (msg->msg_namelen < sizeof(struct sockaddr))
428 			return -EINVAL;
429 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430 			proto = saddr->spkt_protocol;
431 	} else
432 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
433 
434 	/*
435 	 *	Find the device first to size check it
436 	 */
437 
438 	saddr->spkt_device[13] = 0;
439 	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440 	err = -ENODEV;
441 	if (dev == NULL)
442 		goto out_unlock;
443 
444 	err = -ENETDOWN;
445 	if (!(dev->flags & IFF_UP))
446 		goto out_unlock;
447 
448 	/*
449 	 * You may not queue a frame bigger than the mtu. This is the lowest level
450 	 * raw protocol and you must do your own fragmentation at this level.
451 	 */
452 
453 	err = -EMSGSIZE;
454 	if (len > dev->mtu + dev->hard_header_len)
455 		goto out_unlock;
456 
457 	err = -ENOBUFS;
458 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459 
460 	/*
461 	 * If the write buffer is full, then tough. At this level the user
462 	 * gets to deal with the problem - do your own algorithmic backoffs.
463 	 * That's far more flexible.
464 	 */
465 
466 	if (skb == NULL)
467 		goto out_unlock;
468 
469 	/*
470 	 *	Fill it in
471 	 */
472 
473 	/* FIXME: Save some space for broken drivers that write a
474 	 * hard header at transmission time by themselves. PPP is the
475 	 * notable one here. This should really be fixed at the driver level.
476 	 */
477 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
478 	skb_reset_network_header(skb);
479 
480 	/* Try to align data part correctly */
481 	if (dev->header_ops) {
482 		skb->data -= dev->hard_header_len;
483 		skb->tail -= dev->hard_header_len;
484 		if (len < dev->hard_header_len)
485 			skb_reset_network_header(skb);
486 	}
487 
488 	/* Returns -EFAULT on error */
489 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490 	skb->protocol = proto;
491 	skb->dev = dev;
492 	skb->priority = sk->sk_priority;
493 	if (err)
494 		goto out_free;
495 
496 	/*
497 	 *	Now send it
498 	 */
499 
500 	dev_queue_xmit(skb);
501 	dev_put(dev);
502 	return len;
503 
504 out_free:
505 	kfree_skb(skb);
506 out_unlock:
507 	if (dev)
508 		dev_put(dev);
509 	return err;
510 }
511 
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513 				      unsigned int res)
514 {
515 	struct sk_filter *filter;
516 
517 	rcu_read_lock_bh();
518 	filter = rcu_dereference(sk->sk_filter);
519 	if (filter != NULL)
520 		res = sk_run_filter(skb, filter->insns, filter->len);
521 	rcu_read_unlock_bh();
522 
523 	return res;
524 }
525 
526 /*
527    This function makes lazy skb cloning in hope that most of packets
528    are discarded by BPF.
529 
530    Note tricky part: we DO mangle shared skb! skb->data, skb->len
531    and skb->cb are mangled. It works because (and until) packets
532    falling here are owned by current CPU. Output packets are cloned
533    by dev_queue_xmit_nit(), input packets are processed by net_bh
534    sequencially, so that if we return skb to original state on exit,
535    we will not harm anyone.
536  */
537 
538 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
539 		      struct packet_type *pt, struct net_device *orig_dev)
540 {
541 	struct sock *sk;
542 	struct sockaddr_ll *sll;
543 	struct packet_sock *po;
544 	u8 *skb_head = skb->data;
545 	int skb_len = skb->len;
546 	unsigned int snaplen, res;
547 
548 	if (skb->pkt_type == PACKET_LOOPBACK)
549 		goto drop;
550 
551 	sk = pt->af_packet_priv;
552 	po = pkt_sk(sk);
553 
554 	if (dev_net(dev) != sock_net(sk))
555 		goto drop;
556 
557 	skb->dev = dev;
558 
559 	if (dev->header_ops) {
560 		/* The device has an explicit notion of ll header,
561 		   exported to higher levels.
562 
563 		   Otherwise, the device hides datails of it frame
564 		   structure, so that corresponding packet head
565 		   never delivered to user.
566 		 */
567 		if (sk->sk_type != SOCK_DGRAM)
568 			skb_push(skb, skb->data - skb_mac_header(skb));
569 		else if (skb->pkt_type == PACKET_OUTGOING) {
570 			/* Special case: outgoing packets have ll header at head */
571 			skb_pull(skb, skb_network_offset(skb));
572 		}
573 	}
574 
575 	snaplen = skb->len;
576 
577 	res = run_filter(skb, sk, snaplen);
578 	if (!res)
579 		goto drop_n_restore;
580 	if (snaplen > res)
581 		snaplen = res;
582 
583 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
584 	    (unsigned)sk->sk_rcvbuf)
585 		goto drop_n_acct;
586 
587 	if (skb_shared(skb)) {
588 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
589 		if (nskb == NULL)
590 			goto drop_n_acct;
591 
592 		if (skb_head != skb->data) {
593 			skb->data = skb_head;
594 			skb->len = skb_len;
595 		}
596 		kfree_skb(skb);
597 		skb = nskb;
598 	}
599 
600 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
601 		     sizeof(skb->cb));
602 
603 	sll = &PACKET_SKB_CB(skb)->sa.ll;
604 	sll->sll_family = AF_PACKET;
605 	sll->sll_hatype = dev->type;
606 	sll->sll_protocol = skb->protocol;
607 	sll->sll_pkttype = skb->pkt_type;
608 	if (unlikely(po->origdev))
609 		sll->sll_ifindex = orig_dev->ifindex;
610 	else
611 		sll->sll_ifindex = dev->ifindex;
612 
613 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
614 
615 	PACKET_SKB_CB(skb)->origlen = skb->len;
616 
617 	if (pskb_trim(skb, snaplen))
618 		goto drop_n_acct;
619 
620 	skb_set_owner_r(skb, sk);
621 	skb->dev = NULL;
622 	skb_dst_drop(skb);
623 
624 	/* drop conntrack reference */
625 	nf_reset(skb);
626 
627 	spin_lock(&sk->sk_receive_queue.lock);
628 	po->stats.tp_packets++;
629 	__skb_queue_tail(&sk->sk_receive_queue, skb);
630 	spin_unlock(&sk->sk_receive_queue.lock);
631 	sk->sk_data_ready(sk, skb->len);
632 	return 0;
633 
634 drop_n_acct:
635 	spin_lock(&sk->sk_receive_queue.lock);
636 	po->stats.tp_drops++;
637 	spin_unlock(&sk->sk_receive_queue.lock);
638 
639 drop_n_restore:
640 	if (skb_head != skb->data && skb_shared(skb)) {
641 		skb->data = skb_head;
642 		skb->len = skb_len;
643 	}
644 drop:
645 	consume_skb(skb);
646 	return 0;
647 }
648 
649 #ifdef CONFIG_PACKET_MMAP
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 		       struct packet_type *pt, struct net_device *orig_dev)
652 {
653 	struct sock *sk;
654 	struct packet_sock *po;
655 	struct sockaddr_ll *sll;
656 	union {
657 		struct tpacket_hdr *h1;
658 		struct tpacket2_hdr *h2;
659 		void *raw;
660 	} h;
661 	u8 *skb_head = skb->data;
662 	int skb_len = skb->len;
663 	unsigned int snaplen, res;
664 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 	unsigned short macoff, netoff, hdrlen;
666 	struct sk_buff *copy_skb = NULL;
667 	struct timeval tv;
668 	struct timespec ts;
669 
670 	if (skb->pkt_type == PACKET_LOOPBACK)
671 		goto drop;
672 
673 	sk = pt->af_packet_priv;
674 	po = pkt_sk(sk);
675 
676 	if (dev_net(dev) != sock_net(sk))
677 		goto drop;
678 
679 	if (dev->header_ops) {
680 		if (sk->sk_type != SOCK_DGRAM)
681 			skb_push(skb, skb->data - skb_mac_header(skb));
682 		else if (skb->pkt_type == PACKET_OUTGOING) {
683 			/* Special case: outgoing packets have ll header at head */
684 			skb_pull(skb, skb_network_offset(skb));
685 		}
686 	}
687 
688 	if (skb->ip_summed == CHECKSUM_PARTIAL)
689 		status |= TP_STATUS_CSUMNOTREADY;
690 
691 	snaplen = skb->len;
692 
693 	res = run_filter(skb, sk, snaplen);
694 	if (!res)
695 		goto drop_n_restore;
696 	if (snaplen > res)
697 		snaplen = res;
698 
699 	if (sk->sk_type == SOCK_DGRAM) {
700 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701 				  po->tp_reserve;
702 	} else {
703 		unsigned maclen = skb_network_offset(skb);
704 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
705 				       (maclen < 16 ? 16 : maclen)) +
706 			po->tp_reserve;
707 		macoff = netoff - maclen;
708 	}
709 
710 	if (macoff + snaplen > po->rx_ring.frame_size) {
711 		if (po->copy_thresh &&
712 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713 		    (unsigned)sk->sk_rcvbuf) {
714 			if (skb_shared(skb)) {
715 				copy_skb = skb_clone(skb, GFP_ATOMIC);
716 			} else {
717 				copy_skb = skb_get(skb);
718 				skb_head = skb->data;
719 			}
720 			if (copy_skb)
721 				skb_set_owner_r(copy_skb, sk);
722 		}
723 		snaplen = po->rx_ring.frame_size - macoff;
724 		if ((int)snaplen < 0)
725 			snaplen = 0;
726 	}
727 
728 	spin_lock(&sk->sk_receive_queue.lock);
729 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730 	if (!h.raw)
731 		goto ring_is_full;
732 	packet_increment_head(&po->rx_ring);
733 	po->stats.tp_packets++;
734 	if (copy_skb) {
735 		status |= TP_STATUS_COPY;
736 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737 	}
738 	if (!po->stats.tp_drops)
739 		status &= ~TP_STATUS_LOSING;
740 	spin_unlock(&sk->sk_receive_queue.lock);
741 
742 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743 
744 	switch (po->tp_version) {
745 	case TPACKET_V1:
746 		h.h1->tp_len = skb->len;
747 		h.h1->tp_snaplen = snaplen;
748 		h.h1->tp_mac = macoff;
749 		h.h1->tp_net = netoff;
750 		if (skb->tstamp.tv64)
751 			tv = ktime_to_timeval(skb->tstamp);
752 		else
753 			do_gettimeofday(&tv);
754 		h.h1->tp_sec = tv.tv_sec;
755 		h.h1->tp_usec = tv.tv_usec;
756 		hdrlen = sizeof(*h.h1);
757 		break;
758 	case TPACKET_V2:
759 		h.h2->tp_len = skb->len;
760 		h.h2->tp_snaplen = snaplen;
761 		h.h2->tp_mac = macoff;
762 		h.h2->tp_net = netoff;
763 		if (skb->tstamp.tv64)
764 			ts = ktime_to_timespec(skb->tstamp);
765 		else
766 			getnstimeofday(&ts);
767 		h.h2->tp_sec = ts.tv_sec;
768 		h.h2->tp_nsec = ts.tv_nsec;
769 		h.h2->tp_vlan_tci = skb->vlan_tci;
770 		hdrlen = sizeof(*h.h2);
771 		break;
772 	default:
773 		BUG();
774 	}
775 
776 	sll = h.raw + TPACKET_ALIGN(hdrlen);
777 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778 	sll->sll_family = AF_PACKET;
779 	sll->sll_hatype = dev->type;
780 	sll->sll_protocol = skb->protocol;
781 	sll->sll_pkttype = skb->pkt_type;
782 	if (unlikely(po->origdev))
783 		sll->sll_ifindex = orig_dev->ifindex;
784 	else
785 		sll->sll_ifindex = dev->ifindex;
786 
787 	__packet_set_status(po, h.raw, status);
788 	smp_mb();
789 	{
790 		struct page *p_start, *p_end;
791 		u8 *h_end = h.raw + macoff + snaplen - 1;
792 
793 		p_start = virt_to_page(h.raw);
794 		p_end = virt_to_page(h_end);
795 		while (p_start <= p_end) {
796 			flush_dcache_page(p_start);
797 			p_start++;
798 		}
799 	}
800 
801 	sk->sk_data_ready(sk, 0);
802 
803 drop_n_restore:
804 	if (skb_head != skb->data && skb_shared(skb)) {
805 		skb->data = skb_head;
806 		skb->len = skb_len;
807 	}
808 drop:
809 	kfree_skb(skb);
810 	return 0;
811 
812 ring_is_full:
813 	po->stats.tp_drops++;
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	sk->sk_data_ready(sk, 0);
817 	kfree_skb(copy_skb);
818 	goto drop_n_restore;
819 }
820 
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823 	struct packet_sock *po = pkt_sk(skb->sk);
824 	void *ph;
825 
826 	BUG_ON(skb == NULL);
827 
828 	if (likely(po->tx_ring.pg_vec)) {
829 		ph = skb_shinfo(skb)->destructor_arg;
830 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832 		atomic_dec(&po->tx_ring.pending);
833 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834 	}
835 
836 	sock_wfree(skb);
837 }
838 
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
840 		void *frame, struct net_device *dev, int size_max,
841 		__be16 proto, unsigned char *addr)
842 {
843 	union {
844 		struct tpacket_hdr *h1;
845 		struct tpacket2_hdr *h2;
846 		void *raw;
847 	} ph;
848 	int to_write, offset, len, tp_len, nr_frags, len_max;
849 	struct socket *sock = po->sk.sk_socket;
850 	struct page *page;
851 	void *data;
852 	int err;
853 
854 	ph.raw = frame;
855 
856 	skb->protocol = proto;
857 	skb->dev = dev;
858 	skb->priority = po->sk.sk_priority;
859 	skb_shinfo(skb)->destructor_arg = ph.raw;
860 
861 	switch (po->tp_version) {
862 	case TPACKET_V2:
863 		tp_len = ph.h2->tp_len;
864 		break;
865 	default:
866 		tp_len = ph.h1->tp_len;
867 		break;
868 	}
869 	if (unlikely(tp_len > size_max)) {
870 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
871 		return -EMSGSIZE;
872 	}
873 
874 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
875 	skb_reset_network_header(skb);
876 
877 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
878 	to_write = tp_len;
879 
880 	if (sock->type == SOCK_DGRAM) {
881 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
882 				NULL, tp_len);
883 		if (unlikely(err < 0))
884 			return -EINVAL;
885 	} else if (dev->hard_header_len) {
886 		/* net device doesn't like empty head */
887 		if (unlikely(tp_len <= dev->hard_header_len)) {
888 			pr_err("packet size is too short (%d < %d)\n",
889 			       tp_len, dev->hard_header_len);
890 			return -EINVAL;
891 		}
892 
893 		skb_push(skb, dev->hard_header_len);
894 		err = skb_store_bits(skb, 0, data,
895 				dev->hard_header_len);
896 		if (unlikely(err))
897 			return err;
898 
899 		data += dev->hard_header_len;
900 		to_write -= dev->hard_header_len;
901 	}
902 
903 	err = -EFAULT;
904 	page = virt_to_page(data);
905 	offset = offset_in_page(data);
906 	len_max = PAGE_SIZE - offset;
907 	len = ((to_write > len_max) ? len_max : to_write);
908 
909 	skb->data_len = to_write;
910 	skb->len += to_write;
911 	skb->truesize += to_write;
912 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
913 
914 	while (likely(to_write)) {
915 		nr_frags = skb_shinfo(skb)->nr_frags;
916 
917 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
918 			pr_err("Packet exceed the number of skb frags(%lu)\n",
919 			       MAX_SKB_FRAGS);
920 			return -EFAULT;
921 		}
922 
923 		flush_dcache_page(page);
924 		get_page(page);
925 		skb_fill_page_desc(skb,
926 				nr_frags,
927 				page++, offset, len);
928 		to_write -= len;
929 		offset = 0;
930 		len_max = PAGE_SIZE;
931 		len = ((to_write > len_max) ? len_max : to_write);
932 	}
933 
934 	return tp_len;
935 }
936 
937 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
938 {
939 	struct socket *sock;
940 	struct sk_buff *skb;
941 	struct net_device *dev;
942 	__be16 proto;
943 	int ifindex, err, reserve = 0;
944 	void *ph;
945 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
946 	int tp_len, size_max;
947 	unsigned char *addr;
948 	int len_sum = 0;
949 	int status = 0;
950 
951 	sock = po->sk.sk_socket;
952 
953 	mutex_lock(&po->pg_vec_lock);
954 
955 	err = -EBUSY;
956 	if (saddr == NULL) {
957 		ifindex	= po->ifindex;
958 		proto	= po->num;
959 		addr	= NULL;
960 	} else {
961 		err = -EINVAL;
962 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
963 			goto out;
964 		if (msg->msg_namelen < (saddr->sll_halen
965 					+ offsetof(struct sockaddr_ll,
966 						sll_addr)))
967 			goto out;
968 		ifindex	= saddr->sll_ifindex;
969 		proto	= saddr->sll_protocol;
970 		addr	= saddr->sll_addr;
971 	}
972 
973 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
974 	err = -ENXIO;
975 	if (unlikely(dev == NULL))
976 		goto out;
977 
978 	reserve = dev->hard_header_len;
979 
980 	err = -ENETDOWN;
981 	if (unlikely(!(dev->flags & IFF_UP)))
982 		goto out_put;
983 
984 	size_max = po->tx_ring.frame_size
985 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
986 
987 	if (size_max > dev->mtu + reserve)
988 		size_max = dev->mtu + reserve;
989 
990 	do {
991 		ph = packet_current_frame(po, &po->tx_ring,
992 				TP_STATUS_SEND_REQUEST);
993 
994 		if (unlikely(ph == NULL)) {
995 			schedule();
996 			continue;
997 		}
998 
999 		status = TP_STATUS_SEND_REQUEST;
1000 		skb = sock_alloc_send_skb(&po->sk,
1001 				LL_ALLOCATED_SPACE(dev)
1002 				+ sizeof(struct sockaddr_ll),
1003 				0, &err);
1004 
1005 		if (unlikely(skb == NULL))
1006 			goto out_status;
1007 
1008 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1009 				addr);
1010 
1011 		if (unlikely(tp_len < 0)) {
1012 			if (po->tp_loss) {
1013 				__packet_set_status(po, ph,
1014 						TP_STATUS_AVAILABLE);
1015 				packet_increment_head(&po->tx_ring);
1016 				kfree_skb(skb);
1017 				continue;
1018 			} else {
1019 				status = TP_STATUS_WRONG_FORMAT;
1020 				err = tp_len;
1021 				goto out_status;
1022 			}
1023 		}
1024 
1025 		skb->destructor = tpacket_destruct_skb;
1026 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1027 		atomic_inc(&po->tx_ring.pending);
1028 
1029 		status = TP_STATUS_SEND_REQUEST;
1030 		err = dev_queue_xmit(skb);
1031 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1032 			goto out_xmit;
1033 		packet_increment_head(&po->tx_ring);
1034 		len_sum += tp_len;
1035 	} while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1036 					&& (atomic_read(&po->tx_ring.pending))))
1037 	      );
1038 
1039 	err = len_sum;
1040 	goto out_put;
1041 
1042 out_xmit:
1043 	skb->destructor = sock_wfree;
1044 	atomic_dec(&po->tx_ring.pending);
1045 out_status:
1046 	__packet_set_status(po, ph, status);
1047 	kfree_skb(skb);
1048 out_put:
1049 	dev_put(dev);
1050 out:
1051 	mutex_unlock(&po->pg_vec_lock);
1052 	return err;
1053 }
1054 #endif
1055 
1056 static int packet_snd(struct socket *sock,
1057 			  struct msghdr *msg, size_t len)
1058 {
1059 	struct sock *sk = sock->sk;
1060 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1061 	struct sk_buff *skb;
1062 	struct net_device *dev;
1063 	__be16 proto;
1064 	unsigned char *addr;
1065 	int ifindex, err, reserve = 0;
1066 
1067 	/*
1068 	 *	Get and verify the address.
1069 	 */
1070 
1071 	if (saddr == NULL) {
1072 		struct packet_sock *po = pkt_sk(sk);
1073 
1074 		ifindex	= po->ifindex;
1075 		proto	= po->num;
1076 		addr	= NULL;
1077 	} else {
1078 		err = -EINVAL;
1079 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1080 			goto out;
1081 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1082 			goto out;
1083 		ifindex	= saddr->sll_ifindex;
1084 		proto	= saddr->sll_protocol;
1085 		addr	= saddr->sll_addr;
1086 	}
1087 
1088 
1089 	dev = dev_get_by_index(sock_net(sk), ifindex);
1090 	err = -ENXIO;
1091 	if (dev == NULL)
1092 		goto out_unlock;
1093 	if (sock->type == SOCK_RAW)
1094 		reserve = dev->hard_header_len;
1095 
1096 	err = -ENETDOWN;
1097 	if (!(dev->flags & IFF_UP))
1098 		goto out_unlock;
1099 
1100 	err = -EMSGSIZE;
1101 	if (len > dev->mtu+reserve)
1102 		goto out_unlock;
1103 
1104 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1105 				msg->msg_flags & MSG_DONTWAIT, &err);
1106 	if (skb == NULL)
1107 		goto out_unlock;
1108 
1109 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1110 	skb_reset_network_header(skb);
1111 
1112 	err = -EINVAL;
1113 	if (sock->type == SOCK_DGRAM &&
1114 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1115 		goto out_free;
1116 
1117 	/* Returns -EFAULT on error */
1118 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1119 	if (err)
1120 		goto out_free;
1121 
1122 	skb->protocol = proto;
1123 	skb->dev = dev;
1124 	skb->priority = sk->sk_priority;
1125 
1126 	/*
1127 	 *	Now send it
1128 	 */
1129 
1130 	err = dev_queue_xmit(skb);
1131 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1132 		goto out_unlock;
1133 
1134 	dev_put(dev);
1135 
1136 	return len;
1137 
1138 out_free:
1139 	kfree_skb(skb);
1140 out_unlock:
1141 	if (dev)
1142 		dev_put(dev);
1143 out:
1144 	return err;
1145 }
1146 
1147 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1148 		struct msghdr *msg, size_t len)
1149 {
1150 #ifdef CONFIG_PACKET_MMAP
1151 	struct sock *sk = sock->sk;
1152 	struct packet_sock *po = pkt_sk(sk);
1153 	if (po->tx_ring.pg_vec)
1154 		return tpacket_snd(po, msg);
1155 	else
1156 #endif
1157 		return packet_snd(sock, msg, len);
1158 }
1159 
1160 /*
1161  *	Close a PACKET socket. This is fairly simple. We immediately go
1162  *	to 'closed' state and remove our protocol entry in the device list.
1163  */
1164 
1165 static int packet_release(struct socket *sock)
1166 {
1167 	struct sock *sk = sock->sk;
1168 	struct packet_sock *po;
1169 	struct net *net;
1170 #ifdef CONFIG_PACKET_MMAP
1171 	struct tpacket_req req;
1172 #endif
1173 
1174 	if (!sk)
1175 		return 0;
1176 
1177 	net = sock_net(sk);
1178 	po = pkt_sk(sk);
1179 
1180 	write_lock_bh(&net->packet.sklist_lock);
1181 	sk_del_node_init(sk);
1182 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1183 	write_unlock_bh(&net->packet.sklist_lock);
1184 
1185 	/*
1186 	 *	Unhook packet receive handler.
1187 	 */
1188 
1189 	if (po->running) {
1190 		/*
1191 		 *	Remove the protocol hook
1192 		 */
1193 		dev_remove_pack(&po->prot_hook);
1194 		po->running = 0;
1195 		po->num = 0;
1196 		__sock_put(sk);
1197 	}
1198 
1199 	packet_flush_mclist(sk);
1200 
1201 #ifdef CONFIG_PACKET_MMAP
1202 	memset(&req, 0, sizeof(req));
1203 
1204 	if (po->rx_ring.pg_vec)
1205 		packet_set_ring(sk, &req, 1, 0);
1206 
1207 	if (po->tx_ring.pg_vec)
1208 		packet_set_ring(sk, &req, 1, 1);
1209 #endif
1210 
1211 	/*
1212 	 *	Now the socket is dead. No more input will appear.
1213 	 */
1214 
1215 	sock_orphan(sk);
1216 	sock->sk = NULL;
1217 
1218 	/* Purge queues */
1219 
1220 	skb_queue_purge(&sk->sk_receive_queue);
1221 	sk_refcnt_debug_release(sk);
1222 
1223 	sock_put(sk);
1224 	return 0;
1225 }
1226 
1227 /*
1228  *	Attach a packet hook.
1229  */
1230 
1231 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1232 {
1233 	struct packet_sock *po = pkt_sk(sk);
1234 	/*
1235 	 *	Detach an existing hook if present.
1236 	 */
1237 
1238 	lock_sock(sk);
1239 
1240 	spin_lock(&po->bind_lock);
1241 	if (po->running) {
1242 		__sock_put(sk);
1243 		po->running = 0;
1244 		po->num = 0;
1245 		spin_unlock(&po->bind_lock);
1246 		dev_remove_pack(&po->prot_hook);
1247 		spin_lock(&po->bind_lock);
1248 	}
1249 
1250 	po->num = protocol;
1251 	po->prot_hook.type = protocol;
1252 	po->prot_hook.dev = dev;
1253 
1254 	po->ifindex = dev ? dev->ifindex : 0;
1255 
1256 	if (protocol == 0)
1257 		goto out_unlock;
1258 
1259 	if (!dev || (dev->flags & IFF_UP)) {
1260 		dev_add_pack(&po->prot_hook);
1261 		sock_hold(sk);
1262 		po->running = 1;
1263 	} else {
1264 		sk->sk_err = ENETDOWN;
1265 		if (!sock_flag(sk, SOCK_DEAD))
1266 			sk->sk_error_report(sk);
1267 	}
1268 
1269 out_unlock:
1270 	spin_unlock(&po->bind_lock);
1271 	release_sock(sk);
1272 	return 0;
1273 }
1274 
1275 /*
1276  *	Bind a packet socket to a device
1277  */
1278 
1279 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1280 			    int addr_len)
1281 {
1282 	struct sock *sk = sock->sk;
1283 	char name[15];
1284 	struct net_device *dev;
1285 	int err = -ENODEV;
1286 
1287 	/*
1288 	 *	Check legality
1289 	 */
1290 
1291 	if (addr_len != sizeof(struct sockaddr))
1292 		return -EINVAL;
1293 	strlcpy(name, uaddr->sa_data, sizeof(name));
1294 
1295 	dev = dev_get_by_name(sock_net(sk), name);
1296 	if (dev) {
1297 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1298 		dev_put(dev);
1299 	}
1300 	return err;
1301 }
1302 
1303 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1304 {
1305 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1306 	struct sock *sk = sock->sk;
1307 	struct net_device *dev = NULL;
1308 	int err;
1309 
1310 
1311 	/*
1312 	 *	Check legality
1313 	 */
1314 
1315 	if (addr_len < sizeof(struct sockaddr_ll))
1316 		return -EINVAL;
1317 	if (sll->sll_family != AF_PACKET)
1318 		return -EINVAL;
1319 
1320 	if (sll->sll_ifindex) {
1321 		err = -ENODEV;
1322 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1323 		if (dev == NULL)
1324 			goto out;
1325 	}
1326 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1327 	if (dev)
1328 		dev_put(dev);
1329 
1330 out:
1331 	return err;
1332 }
1333 
1334 static struct proto packet_proto = {
1335 	.name	  = "PACKET",
1336 	.owner	  = THIS_MODULE,
1337 	.obj_size = sizeof(struct packet_sock),
1338 };
1339 
1340 /*
1341  *	Create a packet of type SOCK_PACKET.
1342  */
1343 
1344 static int packet_create(struct net *net, struct socket *sock, int protocol)
1345 {
1346 	struct sock *sk;
1347 	struct packet_sock *po;
1348 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1349 	int err;
1350 
1351 	if (!capable(CAP_NET_RAW))
1352 		return -EPERM;
1353 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1354 	    sock->type != SOCK_PACKET)
1355 		return -ESOCKTNOSUPPORT;
1356 
1357 	sock->state = SS_UNCONNECTED;
1358 
1359 	err = -ENOBUFS;
1360 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1361 	if (sk == NULL)
1362 		goto out;
1363 
1364 	sock->ops = &packet_ops;
1365 	if (sock->type == SOCK_PACKET)
1366 		sock->ops = &packet_ops_spkt;
1367 
1368 	sock_init_data(sock, sk);
1369 
1370 	po = pkt_sk(sk);
1371 	sk->sk_family = PF_PACKET;
1372 	po->num = proto;
1373 
1374 	sk->sk_destruct = packet_sock_destruct;
1375 	sk_refcnt_debug_inc(sk);
1376 
1377 	/*
1378 	 *	Attach a protocol block
1379 	 */
1380 
1381 	spin_lock_init(&po->bind_lock);
1382 	mutex_init(&po->pg_vec_lock);
1383 	po->prot_hook.func = packet_rcv;
1384 
1385 	if (sock->type == SOCK_PACKET)
1386 		po->prot_hook.func = packet_rcv_spkt;
1387 
1388 	po->prot_hook.af_packet_priv = sk;
1389 
1390 	if (proto) {
1391 		po->prot_hook.type = proto;
1392 		dev_add_pack(&po->prot_hook);
1393 		sock_hold(sk);
1394 		po->running = 1;
1395 	}
1396 
1397 	write_lock_bh(&net->packet.sklist_lock);
1398 	sk_add_node(sk, &net->packet.sklist);
1399 	sock_prot_inuse_add(net, &packet_proto, 1);
1400 	write_unlock_bh(&net->packet.sklist_lock);
1401 	return 0;
1402 out:
1403 	return err;
1404 }
1405 
1406 /*
1407  *	Pull a packet from our receive queue and hand it to the user.
1408  *	If necessary we block.
1409  */
1410 
1411 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1412 			  struct msghdr *msg, size_t len, int flags)
1413 {
1414 	struct sock *sk = sock->sk;
1415 	struct sk_buff *skb;
1416 	int copied, err;
1417 	struct sockaddr_ll *sll;
1418 
1419 	err = -EINVAL;
1420 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1421 		goto out;
1422 
1423 #if 0
1424 	/* What error should we return now? EUNATTACH? */
1425 	if (pkt_sk(sk)->ifindex < 0)
1426 		return -ENODEV;
1427 #endif
1428 
1429 	/*
1430 	 *	Call the generic datagram receiver. This handles all sorts
1431 	 *	of horrible races and re-entrancy so we can forget about it
1432 	 *	in the protocol layers.
1433 	 *
1434 	 *	Now it will return ENETDOWN, if device have just gone down,
1435 	 *	but then it will block.
1436 	 */
1437 
1438 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1439 
1440 	/*
1441 	 *	An error occurred so return it. Because skb_recv_datagram()
1442 	 *	handles the blocking we don't see and worry about blocking
1443 	 *	retries.
1444 	 */
1445 
1446 	if (skb == NULL)
1447 		goto out;
1448 
1449 	/*
1450 	 *	If the address length field is there to be filled in, we fill
1451 	 *	it in now.
1452 	 */
1453 
1454 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1455 	if (sock->type == SOCK_PACKET)
1456 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1457 	else
1458 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1459 
1460 	/*
1461 	 *	You lose any data beyond the buffer you gave. If it worries a
1462 	 *	user program they can ask the device for its MTU anyway.
1463 	 */
1464 
1465 	copied = skb->len;
1466 	if (copied > len) {
1467 		copied = len;
1468 		msg->msg_flags |= MSG_TRUNC;
1469 	}
1470 
1471 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1472 	if (err)
1473 		goto out_free;
1474 
1475 	sock_recv_timestamp(msg, sk, skb);
1476 
1477 	if (msg->msg_name)
1478 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1479 		       msg->msg_namelen);
1480 
1481 	if (pkt_sk(sk)->auxdata) {
1482 		struct tpacket_auxdata aux;
1483 
1484 		aux.tp_status = TP_STATUS_USER;
1485 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1486 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1487 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1488 		aux.tp_snaplen = skb->len;
1489 		aux.tp_mac = 0;
1490 		aux.tp_net = skb_network_offset(skb);
1491 		aux.tp_vlan_tci = skb->vlan_tci;
1492 
1493 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1494 	}
1495 
1496 	/*
1497 	 *	Free or return the buffer as appropriate. Again this
1498 	 *	hides all the races and re-entrancy issues from us.
1499 	 */
1500 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1501 
1502 out_free:
1503 	skb_free_datagram(sk, skb);
1504 out:
1505 	return err;
1506 }
1507 
1508 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1509 			       int *uaddr_len, int peer)
1510 {
1511 	struct net_device *dev;
1512 	struct sock *sk	= sock->sk;
1513 
1514 	if (peer)
1515 		return -EOPNOTSUPP;
1516 
1517 	uaddr->sa_family = AF_PACKET;
1518 	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1519 	if (dev) {
1520 		strlcpy(uaddr->sa_data, dev->name, 15);
1521 		dev_put(dev);
1522 	} else
1523 		memset(uaddr->sa_data, 0, 14);
1524 	*uaddr_len = sizeof(*uaddr);
1525 
1526 	return 0;
1527 }
1528 
1529 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1530 			  int *uaddr_len, int peer)
1531 {
1532 	struct net_device *dev;
1533 	struct sock *sk = sock->sk;
1534 	struct packet_sock *po = pkt_sk(sk);
1535 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1536 
1537 	if (peer)
1538 		return -EOPNOTSUPP;
1539 
1540 	sll->sll_family = AF_PACKET;
1541 	sll->sll_ifindex = po->ifindex;
1542 	sll->sll_protocol = po->num;
1543 	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1544 	if (dev) {
1545 		sll->sll_hatype = dev->type;
1546 		sll->sll_halen = dev->addr_len;
1547 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1548 		dev_put(dev);
1549 	} else {
1550 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1551 		sll->sll_halen = 0;
1552 	}
1553 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1554 
1555 	return 0;
1556 }
1557 
1558 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1559 			 int what)
1560 {
1561 	switch (i->type) {
1562 	case PACKET_MR_MULTICAST:
1563 		if (what > 0)
1564 			return dev_mc_add(dev, i->addr, i->alen, 0);
1565 		else
1566 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1567 		break;
1568 	case PACKET_MR_PROMISC:
1569 		return dev_set_promiscuity(dev, what);
1570 		break;
1571 	case PACKET_MR_ALLMULTI:
1572 		return dev_set_allmulti(dev, what);
1573 		break;
1574 	case PACKET_MR_UNICAST:
1575 		if (what > 0)
1576 			return dev_unicast_add(dev, i->addr);
1577 		else
1578 			return dev_unicast_delete(dev, i->addr);
1579 		break;
1580 	default:
1581 		break;
1582 	}
1583 	return 0;
1584 }
1585 
1586 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1587 {
1588 	for ( ; i; i = i->next) {
1589 		if (i->ifindex == dev->ifindex)
1590 			packet_dev_mc(dev, i, what);
1591 	}
1592 }
1593 
1594 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1595 {
1596 	struct packet_sock *po = pkt_sk(sk);
1597 	struct packet_mclist *ml, *i;
1598 	struct net_device *dev;
1599 	int err;
1600 
1601 	rtnl_lock();
1602 
1603 	err = -ENODEV;
1604 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1605 	if (!dev)
1606 		goto done;
1607 
1608 	err = -EINVAL;
1609 	if (mreq->mr_alen > dev->addr_len)
1610 		goto done;
1611 
1612 	err = -ENOBUFS;
1613 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1614 	if (i == NULL)
1615 		goto done;
1616 
1617 	err = 0;
1618 	for (ml = po->mclist; ml; ml = ml->next) {
1619 		if (ml->ifindex == mreq->mr_ifindex &&
1620 		    ml->type == mreq->mr_type &&
1621 		    ml->alen == mreq->mr_alen &&
1622 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1623 			ml->count++;
1624 			/* Free the new element ... */
1625 			kfree(i);
1626 			goto done;
1627 		}
1628 	}
1629 
1630 	i->type = mreq->mr_type;
1631 	i->ifindex = mreq->mr_ifindex;
1632 	i->alen = mreq->mr_alen;
1633 	memcpy(i->addr, mreq->mr_address, i->alen);
1634 	i->count = 1;
1635 	i->next = po->mclist;
1636 	po->mclist = i;
1637 	err = packet_dev_mc(dev, i, 1);
1638 	if (err) {
1639 		po->mclist = i->next;
1640 		kfree(i);
1641 	}
1642 
1643 done:
1644 	rtnl_unlock();
1645 	return err;
1646 }
1647 
1648 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1649 {
1650 	struct packet_mclist *ml, **mlp;
1651 
1652 	rtnl_lock();
1653 
1654 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1655 		if (ml->ifindex == mreq->mr_ifindex &&
1656 		    ml->type == mreq->mr_type &&
1657 		    ml->alen == mreq->mr_alen &&
1658 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1659 			if (--ml->count == 0) {
1660 				struct net_device *dev;
1661 				*mlp = ml->next;
1662 				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1663 				if (dev) {
1664 					packet_dev_mc(dev, ml, -1);
1665 					dev_put(dev);
1666 				}
1667 				kfree(ml);
1668 			}
1669 			rtnl_unlock();
1670 			return 0;
1671 		}
1672 	}
1673 	rtnl_unlock();
1674 	return -EADDRNOTAVAIL;
1675 }
1676 
1677 static void packet_flush_mclist(struct sock *sk)
1678 {
1679 	struct packet_sock *po = pkt_sk(sk);
1680 	struct packet_mclist *ml;
1681 
1682 	if (!po->mclist)
1683 		return;
1684 
1685 	rtnl_lock();
1686 	while ((ml = po->mclist) != NULL) {
1687 		struct net_device *dev;
1688 
1689 		po->mclist = ml->next;
1690 		dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1691 		if (dev != NULL) {
1692 			packet_dev_mc(dev, ml, -1);
1693 			dev_put(dev);
1694 		}
1695 		kfree(ml);
1696 	}
1697 	rtnl_unlock();
1698 }
1699 
1700 static int
1701 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1702 {
1703 	struct sock *sk = sock->sk;
1704 	struct packet_sock *po = pkt_sk(sk);
1705 	int ret;
1706 
1707 	if (level != SOL_PACKET)
1708 		return -ENOPROTOOPT;
1709 
1710 	switch (optname) {
1711 	case PACKET_ADD_MEMBERSHIP:
1712 	case PACKET_DROP_MEMBERSHIP:
1713 	{
1714 		struct packet_mreq_max mreq;
1715 		int len = optlen;
1716 		memset(&mreq, 0, sizeof(mreq));
1717 		if (len < sizeof(struct packet_mreq))
1718 			return -EINVAL;
1719 		if (len > sizeof(mreq))
1720 			len = sizeof(mreq);
1721 		if (copy_from_user(&mreq, optval, len))
1722 			return -EFAULT;
1723 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1724 			return -EINVAL;
1725 		if (optname == PACKET_ADD_MEMBERSHIP)
1726 			ret = packet_mc_add(sk, &mreq);
1727 		else
1728 			ret = packet_mc_drop(sk, &mreq);
1729 		return ret;
1730 	}
1731 
1732 #ifdef CONFIG_PACKET_MMAP
1733 	case PACKET_RX_RING:
1734 	case PACKET_TX_RING:
1735 	{
1736 		struct tpacket_req req;
1737 
1738 		if (optlen < sizeof(req))
1739 			return -EINVAL;
1740 		if (copy_from_user(&req, optval, sizeof(req)))
1741 			return -EFAULT;
1742 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1743 	}
1744 	case PACKET_COPY_THRESH:
1745 	{
1746 		int val;
1747 
1748 		if (optlen != sizeof(val))
1749 			return -EINVAL;
1750 		if (copy_from_user(&val, optval, sizeof(val)))
1751 			return -EFAULT;
1752 
1753 		pkt_sk(sk)->copy_thresh = val;
1754 		return 0;
1755 	}
1756 	case PACKET_VERSION:
1757 	{
1758 		int val;
1759 
1760 		if (optlen != sizeof(val))
1761 			return -EINVAL;
1762 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1763 			return -EBUSY;
1764 		if (copy_from_user(&val, optval, sizeof(val)))
1765 			return -EFAULT;
1766 		switch (val) {
1767 		case TPACKET_V1:
1768 		case TPACKET_V2:
1769 			po->tp_version = val;
1770 			return 0;
1771 		default:
1772 			return -EINVAL;
1773 		}
1774 	}
1775 	case PACKET_RESERVE:
1776 	{
1777 		unsigned int val;
1778 
1779 		if (optlen != sizeof(val))
1780 			return -EINVAL;
1781 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1782 			return -EBUSY;
1783 		if (copy_from_user(&val, optval, sizeof(val)))
1784 			return -EFAULT;
1785 		po->tp_reserve = val;
1786 		return 0;
1787 	}
1788 	case PACKET_LOSS:
1789 	{
1790 		unsigned int val;
1791 
1792 		if (optlen != sizeof(val))
1793 			return -EINVAL;
1794 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1795 			return -EBUSY;
1796 		if (copy_from_user(&val, optval, sizeof(val)))
1797 			return -EFAULT;
1798 		po->tp_loss = !!val;
1799 		return 0;
1800 	}
1801 #endif
1802 	case PACKET_AUXDATA:
1803 	{
1804 		int val;
1805 
1806 		if (optlen < sizeof(val))
1807 			return -EINVAL;
1808 		if (copy_from_user(&val, optval, sizeof(val)))
1809 			return -EFAULT;
1810 
1811 		po->auxdata = !!val;
1812 		return 0;
1813 	}
1814 	case PACKET_ORIGDEV:
1815 	{
1816 		int val;
1817 
1818 		if (optlen < sizeof(val))
1819 			return -EINVAL;
1820 		if (copy_from_user(&val, optval, sizeof(val)))
1821 			return -EFAULT;
1822 
1823 		po->origdev = !!val;
1824 		return 0;
1825 	}
1826 	default:
1827 		return -ENOPROTOOPT;
1828 	}
1829 }
1830 
1831 static int packet_getsockopt(struct socket *sock, int level, int optname,
1832 			     char __user *optval, int __user *optlen)
1833 {
1834 	int len;
1835 	int val;
1836 	struct sock *sk = sock->sk;
1837 	struct packet_sock *po = pkt_sk(sk);
1838 	void *data;
1839 	struct tpacket_stats st;
1840 
1841 	if (level != SOL_PACKET)
1842 		return -ENOPROTOOPT;
1843 
1844 	if (get_user(len, optlen))
1845 		return -EFAULT;
1846 
1847 	if (len < 0)
1848 		return -EINVAL;
1849 
1850 	switch (optname) {
1851 	case PACKET_STATISTICS:
1852 		if (len > sizeof(struct tpacket_stats))
1853 			len = sizeof(struct tpacket_stats);
1854 		spin_lock_bh(&sk->sk_receive_queue.lock);
1855 		st = po->stats;
1856 		memset(&po->stats, 0, sizeof(st));
1857 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1858 		st.tp_packets += st.tp_drops;
1859 
1860 		data = &st;
1861 		break;
1862 	case PACKET_AUXDATA:
1863 		if (len > sizeof(int))
1864 			len = sizeof(int);
1865 		val = po->auxdata;
1866 
1867 		data = &val;
1868 		break;
1869 	case PACKET_ORIGDEV:
1870 		if (len > sizeof(int))
1871 			len = sizeof(int);
1872 		val = po->origdev;
1873 
1874 		data = &val;
1875 		break;
1876 #ifdef CONFIG_PACKET_MMAP
1877 	case PACKET_VERSION:
1878 		if (len > sizeof(int))
1879 			len = sizeof(int);
1880 		val = po->tp_version;
1881 		data = &val;
1882 		break;
1883 	case PACKET_HDRLEN:
1884 		if (len > sizeof(int))
1885 			len = sizeof(int);
1886 		if (copy_from_user(&val, optval, len))
1887 			return -EFAULT;
1888 		switch (val) {
1889 		case TPACKET_V1:
1890 			val = sizeof(struct tpacket_hdr);
1891 			break;
1892 		case TPACKET_V2:
1893 			val = sizeof(struct tpacket2_hdr);
1894 			break;
1895 		default:
1896 			return -EINVAL;
1897 		}
1898 		data = &val;
1899 		break;
1900 	case PACKET_RESERVE:
1901 		if (len > sizeof(unsigned int))
1902 			len = sizeof(unsigned int);
1903 		val = po->tp_reserve;
1904 		data = &val;
1905 		break;
1906 	case PACKET_LOSS:
1907 		if (len > sizeof(unsigned int))
1908 			len = sizeof(unsigned int);
1909 		val = po->tp_loss;
1910 		data = &val;
1911 		break;
1912 #endif
1913 	default:
1914 		return -ENOPROTOOPT;
1915 	}
1916 
1917 	if (put_user(len, optlen))
1918 		return -EFAULT;
1919 	if (copy_to_user(optval, data, len))
1920 		return -EFAULT;
1921 	return 0;
1922 }
1923 
1924 
1925 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1926 {
1927 	struct sock *sk;
1928 	struct hlist_node *node;
1929 	struct net_device *dev = data;
1930 	struct net *net = dev_net(dev);
1931 
1932 	read_lock(&net->packet.sklist_lock);
1933 	sk_for_each(sk, node, &net->packet.sklist) {
1934 		struct packet_sock *po = pkt_sk(sk);
1935 
1936 		switch (msg) {
1937 		case NETDEV_UNREGISTER:
1938 			if (po->mclist)
1939 				packet_dev_mclist(dev, po->mclist, -1);
1940 			/* fallthrough */
1941 
1942 		case NETDEV_DOWN:
1943 			if (dev->ifindex == po->ifindex) {
1944 				spin_lock(&po->bind_lock);
1945 				if (po->running) {
1946 					__dev_remove_pack(&po->prot_hook);
1947 					__sock_put(sk);
1948 					po->running = 0;
1949 					sk->sk_err = ENETDOWN;
1950 					if (!sock_flag(sk, SOCK_DEAD))
1951 						sk->sk_error_report(sk);
1952 				}
1953 				if (msg == NETDEV_UNREGISTER) {
1954 					po->ifindex = -1;
1955 					po->prot_hook.dev = NULL;
1956 				}
1957 				spin_unlock(&po->bind_lock);
1958 			}
1959 			break;
1960 		case NETDEV_UP:
1961 			spin_lock(&po->bind_lock);
1962 			if (dev->ifindex == po->ifindex && po->num &&
1963 			    !po->running) {
1964 				dev_add_pack(&po->prot_hook);
1965 				sock_hold(sk);
1966 				po->running = 1;
1967 			}
1968 			spin_unlock(&po->bind_lock);
1969 			break;
1970 		}
1971 	}
1972 	read_unlock(&net->packet.sklist_lock);
1973 	return NOTIFY_DONE;
1974 }
1975 
1976 
1977 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1978 			unsigned long arg)
1979 {
1980 	struct sock *sk = sock->sk;
1981 
1982 	switch (cmd) {
1983 	case SIOCOUTQ:
1984 	{
1985 		int amount = sk_wmem_alloc_get(sk);
1986 
1987 		return put_user(amount, (int __user *)arg);
1988 	}
1989 	case SIOCINQ:
1990 	{
1991 		struct sk_buff *skb;
1992 		int amount = 0;
1993 
1994 		spin_lock_bh(&sk->sk_receive_queue.lock);
1995 		skb = skb_peek(&sk->sk_receive_queue);
1996 		if (skb)
1997 			amount = skb->len;
1998 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1999 		return put_user(amount, (int __user *)arg);
2000 	}
2001 	case SIOCGSTAMP:
2002 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2003 	case SIOCGSTAMPNS:
2004 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2005 
2006 #ifdef CONFIG_INET
2007 	case SIOCADDRT:
2008 	case SIOCDELRT:
2009 	case SIOCDARP:
2010 	case SIOCGARP:
2011 	case SIOCSARP:
2012 	case SIOCGIFADDR:
2013 	case SIOCSIFADDR:
2014 	case SIOCGIFBRDADDR:
2015 	case SIOCSIFBRDADDR:
2016 	case SIOCGIFNETMASK:
2017 	case SIOCSIFNETMASK:
2018 	case SIOCGIFDSTADDR:
2019 	case SIOCSIFDSTADDR:
2020 	case SIOCSIFFLAGS:
2021 		if (!net_eq(sock_net(sk), &init_net))
2022 			return -ENOIOCTLCMD;
2023 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2024 #endif
2025 
2026 	default:
2027 		return -ENOIOCTLCMD;
2028 	}
2029 	return 0;
2030 }
2031 
2032 #ifndef CONFIG_PACKET_MMAP
2033 #define packet_mmap sock_no_mmap
2034 #define packet_poll datagram_poll
2035 #else
2036 
2037 static unsigned int packet_poll(struct file *file, struct socket *sock,
2038 				poll_table *wait)
2039 {
2040 	struct sock *sk = sock->sk;
2041 	struct packet_sock *po = pkt_sk(sk);
2042 	unsigned int mask = datagram_poll(file, sock, wait);
2043 
2044 	spin_lock_bh(&sk->sk_receive_queue.lock);
2045 	if (po->rx_ring.pg_vec) {
2046 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2047 			mask |= POLLIN | POLLRDNORM;
2048 	}
2049 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2050 	spin_lock_bh(&sk->sk_write_queue.lock);
2051 	if (po->tx_ring.pg_vec) {
2052 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2053 			mask |= POLLOUT | POLLWRNORM;
2054 	}
2055 	spin_unlock_bh(&sk->sk_write_queue.lock);
2056 	return mask;
2057 }
2058 
2059 
2060 /* Dirty? Well, I still did not learn better way to account
2061  * for user mmaps.
2062  */
2063 
2064 static void packet_mm_open(struct vm_area_struct *vma)
2065 {
2066 	struct file *file = vma->vm_file;
2067 	struct socket *sock = file->private_data;
2068 	struct sock *sk = sock->sk;
2069 
2070 	if (sk)
2071 		atomic_inc(&pkt_sk(sk)->mapped);
2072 }
2073 
2074 static void packet_mm_close(struct vm_area_struct *vma)
2075 {
2076 	struct file *file = vma->vm_file;
2077 	struct socket *sock = file->private_data;
2078 	struct sock *sk = sock->sk;
2079 
2080 	if (sk)
2081 		atomic_dec(&pkt_sk(sk)->mapped);
2082 }
2083 
2084 static const struct vm_operations_struct packet_mmap_ops = {
2085 	.open	=	packet_mm_open,
2086 	.close	=	packet_mm_close,
2087 };
2088 
2089 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2090 {
2091 	int i;
2092 
2093 	for (i = 0; i < len; i++) {
2094 		if (likely(pg_vec[i]))
2095 			free_pages((unsigned long) pg_vec[i], order);
2096 	}
2097 	kfree(pg_vec);
2098 }
2099 
2100 static inline char *alloc_one_pg_vec_page(unsigned long order)
2101 {
2102 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2103 
2104 	return (char *) __get_free_pages(gfp_flags, order);
2105 }
2106 
2107 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2108 {
2109 	unsigned int block_nr = req->tp_block_nr;
2110 	char **pg_vec;
2111 	int i;
2112 
2113 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2114 	if (unlikely(!pg_vec))
2115 		goto out;
2116 
2117 	for (i = 0; i < block_nr; i++) {
2118 		pg_vec[i] = alloc_one_pg_vec_page(order);
2119 		if (unlikely(!pg_vec[i]))
2120 			goto out_free_pgvec;
2121 	}
2122 
2123 out:
2124 	return pg_vec;
2125 
2126 out_free_pgvec:
2127 	free_pg_vec(pg_vec, order, block_nr);
2128 	pg_vec = NULL;
2129 	goto out;
2130 }
2131 
2132 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2133 		int closing, int tx_ring)
2134 {
2135 	char **pg_vec = NULL;
2136 	struct packet_sock *po = pkt_sk(sk);
2137 	int was_running, order = 0;
2138 	struct packet_ring_buffer *rb;
2139 	struct sk_buff_head *rb_queue;
2140 	__be16 num;
2141 	int err;
2142 
2143 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2144 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2145 
2146 	err = -EBUSY;
2147 	if (!closing) {
2148 		if (atomic_read(&po->mapped))
2149 			goto out;
2150 		if (atomic_read(&rb->pending))
2151 			goto out;
2152 	}
2153 
2154 	if (req->tp_block_nr) {
2155 		/* Sanity tests and some calculations */
2156 		err = -EBUSY;
2157 		if (unlikely(rb->pg_vec))
2158 			goto out;
2159 
2160 		switch (po->tp_version) {
2161 		case TPACKET_V1:
2162 			po->tp_hdrlen = TPACKET_HDRLEN;
2163 			break;
2164 		case TPACKET_V2:
2165 			po->tp_hdrlen = TPACKET2_HDRLEN;
2166 			break;
2167 		}
2168 
2169 		err = -EINVAL;
2170 		if (unlikely((int)req->tp_block_size <= 0))
2171 			goto out;
2172 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2173 			goto out;
2174 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2175 					po->tp_reserve))
2176 			goto out;
2177 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2178 			goto out;
2179 
2180 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2181 		if (unlikely(rb->frames_per_block <= 0))
2182 			goto out;
2183 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2184 					req->tp_frame_nr))
2185 			goto out;
2186 
2187 		err = -ENOMEM;
2188 		order = get_order(req->tp_block_size);
2189 		pg_vec = alloc_pg_vec(req, order);
2190 		if (unlikely(!pg_vec))
2191 			goto out;
2192 	}
2193 	/* Done */
2194 	else {
2195 		err = -EINVAL;
2196 		if (unlikely(req->tp_frame_nr))
2197 			goto out;
2198 	}
2199 
2200 	lock_sock(sk);
2201 
2202 	/* Detach socket from network */
2203 	spin_lock(&po->bind_lock);
2204 	was_running = po->running;
2205 	num = po->num;
2206 	if (was_running) {
2207 		__dev_remove_pack(&po->prot_hook);
2208 		po->num = 0;
2209 		po->running = 0;
2210 		__sock_put(sk);
2211 	}
2212 	spin_unlock(&po->bind_lock);
2213 
2214 	synchronize_net();
2215 
2216 	err = -EBUSY;
2217 	mutex_lock(&po->pg_vec_lock);
2218 	if (closing || atomic_read(&po->mapped) == 0) {
2219 		err = 0;
2220 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2221 		spin_lock_bh(&rb_queue->lock);
2222 		pg_vec = XC(rb->pg_vec, pg_vec);
2223 		rb->frame_max = (req->tp_frame_nr - 1);
2224 		rb->head = 0;
2225 		rb->frame_size = req->tp_frame_size;
2226 		spin_unlock_bh(&rb_queue->lock);
2227 
2228 		order = XC(rb->pg_vec_order, order);
2229 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2230 
2231 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2232 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2233 						tpacket_rcv : packet_rcv;
2234 		skb_queue_purge(rb_queue);
2235 #undef XC
2236 		if (atomic_read(&po->mapped))
2237 			pr_err("packet_mmap: vma is busy: %d\n",
2238 			       atomic_read(&po->mapped));
2239 	}
2240 	mutex_unlock(&po->pg_vec_lock);
2241 
2242 	spin_lock(&po->bind_lock);
2243 	if (was_running && !po->running) {
2244 		sock_hold(sk);
2245 		po->running = 1;
2246 		po->num = num;
2247 		dev_add_pack(&po->prot_hook);
2248 	}
2249 	spin_unlock(&po->bind_lock);
2250 
2251 	release_sock(sk);
2252 
2253 	if (pg_vec)
2254 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2255 out:
2256 	return err;
2257 }
2258 
2259 static int packet_mmap(struct file *file, struct socket *sock,
2260 		struct vm_area_struct *vma)
2261 {
2262 	struct sock *sk = sock->sk;
2263 	struct packet_sock *po = pkt_sk(sk);
2264 	unsigned long size, expected_size;
2265 	struct packet_ring_buffer *rb;
2266 	unsigned long start;
2267 	int err = -EINVAL;
2268 	int i;
2269 
2270 	if (vma->vm_pgoff)
2271 		return -EINVAL;
2272 
2273 	mutex_lock(&po->pg_vec_lock);
2274 
2275 	expected_size = 0;
2276 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2277 		if (rb->pg_vec) {
2278 			expected_size += rb->pg_vec_len
2279 						* rb->pg_vec_pages
2280 						* PAGE_SIZE;
2281 		}
2282 	}
2283 
2284 	if (expected_size == 0)
2285 		goto out;
2286 
2287 	size = vma->vm_end - vma->vm_start;
2288 	if (size != expected_size)
2289 		goto out;
2290 
2291 	start = vma->vm_start;
2292 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2293 		if (rb->pg_vec == NULL)
2294 			continue;
2295 
2296 		for (i = 0; i < rb->pg_vec_len; i++) {
2297 			struct page *page = virt_to_page(rb->pg_vec[i]);
2298 			int pg_num;
2299 
2300 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2301 					pg_num++, page++) {
2302 				err = vm_insert_page(vma, start, page);
2303 				if (unlikely(err))
2304 					goto out;
2305 				start += PAGE_SIZE;
2306 			}
2307 		}
2308 	}
2309 
2310 	atomic_inc(&po->mapped);
2311 	vma->vm_ops = &packet_mmap_ops;
2312 	err = 0;
2313 
2314 out:
2315 	mutex_unlock(&po->pg_vec_lock);
2316 	return err;
2317 }
2318 #endif
2319 
2320 
2321 static const struct proto_ops packet_ops_spkt = {
2322 	.family =	PF_PACKET,
2323 	.owner =	THIS_MODULE,
2324 	.release =	packet_release,
2325 	.bind =		packet_bind_spkt,
2326 	.connect =	sock_no_connect,
2327 	.socketpair =	sock_no_socketpair,
2328 	.accept =	sock_no_accept,
2329 	.getname =	packet_getname_spkt,
2330 	.poll =		datagram_poll,
2331 	.ioctl =	packet_ioctl,
2332 	.listen =	sock_no_listen,
2333 	.shutdown =	sock_no_shutdown,
2334 	.setsockopt =	sock_no_setsockopt,
2335 	.getsockopt =	sock_no_getsockopt,
2336 	.sendmsg =	packet_sendmsg_spkt,
2337 	.recvmsg =	packet_recvmsg,
2338 	.mmap =		sock_no_mmap,
2339 	.sendpage =	sock_no_sendpage,
2340 };
2341 
2342 static const struct proto_ops packet_ops = {
2343 	.family =	PF_PACKET,
2344 	.owner =	THIS_MODULE,
2345 	.release =	packet_release,
2346 	.bind =		packet_bind,
2347 	.connect =	sock_no_connect,
2348 	.socketpair =	sock_no_socketpair,
2349 	.accept =	sock_no_accept,
2350 	.getname =	packet_getname,
2351 	.poll =		packet_poll,
2352 	.ioctl =	packet_ioctl,
2353 	.listen =	sock_no_listen,
2354 	.shutdown =	sock_no_shutdown,
2355 	.setsockopt =	packet_setsockopt,
2356 	.getsockopt =	packet_getsockopt,
2357 	.sendmsg =	packet_sendmsg,
2358 	.recvmsg =	packet_recvmsg,
2359 	.mmap =		packet_mmap,
2360 	.sendpage =	sock_no_sendpage,
2361 };
2362 
2363 static struct net_proto_family packet_family_ops = {
2364 	.family =	PF_PACKET,
2365 	.create =	packet_create,
2366 	.owner	=	THIS_MODULE,
2367 };
2368 
2369 static struct notifier_block packet_netdev_notifier = {
2370 	.notifier_call =	packet_notifier,
2371 };
2372 
2373 #ifdef CONFIG_PROC_FS
2374 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2375 {
2376 	struct sock *s;
2377 	struct hlist_node *node;
2378 
2379 	sk_for_each(s, node, &net->packet.sklist) {
2380 		if (!off--)
2381 			return s;
2382 	}
2383 	return NULL;
2384 }
2385 
2386 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2387 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2388 {
2389 	struct net *net = seq_file_net(seq);
2390 	read_lock(&net->packet.sklist_lock);
2391 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2392 }
2393 
2394 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2395 {
2396 	struct net *net = seq_file_net(seq);
2397 	++*pos;
2398 	return  (v == SEQ_START_TOKEN)
2399 		? sk_head(&net->packet.sklist)
2400 		: sk_next((struct sock *)v) ;
2401 }
2402 
2403 static void packet_seq_stop(struct seq_file *seq, void *v)
2404 	__releases(seq_file_net(seq)->packet.sklist_lock)
2405 {
2406 	struct net *net = seq_file_net(seq);
2407 	read_unlock(&net->packet.sklist_lock);
2408 }
2409 
2410 static int packet_seq_show(struct seq_file *seq, void *v)
2411 {
2412 	if (v == SEQ_START_TOKEN)
2413 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2414 	else {
2415 		struct sock *s = v;
2416 		const struct packet_sock *po = pkt_sk(s);
2417 
2418 		seq_printf(seq,
2419 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2420 			   s,
2421 			   atomic_read(&s->sk_refcnt),
2422 			   s->sk_type,
2423 			   ntohs(po->num),
2424 			   po->ifindex,
2425 			   po->running,
2426 			   atomic_read(&s->sk_rmem_alloc),
2427 			   sock_i_uid(s),
2428 			   sock_i_ino(s));
2429 	}
2430 
2431 	return 0;
2432 }
2433 
2434 static const struct seq_operations packet_seq_ops = {
2435 	.start	= packet_seq_start,
2436 	.next	= packet_seq_next,
2437 	.stop	= packet_seq_stop,
2438 	.show	= packet_seq_show,
2439 };
2440 
2441 static int packet_seq_open(struct inode *inode, struct file *file)
2442 {
2443 	return seq_open_net(inode, file, &packet_seq_ops,
2444 			    sizeof(struct seq_net_private));
2445 }
2446 
2447 static const struct file_operations packet_seq_fops = {
2448 	.owner		= THIS_MODULE,
2449 	.open		= packet_seq_open,
2450 	.read		= seq_read,
2451 	.llseek		= seq_lseek,
2452 	.release	= seq_release_net,
2453 };
2454 
2455 #endif
2456 
2457 static int packet_net_init(struct net *net)
2458 {
2459 	rwlock_init(&net->packet.sklist_lock);
2460 	INIT_HLIST_HEAD(&net->packet.sklist);
2461 
2462 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2463 		return -ENOMEM;
2464 
2465 	return 0;
2466 }
2467 
2468 static void packet_net_exit(struct net *net)
2469 {
2470 	proc_net_remove(net, "packet");
2471 }
2472 
2473 static struct pernet_operations packet_net_ops = {
2474 	.init = packet_net_init,
2475 	.exit = packet_net_exit,
2476 };
2477 
2478 
2479 static void __exit packet_exit(void)
2480 {
2481 	unregister_netdevice_notifier(&packet_netdev_notifier);
2482 	unregister_pernet_subsys(&packet_net_ops);
2483 	sock_unregister(PF_PACKET);
2484 	proto_unregister(&packet_proto);
2485 }
2486 
2487 static int __init packet_init(void)
2488 {
2489 	int rc = proto_register(&packet_proto, 0);
2490 
2491 	if (rc != 0)
2492 		goto out;
2493 
2494 	sock_register(&packet_family_ops);
2495 	register_pernet_subsys(&packet_net_ops);
2496 	register_netdevice_notifier(&packet_netdev_notifier);
2497 out:
2498 	return rc;
2499 }
2500 
2501 module_init(packet_init);
2502 module_exit(packet_exit);
2503 MODULE_LICENSE("GPL");
2504 MODULE_ALIAS_NETPROTO(PF_PACKET);
2505