xref: /linux/net/packet/af_packet.c (revision 3252b11fc4790d046b93f300c898df2f7cd7c176)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *		Alan Cox	:	verify_area() now used correctly
14  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15  *		Alan Cox	:	tidied skbuff lists.
16  *		Alan Cox	:	Now uses generic datagram routines I
17  *					added. Also fixed the peek/read crash
18  *					from all old Linux datagram code.
19  *		Alan Cox	:	Uses the improved datagram code.
20  *		Alan Cox	:	Added NULL's for socket options.
21  *		Alan Cox	:	Re-commented the code.
22  *		Alan Cox	:	Use new kernel side addressing
23  *		Rob Janssen	:	Correct MTU usage.
24  *		Dave Platt	:	Counter leaks caused by incorrect
25  *					interrupt locking and some slightly
26  *					dubious gcc output. Can you read
27  *					compiler: it said _VOLATILE_
28  *	Richard Kooijman	:	Timestamp fixes.
29  *		Alan Cox	:	New buffers. Use sk->mac.raw.
30  *		Alan Cox	:	sendmsg/recvmsg support.
31  *		Alan Cox	:	Protocol setting support
32  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33  *	Cyrus Durgin		:	Fixed kerneld for kmod.
34  *	Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38  *					The convention is that longer addresses
39  *					will simply extend the hardware address
40  *					byte arrays at the end of sockaddr_ll
41  *					and packet_mreq.
42  *		Johann Baudy	:	Added TX RING.
43  *
44  *		This program is free software; you can redistribute it and/or
45  *		modify it under the terms of the GNU General Public License
46  *		as published by the Free Software Foundation; either version
47  *		2 of the License, or (at your option) any later version.
48  *
49  */
50 
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 
84 #ifdef CONFIG_INET
85 #include <net/inet_common.h>
86 #endif
87 
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98 
99 On receive:
100 -----------
101 
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105 
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109 
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112 		 header.  PPP makes it, that is wrong, because introduce
113 		 assymetry between rx and tx paths.
114    data       -> data
115 
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119 
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122 
123 
124 On transmit:
125 ------------
126 
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130 
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134 
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138 
139 /* Private packet socket structures. */
140 
141 struct packet_mclist {
142 	struct packet_mclist	*next;
143 	int			ifindex;
144 	int			count;
145 	unsigned short		type;
146 	unsigned short		alen;
147 	unsigned char		addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max {
153 	int		mr_ifindex;
154 	unsigned short	mr_type;
155 	unsigned short	mr_alen;
156 	unsigned char	mr_address[MAX_ADDR_LEN];
157 };
158 
159 #ifdef CONFIG_PACKET_MMAP
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161 		int closing, int tx_ring);
162 
163 struct packet_ring_buffer {
164 	char			**pg_vec;
165 	unsigned int		head;
166 	unsigned int		frames_per_block;
167 	unsigned int		frame_size;
168 	unsigned int		frame_max;
169 
170 	unsigned int		pg_vec_order;
171 	unsigned int		pg_vec_pages;
172 	unsigned int		pg_vec_len;
173 
174 	atomic_t		pending;
175 };
176 
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179 #endif
180 
181 static void packet_flush_mclist(struct sock *sk);
182 
183 struct packet_sock {
184 	/* struct sock has to be the first member of packet_sock */
185 	struct sock		sk;
186 	struct tpacket_stats	stats;
187 #ifdef CONFIG_PACKET_MMAP
188 	struct packet_ring_buffer	rx_ring;
189 	struct packet_ring_buffer	tx_ring;
190 	int			copy_thresh;
191 #endif
192 	spinlock_t		bind_lock;
193 	struct mutex		pg_vec_lock;
194 	unsigned int		running:1,	/* prot_hook is attached*/
195 				auxdata:1,
196 				origdev:1;
197 	int			ifindex;	/* bound device		*/
198 	__be16			num;
199 	struct packet_mclist	*mclist;
200 #ifdef CONFIG_PACKET_MMAP
201 	atomic_t		mapped;
202 	enum tpacket_versions	tp_version;
203 	unsigned int		tp_hdrlen;
204 	unsigned int		tp_reserve;
205 	unsigned int		tp_loss:1;
206 #endif
207 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
208 };
209 
210 struct packet_skb_cb {
211 	unsigned int origlen;
212 	union {
213 		struct sockaddr_pkt pkt;
214 		struct sockaddr_ll ll;
215 	} sa;
216 };
217 
218 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
219 
220 #ifdef CONFIG_PACKET_MMAP
221 
222 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223 {
224 	union {
225 		struct tpacket_hdr *h1;
226 		struct tpacket2_hdr *h2;
227 		void *raw;
228 	} h;
229 
230 	h.raw = frame;
231 	switch (po->tp_version) {
232 	case TPACKET_V1:
233 		h.h1->tp_status = status;
234 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
235 		break;
236 	case TPACKET_V2:
237 		h.h2->tp_status = status;
238 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
239 		break;
240 	default:
241 		pr_err("TPACKET version not supported\n");
242 		BUG();
243 	}
244 
245 	smp_wmb();
246 }
247 
248 static int __packet_get_status(struct packet_sock *po, void *frame)
249 {
250 	union {
251 		struct tpacket_hdr *h1;
252 		struct tpacket2_hdr *h2;
253 		void *raw;
254 	} h;
255 
256 	smp_rmb();
257 
258 	h.raw = frame;
259 	switch (po->tp_version) {
260 	case TPACKET_V1:
261 		flush_dcache_page(virt_to_page(&h.h1->tp_status));
262 		return h.h1->tp_status;
263 	case TPACKET_V2:
264 		flush_dcache_page(virt_to_page(&h.h2->tp_status));
265 		return h.h2->tp_status;
266 	default:
267 		pr_err("TPACKET version not supported\n");
268 		BUG();
269 		return 0;
270 	}
271 }
272 
273 static void *packet_lookup_frame(struct packet_sock *po,
274 		struct packet_ring_buffer *rb,
275 		unsigned int position,
276 		int status)
277 {
278 	unsigned int pg_vec_pos, frame_offset;
279 	union {
280 		struct tpacket_hdr *h1;
281 		struct tpacket2_hdr *h2;
282 		void *raw;
283 	} h;
284 
285 	pg_vec_pos = position / rb->frames_per_block;
286 	frame_offset = position % rb->frames_per_block;
287 
288 	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289 
290 	if (status != __packet_get_status(po, h.raw))
291 		return NULL;
292 
293 	return h.raw;
294 }
295 
296 static inline void *packet_current_frame(struct packet_sock *po,
297 		struct packet_ring_buffer *rb,
298 		int status)
299 {
300 	return packet_lookup_frame(po, rb, rb->head, status);
301 }
302 
303 static inline void *packet_previous_frame(struct packet_sock *po,
304 		struct packet_ring_buffer *rb,
305 		int status)
306 {
307 	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308 	return packet_lookup_frame(po, rb, previous, status);
309 }
310 
311 static inline void packet_increment_head(struct packet_ring_buffer *buff)
312 {
313 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314 }
315 
316 #endif
317 
318 static inline struct packet_sock *pkt_sk(struct sock *sk)
319 {
320 	return (struct packet_sock *)sk;
321 }
322 
323 static void packet_sock_destruct(struct sock *sk)
324 {
325 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327 
328 	if (!sock_flag(sk, SOCK_DEAD)) {
329 		pr_err("Attempt to release alive packet socket: %p\n", sk);
330 		return;
331 	}
332 
333 	sk_refcnt_debug_dec(sk);
334 }
335 
336 
337 static const struct proto_ops packet_ops;
338 
339 static const struct proto_ops packet_ops_spkt;
340 
341 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342 			   struct packet_type *pt, struct net_device *orig_dev)
343 {
344 	struct sock *sk;
345 	struct sockaddr_pkt *spkt;
346 
347 	/*
348 	 *	When we registered the protocol we saved the socket in the data
349 	 *	field for just this event.
350 	 */
351 
352 	sk = pt->af_packet_priv;
353 
354 	/*
355 	 *	Yank back the headers [hope the device set this
356 	 *	right or kerboom...]
357 	 *
358 	 *	Incoming packets have ll header pulled,
359 	 *	push it back.
360 	 *
361 	 *	For outgoing ones skb->data == skb_mac_header(skb)
362 	 *	so that this procedure is noop.
363 	 */
364 
365 	if (skb->pkt_type == PACKET_LOOPBACK)
366 		goto out;
367 
368 	if (!net_eq(dev_net(dev), sock_net(sk)))
369 		goto out;
370 
371 	skb = skb_share_check(skb, GFP_ATOMIC);
372 	if (skb == NULL)
373 		goto oom;
374 
375 	/* drop any routing info */
376 	skb_dst_drop(skb);
377 
378 	/* drop conntrack reference */
379 	nf_reset(skb);
380 
381 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382 
383 	skb_push(skb, skb->data - skb_mac_header(skb));
384 
385 	/*
386 	 *	The SOCK_PACKET socket receives _all_ frames.
387 	 */
388 
389 	spkt->spkt_family = dev->type;
390 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391 	spkt->spkt_protocol = skb->protocol;
392 
393 	/*
394 	 *	Charge the memory to the socket. This is done specifically
395 	 *	to prevent sockets using all the memory up.
396 	 */
397 
398 	if (sock_queue_rcv_skb(sk, skb) == 0)
399 		return 0;
400 
401 out:
402 	kfree_skb(skb);
403 oom:
404 	return 0;
405 }
406 
407 
408 /*
409  *	Output a raw packet to a device layer. This bypasses all the other
410  *	protocol layers and you must therefore supply it with a complete frame
411  */
412 
413 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414 			       struct msghdr *msg, size_t len)
415 {
416 	struct sock *sk = sock->sk;
417 	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418 	struct sk_buff *skb;
419 	struct net_device *dev;
420 	__be16 proto = 0;
421 	int err;
422 
423 	/*
424 	 *	Get and verify the address.
425 	 */
426 
427 	if (saddr) {
428 		if (msg->msg_namelen < sizeof(struct sockaddr))
429 			return -EINVAL;
430 		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431 			proto = saddr->spkt_protocol;
432 	} else
433 		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
434 
435 	/*
436 	 *	Find the device first to size check it
437 	 */
438 
439 	saddr->spkt_device[13] = 0;
440 	rcu_read_lock();
441 	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
442 	err = -ENODEV;
443 	if (dev == NULL)
444 		goto out_unlock;
445 
446 	err = -ENETDOWN;
447 	if (!(dev->flags & IFF_UP))
448 		goto out_unlock;
449 
450 	/*
451 	 * You may not queue a frame bigger than the mtu. This is the lowest level
452 	 * raw protocol and you must do your own fragmentation at this level.
453 	 */
454 
455 	err = -EMSGSIZE;
456 	if (len > dev->mtu + dev->hard_header_len)
457 		goto out_unlock;
458 
459 	err = -ENOBUFS;
460 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
461 
462 	/*
463 	 * If the write buffer is full, then tough. At this level the user
464 	 * gets to deal with the problem - do your own algorithmic backoffs.
465 	 * That's far more flexible.
466 	 */
467 
468 	if (skb == NULL)
469 		goto out_unlock;
470 
471 	/*
472 	 *	Fill it in
473 	 */
474 
475 	/* FIXME: Save some space for broken drivers that write a
476 	 * hard header at transmission time by themselves. PPP is the
477 	 * notable one here. This should really be fixed at the driver level.
478 	 */
479 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
480 	skb_reset_network_header(skb);
481 
482 	/* Try to align data part correctly */
483 	if (dev->header_ops) {
484 		skb->data -= dev->hard_header_len;
485 		skb->tail -= dev->hard_header_len;
486 		if (len < dev->hard_header_len)
487 			skb_reset_network_header(skb);
488 	}
489 
490 	/* Returns -EFAULT on error */
491 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
492 	skb->protocol = proto;
493 	skb->dev = dev;
494 	skb->priority = sk->sk_priority;
495 	skb->mark = sk->sk_mark;
496 	if (err)
497 		goto out_free;
498 
499 	/*
500 	 *	Now send it
501 	 */
502 
503 	dev_queue_xmit(skb);
504 	rcu_read_unlock();
505 	return len;
506 
507 out_free:
508 	kfree_skb(skb);
509 out_unlock:
510 	rcu_read_unlock();
511 	return err;
512 }
513 
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 				      unsigned int res)
516 {
517 	struct sk_filter *filter;
518 
519 	rcu_read_lock_bh();
520 	filter = rcu_dereference(sk->sk_filter);
521 	if (filter != NULL)
522 		res = sk_run_filter(skb, filter->insns, filter->len);
523 	rcu_read_unlock_bh();
524 
525 	return res;
526 }
527 
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531 
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539 
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541 		      struct packet_type *pt, struct net_device *orig_dev)
542 {
543 	struct sock *sk;
544 	struct sockaddr_ll *sll;
545 	struct packet_sock *po;
546 	u8 *skb_head = skb->data;
547 	int skb_len = skb->len;
548 	unsigned int snaplen, res;
549 
550 	if (skb->pkt_type == PACKET_LOOPBACK)
551 		goto drop;
552 
553 	sk = pt->af_packet_priv;
554 	po = pkt_sk(sk);
555 
556 	if (!net_eq(dev_net(dev), sock_net(sk)))
557 		goto drop;
558 
559 	skb->dev = dev;
560 
561 	if (dev->header_ops) {
562 		/* The device has an explicit notion of ll header,
563 		   exported to higher levels.
564 
565 		   Otherwise, the device hides datails of it frame
566 		   structure, so that corresponding packet head
567 		   never delivered to user.
568 		 */
569 		if (sk->sk_type != SOCK_DGRAM)
570 			skb_push(skb, skb->data - skb_mac_header(skb));
571 		else if (skb->pkt_type == PACKET_OUTGOING) {
572 			/* Special case: outgoing packets have ll header at head */
573 			skb_pull(skb, skb_network_offset(skb));
574 		}
575 	}
576 
577 	snaplen = skb->len;
578 
579 	res = run_filter(skb, sk, snaplen);
580 	if (!res)
581 		goto drop_n_restore;
582 	if (snaplen > res)
583 		snaplen = res;
584 
585 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586 	    (unsigned)sk->sk_rcvbuf)
587 		goto drop_n_acct;
588 
589 	if (skb_shared(skb)) {
590 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591 		if (nskb == NULL)
592 			goto drop_n_acct;
593 
594 		if (skb_head != skb->data) {
595 			skb->data = skb_head;
596 			skb->len = skb_len;
597 		}
598 		kfree_skb(skb);
599 		skb = nskb;
600 	}
601 
602 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603 		     sizeof(skb->cb));
604 
605 	sll = &PACKET_SKB_CB(skb)->sa.ll;
606 	sll->sll_family = AF_PACKET;
607 	sll->sll_hatype = dev->type;
608 	sll->sll_protocol = skb->protocol;
609 	sll->sll_pkttype = skb->pkt_type;
610 	if (unlikely(po->origdev))
611 		sll->sll_ifindex = orig_dev->ifindex;
612 	else
613 		sll->sll_ifindex = dev->ifindex;
614 
615 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616 
617 	PACKET_SKB_CB(skb)->origlen = skb->len;
618 
619 	if (pskb_trim(skb, snaplen))
620 		goto drop_n_acct;
621 
622 	skb_set_owner_r(skb, sk);
623 	skb->dev = NULL;
624 	skb_dst_drop(skb);
625 
626 	/* drop conntrack reference */
627 	nf_reset(skb);
628 
629 	spin_lock(&sk->sk_receive_queue.lock);
630 	po->stats.tp_packets++;
631 	skb->dropcount = atomic_read(&sk->sk_drops);
632 	__skb_queue_tail(&sk->sk_receive_queue, skb);
633 	spin_unlock(&sk->sk_receive_queue.lock);
634 	sk->sk_data_ready(sk, skb->len);
635 	return 0;
636 
637 drop_n_acct:
638 	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639 
640 drop_n_restore:
641 	if (skb_head != skb->data && skb_shared(skb)) {
642 		skb->data = skb_head;
643 		skb->len = skb_len;
644 	}
645 drop:
646 	consume_skb(skb);
647 	return 0;
648 }
649 
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
652 		       struct packet_type *pt, struct net_device *orig_dev)
653 {
654 	struct sock *sk;
655 	struct packet_sock *po;
656 	struct sockaddr_ll *sll;
657 	union {
658 		struct tpacket_hdr *h1;
659 		struct tpacket2_hdr *h2;
660 		void *raw;
661 	} h;
662 	u8 *skb_head = skb->data;
663 	int skb_len = skb->len;
664 	unsigned int snaplen, res;
665 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
666 	unsigned short macoff, netoff, hdrlen;
667 	struct sk_buff *copy_skb = NULL;
668 	struct timeval tv;
669 	struct timespec ts;
670 
671 	if (skb->pkt_type == PACKET_LOOPBACK)
672 		goto drop;
673 
674 	sk = pt->af_packet_priv;
675 	po = pkt_sk(sk);
676 
677 	if (!net_eq(dev_net(dev), sock_net(sk)))
678 		goto drop;
679 
680 	if (dev->header_ops) {
681 		if (sk->sk_type != SOCK_DGRAM)
682 			skb_push(skb, skb->data - skb_mac_header(skb));
683 		else if (skb->pkt_type == PACKET_OUTGOING) {
684 			/* Special case: outgoing packets have ll header at head */
685 			skb_pull(skb, skb_network_offset(skb));
686 		}
687 	}
688 
689 	if (skb->ip_summed == CHECKSUM_PARTIAL)
690 		status |= TP_STATUS_CSUMNOTREADY;
691 
692 	snaplen = skb->len;
693 
694 	res = run_filter(skb, sk, snaplen);
695 	if (!res)
696 		goto drop_n_restore;
697 	if (snaplen > res)
698 		snaplen = res;
699 
700 	if (sk->sk_type == SOCK_DGRAM) {
701 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702 				  po->tp_reserve;
703 	} else {
704 		unsigned maclen = skb_network_offset(skb);
705 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
706 				       (maclen < 16 ? 16 : maclen)) +
707 			po->tp_reserve;
708 		macoff = netoff - maclen;
709 	}
710 
711 	if (macoff + snaplen > po->rx_ring.frame_size) {
712 		if (po->copy_thresh &&
713 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714 		    (unsigned)sk->sk_rcvbuf) {
715 			if (skb_shared(skb)) {
716 				copy_skb = skb_clone(skb, GFP_ATOMIC);
717 			} else {
718 				copy_skb = skb_get(skb);
719 				skb_head = skb->data;
720 			}
721 			if (copy_skb)
722 				skb_set_owner_r(copy_skb, sk);
723 		}
724 		snaplen = po->rx_ring.frame_size - macoff;
725 		if ((int)snaplen < 0)
726 			snaplen = 0;
727 	}
728 
729 	spin_lock(&sk->sk_receive_queue.lock);
730 	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731 	if (!h.raw)
732 		goto ring_is_full;
733 	packet_increment_head(&po->rx_ring);
734 	po->stats.tp_packets++;
735 	if (copy_skb) {
736 		status |= TP_STATUS_COPY;
737 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738 	}
739 	if (!po->stats.tp_drops)
740 		status &= ~TP_STATUS_LOSING;
741 	spin_unlock(&sk->sk_receive_queue.lock);
742 
743 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744 
745 	switch (po->tp_version) {
746 	case TPACKET_V1:
747 		h.h1->tp_len = skb->len;
748 		h.h1->tp_snaplen = snaplen;
749 		h.h1->tp_mac = macoff;
750 		h.h1->tp_net = netoff;
751 		if (skb->tstamp.tv64)
752 			tv = ktime_to_timeval(skb->tstamp);
753 		else
754 			do_gettimeofday(&tv);
755 		h.h1->tp_sec = tv.tv_sec;
756 		h.h1->tp_usec = tv.tv_usec;
757 		hdrlen = sizeof(*h.h1);
758 		break;
759 	case TPACKET_V2:
760 		h.h2->tp_len = skb->len;
761 		h.h2->tp_snaplen = snaplen;
762 		h.h2->tp_mac = macoff;
763 		h.h2->tp_net = netoff;
764 		if (skb->tstamp.tv64)
765 			ts = ktime_to_timespec(skb->tstamp);
766 		else
767 			getnstimeofday(&ts);
768 		h.h2->tp_sec = ts.tv_sec;
769 		h.h2->tp_nsec = ts.tv_nsec;
770 		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
771 		hdrlen = sizeof(*h.h2);
772 		break;
773 	default:
774 		BUG();
775 	}
776 
777 	sll = h.raw + TPACKET_ALIGN(hdrlen);
778 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
779 	sll->sll_family = AF_PACKET;
780 	sll->sll_hatype = dev->type;
781 	sll->sll_protocol = skb->protocol;
782 	sll->sll_pkttype = skb->pkt_type;
783 	if (unlikely(po->origdev))
784 		sll->sll_ifindex = orig_dev->ifindex;
785 	else
786 		sll->sll_ifindex = dev->ifindex;
787 
788 	__packet_set_status(po, h.raw, status);
789 	smp_mb();
790 	{
791 		struct page *p_start, *p_end;
792 		u8 *h_end = h.raw + macoff + snaplen - 1;
793 
794 		p_start = virt_to_page(h.raw);
795 		p_end = virt_to_page(h_end);
796 		while (p_start <= p_end) {
797 			flush_dcache_page(p_start);
798 			p_start++;
799 		}
800 	}
801 
802 	sk->sk_data_ready(sk, 0);
803 
804 drop_n_restore:
805 	if (skb_head != skb->data && skb_shared(skb)) {
806 		skb->data = skb_head;
807 		skb->len = skb_len;
808 	}
809 drop:
810 	kfree_skb(skb);
811 	return 0;
812 
813 ring_is_full:
814 	po->stats.tp_drops++;
815 	spin_unlock(&sk->sk_receive_queue.lock);
816 
817 	sk->sk_data_ready(sk, 0);
818 	kfree_skb(copy_skb);
819 	goto drop_n_restore;
820 }
821 
822 static void tpacket_destruct_skb(struct sk_buff *skb)
823 {
824 	struct packet_sock *po = pkt_sk(skb->sk);
825 	void *ph;
826 
827 	BUG_ON(skb == NULL);
828 
829 	if (likely(po->tx_ring.pg_vec)) {
830 		ph = skb_shinfo(skb)->destructor_arg;
831 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
832 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
833 		atomic_dec(&po->tx_ring.pending);
834 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
835 	}
836 
837 	sock_wfree(skb);
838 }
839 
840 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
841 		void *frame, struct net_device *dev, int size_max,
842 		__be16 proto, unsigned char *addr)
843 {
844 	union {
845 		struct tpacket_hdr *h1;
846 		struct tpacket2_hdr *h2;
847 		void *raw;
848 	} ph;
849 	int to_write, offset, len, tp_len, nr_frags, len_max;
850 	struct socket *sock = po->sk.sk_socket;
851 	struct page *page;
852 	void *data;
853 	int err;
854 
855 	ph.raw = frame;
856 
857 	skb->protocol = proto;
858 	skb->dev = dev;
859 	skb->priority = po->sk.sk_priority;
860 	skb->mark = po->sk.sk_mark;
861 	skb_shinfo(skb)->destructor_arg = ph.raw;
862 
863 	switch (po->tp_version) {
864 	case TPACKET_V2:
865 		tp_len = ph.h2->tp_len;
866 		break;
867 	default:
868 		tp_len = ph.h1->tp_len;
869 		break;
870 	}
871 	if (unlikely(tp_len > size_max)) {
872 		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
873 		return -EMSGSIZE;
874 	}
875 
876 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
877 	skb_reset_network_header(skb);
878 
879 	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
880 	to_write = tp_len;
881 
882 	if (sock->type == SOCK_DGRAM) {
883 		err = dev_hard_header(skb, dev, ntohs(proto), addr,
884 				NULL, tp_len);
885 		if (unlikely(err < 0))
886 			return -EINVAL;
887 	} else if (dev->hard_header_len) {
888 		/* net device doesn't like empty head */
889 		if (unlikely(tp_len <= dev->hard_header_len)) {
890 			pr_err("packet size is too short (%d < %d)\n",
891 			       tp_len, dev->hard_header_len);
892 			return -EINVAL;
893 		}
894 
895 		skb_push(skb, dev->hard_header_len);
896 		err = skb_store_bits(skb, 0, data,
897 				dev->hard_header_len);
898 		if (unlikely(err))
899 			return err;
900 
901 		data += dev->hard_header_len;
902 		to_write -= dev->hard_header_len;
903 	}
904 
905 	err = -EFAULT;
906 	page = virt_to_page(data);
907 	offset = offset_in_page(data);
908 	len_max = PAGE_SIZE - offset;
909 	len = ((to_write > len_max) ? len_max : to_write);
910 
911 	skb->data_len = to_write;
912 	skb->len += to_write;
913 	skb->truesize += to_write;
914 	atomic_add(to_write, &po->sk.sk_wmem_alloc);
915 
916 	while (likely(to_write)) {
917 		nr_frags = skb_shinfo(skb)->nr_frags;
918 
919 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920 			pr_err("Packet exceed the number of skb frags(%lu)\n",
921 			       MAX_SKB_FRAGS);
922 			return -EFAULT;
923 		}
924 
925 		flush_dcache_page(page);
926 		get_page(page);
927 		skb_fill_page_desc(skb,
928 				nr_frags,
929 				page++, offset, len);
930 		to_write -= len;
931 		offset = 0;
932 		len_max = PAGE_SIZE;
933 		len = ((to_write > len_max) ? len_max : to_write);
934 	}
935 
936 	return tp_len;
937 }
938 
939 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
940 {
941 	struct socket *sock;
942 	struct sk_buff *skb;
943 	struct net_device *dev;
944 	__be16 proto;
945 	int ifindex, err, reserve = 0;
946 	void *ph;
947 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
948 	int tp_len, size_max;
949 	unsigned char *addr;
950 	int len_sum = 0;
951 	int status = 0;
952 
953 	sock = po->sk.sk_socket;
954 
955 	mutex_lock(&po->pg_vec_lock);
956 
957 	err = -EBUSY;
958 	if (saddr == NULL) {
959 		ifindex	= po->ifindex;
960 		proto	= po->num;
961 		addr	= NULL;
962 	} else {
963 		err = -EINVAL;
964 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
965 			goto out;
966 		if (msg->msg_namelen < (saddr->sll_halen
967 					+ offsetof(struct sockaddr_ll,
968 						sll_addr)))
969 			goto out;
970 		ifindex	= saddr->sll_ifindex;
971 		proto	= saddr->sll_protocol;
972 		addr	= saddr->sll_addr;
973 	}
974 
975 	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
976 	err = -ENXIO;
977 	if (unlikely(dev == NULL))
978 		goto out;
979 
980 	reserve = dev->hard_header_len;
981 
982 	err = -ENETDOWN;
983 	if (unlikely(!(dev->flags & IFF_UP)))
984 		goto out_put;
985 
986 	size_max = po->tx_ring.frame_size
987 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
988 
989 	if (size_max > dev->mtu + reserve)
990 		size_max = dev->mtu + reserve;
991 
992 	do {
993 		ph = packet_current_frame(po, &po->tx_ring,
994 				TP_STATUS_SEND_REQUEST);
995 
996 		if (unlikely(ph == NULL)) {
997 			schedule();
998 			continue;
999 		}
1000 
1001 		status = TP_STATUS_SEND_REQUEST;
1002 		skb = sock_alloc_send_skb(&po->sk,
1003 				LL_ALLOCATED_SPACE(dev)
1004 				+ sizeof(struct sockaddr_ll),
1005 				0, &err);
1006 
1007 		if (unlikely(skb == NULL))
1008 			goto out_status;
1009 
1010 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1011 				addr);
1012 
1013 		if (unlikely(tp_len < 0)) {
1014 			if (po->tp_loss) {
1015 				__packet_set_status(po, ph,
1016 						TP_STATUS_AVAILABLE);
1017 				packet_increment_head(&po->tx_ring);
1018 				kfree_skb(skb);
1019 				continue;
1020 			} else {
1021 				status = TP_STATUS_WRONG_FORMAT;
1022 				err = tp_len;
1023 				goto out_status;
1024 			}
1025 		}
1026 
1027 		skb->destructor = tpacket_destruct_skb;
1028 		__packet_set_status(po, ph, TP_STATUS_SENDING);
1029 		atomic_inc(&po->tx_ring.pending);
1030 
1031 		status = TP_STATUS_SEND_REQUEST;
1032 		err = dev_queue_xmit(skb);
1033 		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1034 			goto out_xmit;
1035 		packet_increment_head(&po->tx_ring);
1036 		len_sum += tp_len;
1037 	} while (likely((ph != NULL) ||
1038 			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1039 			 (atomic_read(&po->tx_ring.pending))))
1040 		);
1041 
1042 	err = len_sum;
1043 	goto out_put;
1044 
1045 out_xmit:
1046 	skb->destructor = sock_wfree;
1047 	atomic_dec(&po->tx_ring.pending);
1048 out_status:
1049 	__packet_set_status(po, ph, status);
1050 	kfree_skb(skb);
1051 out_put:
1052 	dev_put(dev);
1053 out:
1054 	mutex_unlock(&po->pg_vec_lock);
1055 	return err;
1056 }
1057 #endif
1058 
1059 static int packet_snd(struct socket *sock,
1060 			  struct msghdr *msg, size_t len)
1061 {
1062 	struct sock *sk = sock->sk;
1063 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1064 	struct sk_buff *skb;
1065 	struct net_device *dev;
1066 	__be16 proto;
1067 	unsigned char *addr;
1068 	int ifindex, err, reserve = 0;
1069 
1070 	/*
1071 	 *	Get and verify the address.
1072 	 */
1073 
1074 	if (saddr == NULL) {
1075 		struct packet_sock *po = pkt_sk(sk);
1076 
1077 		ifindex	= po->ifindex;
1078 		proto	= po->num;
1079 		addr	= NULL;
1080 	} else {
1081 		err = -EINVAL;
1082 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1083 			goto out;
1084 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1085 			goto out;
1086 		ifindex	= saddr->sll_ifindex;
1087 		proto	= saddr->sll_protocol;
1088 		addr	= saddr->sll_addr;
1089 	}
1090 
1091 
1092 	dev = dev_get_by_index(sock_net(sk), ifindex);
1093 	err = -ENXIO;
1094 	if (dev == NULL)
1095 		goto out_unlock;
1096 	if (sock->type == SOCK_RAW)
1097 		reserve = dev->hard_header_len;
1098 
1099 	err = -ENETDOWN;
1100 	if (!(dev->flags & IFF_UP))
1101 		goto out_unlock;
1102 
1103 	err = -EMSGSIZE;
1104 	if (len > dev->mtu+reserve)
1105 		goto out_unlock;
1106 
1107 	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1108 				msg->msg_flags & MSG_DONTWAIT, &err);
1109 	if (skb == NULL)
1110 		goto out_unlock;
1111 
1112 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1113 	skb_reset_network_header(skb);
1114 
1115 	err = -EINVAL;
1116 	if (sock->type == SOCK_DGRAM &&
1117 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1118 		goto out_free;
1119 
1120 	/* Returns -EFAULT on error */
1121 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1122 	if (err)
1123 		goto out_free;
1124 
1125 	skb->protocol = proto;
1126 	skb->dev = dev;
1127 	skb->priority = sk->sk_priority;
1128 	skb->mark = sk->sk_mark;
1129 
1130 	/*
1131 	 *	Now send it
1132 	 */
1133 
1134 	err = dev_queue_xmit(skb);
1135 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1136 		goto out_unlock;
1137 
1138 	dev_put(dev);
1139 
1140 	return len;
1141 
1142 out_free:
1143 	kfree_skb(skb);
1144 out_unlock:
1145 	if (dev)
1146 		dev_put(dev);
1147 out:
1148 	return err;
1149 }
1150 
1151 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1152 		struct msghdr *msg, size_t len)
1153 {
1154 #ifdef CONFIG_PACKET_MMAP
1155 	struct sock *sk = sock->sk;
1156 	struct packet_sock *po = pkt_sk(sk);
1157 	if (po->tx_ring.pg_vec)
1158 		return tpacket_snd(po, msg);
1159 	else
1160 #endif
1161 		return packet_snd(sock, msg, len);
1162 }
1163 
1164 /*
1165  *	Close a PACKET socket. This is fairly simple. We immediately go
1166  *	to 'closed' state and remove our protocol entry in the device list.
1167  */
1168 
1169 static int packet_release(struct socket *sock)
1170 {
1171 	struct sock *sk = sock->sk;
1172 	struct packet_sock *po;
1173 	struct net *net;
1174 #ifdef CONFIG_PACKET_MMAP
1175 	struct tpacket_req req;
1176 #endif
1177 
1178 	if (!sk)
1179 		return 0;
1180 
1181 	net = sock_net(sk);
1182 	po = pkt_sk(sk);
1183 
1184 	write_lock_bh(&net->packet.sklist_lock);
1185 	sk_del_node_init(sk);
1186 	sock_prot_inuse_add(net, sk->sk_prot, -1);
1187 	write_unlock_bh(&net->packet.sklist_lock);
1188 
1189 	/*
1190 	 *	Unhook packet receive handler.
1191 	 */
1192 
1193 	if (po->running) {
1194 		/*
1195 		 *	Remove the protocol hook
1196 		 */
1197 		dev_remove_pack(&po->prot_hook);
1198 		po->running = 0;
1199 		po->num = 0;
1200 		__sock_put(sk);
1201 	}
1202 
1203 	packet_flush_mclist(sk);
1204 
1205 #ifdef CONFIG_PACKET_MMAP
1206 	memset(&req, 0, sizeof(req));
1207 
1208 	if (po->rx_ring.pg_vec)
1209 		packet_set_ring(sk, &req, 1, 0);
1210 
1211 	if (po->tx_ring.pg_vec)
1212 		packet_set_ring(sk, &req, 1, 1);
1213 #endif
1214 
1215 	/*
1216 	 *	Now the socket is dead. No more input will appear.
1217 	 */
1218 
1219 	sock_orphan(sk);
1220 	sock->sk = NULL;
1221 
1222 	/* Purge queues */
1223 
1224 	skb_queue_purge(&sk->sk_receive_queue);
1225 	sk_refcnt_debug_release(sk);
1226 
1227 	sock_put(sk);
1228 	return 0;
1229 }
1230 
1231 /*
1232  *	Attach a packet hook.
1233  */
1234 
1235 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1236 {
1237 	struct packet_sock *po = pkt_sk(sk);
1238 	/*
1239 	 *	Detach an existing hook if present.
1240 	 */
1241 
1242 	lock_sock(sk);
1243 
1244 	spin_lock(&po->bind_lock);
1245 	if (po->running) {
1246 		__sock_put(sk);
1247 		po->running = 0;
1248 		po->num = 0;
1249 		spin_unlock(&po->bind_lock);
1250 		dev_remove_pack(&po->prot_hook);
1251 		spin_lock(&po->bind_lock);
1252 	}
1253 
1254 	po->num = protocol;
1255 	po->prot_hook.type = protocol;
1256 	po->prot_hook.dev = dev;
1257 
1258 	po->ifindex = dev ? dev->ifindex : 0;
1259 
1260 	if (protocol == 0)
1261 		goto out_unlock;
1262 
1263 	if (!dev || (dev->flags & IFF_UP)) {
1264 		dev_add_pack(&po->prot_hook);
1265 		sock_hold(sk);
1266 		po->running = 1;
1267 	} else {
1268 		sk->sk_err = ENETDOWN;
1269 		if (!sock_flag(sk, SOCK_DEAD))
1270 			sk->sk_error_report(sk);
1271 	}
1272 
1273 out_unlock:
1274 	spin_unlock(&po->bind_lock);
1275 	release_sock(sk);
1276 	return 0;
1277 }
1278 
1279 /*
1280  *	Bind a packet socket to a device
1281  */
1282 
1283 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1284 			    int addr_len)
1285 {
1286 	struct sock *sk = sock->sk;
1287 	char name[15];
1288 	struct net_device *dev;
1289 	int err = -ENODEV;
1290 
1291 	/*
1292 	 *	Check legality
1293 	 */
1294 
1295 	if (addr_len != sizeof(struct sockaddr))
1296 		return -EINVAL;
1297 	strlcpy(name, uaddr->sa_data, sizeof(name));
1298 
1299 	dev = dev_get_by_name(sock_net(sk), name);
1300 	if (dev) {
1301 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1302 		dev_put(dev);
1303 	}
1304 	return err;
1305 }
1306 
1307 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1308 {
1309 	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1310 	struct sock *sk = sock->sk;
1311 	struct net_device *dev = NULL;
1312 	int err;
1313 
1314 
1315 	/*
1316 	 *	Check legality
1317 	 */
1318 
1319 	if (addr_len < sizeof(struct sockaddr_ll))
1320 		return -EINVAL;
1321 	if (sll->sll_family != AF_PACKET)
1322 		return -EINVAL;
1323 
1324 	if (sll->sll_ifindex) {
1325 		err = -ENODEV;
1326 		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1327 		if (dev == NULL)
1328 			goto out;
1329 	}
1330 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1331 	if (dev)
1332 		dev_put(dev);
1333 
1334 out:
1335 	return err;
1336 }
1337 
1338 static struct proto packet_proto = {
1339 	.name	  = "PACKET",
1340 	.owner	  = THIS_MODULE,
1341 	.obj_size = sizeof(struct packet_sock),
1342 };
1343 
1344 /*
1345  *	Create a packet of type SOCK_PACKET.
1346  */
1347 
1348 static int packet_create(struct net *net, struct socket *sock, int protocol,
1349 			 int kern)
1350 {
1351 	struct sock *sk;
1352 	struct packet_sock *po;
1353 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1354 	int err;
1355 
1356 	if (!capable(CAP_NET_RAW))
1357 		return -EPERM;
1358 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1359 	    sock->type != SOCK_PACKET)
1360 		return -ESOCKTNOSUPPORT;
1361 
1362 	sock->state = SS_UNCONNECTED;
1363 
1364 	err = -ENOBUFS;
1365 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1366 	if (sk == NULL)
1367 		goto out;
1368 
1369 	sock->ops = &packet_ops;
1370 	if (sock->type == SOCK_PACKET)
1371 		sock->ops = &packet_ops_spkt;
1372 
1373 	sock_init_data(sock, sk);
1374 
1375 	po = pkt_sk(sk);
1376 	sk->sk_family = PF_PACKET;
1377 	po->num = proto;
1378 
1379 	sk->sk_destruct = packet_sock_destruct;
1380 	sk_refcnt_debug_inc(sk);
1381 
1382 	/*
1383 	 *	Attach a protocol block
1384 	 */
1385 
1386 	spin_lock_init(&po->bind_lock);
1387 	mutex_init(&po->pg_vec_lock);
1388 	po->prot_hook.func = packet_rcv;
1389 
1390 	if (sock->type == SOCK_PACKET)
1391 		po->prot_hook.func = packet_rcv_spkt;
1392 
1393 	po->prot_hook.af_packet_priv = sk;
1394 
1395 	if (proto) {
1396 		po->prot_hook.type = proto;
1397 		dev_add_pack(&po->prot_hook);
1398 		sock_hold(sk);
1399 		po->running = 1;
1400 	}
1401 
1402 	write_lock_bh(&net->packet.sklist_lock);
1403 	sk_add_node(sk, &net->packet.sklist);
1404 	sock_prot_inuse_add(net, &packet_proto, 1);
1405 	write_unlock_bh(&net->packet.sklist_lock);
1406 	return 0;
1407 out:
1408 	return err;
1409 }
1410 
1411 /*
1412  *	Pull a packet from our receive queue and hand it to the user.
1413  *	If necessary we block.
1414  */
1415 
1416 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1417 			  struct msghdr *msg, size_t len, int flags)
1418 {
1419 	struct sock *sk = sock->sk;
1420 	struct sk_buff *skb;
1421 	int copied, err;
1422 	struct sockaddr_ll *sll;
1423 
1424 	err = -EINVAL;
1425 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1426 		goto out;
1427 
1428 #if 0
1429 	/* What error should we return now? EUNATTACH? */
1430 	if (pkt_sk(sk)->ifindex < 0)
1431 		return -ENODEV;
1432 #endif
1433 
1434 	/*
1435 	 *	Call the generic datagram receiver. This handles all sorts
1436 	 *	of horrible races and re-entrancy so we can forget about it
1437 	 *	in the protocol layers.
1438 	 *
1439 	 *	Now it will return ENETDOWN, if device have just gone down,
1440 	 *	but then it will block.
1441 	 */
1442 
1443 	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1444 
1445 	/*
1446 	 *	An error occurred so return it. Because skb_recv_datagram()
1447 	 *	handles the blocking we don't see and worry about blocking
1448 	 *	retries.
1449 	 */
1450 
1451 	if (skb == NULL)
1452 		goto out;
1453 
1454 	/*
1455 	 *	If the address length field is there to be filled in, we fill
1456 	 *	it in now.
1457 	 */
1458 
1459 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1460 	if (sock->type == SOCK_PACKET)
1461 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1462 	else
1463 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1464 
1465 	/*
1466 	 *	You lose any data beyond the buffer you gave. If it worries a
1467 	 *	user program they can ask the device for its MTU anyway.
1468 	 */
1469 
1470 	copied = skb->len;
1471 	if (copied > len) {
1472 		copied = len;
1473 		msg->msg_flags |= MSG_TRUNC;
1474 	}
1475 
1476 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1477 	if (err)
1478 		goto out_free;
1479 
1480 	sock_recv_ts_and_drops(msg, sk, skb);
1481 
1482 	if (msg->msg_name)
1483 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1484 		       msg->msg_namelen);
1485 
1486 	if (pkt_sk(sk)->auxdata) {
1487 		struct tpacket_auxdata aux;
1488 
1489 		aux.tp_status = TP_STATUS_USER;
1490 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1491 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1492 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1493 		aux.tp_snaplen = skb->len;
1494 		aux.tp_mac = 0;
1495 		aux.tp_net = skb_network_offset(skb);
1496 		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1497 
1498 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1499 	}
1500 
1501 	/*
1502 	 *	Free or return the buffer as appropriate. Again this
1503 	 *	hides all the races and re-entrancy issues from us.
1504 	 */
1505 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1506 
1507 out_free:
1508 	skb_free_datagram(sk, skb);
1509 out:
1510 	return err;
1511 }
1512 
1513 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1514 			       int *uaddr_len, int peer)
1515 {
1516 	struct net_device *dev;
1517 	struct sock *sk	= sock->sk;
1518 
1519 	if (peer)
1520 		return -EOPNOTSUPP;
1521 
1522 	uaddr->sa_family = AF_PACKET;
1523 	rcu_read_lock();
1524 	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1525 	if (dev)
1526 		strlcpy(uaddr->sa_data, dev->name, 15);
1527 	else
1528 		memset(uaddr->sa_data, 0, 14);
1529 	rcu_read_unlock();
1530 	*uaddr_len = sizeof(*uaddr);
1531 
1532 	return 0;
1533 }
1534 
1535 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1536 			  int *uaddr_len, int peer)
1537 {
1538 	struct net_device *dev;
1539 	struct sock *sk = sock->sk;
1540 	struct packet_sock *po = pkt_sk(sk);
1541 	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1542 
1543 	if (peer)
1544 		return -EOPNOTSUPP;
1545 
1546 	sll->sll_family = AF_PACKET;
1547 	sll->sll_ifindex = po->ifindex;
1548 	sll->sll_protocol = po->num;
1549 	rcu_read_lock();
1550 	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1551 	if (dev) {
1552 		sll->sll_hatype = dev->type;
1553 		sll->sll_halen = dev->addr_len;
1554 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1555 	} else {
1556 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1557 		sll->sll_halen = 0;
1558 	}
1559 	rcu_read_unlock();
1560 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1561 
1562 	return 0;
1563 }
1564 
1565 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1566 			 int what)
1567 {
1568 	switch (i->type) {
1569 	case PACKET_MR_MULTICAST:
1570 		if (what > 0)
1571 			return dev_mc_add(dev, i->addr, i->alen, 0);
1572 		else
1573 			return dev_mc_delete(dev, i->addr, i->alen, 0);
1574 		break;
1575 	case PACKET_MR_PROMISC:
1576 		return dev_set_promiscuity(dev, what);
1577 		break;
1578 	case PACKET_MR_ALLMULTI:
1579 		return dev_set_allmulti(dev, what);
1580 		break;
1581 	case PACKET_MR_UNICAST:
1582 		if (what > 0)
1583 			return dev_unicast_add(dev, i->addr);
1584 		else
1585 			return dev_unicast_delete(dev, i->addr);
1586 		break;
1587 	default:
1588 		break;
1589 	}
1590 	return 0;
1591 }
1592 
1593 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1594 {
1595 	for ( ; i; i = i->next) {
1596 		if (i->ifindex == dev->ifindex)
1597 			packet_dev_mc(dev, i, what);
1598 	}
1599 }
1600 
1601 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1602 {
1603 	struct packet_sock *po = pkt_sk(sk);
1604 	struct packet_mclist *ml, *i;
1605 	struct net_device *dev;
1606 	int err;
1607 
1608 	rtnl_lock();
1609 
1610 	err = -ENODEV;
1611 	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1612 	if (!dev)
1613 		goto done;
1614 
1615 	err = -EINVAL;
1616 	if (mreq->mr_alen > dev->addr_len)
1617 		goto done;
1618 
1619 	err = -ENOBUFS;
1620 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1621 	if (i == NULL)
1622 		goto done;
1623 
1624 	err = 0;
1625 	for (ml = po->mclist; ml; ml = ml->next) {
1626 		if (ml->ifindex == mreq->mr_ifindex &&
1627 		    ml->type == mreq->mr_type &&
1628 		    ml->alen == mreq->mr_alen &&
1629 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1630 			ml->count++;
1631 			/* Free the new element ... */
1632 			kfree(i);
1633 			goto done;
1634 		}
1635 	}
1636 
1637 	i->type = mreq->mr_type;
1638 	i->ifindex = mreq->mr_ifindex;
1639 	i->alen = mreq->mr_alen;
1640 	memcpy(i->addr, mreq->mr_address, i->alen);
1641 	i->count = 1;
1642 	i->next = po->mclist;
1643 	po->mclist = i;
1644 	err = packet_dev_mc(dev, i, 1);
1645 	if (err) {
1646 		po->mclist = i->next;
1647 		kfree(i);
1648 	}
1649 
1650 done:
1651 	rtnl_unlock();
1652 	return err;
1653 }
1654 
1655 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1656 {
1657 	struct packet_mclist *ml, **mlp;
1658 
1659 	rtnl_lock();
1660 
1661 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1662 		if (ml->ifindex == mreq->mr_ifindex &&
1663 		    ml->type == mreq->mr_type &&
1664 		    ml->alen == mreq->mr_alen &&
1665 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1666 			if (--ml->count == 0) {
1667 				struct net_device *dev;
1668 				*mlp = ml->next;
1669 				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1670 				if (dev)
1671 					packet_dev_mc(dev, ml, -1);
1672 				kfree(ml);
1673 			}
1674 			rtnl_unlock();
1675 			return 0;
1676 		}
1677 	}
1678 	rtnl_unlock();
1679 	return -EADDRNOTAVAIL;
1680 }
1681 
1682 static void packet_flush_mclist(struct sock *sk)
1683 {
1684 	struct packet_sock *po = pkt_sk(sk);
1685 	struct packet_mclist *ml;
1686 
1687 	if (!po->mclist)
1688 		return;
1689 
1690 	rtnl_lock();
1691 	while ((ml = po->mclist) != NULL) {
1692 		struct net_device *dev;
1693 
1694 		po->mclist = ml->next;
1695 		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1696 		if (dev != NULL)
1697 			packet_dev_mc(dev, ml, -1);
1698 		kfree(ml);
1699 	}
1700 	rtnl_unlock();
1701 }
1702 
1703 static int
1704 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1705 {
1706 	struct sock *sk = sock->sk;
1707 	struct packet_sock *po = pkt_sk(sk);
1708 	int ret;
1709 
1710 	if (level != SOL_PACKET)
1711 		return -ENOPROTOOPT;
1712 
1713 	switch (optname) {
1714 	case PACKET_ADD_MEMBERSHIP:
1715 	case PACKET_DROP_MEMBERSHIP:
1716 	{
1717 		struct packet_mreq_max mreq;
1718 		int len = optlen;
1719 		memset(&mreq, 0, sizeof(mreq));
1720 		if (len < sizeof(struct packet_mreq))
1721 			return -EINVAL;
1722 		if (len > sizeof(mreq))
1723 			len = sizeof(mreq);
1724 		if (copy_from_user(&mreq, optval, len))
1725 			return -EFAULT;
1726 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1727 			return -EINVAL;
1728 		if (optname == PACKET_ADD_MEMBERSHIP)
1729 			ret = packet_mc_add(sk, &mreq);
1730 		else
1731 			ret = packet_mc_drop(sk, &mreq);
1732 		return ret;
1733 	}
1734 
1735 #ifdef CONFIG_PACKET_MMAP
1736 	case PACKET_RX_RING:
1737 	case PACKET_TX_RING:
1738 	{
1739 		struct tpacket_req req;
1740 
1741 		if (optlen < sizeof(req))
1742 			return -EINVAL;
1743 		if (copy_from_user(&req, optval, sizeof(req)))
1744 			return -EFAULT;
1745 		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1746 	}
1747 	case PACKET_COPY_THRESH:
1748 	{
1749 		int val;
1750 
1751 		if (optlen != sizeof(val))
1752 			return -EINVAL;
1753 		if (copy_from_user(&val, optval, sizeof(val)))
1754 			return -EFAULT;
1755 
1756 		pkt_sk(sk)->copy_thresh = val;
1757 		return 0;
1758 	}
1759 	case PACKET_VERSION:
1760 	{
1761 		int val;
1762 
1763 		if (optlen != sizeof(val))
1764 			return -EINVAL;
1765 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1766 			return -EBUSY;
1767 		if (copy_from_user(&val, optval, sizeof(val)))
1768 			return -EFAULT;
1769 		switch (val) {
1770 		case TPACKET_V1:
1771 		case TPACKET_V2:
1772 			po->tp_version = val;
1773 			return 0;
1774 		default:
1775 			return -EINVAL;
1776 		}
1777 	}
1778 	case PACKET_RESERVE:
1779 	{
1780 		unsigned int val;
1781 
1782 		if (optlen != sizeof(val))
1783 			return -EINVAL;
1784 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1785 			return -EBUSY;
1786 		if (copy_from_user(&val, optval, sizeof(val)))
1787 			return -EFAULT;
1788 		po->tp_reserve = val;
1789 		return 0;
1790 	}
1791 	case PACKET_LOSS:
1792 	{
1793 		unsigned int val;
1794 
1795 		if (optlen != sizeof(val))
1796 			return -EINVAL;
1797 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1798 			return -EBUSY;
1799 		if (copy_from_user(&val, optval, sizeof(val)))
1800 			return -EFAULT;
1801 		po->tp_loss = !!val;
1802 		return 0;
1803 	}
1804 #endif
1805 	case PACKET_AUXDATA:
1806 	{
1807 		int val;
1808 
1809 		if (optlen < sizeof(val))
1810 			return -EINVAL;
1811 		if (copy_from_user(&val, optval, sizeof(val)))
1812 			return -EFAULT;
1813 
1814 		po->auxdata = !!val;
1815 		return 0;
1816 	}
1817 	case PACKET_ORIGDEV:
1818 	{
1819 		int val;
1820 
1821 		if (optlen < sizeof(val))
1822 			return -EINVAL;
1823 		if (copy_from_user(&val, optval, sizeof(val)))
1824 			return -EFAULT;
1825 
1826 		po->origdev = !!val;
1827 		return 0;
1828 	}
1829 	default:
1830 		return -ENOPROTOOPT;
1831 	}
1832 }
1833 
1834 static int packet_getsockopt(struct socket *sock, int level, int optname,
1835 			     char __user *optval, int __user *optlen)
1836 {
1837 	int len;
1838 	int val;
1839 	struct sock *sk = sock->sk;
1840 	struct packet_sock *po = pkt_sk(sk);
1841 	void *data;
1842 	struct tpacket_stats st;
1843 
1844 	if (level != SOL_PACKET)
1845 		return -ENOPROTOOPT;
1846 
1847 	if (get_user(len, optlen))
1848 		return -EFAULT;
1849 
1850 	if (len < 0)
1851 		return -EINVAL;
1852 
1853 	switch (optname) {
1854 	case PACKET_STATISTICS:
1855 		if (len > sizeof(struct tpacket_stats))
1856 			len = sizeof(struct tpacket_stats);
1857 		spin_lock_bh(&sk->sk_receive_queue.lock);
1858 		st = po->stats;
1859 		memset(&po->stats, 0, sizeof(st));
1860 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1861 		st.tp_packets += st.tp_drops;
1862 
1863 		data = &st;
1864 		break;
1865 	case PACKET_AUXDATA:
1866 		if (len > sizeof(int))
1867 			len = sizeof(int);
1868 		val = po->auxdata;
1869 
1870 		data = &val;
1871 		break;
1872 	case PACKET_ORIGDEV:
1873 		if (len > sizeof(int))
1874 			len = sizeof(int);
1875 		val = po->origdev;
1876 
1877 		data = &val;
1878 		break;
1879 #ifdef CONFIG_PACKET_MMAP
1880 	case PACKET_VERSION:
1881 		if (len > sizeof(int))
1882 			len = sizeof(int);
1883 		val = po->tp_version;
1884 		data = &val;
1885 		break;
1886 	case PACKET_HDRLEN:
1887 		if (len > sizeof(int))
1888 			len = sizeof(int);
1889 		if (copy_from_user(&val, optval, len))
1890 			return -EFAULT;
1891 		switch (val) {
1892 		case TPACKET_V1:
1893 			val = sizeof(struct tpacket_hdr);
1894 			break;
1895 		case TPACKET_V2:
1896 			val = sizeof(struct tpacket2_hdr);
1897 			break;
1898 		default:
1899 			return -EINVAL;
1900 		}
1901 		data = &val;
1902 		break;
1903 	case PACKET_RESERVE:
1904 		if (len > sizeof(unsigned int))
1905 			len = sizeof(unsigned int);
1906 		val = po->tp_reserve;
1907 		data = &val;
1908 		break;
1909 	case PACKET_LOSS:
1910 		if (len > sizeof(unsigned int))
1911 			len = sizeof(unsigned int);
1912 		val = po->tp_loss;
1913 		data = &val;
1914 		break;
1915 #endif
1916 	default:
1917 		return -ENOPROTOOPT;
1918 	}
1919 
1920 	if (put_user(len, optlen))
1921 		return -EFAULT;
1922 	if (copy_to_user(optval, data, len))
1923 		return -EFAULT;
1924 	return 0;
1925 }
1926 
1927 
1928 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1929 {
1930 	struct sock *sk;
1931 	struct hlist_node *node;
1932 	struct net_device *dev = data;
1933 	struct net *net = dev_net(dev);
1934 
1935 	read_lock(&net->packet.sklist_lock);
1936 	sk_for_each(sk, node, &net->packet.sklist) {
1937 		struct packet_sock *po = pkt_sk(sk);
1938 
1939 		switch (msg) {
1940 		case NETDEV_UNREGISTER:
1941 			if (po->mclist)
1942 				packet_dev_mclist(dev, po->mclist, -1);
1943 			/* fallthrough */
1944 
1945 		case NETDEV_DOWN:
1946 			if (dev->ifindex == po->ifindex) {
1947 				spin_lock(&po->bind_lock);
1948 				if (po->running) {
1949 					__dev_remove_pack(&po->prot_hook);
1950 					__sock_put(sk);
1951 					po->running = 0;
1952 					sk->sk_err = ENETDOWN;
1953 					if (!sock_flag(sk, SOCK_DEAD))
1954 						sk->sk_error_report(sk);
1955 				}
1956 				if (msg == NETDEV_UNREGISTER) {
1957 					po->ifindex = -1;
1958 					po->prot_hook.dev = NULL;
1959 				}
1960 				spin_unlock(&po->bind_lock);
1961 			}
1962 			break;
1963 		case NETDEV_UP:
1964 			spin_lock(&po->bind_lock);
1965 			if (dev->ifindex == po->ifindex && po->num &&
1966 			    !po->running) {
1967 				dev_add_pack(&po->prot_hook);
1968 				sock_hold(sk);
1969 				po->running = 1;
1970 			}
1971 			spin_unlock(&po->bind_lock);
1972 			break;
1973 		}
1974 	}
1975 	read_unlock(&net->packet.sklist_lock);
1976 	return NOTIFY_DONE;
1977 }
1978 
1979 
1980 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1981 			unsigned long arg)
1982 {
1983 	struct sock *sk = sock->sk;
1984 
1985 	switch (cmd) {
1986 	case SIOCOUTQ:
1987 	{
1988 		int amount = sk_wmem_alloc_get(sk);
1989 
1990 		return put_user(amount, (int __user *)arg);
1991 	}
1992 	case SIOCINQ:
1993 	{
1994 		struct sk_buff *skb;
1995 		int amount = 0;
1996 
1997 		spin_lock_bh(&sk->sk_receive_queue.lock);
1998 		skb = skb_peek(&sk->sk_receive_queue);
1999 		if (skb)
2000 			amount = skb->len;
2001 		spin_unlock_bh(&sk->sk_receive_queue.lock);
2002 		return put_user(amount, (int __user *)arg);
2003 	}
2004 	case SIOCGSTAMP:
2005 		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2006 	case SIOCGSTAMPNS:
2007 		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2008 
2009 #ifdef CONFIG_INET
2010 	case SIOCADDRT:
2011 	case SIOCDELRT:
2012 	case SIOCDARP:
2013 	case SIOCGARP:
2014 	case SIOCSARP:
2015 	case SIOCGIFADDR:
2016 	case SIOCSIFADDR:
2017 	case SIOCGIFBRDADDR:
2018 	case SIOCSIFBRDADDR:
2019 	case SIOCGIFNETMASK:
2020 	case SIOCSIFNETMASK:
2021 	case SIOCGIFDSTADDR:
2022 	case SIOCSIFDSTADDR:
2023 	case SIOCSIFFLAGS:
2024 		if (!net_eq(sock_net(sk), &init_net))
2025 			return -ENOIOCTLCMD;
2026 		return inet_dgram_ops.ioctl(sock, cmd, arg);
2027 #endif
2028 
2029 	default:
2030 		return -ENOIOCTLCMD;
2031 	}
2032 	return 0;
2033 }
2034 
2035 #ifndef CONFIG_PACKET_MMAP
2036 #define packet_mmap sock_no_mmap
2037 #define packet_poll datagram_poll
2038 #else
2039 
2040 static unsigned int packet_poll(struct file *file, struct socket *sock,
2041 				poll_table *wait)
2042 {
2043 	struct sock *sk = sock->sk;
2044 	struct packet_sock *po = pkt_sk(sk);
2045 	unsigned int mask = datagram_poll(file, sock, wait);
2046 
2047 	spin_lock_bh(&sk->sk_receive_queue.lock);
2048 	if (po->rx_ring.pg_vec) {
2049 		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2050 			mask |= POLLIN | POLLRDNORM;
2051 	}
2052 	spin_unlock_bh(&sk->sk_receive_queue.lock);
2053 	spin_lock_bh(&sk->sk_write_queue.lock);
2054 	if (po->tx_ring.pg_vec) {
2055 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2056 			mask |= POLLOUT | POLLWRNORM;
2057 	}
2058 	spin_unlock_bh(&sk->sk_write_queue.lock);
2059 	return mask;
2060 }
2061 
2062 
2063 /* Dirty? Well, I still did not learn better way to account
2064  * for user mmaps.
2065  */
2066 
2067 static void packet_mm_open(struct vm_area_struct *vma)
2068 {
2069 	struct file *file = vma->vm_file;
2070 	struct socket *sock = file->private_data;
2071 	struct sock *sk = sock->sk;
2072 
2073 	if (sk)
2074 		atomic_inc(&pkt_sk(sk)->mapped);
2075 }
2076 
2077 static void packet_mm_close(struct vm_area_struct *vma)
2078 {
2079 	struct file *file = vma->vm_file;
2080 	struct socket *sock = file->private_data;
2081 	struct sock *sk = sock->sk;
2082 
2083 	if (sk)
2084 		atomic_dec(&pkt_sk(sk)->mapped);
2085 }
2086 
2087 static const struct vm_operations_struct packet_mmap_ops = {
2088 	.open	=	packet_mm_open,
2089 	.close	=	packet_mm_close,
2090 };
2091 
2092 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2093 {
2094 	int i;
2095 
2096 	for (i = 0; i < len; i++) {
2097 		if (likely(pg_vec[i]))
2098 			free_pages((unsigned long) pg_vec[i], order);
2099 	}
2100 	kfree(pg_vec);
2101 }
2102 
2103 static inline char *alloc_one_pg_vec_page(unsigned long order)
2104 {
2105 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2106 
2107 	return (char *) __get_free_pages(gfp_flags, order);
2108 }
2109 
2110 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2111 {
2112 	unsigned int block_nr = req->tp_block_nr;
2113 	char **pg_vec;
2114 	int i;
2115 
2116 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2117 	if (unlikely(!pg_vec))
2118 		goto out;
2119 
2120 	for (i = 0; i < block_nr; i++) {
2121 		pg_vec[i] = alloc_one_pg_vec_page(order);
2122 		if (unlikely(!pg_vec[i]))
2123 			goto out_free_pgvec;
2124 	}
2125 
2126 out:
2127 	return pg_vec;
2128 
2129 out_free_pgvec:
2130 	free_pg_vec(pg_vec, order, block_nr);
2131 	pg_vec = NULL;
2132 	goto out;
2133 }
2134 
2135 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2136 		int closing, int tx_ring)
2137 {
2138 	char **pg_vec = NULL;
2139 	struct packet_sock *po = pkt_sk(sk);
2140 	int was_running, order = 0;
2141 	struct packet_ring_buffer *rb;
2142 	struct sk_buff_head *rb_queue;
2143 	__be16 num;
2144 	int err;
2145 
2146 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2147 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2148 
2149 	err = -EBUSY;
2150 	if (!closing) {
2151 		if (atomic_read(&po->mapped))
2152 			goto out;
2153 		if (atomic_read(&rb->pending))
2154 			goto out;
2155 	}
2156 
2157 	if (req->tp_block_nr) {
2158 		/* Sanity tests and some calculations */
2159 		err = -EBUSY;
2160 		if (unlikely(rb->pg_vec))
2161 			goto out;
2162 
2163 		switch (po->tp_version) {
2164 		case TPACKET_V1:
2165 			po->tp_hdrlen = TPACKET_HDRLEN;
2166 			break;
2167 		case TPACKET_V2:
2168 			po->tp_hdrlen = TPACKET2_HDRLEN;
2169 			break;
2170 		}
2171 
2172 		err = -EINVAL;
2173 		if (unlikely((int)req->tp_block_size <= 0))
2174 			goto out;
2175 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2176 			goto out;
2177 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2178 					po->tp_reserve))
2179 			goto out;
2180 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2181 			goto out;
2182 
2183 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2184 		if (unlikely(rb->frames_per_block <= 0))
2185 			goto out;
2186 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2187 					req->tp_frame_nr))
2188 			goto out;
2189 
2190 		err = -ENOMEM;
2191 		order = get_order(req->tp_block_size);
2192 		pg_vec = alloc_pg_vec(req, order);
2193 		if (unlikely(!pg_vec))
2194 			goto out;
2195 	}
2196 	/* Done */
2197 	else {
2198 		err = -EINVAL;
2199 		if (unlikely(req->tp_frame_nr))
2200 			goto out;
2201 	}
2202 
2203 	lock_sock(sk);
2204 
2205 	/* Detach socket from network */
2206 	spin_lock(&po->bind_lock);
2207 	was_running = po->running;
2208 	num = po->num;
2209 	if (was_running) {
2210 		__dev_remove_pack(&po->prot_hook);
2211 		po->num = 0;
2212 		po->running = 0;
2213 		__sock_put(sk);
2214 	}
2215 	spin_unlock(&po->bind_lock);
2216 
2217 	synchronize_net();
2218 
2219 	err = -EBUSY;
2220 	mutex_lock(&po->pg_vec_lock);
2221 	if (closing || atomic_read(&po->mapped) == 0) {
2222 		err = 0;
2223 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2224 		spin_lock_bh(&rb_queue->lock);
2225 		pg_vec = XC(rb->pg_vec, pg_vec);
2226 		rb->frame_max = (req->tp_frame_nr - 1);
2227 		rb->head = 0;
2228 		rb->frame_size = req->tp_frame_size;
2229 		spin_unlock_bh(&rb_queue->lock);
2230 
2231 		order = XC(rb->pg_vec_order, order);
2232 		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2233 
2234 		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2235 		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2236 						tpacket_rcv : packet_rcv;
2237 		skb_queue_purge(rb_queue);
2238 #undef XC
2239 		if (atomic_read(&po->mapped))
2240 			pr_err("packet_mmap: vma is busy: %d\n",
2241 			       atomic_read(&po->mapped));
2242 	}
2243 	mutex_unlock(&po->pg_vec_lock);
2244 
2245 	spin_lock(&po->bind_lock);
2246 	if (was_running && !po->running) {
2247 		sock_hold(sk);
2248 		po->running = 1;
2249 		po->num = num;
2250 		dev_add_pack(&po->prot_hook);
2251 	}
2252 	spin_unlock(&po->bind_lock);
2253 
2254 	release_sock(sk);
2255 
2256 	if (pg_vec)
2257 		free_pg_vec(pg_vec, order, req->tp_block_nr);
2258 out:
2259 	return err;
2260 }
2261 
2262 static int packet_mmap(struct file *file, struct socket *sock,
2263 		struct vm_area_struct *vma)
2264 {
2265 	struct sock *sk = sock->sk;
2266 	struct packet_sock *po = pkt_sk(sk);
2267 	unsigned long size, expected_size;
2268 	struct packet_ring_buffer *rb;
2269 	unsigned long start;
2270 	int err = -EINVAL;
2271 	int i;
2272 
2273 	if (vma->vm_pgoff)
2274 		return -EINVAL;
2275 
2276 	mutex_lock(&po->pg_vec_lock);
2277 
2278 	expected_size = 0;
2279 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2280 		if (rb->pg_vec) {
2281 			expected_size += rb->pg_vec_len
2282 						* rb->pg_vec_pages
2283 						* PAGE_SIZE;
2284 		}
2285 	}
2286 
2287 	if (expected_size == 0)
2288 		goto out;
2289 
2290 	size = vma->vm_end - vma->vm_start;
2291 	if (size != expected_size)
2292 		goto out;
2293 
2294 	start = vma->vm_start;
2295 	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2296 		if (rb->pg_vec == NULL)
2297 			continue;
2298 
2299 		for (i = 0; i < rb->pg_vec_len; i++) {
2300 			struct page *page = virt_to_page(rb->pg_vec[i]);
2301 			int pg_num;
2302 
2303 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2304 					pg_num++, page++) {
2305 				err = vm_insert_page(vma, start, page);
2306 				if (unlikely(err))
2307 					goto out;
2308 				start += PAGE_SIZE;
2309 			}
2310 		}
2311 	}
2312 
2313 	atomic_inc(&po->mapped);
2314 	vma->vm_ops = &packet_mmap_ops;
2315 	err = 0;
2316 
2317 out:
2318 	mutex_unlock(&po->pg_vec_lock);
2319 	return err;
2320 }
2321 #endif
2322 
2323 
2324 static const struct proto_ops packet_ops_spkt = {
2325 	.family =	PF_PACKET,
2326 	.owner =	THIS_MODULE,
2327 	.release =	packet_release,
2328 	.bind =		packet_bind_spkt,
2329 	.connect =	sock_no_connect,
2330 	.socketpair =	sock_no_socketpair,
2331 	.accept =	sock_no_accept,
2332 	.getname =	packet_getname_spkt,
2333 	.poll =		datagram_poll,
2334 	.ioctl =	packet_ioctl,
2335 	.listen =	sock_no_listen,
2336 	.shutdown =	sock_no_shutdown,
2337 	.setsockopt =	sock_no_setsockopt,
2338 	.getsockopt =	sock_no_getsockopt,
2339 	.sendmsg =	packet_sendmsg_spkt,
2340 	.recvmsg =	packet_recvmsg,
2341 	.mmap =		sock_no_mmap,
2342 	.sendpage =	sock_no_sendpage,
2343 };
2344 
2345 static const struct proto_ops packet_ops = {
2346 	.family =	PF_PACKET,
2347 	.owner =	THIS_MODULE,
2348 	.release =	packet_release,
2349 	.bind =		packet_bind,
2350 	.connect =	sock_no_connect,
2351 	.socketpair =	sock_no_socketpair,
2352 	.accept =	sock_no_accept,
2353 	.getname =	packet_getname,
2354 	.poll =		packet_poll,
2355 	.ioctl =	packet_ioctl,
2356 	.listen =	sock_no_listen,
2357 	.shutdown =	sock_no_shutdown,
2358 	.setsockopt =	packet_setsockopt,
2359 	.getsockopt =	packet_getsockopt,
2360 	.sendmsg =	packet_sendmsg,
2361 	.recvmsg =	packet_recvmsg,
2362 	.mmap =		packet_mmap,
2363 	.sendpage =	sock_no_sendpage,
2364 };
2365 
2366 static const struct net_proto_family packet_family_ops = {
2367 	.family =	PF_PACKET,
2368 	.create =	packet_create,
2369 	.owner	=	THIS_MODULE,
2370 };
2371 
2372 static struct notifier_block packet_netdev_notifier = {
2373 	.notifier_call =	packet_notifier,
2374 };
2375 
2376 #ifdef CONFIG_PROC_FS
2377 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2378 {
2379 	struct sock *s;
2380 	struct hlist_node *node;
2381 
2382 	sk_for_each(s, node, &net->packet.sklist) {
2383 		if (!off--)
2384 			return s;
2385 	}
2386 	return NULL;
2387 }
2388 
2389 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2390 	__acquires(seq_file_net(seq)->packet.sklist_lock)
2391 {
2392 	struct net *net = seq_file_net(seq);
2393 	read_lock(&net->packet.sklist_lock);
2394 	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2395 }
2396 
2397 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398 {
2399 	struct net *net = seq_file_net(seq);
2400 	++*pos;
2401 	return  (v == SEQ_START_TOKEN)
2402 		? sk_head(&net->packet.sklist)
2403 		: sk_next((struct sock *)v) ;
2404 }
2405 
2406 static void packet_seq_stop(struct seq_file *seq, void *v)
2407 	__releases(seq_file_net(seq)->packet.sklist_lock)
2408 {
2409 	struct net *net = seq_file_net(seq);
2410 	read_unlock(&net->packet.sklist_lock);
2411 }
2412 
2413 static int packet_seq_show(struct seq_file *seq, void *v)
2414 {
2415 	if (v == SEQ_START_TOKEN)
2416 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2417 	else {
2418 		struct sock *s = v;
2419 		const struct packet_sock *po = pkt_sk(s);
2420 
2421 		seq_printf(seq,
2422 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2423 			   s,
2424 			   atomic_read(&s->sk_refcnt),
2425 			   s->sk_type,
2426 			   ntohs(po->num),
2427 			   po->ifindex,
2428 			   po->running,
2429 			   atomic_read(&s->sk_rmem_alloc),
2430 			   sock_i_uid(s),
2431 			   sock_i_ino(s));
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static const struct seq_operations packet_seq_ops = {
2438 	.start	= packet_seq_start,
2439 	.next	= packet_seq_next,
2440 	.stop	= packet_seq_stop,
2441 	.show	= packet_seq_show,
2442 };
2443 
2444 static int packet_seq_open(struct inode *inode, struct file *file)
2445 {
2446 	return seq_open_net(inode, file, &packet_seq_ops,
2447 			    sizeof(struct seq_net_private));
2448 }
2449 
2450 static const struct file_operations packet_seq_fops = {
2451 	.owner		= THIS_MODULE,
2452 	.open		= packet_seq_open,
2453 	.read		= seq_read,
2454 	.llseek		= seq_lseek,
2455 	.release	= seq_release_net,
2456 };
2457 
2458 #endif
2459 
2460 static int packet_net_init(struct net *net)
2461 {
2462 	rwlock_init(&net->packet.sklist_lock);
2463 	INIT_HLIST_HEAD(&net->packet.sklist);
2464 
2465 	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2466 		return -ENOMEM;
2467 
2468 	return 0;
2469 }
2470 
2471 static void packet_net_exit(struct net *net)
2472 {
2473 	proc_net_remove(net, "packet");
2474 }
2475 
2476 static struct pernet_operations packet_net_ops = {
2477 	.init = packet_net_init,
2478 	.exit = packet_net_exit,
2479 };
2480 
2481 
2482 static void __exit packet_exit(void)
2483 {
2484 	unregister_netdevice_notifier(&packet_netdev_notifier);
2485 	unregister_pernet_subsys(&packet_net_ops);
2486 	sock_unregister(PF_PACKET);
2487 	proto_unregister(&packet_proto);
2488 }
2489 
2490 static int __init packet_init(void)
2491 {
2492 	int rc = proto_register(&packet_proto, 0);
2493 
2494 	if (rc != 0)
2495 		goto out;
2496 
2497 	sock_register(&packet_family_ops);
2498 	register_pernet_subsys(&packet_net_ops);
2499 	register_netdevice_notifier(&packet_netdev_notifier);
2500 out:
2501 	return rc;
2502 }
2503 
2504 module_init(packet_init);
2505 module_exit(packet_exit);
2506 MODULE_LICENSE("GPL");
2507 MODULE_ALIAS_NETPROTO(PF_PACKET);
2508