xref: /linux/net/packet/af_packet.c (revision 7f3edee81fbd49114c28057512906f169caa0bed)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		PACKET - implements raw packet sockets.
7  *
8  * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *		Alan Cox	:	verify_area() now used correctly
16  *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17  *		Alan Cox	:	tidied skbuff lists.
18  *		Alan Cox	:	Now uses generic datagram routines I
19  *					added. Also fixed the peek/read crash
20  *					from all old Linux datagram code.
21  *		Alan Cox	:	Uses the improved datagram code.
22  *		Alan Cox	:	Added NULL's for socket options.
23  *		Alan Cox	:	Re-commented the code.
24  *		Alan Cox	:	Use new kernel side addressing
25  *		Rob Janssen	:	Correct MTU usage.
26  *		Dave Platt	:	Counter leaks caused by incorrect
27  *					interrupt locking and some slightly
28  *					dubious gcc output. Can you read
29  *					compiler: it said _VOLATILE_
30  *	Richard Kooijman	:	Timestamp fixes.
31  *		Alan Cox	:	New buffers. Use sk->mac.raw.
32  *		Alan Cox	:	sendmsg/recvmsg support.
33  *		Alan Cox	:	Protocol setting support
34  *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35  *	Cyrus Durgin		:	Fixed kerneld for kmod.
36  *	Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
40  *					The convention is that longer addresses
41  *					will simply extend the hardware address
42  *					byte arrays at the end of sockaddr_ll
43  *					and packet_mreq.
44  *
45  *		This program is free software; you can redistribute it and/or
46  *		modify it under the terms of the GNU General Public License
47  *		as published by the Free Software Foundation; either version
48  *		2 of the License, or (at your option) any later version.
49  *
50  */
51 
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86 
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97 
98 On receive:
99 -----------
100 
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104 
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108 
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 		 header.  PPP makes it, that is wrong, because introduce
112 		 assymetry between rx and tx paths.
113    data       -> data
114 
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118 
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121 
122 
123 On transmit:
124 ------------
125 
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129 
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133 
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137 
138 /* List of all packet sockets. */
139 static HLIST_HEAD(packet_sklist);
140 static DEFINE_RWLOCK(packet_sklist_lock);
141 
142 /* Private packet socket structures. */
143 
144 struct packet_mclist
145 {
146 	struct packet_mclist	*next;
147 	int			ifindex;
148 	int			count;
149 	unsigned short		type;
150 	unsigned short		alen;
151 	unsigned char		addr[MAX_ADDR_LEN];
152 };
153 /* identical to struct packet_mreq except it has
154  * a longer address field.
155  */
156 struct packet_mreq_max
157 {
158 	int		mr_ifindex;
159 	unsigned short	mr_type;
160 	unsigned short	mr_alen;
161 	unsigned char	mr_address[MAX_ADDR_LEN];
162 };
163 
164 #ifdef CONFIG_PACKET_MMAP
165 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
166 #endif
167 
168 static void packet_flush_mclist(struct sock *sk);
169 
170 struct packet_sock {
171 	/* struct sock has to be the first member of packet_sock */
172 	struct sock		sk;
173 	struct tpacket_stats	stats;
174 #ifdef CONFIG_PACKET_MMAP
175 	char *			*pg_vec;
176 	unsigned int		head;
177 	unsigned int            frames_per_block;
178 	unsigned int		frame_size;
179 	unsigned int		frame_max;
180 	int			copy_thresh;
181 #endif
182 	struct packet_type	prot_hook;
183 	spinlock_t		bind_lock;
184 	unsigned int		running:1,	/* prot_hook is attached*/
185 				auxdata:1,
186 				origdev:1;
187 	int			ifindex;	/* bound device		*/
188 	__be16			num;
189 	struct packet_mclist	*mclist;
190 #ifdef CONFIG_PACKET_MMAP
191 	atomic_t		mapped;
192 	unsigned int            pg_vec_order;
193 	unsigned int		pg_vec_pages;
194 	unsigned int		pg_vec_len;
195 #endif
196 };
197 
198 struct packet_skb_cb {
199 	unsigned int origlen;
200 	union {
201 		struct sockaddr_pkt pkt;
202 		struct sockaddr_ll ll;
203 	} sa;
204 };
205 
206 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
207 
208 #ifdef CONFIG_PACKET_MMAP
209 
210 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
211 {
212 	unsigned int pg_vec_pos, frame_offset;
213 
214 	pg_vec_pos = position / po->frames_per_block;
215 	frame_offset = position % po->frames_per_block;
216 
217 	return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
218 }
219 #endif
220 
221 static inline struct packet_sock *pkt_sk(struct sock *sk)
222 {
223 	return (struct packet_sock *)sk;
224 }
225 
226 static void packet_sock_destruct(struct sock *sk)
227 {
228 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
229 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
230 
231 	if (!sock_flag(sk, SOCK_DEAD)) {
232 		printk("Attempt to release alive packet socket: %p\n", sk);
233 		return;
234 	}
235 
236 	sk_refcnt_debug_dec(sk);
237 }
238 
239 
240 static const struct proto_ops packet_ops;
241 
242 static const struct proto_ops packet_ops_spkt;
243 
244 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
245 {
246 	struct sock *sk;
247 	struct sockaddr_pkt *spkt;
248 
249 	if (dev->nd_net != &init_net)
250 		goto out;
251 
252 	/*
253 	 *	When we registered the protocol we saved the socket in the data
254 	 *	field for just this event.
255 	 */
256 
257 	sk = pt->af_packet_priv;
258 
259 	/*
260 	 *	Yank back the headers [hope the device set this
261 	 *	right or kerboom...]
262 	 *
263 	 *	Incoming packets have ll header pulled,
264 	 *	push it back.
265 	 *
266 	 *	For outgoing ones skb->data == skb_mac_header(skb)
267 	 *	so that this procedure is noop.
268 	 */
269 
270 	if (skb->pkt_type == PACKET_LOOPBACK)
271 		goto out;
272 
273 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
274 		goto oom;
275 
276 	/* drop any routing info */
277 	dst_release(skb->dst);
278 	skb->dst = NULL;
279 
280 	/* drop conntrack reference */
281 	nf_reset(skb);
282 
283 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
284 
285 	skb_push(skb, skb->data - skb_mac_header(skb));
286 
287 	/*
288 	 *	The SOCK_PACKET socket receives _all_ frames.
289 	 */
290 
291 	spkt->spkt_family = dev->type;
292 	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
293 	spkt->spkt_protocol = skb->protocol;
294 
295 	/*
296 	 *	Charge the memory to the socket. This is done specifically
297 	 *	to prevent sockets using all the memory up.
298 	 */
299 
300 	if (sock_queue_rcv_skb(sk,skb) == 0)
301 		return 0;
302 
303 out:
304 	kfree_skb(skb);
305 oom:
306 	return 0;
307 }
308 
309 
310 /*
311  *	Output a raw packet to a device layer. This bypasses all the other
312  *	protocol layers and you must therefore supply it with a complete frame
313  */
314 
315 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
316 			       struct msghdr *msg, size_t len)
317 {
318 	struct sock *sk = sock->sk;
319 	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
320 	struct sk_buff *skb;
321 	struct net_device *dev;
322 	__be16 proto=0;
323 	int err;
324 
325 	/*
326 	 *	Get and verify the address.
327 	 */
328 
329 	if (saddr)
330 	{
331 		if (msg->msg_namelen < sizeof(struct sockaddr))
332 			return(-EINVAL);
333 		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
334 			proto=saddr->spkt_protocol;
335 	}
336 	else
337 		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
338 
339 	/*
340 	 *	Find the device first to size check it
341 	 */
342 
343 	saddr->spkt_device[13] = 0;
344 	dev = dev_get_by_name(&init_net, saddr->spkt_device);
345 	err = -ENODEV;
346 	if (dev == NULL)
347 		goto out_unlock;
348 
349 	err = -ENETDOWN;
350 	if (!(dev->flags & IFF_UP))
351 		goto out_unlock;
352 
353 	/*
354 	 *	You may not queue a frame bigger than the mtu. This is the lowest level
355 	 *	raw protocol and you must do your own fragmentation at this level.
356 	 */
357 
358 	err = -EMSGSIZE;
359 	if (len > dev->mtu + dev->hard_header_len)
360 		goto out_unlock;
361 
362 	err = -ENOBUFS;
363 	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
364 
365 	/*
366 	 *	If the write buffer is full, then tough. At this level the user gets to
367 	 *	deal with the problem - do your own algorithmic backoffs. That's far
368 	 *	more flexible.
369 	 */
370 
371 	if (skb == NULL)
372 		goto out_unlock;
373 
374 	/*
375 	 *	Fill it in
376 	 */
377 
378 	/* FIXME: Save some space for broken drivers that write a
379 	 * hard header at transmission time by themselves. PPP is the
380 	 * notable one here. This should really be fixed at the driver level.
381 	 */
382 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
383 	skb_reset_network_header(skb);
384 
385 	/* Try to align data part correctly */
386 	if (dev->header_ops) {
387 		skb->data -= dev->hard_header_len;
388 		skb->tail -= dev->hard_header_len;
389 		if (len < dev->hard_header_len)
390 			skb_reset_network_header(skb);
391 	}
392 
393 	/* Returns -EFAULT on error */
394 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
395 	skb->protocol = proto;
396 	skb->dev = dev;
397 	skb->priority = sk->sk_priority;
398 	if (err)
399 		goto out_free;
400 
401 	/*
402 	 *	Now send it
403 	 */
404 
405 	dev_queue_xmit(skb);
406 	dev_put(dev);
407 	return(len);
408 
409 out_free:
410 	kfree_skb(skb);
411 out_unlock:
412 	if (dev)
413 		dev_put(dev);
414 	return err;
415 }
416 
417 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
418 				      unsigned int res)
419 {
420 	struct sk_filter *filter;
421 
422 	rcu_read_lock_bh();
423 	filter = rcu_dereference(sk->sk_filter);
424 	if (filter != NULL)
425 		res = sk_run_filter(skb, filter->insns, filter->len);
426 	rcu_read_unlock_bh();
427 
428 	return res;
429 }
430 
431 /*
432    This function makes lazy skb cloning in hope that most of packets
433    are discarded by BPF.
434 
435    Note tricky part: we DO mangle shared skb! skb->data, skb->len
436    and skb->cb are mangled. It works because (and until) packets
437    falling here are owned by current CPU. Output packets are cloned
438    by dev_queue_xmit_nit(), input packets are processed by net_bh
439    sequencially, so that if we return skb to original state on exit,
440    we will not harm anyone.
441  */
442 
443 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
444 {
445 	struct sock *sk;
446 	struct sockaddr_ll *sll;
447 	struct packet_sock *po;
448 	u8 * skb_head = skb->data;
449 	int skb_len = skb->len;
450 	unsigned int snaplen, res;
451 
452 	if (dev->nd_net != &init_net)
453 		goto drop;
454 
455 	if (skb->pkt_type == PACKET_LOOPBACK)
456 		goto drop;
457 
458 	sk = pt->af_packet_priv;
459 	po = pkt_sk(sk);
460 
461 	skb->dev = dev;
462 
463 	if (dev->header_ops) {
464 		/* The device has an explicit notion of ll header,
465 		   exported to higher levels.
466 
467 		   Otherwise, the device hides datails of it frame
468 		   structure, so that corresponding packet head
469 		   never delivered to user.
470 		 */
471 		if (sk->sk_type != SOCK_DGRAM)
472 			skb_push(skb, skb->data - skb_mac_header(skb));
473 		else if (skb->pkt_type == PACKET_OUTGOING) {
474 			/* Special case: outgoing packets have ll header at head */
475 			skb_pull(skb, skb_network_offset(skb));
476 		}
477 	}
478 
479 	snaplen = skb->len;
480 
481 	res = run_filter(skb, sk, snaplen);
482 	if (!res)
483 		goto drop_n_restore;
484 	if (snaplen > res)
485 		snaplen = res;
486 
487 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488 	    (unsigned)sk->sk_rcvbuf)
489 		goto drop_n_acct;
490 
491 	if (skb_shared(skb)) {
492 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493 		if (nskb == NULL)
494 			goto drop_n_acct;
495 
496 		if (skb_head != skb->data) {
497 			skb->data = skb_head;
498 			skb->len = skb_len;
499 		}
500 		kfree_skb(skb);
501 		skb = nskb;
502 	}
503 
504 	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
505 		     sizeof(skb->cb));
506 
507 	sll = &PACKET_SKB_CB(skb)->sa.ll;
508 	sll->sll_family = AF_PACKET;
509 	sll->sll_hatype = dev->type;
510 	sll->sll_protocol = skb->protocol;
511 	sll->sll_pkttype = skb->pkt_type;
512 	if (unlikely(po->origdev))
513 		sll->sll_ifindex = orig_dev->ifindex;
514 	else
515 		sll->sll_ifindex = dev->ifindex;
516 
517 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
518 
519 	PACKET_SKB_CB(skb)->origlen = skb->len;
520 
521 	if (pskb_trim(skb, snaplen))
522 		goto drop_n_acct;
523 
524 	skb_set_owner_r(skb, sk);
525 	skb->dev = NULL;
526 	dst_release(skb->dst);
527 	skb->dst = NULL;
528 
529 	/* drop conntrack reference */
530 	nf_reset(skb);
531 
532 	spin_lock(&sk->sk_receive_queue.lock);
533 	po->stats.tp_packets++;
534 	__skb_queue_tail(&sk->sk_receive_queue, skb);
535 	spin_unlock(&sk->sk_receive_queue.lock);
536 	sk->sk_data_ready(sk, skb->len);
537 	return 0;
538 
539 drop_n_acct:
540 	spin_lock(&sk->sk_receive_queue.lock);
541 	po->stats.tp_drops++;
542 	spin_unlock(&sk->sk_receive_queue.lock);
543 
544 drop_n_restore:
545 	if (skb_head != skb->data && skb_shared(skb)) {
546 		skb->data = skb_head;
547 		skb->len = skb_len;
548 	}
549 drop:
550 	kfree_skb(skb);
551 	return 0;
552 }
553 
554 #ifdef CONFIG_PACKET_MMAP
555 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
556 {
557 	struct sock *sk;
558 	struct packet_sock *po;
559 	struct sockaddr_ll *sll;
560 	struct tpacket_hdr *h;
561 	u8 * skb_head = skb->data;
562 	int skb_len = skb->len;
563 	unsigned int snaplen, res;
564 	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
565 	unsigned short macoff, netoff;
566 	struct sk_buff *copy_skb = NULL;
567 	struct timeval tv;
568 
569 	if (dev->nd_net != &init_net)
570 		goto drop;
571 
572 	if (skb->pkt_type == PACKET_LOOPBACK)
573 		goto drop;
574 
575 	sk = pt->af_packet_priv;
576 	po = pkt_sk(sk);
577 
578 	if (dev->header_ops) {
579 		if (sk->sk_type != SOCK_DGRAM)
580 			skb_push(skb, skb->data - skb_mac_header(skb));
581 		else if (skb->pkt_type == PACKET_OUTGOING) {
582 			/* Special case: outgoing packets have ll header at head */
583 			skb_pull(skb, skb_network_offset(skb));
584 		}
585 	}
586 
587 	if (skb->ip_summed == CHECKSUM_PARTIAL)
588 		status |= TP_STATUS_CSUMNOTREADY;
589 
590 	snaplen = skb->len;
591 
592 	res = run_filter(skb, sk, snaplen);
593 	if (!res)
594 		goto drop_n_restore;
595 	if (snaplen > res)
596 		snaplen = res;
597 
598 	if (sk->sk_type == SOCK_DGRAM) {
599 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
600 	} else {
601 		unsigned maclen = skb_network_offset(skb);
602 		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
603 		macoff = netoff - maclen;
604 	}
605 
606 	if (macoff + snaplen > po->frame_size) {
607 		if (po->copy_thresh &&
608 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
609 		    (unsigned)sk->sk_rcvbuf) {
610 			if (skb_shared(skb)) {
611 				copy_skb = skb_clone(skb, GFP_ATOMIC);
612 			} else {
613 				copy_skb = skb_get(skb);
614 				skb_head = skb->data;
615 			}
616 			if (copy_skb)
617 				skb_set_owner_r(copy_skb, sk);
618 		}
619 		snaplen = po->frame_size - macoff;
620 		if ((int)snaplen < 0)
621 			snaplen = 0;
622 	}
623 
624 	spin_lock(&sk->sk_receive_queue.lock);
625 	h = packet_lookup_frame(po, po->head);
626 
627 	if (h->tp_status)
628 		goto ring_is_full;
629 	po->head = po->head != po->frame_max ? po->head+1 : 0;
630 	po->stats.tp_packets++;
631 	if (copy_skb) {
632 		status |= TP_STATUS_COPY;
633 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
634 	}
635 	if (!po->stats.tp_drops)
636 		status &= ~TP_STATUS_LOSING;
637 	spin_unlock(&sk->sk_receive_queue.lock);
638 
639 	skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
640 
641 	h->tp_len = skb->len;
642 	h->tp_snaplen = snaplen;
643 	h->tp_mac = macoff;
644 	h->tp_net = netoff;
645 	if (skb->tstamp.tv64)
646 		tv = ktime_to_timeval(skb->tstamp);
647 	else
648 		do_gettimeofday(&tv);
649 	h->tp_sec = tv.tv_sec;
650 	h->tp_usec = tv.tv_usec;
651 
652 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
653 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
654 	sll->sll_family = AF_PACKET;
655 	sll->sll_hatype = dev->type;
656 	sll->sll_protocol = skb->protocol;
657 	sll->sll_pkttype = skb->pkt_type;
658 	if (unlikely(po->origdev))
659 		sll->sll_ifindex = orig_dev->ifindex;
660 	else
661 		sll->sll_ifindex = dev->ifindex;
662 
663 	h->tp_status = status;
664 	smp_mb();
665 
666 	{
667 		struct page *p_start, *p_end;
668 		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
669 
670 		p_start = virt_to_page(h);
671 		p_end = virt_to_page(h_end);
672 		while (p_start <= p_end) {
673 			flush_dcache_page(p_start);
674 			p_start++;
675 		}
676 	}
677 
678 	sk->sk_data_ready(sk, 0);
679 
680 drop_n_restore:
681 	if (skb_head != skb->data && skb_shared(skb)) {
682 		skb->data = skb_head;
683 		skb->len = skb_len;
684 	}
685 drop:
686 	kfree_skb(skb);
687 	return 0;
688 
689 ring_is_full:
690 	po->stats.tp_drops++;
691 	spin_unlock(&sk->sk_receive_queue.lock);
692 
693 	sk->sk_data_ready(sk, 0);
694 	if (copy_skb)
695 		kfree_skb(copy_skb);
696 	goto drop_n_restore;
697 }
698 
699 #endif
700 
701 
702 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
703 			  struct msghdr *msg, size_t len)
704 {
705 	struct sock *sk = sock->sk;
706 	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
707 	struct sk_buff *skb;
708 	struct net_device *dev;
709 	__be16 proto;
710 	unsigned char *addr;
711 	int ifindex, err, reserve = 0;
712 
713 	/*
714 	 *	Get and verify the address.
715 	 */
716 
717 	if (saddr == NULL) {
718 		struct packet_sock *po = pkt_sk(sk);
719 
720 		ifindex	= po->ifindex;
721 		proto	= po->num;
722 		addr	= NULL;
723 	} else {
724 		err = -EINVAL;
725 		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
726 			goto out;
727 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
728 			goto out;
729 		ifindex	= saddr->sll_ifindex;
730 		proto	= saddr->sll_protocol;
731 		addr	= saddr->sll_addr;
732 	}
733 
734 
735 	dev = dev_get_by_index(&init_net, ifindex);
736 	err = -ENXIO;
737 	if (dev == NULL)
738 		goto out_unlock;
739 	if (sock->type == SOCK_RAW)
740 		reserve = dev->hard_header_len;
741 
742 	err = -ENETDOWN;
743 	if (!(dev->flags & IFF_UP))
744 		goto out_unlock;
745 
746 	err = -EMSGSIZE;
747 	if (len > dev->mtu+reserve)
748 		goto out_unlock;
749 
750 	skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
751 				msg->msg_flags & MSG_DONTWAIT, &err);
752 	if (skb==NULL)
753 		goto out_unlock;
754 
755 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
756 	skb_reset_network_header(skb);
757 
758 	err = -EINVAL;
759 	if (sock->type == SOCK_DGRAM &&
760 	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
761 		goto out_free;
762 
763 	/* Returns -EFAULT on error */
764 	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
765 	if (err)
766 		goto out_free;
767 
768 	skb->protocol = proto;
769 	skb->dev = dev;
770 	skb->priority = sk->sk_priority;
771 
772 	/*
773 	 *	Now send it
774 	 */
775 
776 	err = dev_queue_xmit(skb);
777 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
778 		goto out_unlock;
779 
780 	dev_put(dev);
781 
782 	return(len);
783 
784 out_free:
785 	kfree_skb(skb);
786 out_unlock:
787 	if (dev)
788 		dev_put(dev);
789 out:
790 	return err;
791 }
792 
793 /*
794  *	Close a PACKET socket. This is fairly simple. We immediately go
795  *	to 'closed' state and remove our protocol entry in the device list.
796  */
797 
798 static int packet_release(struct socket *sock)
799 {
800 	struct sock *sk = sock->sk;
801 	struct packet_sock *po;
802 
803 	if (!sk)
804 		return 0;
805 
806 	po = pkt_sk(sk);
807 
808 	write_lock_bh(&packet_sklist_lock);
809 	sk_del_node_init(sk);
810 	write_unlock_bh(&packet_sklist_lock);
811 
812 	/*
813 	 *	Unhook packet receive handler.
814 	 */
815 
816 	if (po->running) {
817 		/*
818 		 *	Remove the protocol hook
819 		 */
820 		dev_remove_pack(&po->prot_hook);
821 		po->running = 0;
822 		po->num = 0;
823 		__sock_put(sk);
824 	}
825 
826 	packet_flush_mclist(sk);
827 
828 #ifdef CONFIG_PACKET_MMAP
829 	if (po->pg_vec) {
830 		struct tpacket_req req;
831 		memset(&req, 0, sizeof(req));
832 		packet_set_ring(sk, &req, 1);
833 	}
834 #endif
835 
836 	/*
837 	 *	Now the socket is dead. No more input will appear.
838 	 */
839 
840 	sock_orphan(sk);
841 	sock->sk = NULL;
842 
843 	/* Purge queues */
844 
845 	skb_queue_purge(&sk->sk_receive_queue);
846 	sk_refcnt_debug_release(sk);
847 
848 	sock_put(sk);
849 	return 0;
850 }
851 
852 /*
853  *	Attach a packet hook.
854  */
855 
856 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
857 {
858 	struct packet_sock *po = pkt_sk(sk);
859 	/*
860 	 *	Detach an existing hook if present.
861 	 */
862 
863 	lock_sock(sk);
864 
865 	spin_lock(&po->bind_lock);
866 	if (po->running) {
867 		__sock_put(sk);
868 		po->running = 0;
869 		po->num = 0;
870 		spin_unlock(&po->bind_lock);
871 		dev_remove_pack(&po->prot_hook);
872 		spin_lock(&po->bind_lock);
873 	}
874 
875 	po->num = protocol;
876 	po->prot_hook.type = protocol;
877 	po->prot_hook.dev = dev;
878 
879 	po->ifindex = dev ? dev->ifindex : 0;
880 
881 	if (protocol == 0)
882 		goto out_unlock;
883 
884 	if (!dev || (dev->flags & IFF_UP)) {
885 		dev_add_pack(&po->prot_hook);
886 		sock_hold(sk);
887 		po->running = 1;
888 	} else {
889 		sk->sk_err = ENETDOWN;
890 		if (!sock_flag(sk, SOCK_DEAD))
891 			sk->sk_error_report(sk);
892 	}
893 
894 out_unlock:
895 	spin_unlock(&po->bind_lock);
896 	release_sock(sk);
897 	return 0;
898 }
899 
900 /*
901  *	Bind a packet socket to a device
902  */
903 
904 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
905 {
906 	struct sock *sk=sock->sk;
907 	char name[15];
908 	struct net_device *dev;
909 	int err = -ENODEV;
910 
911 	/*
912 	 *	Check legality
913 	 */
914 
915 	if (addr_len != sizeof(struct sockaddr))
916 		return -EINVAL;
917 	strlcpy(name,uaddr->sa_data,sizeof(name));
918 
919 	dev = dev_get_by_name(&init_net, name);
920 	if (dev) {
921 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
922 		dev_put(dev);
923 	}
924 	return err;
925 }
926 
927 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
928 {
929 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
930 	struct sock *sk=sock->sk;
931 	struct net_device *dev = NULL;
932 	int err;
933 
934 
935 	/*
936 	 *	Check legality
937 	 */
938 
939 	if (addr_len < sizeof(struct sockaddr_ll))
940 		return -EINVAL;
941 	if (sll->sll_family != AF_PACKET)
942 		return -EINVAL;
943 
944 	if (sll->sll_ifindex) {
945 		err = -ENODEV;
946 		dev = dev_get_by_index(&init_net, sll->sll_ifindex);
947 		if (dev == NULL)
948 			goto out;
949 	}
950 	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
951 	if (dev)
952 		dev_put(dev);
953 
954 out:
955 	return err;
956 }
957 
958 static struct proto packet_proto = {
959 	.name	  = "PACKET",
960 	.owner	  = THIS_MODULE,
961 	.obj_size = sizeof(struct packet_sock),
962 };
963 
964 /*
965  *	Create a packet of type SOCK_PACKET.
966  */
967 
968 static int packet_create(struct net *net, struct socket *sock, int protocol)
969 {
970 	struct sock *sk;
971 	struct packet_sock *po;
972 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
973 	int err;
974 
975 	if (net != &init_net)
976 		return -EAFNOSUPPORT;
977 
978 	if (!capable(CAP_NET_RAW))
979 		return -EPERM;
980 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
981 	    sock->type != SOCK_PACKET)
982 		return -ESOCKTNOSUPPORT;
983 
984 	sock->state = SS_UNCONNECTED;
985 
986 	err = -ENOBUFS;
987 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
988 	if (sk == NULL)
989 		goto out;
990 
991 	sock->ops = &packet_ops;
992 	if (sock->type == SOCK_PACKET)
993 		sock->ops = &packet_ops_spkt;
994 
995 	sock_init_data(sock, sk);
996 
997 	po = pkt_sk(sk);
998 	sk->sk_family = PF_PACKET;
999 	po->num = proto;
1000 
1001 	sk->sk_destruct = packet_sock_destruct;
1002 	sk_refcnt_debug_inc(sk);
1003 
1004 	/*
1005 	 *	Attach a protocol block
1006 	 */
1007 
1008 	spin_lock_init(&po->bind_lock);
1009 	po->prot_hook.func = packet_rcv;
1010 
1011 	if (sock->type == SOCK_PACKET)
1012 		po->prot_hook.func = packet_rcv_spkt;
1013 
1014 	po->prot_hook.af_packet_priv = sk;
1015 
1016 	if (proto) {
1017 		po->prot_hook.type = proto;
1018 		dev_add_pack(&po->prot_hook);
1019 		sock_hold(sk);
1020 		po->running = 1;
1021 	}
1022 
1023 	write_lock_bh(&packet_sklist_lock);
1024 	sk_add_node(sk, &packet_sklist);
1025 	write_unlock_bh(&packet_sklist_lock);
1026 	return(0);
1027 out:
1028 	return err;
1029 }
1030 
1031 /*
1032  *	Pull a packet from our receive queue and hand it to the user.
1033  *	If necessary we block.
1034  */
1035 
1036 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1037 			  struct msghdr *msg, size_t len, int flags)
1038 {
1039 	struct sock *sk = sock->sk;
1040 	struct sk_buff *skb;
1041 	int copied, err;
1042 	struct sockaddr_ll *sll;
1043 
1044 	err = -EINVAL;
1045 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1046 		goto out;
1047 
1048 #if 0
1049 	/* What error should we return now? EUNATTACH? */
1050 	if (pkt_sk(sk)->ifindex < 0)
1051 		return -ENODEV;
1052 #endif
1053 
1054 	/*
1055 	 *	Call the generic datagram receiver. This handles all sorts
1056 	 *	of horrible races and re-entrancy so we can forget about it
1057 	 *	in the protocol layers.
1058 	 *
1059 	 *	Now it will return ENETDOWN, if device have just gone down,
1060 	 *	but then it will block.
1061 	 */
1062 
1063 	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1064 
1065 	/*
1066 	 *	An error occurred so return it. Because skb_recv_datagram()
1067 	 *	handles the blocking we don't see and worry about blocking
1068 	 *	retries.
1069 	 */
1070 
1071 	if (skb == NULL)
1072 		goto out;
1073 
1074 	/*
1075 	 *	If the address length field is there to be filled in, we fill
1076 	 *	it in now.
1077 	 */
1078 
1079 	sll = &PACKET_SKB_CB(skb)->sa.ll;
1080 	if (sock->type == SOCK_PACKET)
1081 		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1082 	else
1083 		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1084 
1085 	/*
1086 	 *	You lose any data beyond the buffer you gave. If it worries a
1087 	 *	user program they can ask the device for its MTU anyway.
1088 	 */
1089 
1090 	copied = skb->len;
1091 	if (copied > len)
1092 	{
1093 		copied=len;
1094 		msg->msg_flags|=MSG_TRUNC;
1095 	}
1096 
1097 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1098 	if (err)
1099 		goto out_free;
1100 
1101 	sock_recv_timestamp(msg, sk, skb);
1102 
1103 	if (msg->msg_name)
1104 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1105 		       msg->msg_namelen);
1106 
1107 	if (pkt_sk(sk)->auxdata) {
1108 		struct tpacket_auxdata aux;
1109 
1110 		aux.tp_status = TP_STATUS_USER;
1111 		if (skb->ip_summed == CHECKSUM_PARTIAL)
1112 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1113 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1114 		aux.tp_snaplen = skb->len;
1115 		aux.tp_mac = 0;
1116 		aux.tp_net = skb_network_offset(skb);
1117 
1118 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1119 	}
1120 
1121 	/*
1122 	 *	Free or return the buffer as appropriate. Again this
1123 	 *	hides all the races and re-entrancy issues from us.
1124 	 */
1125 	err = (flags&MSG_TRUNC) ? skb->len : copied;
1126 
1127 out_free:
1128 	skb_free_datagram(sk, skb);
1129 out:
1130 	return err;
1131 }
1132 
1133 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1134 			       int *uaddr_len, int peer)
1135 {
1136 	struct net_device *dev;
1137 	struct sock *sk	= sock->sk;
1138 
1139 	if (peer)
1140 		return -EOPNOTSUPP;
1141 
1142 	uaddr->sa_family = AF_PACKET;
1143 	dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex);
1144 	if (dev) {
1145 		strlcpy(uaddr->sa_data, dev->name, 15);
1146 		dev_put(dev);
1147 	} else
1148 		memset(uaddr->sa_data, 0, 14);
1149 	*uaddr_len = sizeof(*uaddr);
1150 
1151 	return 0;
1152 }
1153 
1154 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1155 			  int *uaddr_len, int peer)
1156 {
1157 	struct net_device *dev;
1158 	struct sock *sk = sock->sk;
1159 	struct packet_sock *po = pkt_sk(sk);
1160 	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1161 
1162 	if (peer)
1163 		return -EOPNOTSUPP;
1164 
1165 	sll->sll_family = AF_PACKET;
1166 	sll->sll_ifindex = po->ifindex;
1167 	sll->sll_protocol = po->num;
1168 	dev = dev_get_by_index(&init_net, po->ifindex);
1169 	if (dev) {
1170 		sll->sll_hatype = dev->type;
1171 		sll->sll_halen = dev->addr_len;
1172 		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1173 		dev_put(dev);
1174 	} else {
1175 		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1176 		sll->sll_halen = 0;
1177 	}
1178 	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1179 
1180 	return 0;
1181 }
1182 
1183 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1184 {
1185 	switch (i->type) {
1186 	case PACKET_MR_MULTICAST:
1187 		if (what > 0)
1188 			dev_mc_add(dev, i->addr, i->alen, 0);
1189 		else
1190 			dev_mc_delete(dev, i->addr, i->alen, 0);
1191 		break;
1192 	case PACKET_MR_PROMISC:
1193 		dev_set_promiscuity(dev, what);
1194 		break;
1195 	case PACKET_MR_ALLMULTI:
1196 		dev_set_allmulti(dev, what);
1197 		break;
1198 	default:;
1199 	}
1200 }
1201 
1202 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1203 {
1204 	for ( ; i; i=i->next) {
1205 		if (i->ifindex == dev->ifindex)
1206 			packet_dev_mc(dev, i, what);
1207 	}
1208 }
1209 
1210 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1211 {
1212 	struct packet_sock *po = pkt_sk(sk);
1213 	struct packet_mclist *ml, *i;
1214 	struct net_device *dev;
1215 	int err;
1216 
1217 	rtnl_lock();
1218 
1219 	err = -ENODEV;
1220 	dev = __dev_get_by_index(&init_net, mreq->mr_ifindex);
1221 	if (!dev)
1222 		goto done;
1223 
1224 	err = -EINVAL;
1225 	if (mreq->mr_alen > dev->addr_len)
1226 		goto done;
1227 
1228 	err = -ENOBUFS;
1229 	i = kmalloc(sizeof(*i), GFP_KERNEL);
1230 	if (i == NULL)
1231 		goto done;
1232 
1233 	err = 0;
1234 	for (ml = po->mclist; ml; ml = ml->next) {
1235 		if (ml->ifindex == mreq->mr_ifindex &&
1236 		    ml->type == mreq->mr_type &&
1237 		    ml->alen == mreq->mr_alen &&
1238 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1239 			ml->count++;
1240 			/* Free the new element ... */
1241 			kfree(i);
1242 			goto done;
1243 		}
1244 	}
1245 
1246 	i->type = mreq->mr_type;
1247 	i->ifindex = mreq->mr_ifindex;
1248 	i->alen = mreq->mr_alen;
1249 	memcpy(i->addr, mreq->mr_address, i->alen);
1250 	i->count = 1;
1251 	i->next = po->mclist;
1252 	po->mclist = i;
1253 	packet_dev_mc(dev, i, +1);
1254 
1255 done:
1256 	rtnl_unlock();
1257 	return err;
1258 }
1259 
1260 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1261 {
1262 	struct packet_mclist *ml, **mlp;
1263 
1264 	rtnl_lock();
1265 
1266 	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1267 		if (ml->ifindex == mreq->mr_ifindex &&
1268 		    ml->type == mreq->mr_type &&
1269 		    ml->alen == mreq->mr_alen &&
1270 		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1271 			if (--ml->count == 0) {
1272 				struct net_device *dev;
1273 				*mlp = ml->next;
1274 				dev = dev_get_by_index(&init_net, ml->ifindex);
1275 				if (dev) {
1276 					packet_dev_mc(dev, ml, -1);
1277 					dev_put(dev);
1278 				}
1279 				kfree(ml);
1280 			}
1281 			rtnl_unlock();
1282 			return 0;
1283 		}
1284 	}
1285 	rtnl_unlock();
1286 	return -EADDRNOTAVAIL;
1287 }
1288 
1289 static void packet_flush_mclist(struct sock *sk)
1290 {
1291 	struct packet_sock *po = pkt_sk(sk);
1292 	struct packet_mclist *ml;
1293 
1294 	if (!po->mclist)
1295 		return;
1296 
1297 	rtnl_lock();
1298 	while ((ml = po->mclist) != NULL) {
1299 		struct net_device *dev;
1300 
1301 		po->mclist = ml->next;
1302 		if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) {
1303 			packet_dev_mc(dev, ml, -1);
1304 			dev_put(dev);
1305 		}
1306 		kfree(ml);
1307 	}
1308 	rtnl_unlock();
1309 }
1310 
1311 static int
1312 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1313 {
1314 	struct sock *sk = sock->sk;
1315 	struct packet_sock *po = pkt_sk(sk);
1316 	int ret;
1317 
1318 	if (level != SOL_PACKET)
1319 		return -ENOPROTOOPT;
1320 
1321 	switch(optname)	{
1322 	case PACKET_ADD_MEMBERSHIP:
1323 	case PACKET_DROP_MEMBERSHIP:
1324 	{
1325 		struct packet_mreq_max mreq;
1326 		int len = optlen;
1327 		memset(&mreq, 0, sizeof(mreq));
1328 		if (len < sizeof(struct packet_mreq))
1329 			return -EINVAL;
1330 		if (len > sizeof(mreq))
1331 			len = sizeof(mreq);
1332 		if (copy_from_user(&mreq,optval,len))
1333 			return -EFAULT;
1334 		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1335 			return -EINVAL;
1336 		if (optname == PACKET_ADD_MEMBERSHIP)
1337 			ret = packet_mc_add(sk, &mreq);
1338 		else
1339 			ret = packet_mc_drop(sk, &mreq);
1340 		return ret;
1341 	}
1342 
1343 #ifdef CONFIG_PACKET_MMAP
1344 	case PACKET_RX_RING:
1345 	{
1346 		struct tpacket_req req;
1347 
1348 		if (optlen<sizeof(req))
1349 			return -EINVAL;
1350 		if (copy_from_user(&req,optval,sizeof(req)))
1351 			return -EFAULT;
1352 		return packet_set_ring(sk, &req, 0);
1353 	}
1354 	case PACKET_COPY_THRESH:
1355 	{
1356 		int val;
1357 
1358 		if (optlen!=sizeof(val))
1359 			return -EINVAL;
1360 		if (copy_from_user(&val,optval,sizeof(val)))
1361 			return -EFAULT;
1362 
1363 		pkt_sk(sk)->copy_thresh = val;
1364 		return 0;
1365 	}
1366 #endif
1367 	case PACKET_AUXDATA:
1368 	{
1369 		int val;
1370 
1371 		if (optlen < sizeof(val))
1372 			return -EINVAL;
1373 		if (copy_from_user(&val, optval, sizeof(val)))
1374 			return -EFAULT;
1375 
1376 		po->auxdata = !!val;
1377 		return 0;
1378 	}
1379 	case PACKET_ORIGDEV:
1380 	{
1381 		int val;
1382 
1383 		if (optlen < sizeof(val))
1384 			return -EINVAL;
1385 		if (copy_from_user(&val, optval, sizeof(val)))
1386 			return -EFAULT;
1387 
1388 		po->origdev = !!val;
1389 		return 0;
1390 	}
1391 	default:
1392 		return -ENOPROTOOPT;
1393 	}
1394 }
1395 
1396 static int packet_getsockopt(struct socket *sock, int level, int optname,
1397 			     char __user *optval, int __user *optlen)
1398 {
1399 	int len;
1400 	int val;
1401 	struct sock *sk = sock->sk;
1402 	struct packet_sock *po = pkt_sk(sk);
1403 	void *data;
1404 	struct tpacket_stats st;
1405 
1406 	if (level != SOL_PACKET)
1407 		return -ENOPROTOOPT;
1408 
1409 	if (get_user(len, optlen))
1410 		return -EFAULT;
1411 
1412 	if (len < 0)
1413 		return -EINVAL;
1414 
1415 	switch(optname)	{
1416 	case PACKET_STATISTICS:
1417 		if (len > sizeof(struct tpacket_stats))
1418 			len = sizeof(struct tpacket_stats);
1419 		spin_lock_bh(&sk->sk_receive_queue.lock);
1420 		st = po->stats;
1421 		memset(&po->stats, 0, sizeof(st));
1422 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1423 		st.tp_packets += st.tp_drops;
1424 
1425 		data = &st;
1426 		break;
1427 	case PACKET_AUXDATA:
1428 		if (len > sizeof(int))
1429 			len = sizeof(int);
1430 		val = po->auxdata;
1431 
1432 		data = &val;
1433 		break;
1434 	case PACKET_ORIGDEV:
1435 		if (len > sizeof(int))
1436 			len = sizeof(int);
1437 		val = po->origdev;
1438 
1439 		data = &val;
1440 		break;
1441 	default:
1442 		return -ENOPROTOOPT;
1443 	}
1444 
1445 	if (put_user(len, optlen))
1446 		return -EFAULT;
1447 	if (copy_to_user(optval, data, len))
1448 		return -EFAULT;
1449 	return 0;
1450 }
1451 
1452 
1453 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1454 {
1455 	struct sock *sk;
1456 	struct hlist_node *node;
1457 	struct net_device *dev = data;
1458 
1459 	if (dev->nd_net != &init_net)
1460 		return NOTIFY_DONE;
1461 
1462 	read_lock(&packet_sklist_lock);
1463 	sk_for_each(sk, node, &packet_sklist) {
1464 		struct packet_sock *po = pkt_sk(sk);
1465 
1466 		switch (msg) {
1467 		case NETDEV_UNREGISTER:
1468 			if (po->mclist)
1469 				packet_dev_mclist(dev, po->mclist, -1);
1470 			/* fallthrough */
1471 
1472 		case NETDEV_DOWN:
1473 			if (dev->ifindex == po->ifindex) {
1474 				spin_lock(&po->bind_lock);
1475 				if (po->running) {
1476 					__dev_remove_pack(&po->prot_hook);
1477 					__sock_put(sk);
1478 					po->running = 0;
1479 					sk->sk_err = ENETDOWN;
1480 					if (!sock_flag(sk, SOCK_DEAD))
1481 						sk->sk_error_report(sk);
1482 				}
1483 				if (msg == NETDEV_UNREGISTER) {
1484 					po->ifindex = -1;
1485 					po->prot_hook.dev = NULL;
1486 				}
1487 				spin_unlock(&po->bind_lock);
1488 			}
1489 			break;
1490 		case NETDEV_UP:
1491 			spin_lock(&po->bind_lock);
1492 			if (dev->ifindex == po->ifindex && po->num &&
1493 			    !po->running) {
1494 				dev_add_pack(&po->prot_hook);
1495 				sock_hold(sk);
1496 				po->running = 1;
1497 			}
1498 			spin_unlock(&po->bind_lock);
1499 			break;
1500 		}
1501 	}
1502 	read_unlock(&packet_sklist_lock);
1503 	return NOTIFY_DONE;
1504 }
1505 
1506 
1507 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1508 			unsigned long arg)
1509 {
1510 	struct sock *sk = sock->sk;
1511 
1512 	switch(cmd) {
1513 		case SIOCOUTQ:
1514 		{
1515 			int amount = atomic_read(&sk->sk_wmem_alloc);
1516 			return put_user(amount, (int __user *)arg);
1517 		}
1518 		case SIOCINQ:
1519 		{
1520 			struct sk_buff *skb;
1521 			int amount = 0;
1522 
1523 			spin_lock_bh(&sk->sk_receive_queue.lock);
1524 			skb = skb_peek(&sk->sk_receive_queue);
1525 			if (skb)
1526 				amount = skb->len;
1527 			spin_unlock_bh(&sk->sk_receive_queue.lock);
1528 			return put_user(amount, (int __user *)arg);
1529 		}
1530 		case SIOCGSTAMP:
1531 			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1532 		case SIOCGSTAMPNS:
1533 			return sock_get_timestampns(sk, (struct timespec __user *)arg);
1534 
1535 #ifdef CONFIG_INET
1536 		case SIOCADDRT:
1537 		case SIOCDELRT:
1538 		case SIOCDARP:
1539 		case SIOCGARP:
1540 		case SIOCSARP:
1541 		case SIOCGIFADDR:
1542 		case SIOCSIFADDR:
1543 		case SIOCGIFBRDADDR:
1544 		case SIOCSIFBRDADDR:
1545 		case SIOCGIFNETMASK:
1546 		case SIOCSIFNETMASK:
1547 		case SIOCGIFDSTADDR:
1548 		case SIOCSIFDSTADDR:
1549 		case SIOCSIFFLAGS:
1550 			return inet_dgram_ops.ioctl(sock, cmd, arg);
1551 #endif
1552 
1553 		default:
1554 			return -ENOIOCTLCMD;
1555 	}
1556 	return 0;
1557 }
1558 
1559 #ifndef CONFIG_PACKET_MMAP
1560 #define packet_mmap sock_no_mmap
1561 #define packet_poll datagram_poll
1562 #else
1563 
1564 static unsigned int packet_poll(struct file * file, struct socket *sock,
1565 				poll_table *wait)
1566 {
1567 	struct sock *sk = sock->sk;
1568 	struct packet_sock *po = pkt_sk(sk);
1569 	unsigned int mask = datagram_poll(file, sock, wait);
1570 
1571 	spin_lock_bh(&sk->sk_receive_queue.lock);
1572 	if (po->pg_vec) {
1573 		unsigned last = po->head ? po->head-1 : po->frame_max;
1574 		struct tpacket_hdr *h;
1575 
1576 		h = packet_lookup_frame(po, last);
1577 
1578 		if (h->tp_status)
1579 			mask |= POLLIN | POLLRDNORM;
1580 	}
1581 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1582 	return mask;
1583 }
1584 
1585 
1586 /* Dirty? Well, I still did not learn better way to account
1587  * for user mmaps.
1588  */
1589 
1590 static void packet_mm_open(struct vm_area_struct *vma)
1591 {
1592 	struct file *file = vma->vm_file;
1593 	struct socket * sock = file->private_data;
1594 	struct sock *sk = sock->sk;
1595 
1596 	if (sk)
1597 		atomic_inc(&pkt_sk(sk)->mapped);
1598 }
1599 
1600 static void packet_mm_close(struct vm_area_struct *vma)
1601 {
1602 	struct file *file = vma->vm_file;
1603 	struct socket * sock = file->private_data;
1604 	struct sock *sk = sock->sk;
1605 
1606 	if (sk)
1607 		atomic_dec(&pkt_sk(sk)->mapped);
1608 }
1609 
1610 static struct vm_operations_struct packet_mmap_ops = {
1611 	.open =	packet_mm_open,
1612 	.close =packet_mm_close,
1613 };
1614 
1615 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1616 {
1617 	int i;
1618 
1619 	for (i = 0; i < len; i++) {
1620 		if (likely(pg_vec[i]))
1621 			free_pages((unsigned long) pg_vec[i], order);
1622 	}
1623 	kfree(pg_vec);
1624 }
1625 
1626 static inline char *alloc_one_pg_vec_page(unsigned long order)
1627 {
1628 	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1629 					 order);
1630 }
1631 
1632 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1633 {
1634 	unsigned int block_nr = req->tp_block_nr;
1635 	char **pg_vec;
1636 	int i;
1637 
1638 	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1639 	if (unlikely(!pg_vec))
1640 		goto out;
1641 
1642 	for (i = 0; i < block_nr; i++) {
1643 		pg_vec[i] = alloc_one_pg_vec_page(order);
1644 		if (unlikely(!pg_vec[i]))
1645 			goto out_free_pgvec;
1646 	}
1647 
1648 out:
1649 	return pg_vec;
1650 
1651 out_free_pgvec:
1652 	free_pg_vec(pg_vec, order, block_nr);
1653 	pg_vec = NULL;
1654 	goto out;
1655 }
1656 
1657 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1658 {
1659 	char **pg_vec = NULL;
1660 	struct packet_sock *po = pkt_sk(sk);
1661 	int was_running, order = 0;
1662 	__be16 num;
1663 	int err = 0;
1664 
1665 	if (req->tp_block_nr) {
1666 		int i, l;
1667 
1668 		/* Sanity tests and some calculations */
1669 
1670 		if (unlikely(po->pg_vec))
1671 			return -EBUSY;
1672 
1673 		if (unlikely((int)req->tp_block_size <= 0))
1674 			return -EINVAL;
1675 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1676 			return -EINVAL;
1677 		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1678 			return -EINVAL;
1679 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1680 			return -EINVAL;
1681 
1682 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1683 		if (unlikely(po->frames_per_block <= 0))
1684 			return -EINVAL;
1685 		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1686 			     req->tp_frame_nr))
1687 			return -EINVAL;
1688 
1689 		err = -ENOMEM;
1690 		order = get_order(req->tp_block_size);
1691 		pg_vec = alloc_pg_vec(req, order);
1692 		if (unlikely(!pg_vec))
1693 			goto out;
1694 
1695 		l = 0;
1696 		for (i = 0; i < req->tp_block_nr; i++) {
1697 			char *ptr = pg_vec[i];
1698 			struct tpacket_hdr *header;
1699 			int k;
1700 
1701 			for (k = 0; k < po->frames_per_block; k++) {
1702 				header = (struct tpacket_hdr *) ptr;
1703 				header->tp_status = TP_STATUS_KERNEL;
1704 				ptr += req->tp_frame_size;
1705 			}
1706 		}
1707 		/* Done */
1708 	} else {
1709 		if (unlikely(req->tp_frame_nr))
1710 			return -EINVAL;
1711 	}
1712 
1713 	lock_sock(sk);
1714 
1715 	/* Detach socket from network */
1716 	spin_lock(&po->bind_lock);
1717 	was_running = po->running;
1718 	num = po->num;
1719 	if (was_running) {
1720 		__dev_remove_pack(&po->prot_hook);
1721 		po->num = 0;
1722 		po->running = 0;
1723 		__sock_put(sk);
1724 	}
1725 	spin_unlock(&po->bind_lock);
1726 
1727 	synchronize_net();
1728 
1729 	err = -EBUSY;
1730 	if (closing || atomic_read(&po->mapped) == 0) {
1731 		err = 0;
1732 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1733 
1734 		spin_lock_bh(&sk->sk_receive_queue.lock);
1735 		pg_vec = XC(po->pg_vec, pg_vec);
1736 		po->frame_max = (req->tp_frame_nr - 1);
1737 		po->head = 0;
1738 		po->frame_size = req->tp_frame_size;
1739 		spin_unlock_bh(&sk->sk_receive_queue.lock);
1740 
1741 		order = XC(po->pg_vec_order, order);
1742 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1743 
1744 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1745 		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1746 		skb_queue_purge(&sk->sk_receive_queue);
1747 #undef XC
1748 		if (atomic_read(&po->mapped))
1749 			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1750 	}
1751 
1752 	spin_lock(&po->bind_lock);
1753 	if (was_running && !po->running) {
1754 		sock_hold(sk);
1755 		po->running = 1;
1756 		po->num = num;
1757 		dev_add_pack(&po->prot_hook);
1758 	}
1759 	spin_unlock(&po->bind_lock);
1760 
1761 	release_sock(sk);
1762 
1763 	if (pg_vec)
1764 		free_pg_vec(pg_vec, order, req->tp_block_nr);
1765 out:
1766 	return err;
1767 }
1768 
1769 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1770 {
1771 	struct sock *sk = sock->sk;
1772 	struct packet_sock *po = pkt_sk(sk);
1773 	unsigned long size;
1774 	unsigned long start;
1775 	int err = -EINVAL;
1776 	int i;
1777 
1778 	if (vma->vm_pgoff)
1779 		return -EINVAL;
1780 
1781 	size = vma->vm_end - vma->vm_start;
1782 
1783 	lock_sock(sk);
1784 	if (po->pg_vec == NULL)
1785 		goto out;
1786 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1787 		goto out;
1788 
1789 	start = vma->vm_start;
1790 	for (i = 0; i < po->pg_vec_len; i++) {
1791 		struct page *page = virt_to_page(po->pg_vec[i]);
1792 		int pg_num;
1793 
1794 		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1795 			err = vm_insert_page(vma, start, page);
1796 			if (unlikely(err))
1797 				goto out;
1798 			start += PAGE_SIZE;
1799 		}
1800 	}
1801 	atomic_inc(&po->mapped);
1802 	vma->vm_ops = &packet_mmap_ops;
1803 	err = 0;
1804 
1805 out:
1806 	release_sock(sk);
1807 	return err;
1808 }
1809 #endif
1810 
1811 
1812 static const struct proto_ops packet_ops_spkt = {
1813 	.family =	PF_PACKET,
1814 	.owner =	THIS_MODULE,
1815 	.release =	packet_release,
1816 	.bind =		packet_bind_spkt,
1817 	.connect =	sock_no_connect,
1818 	.socketpair =	sock_no_socketpair,
1819 	.accept =	sock_no_accept,
1820 	.getname =	packet_getname_spkt,
1821 	.poll =		datagram_poll,
1822 	.ioctl =	packet_ioctl,
1823 	.listen =	sock_no_listen,
1824 	.shutdown =	sock_no_shutdown,
1825 	.setsockopt =	sock_no_setsockopt,
1826 	.getsockopt =	sock_no_getsockopt,
1827 	.sendmsg =	packet_sendmsg_spkt,
1828 	.recvmsg =	packet_recvmsg,
1829 	.mmap =		sock_no_mmap,
1830 	.sendpage =	sock_no_sendpage,
1831 };
1832 
1833 static const struct proto_ops packet_ops = {
1834 	.family =	PF_PACKET,
1835 	.owner =	THIS_MODULE,
1836 	.release =	packet_release,
1837 	.bind =		packet_bind,
1838 	.connect =	sock_no_connect,
1839 	.socketpair =	sock_no_socketpair,
1840 	.accept =	sock_no_accept,
1841 	.getname =	packet_getname,
1842 	.poll =		packet_poll,
1843 	.ioctl =	packet_ioctl,
1844 	.listen =	sock_no_listen,
1845 	.shutdown =	sock_no_shutdown,
1846 	.setsockopt =	packet_setsockopt,
1847 	.getsockopt =	packet_getsockopt,
1848 	.sendmsg =	packet_sendmsg,
1849 	.recvmsg =	packet_recvmsg,
1850 	.mmap =		packet_mmap,
1851 	.sendpage =	sock_no_sendpage,
1852 };
1853 
1854 static struct net_proto_family packet_family_ops = {
1855 	.family =	PF_PACKET,
1856 	.create =	packet_create,
1857 	.owner	=	THIS_MODULE,
1858 };
1859 
1860 static struct notifier_block packet_netdev_notifier = {
1861 	.notifier_call =packet_notifier,
1862 };
1863 
1864 #ifdef CONFIG_PROC_FS
1865 static inline struct sock *packet_seq_idx(loff_t off)
1866 {
1867 	struct sock *s;
1868 	struct hlist_node *node;
1869 
1870 	sk_for_each(s, node, &packet_sklist) {
1871 		if (!off--)
1872 			return s;
1873 	}
1874 	return NULL;
1875 }
1876 
1877 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1878 {
1879 	read_lock(&packet_sklist_lock);
1880 	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1881 }
1882 
1883 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1884 {
1885 	++*pos;
1886 	return  (v == SEQ_START_TOKEN)
1887 		? sk_head(&packet_sklist)
1888 		: sk_next((struct sock*)v) ;
1889 }
1890 
1891 static void packet_seq_stop(struct seq_file *seq, void *v)
1892 {
1893 	read_unlock(&packet_sklist_lock);
1894 }
1895 
1896 static int packet_seq_show(struct seq_file *seq, void *v)
1897 {
1898 	if (v == SEQ_START_TOKEN)
1899 		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1900 	else {
1901 		struct sock *s = v;
1902 		const struct packet_sock *po = pkt_sk(s);
1903 
1904 		seq_printf(seq,
1905 			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1906 			   s,
1907 			   atomic_read(&s->sk_refcnt),
1908 			   s->sk_type,
1909 			   ntohs(po->num),
1910 			   po->ifindex,
1911 			   po->running,
1912 			   atomic_read(&s->sk_rmem_alloc),
1913 			   sock_i_uid(s),
1914 			   sock_i_ino(s) );
1915 	}
1916 
1917 	return 0;
1918 }
1919 
1920 static const struct seq_operations packet_seq_ops = {
1921 	.start	= packet_seq_start,
1922 	.next	= packet_seq_next,
1923 	.stop	= packet_seq_stop,
1924 	.show	= packet_seq_show,
1925 };
1926 
1927 static int packet_seq_open(struct inode *inode, struct file *file)
1928 {
1929 	return seq_open(file, &packet_seq_ops);
1930 }
1931 
1932 static const struct file_operations packet_seq_fops = {
1933 	.owner		= THIS_MODULE,
1934 	.open		= packet_seq_open,
1935 	.read		= seq_read,
1936 	.llseek		= seq_lseek,
1937 	.release	= seq_release,
1938 };
1939 
1940 #endif
1941 
1942 static void __exit packet_exit(void)
1943 {
1944 	proc_net_remove(&init_net, "packet");
1945 	unregister_netdevice_notifier(&packet_netdev_notifier);
1946 	sock_unregister(PF_PACKET);
1947 	proto_unregister(&packet_proto);
1948 }
1949 
1950 static int __init packet_init(void)
1951 {
1952 	int rc = proto_register(&packet_proto, 0);
1953 
1954 	if (rc != 0)
1955 		goto out;
1956 
1957 	sock_register(&packet_family_ops);
1958 	register_netdevice_notifier(&packet_netdev_notifier);
1959 	proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops);
1960 out:
1961 	return rc;
1962 }
1963 
1964 module_init(packet_init);
1965 module_exit(packet_exit);
1966 MODULE_LICENSE("GPL");
1967 MODULE_ALIAS_NETPROTO(PF_PACKET);
1968