xref: /linux/net/netlink/af_netlink.c (revision 2d87650a3bf1b80f7d0d150ee1af3f8a89e5b7aa)
1 /*
2  * NETLINK      Kernel-user communication protocol.
3  *
4  * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
5  * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
6  * 				Patrick McHardy <kaber@trash.net>
7  *
8  *		This program is free software; you can redistribute it and/or
9  *		modify it under the terms of the GNU General Public License
10  *		as published by the Free Software Foundation; either version
11  *		2 of the License, or (at your option) any later version.
12  *
13  * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
14  *                               added netlink_proto_exit
15  * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
16  * 				 use nlk_sk, as sk->protinfo is on a diet 8)
17  * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
18  * 				 - inc module use count of module that owns
19  * 				   the kernel socket in case userspace opens
20  * 				   socket of same protocol
21  * 				 - remove all module support, since netlink is
22  * 				   mandatory if CONFIG_NET=y these days
23  */
24 
25 #include <linux/module.h>
26 
27 #include <linux/capability.h>
28 #include <linux/kernel.h>
29 #include <linux/init.h>
30 #include <linux/signal.h>
31 #include <linux/sched.h>
32 #include <linux/errno.h>
33 #include <linux/string.h>
34 #include <linux/stat.h>
35 #include <linux/socket.h>
36 #include <linux/un.h>
37 #include <linux/fcntl.h>
38 #include <linux/termios.h>
39 #include <linux/sockios.h>
40 #include <linux/net.h>
41 #include <linux/fs.h>
42 #include <linux/slab.h>
43 #include <asm/uaccess.h>
44 #include <linux/skbuff.h>
45 #include <linux/netdevice.h>
46 #include <linux/rtnetlink.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/notifier.h>
50 #include <linux/security.h>
51 #include <linux/jhash.h>
52 #include <linux/jiffies.h>
53 #include <linux/random.h>
54 #include <linux/bitops.h>
55 #include <linux/mm.h>
56 #include <linux/types.h>
57 #include <linux/audit.h>
58 #include <linux/mutex.h>
59 #include <linux/vmalloc.h>
60 #include <linux/if_arp.h>
61 #include <asm/cacheflush.h>
62 
63 #include <net/net_namespace.h>
64 #include <net/sock.h>
65 #include <net/scm.h>
66 #include <net/netlink.h>
67 
68 #include "af_netlink.h"
69 
70 struct listeners {
71 	struct rcu_head		rcu;
72 	unsigned long		masks[0];
73 };
74 
75 /* state bits */
76 #define NETLINK_CONGESTED	0x0
77 
78 /* flags */
79 #define NETLINK_KERNEL_SOCKET	0x1
80 #define NETLINK_RECV_PKTINFO	0x2
81 #define NETLINK_BROADCAST_SEND_ERROR	0x4
82 #define NETLINK_RECV_NO_ENOBUFS	0x8
83 
84 static inline int netlink_is_kernel(struct sock *sk)
85 {
86 	return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
87 }
88 
89 struct netlink_table *nl_table;
90 EXPORT_SYMBOL_GPL(nl_table);
91 
92 static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
93 
94 static int netlink_dump(struct sock *sk);
95 static void netlink_skb_destructor(struct sk_buff *skb);
96 
97 DEFINE_RWLOCK(nl_table_lock);
98 EXPORT_SYMBOL_GPL(nl_table_lock);
99 static atomic_t nl_table_users = ATOMIC_INIT(0);
100 
101 #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
102 
103 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
104 
105 static DEFINE_SPINLOCK(netlink_tap_lock);
106 static struct list_head netlink_tap_all __read_mostly;
107 
108 static inline u32 netlink_group_mask(u32 group)
109 {
110 	return group ? 1 << (group - 1) : 0;
111 }
112 
113 static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid)
114 {
115 	return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
116 }
117 
118 int netlink_add_tap(struct netlink_tap *nt)
119 {
120 	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
121 		return -EINVAL;
122 
123 	spin_lock(&netlink_tap_lock);
124 	list_add_rcu(&nt->list, &netlink_tap_all);
125 	spin_unlock(&netlink_tap_lock);
126 
127 	if (nt->module)
128 		__module_get(nt->module);
129 
130 	return 0;
131 }
132 EXPORT_SYMBOL_GPL(netlink_add_tap);
133 
134 int __netlink_remove_tap(struct netlink_tap *nt)
135 {
136 	bool found = false;
137 	struct netlink_tap *tmp;
138 
139 	spin_lock(&netlink_tap_lock);
140 
141 	list_for_each_entry(tmp, &netlink_tap_all, list) {
142 		if (nt == tmp) {
143 			list_del_rcu(&nt->list);
144 			found = true;
145 			goto out;
146 		}
147 	}
148 
149 	pr_warn("__netlink_remove_tap: %p not found\n", nt);
150 out:
151 	spin_unlock(&netlink_tap_lock);
152 
153 	if (found && nt->module)
154 		module_put(nt->module);
155 
156 	return found ? 0 : -ENODEV;
157 }
158 EXPORT_SYMBOL_GPL(__netlink_remove_tap);
159 
160 int netlink_remove_tap(struct netlink_tap *nt)
161 {
162 	int ret;
163 
164 	ret = __netlink_remove_tap(nt);
165 	synchronize_net();
166 
167 	return ret;
168 }
169 EXPORT_SYMBOL_GPL(netlink_remove_tap);
170 
171 static bool netlink_filter_tap(const struct sk_buff *skb)
172 {
173 	struct sock *sk = skb->sk;
174 	bool pass = false;
175 
176 	/* We take the more conservative approach and
177 	 * whitelist socket protocols that may pass.
178 	 */
179 	switch (sk->sk_protocol) {
180 	case NETLINK_ROUTE:
181 	case NETLINK_USERSOCK:
182 	case NETLINK_SOCK_DIAG:
183 	case NETLINK_NFLOG:
184 	case NETLINK_XFRM:
185 	case NETLINK_FIB_LOOKUP:
186 	case NETLINK_NETFILTER:
187 	case NETLINK_GENERIC:
188 		pass = true;
189 		break;
190 	}
191 
192 	return pass;
193 }
194 
195 static int __netlink_deliver_tap_skb(struct sk_buff *skb,
196 				     struct net_device *dev)
197 {
198 	struct sk_buff *nskb;
199 	struct sock *sk = skb->sk;
200 	int ret = -ENOMEM;
201 
202 	dev_hold(dev);
203 	nskb = skb_clone(skb, GFP_ATOMIC);
204 	if (nskb) {
205 		nskb->dev = dev;
206 		nskb->protocol = htons((u16) sk->sk_protocol);
207 		nskb->pkt_type = netlink_is_kernel(sk) ?
208 				 PACKET_KERNEL : PACKET_USER;
209 
210 		ret = dev_queue_xmit(nskb);
211 		if (unlikely(ret > 0))
212 			ret = net_xmit_errno(ret);
213 	}
214 
215 	dev_put(dev);
216 	return ret;
217 }
218 
219 static void __netlink_deliver_tap(struct sk_buff *skb)
220 {
221 	int ret;
222 	struct netlink_tap *tmp;
223 
224 	if (!netlink_filter_tap(skb))
225 		return;
226 
227 	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
228 		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
229 		if (unlikely(ret))
230 			break;
231 	}
232 }
233 
234 static void netlink_deliver_tap(struct sk_buff *skb)
235 {
236 	rcu_read_lock();
237 
238 	if (unlikely(!list_empty(&netlink_tap_all)))
239 		__netlink_deliver_tap(skb);
240 
241 	rcu_read_unlock();
242 }
243 
244 static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
245 				       struct sk_buff *skb)
246 {
247 	if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
248 		netlink_deliver_tap(skb);
249 }
250 
251 static void netlink_overrun(struct sock *sk)
252 {
253 	struct netlink_sock *nlk = nlk_sk(sk);
254 
255 	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
256 		if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
257 			sk->sk_err = ENOBUFS;
258 			sk->sk_error_report(sk);
259 		}
260 	}
261 	atomic_inc(&sk->sk_drops);
262 }
263 
264 static void netlink_rcv_wake(struct sock *sk)
265 {
266 	struct netlink_sock *nlk = nlk_sk(sk);
267 
268 	if (skb_queue_empty(&sk->sk_receive_queue))
269 		clear_bit(NETLINK_CONGESTED, &nlk->state);
270 	if (!test_bit(NETLINK_CONGESTED, &nlk->state))
271 		wake_up_interruptible(&nlk->wait);
272 }
273 
274 #ifdef CONFIG_NETLINK_MMAP
275 static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
276 {
277 	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
278 }
279 
280 static bool netlink_rx_is_mmaped(struct sock *sk)
281 {
282 	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
283 }
284 
285 static bool netlink_tx_is_mmaped(struct sock *sk)
286 {
287 	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
288 }
289 
290 static __pure struct page *pgvec_to_page(const void *addr)
291 {
292 	if (is_vmalloc_addr(addr))
293 		return vmalloc_to_page(addr);
294 	else
295 		return virt_to_page(addr);
296 }
297 
298 static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
299 {
300 	unsigned int i;
301 
302 	for (i = 0; i < len; i++) {
303 		if (pg_vec[i] != NULL) {
304 			if (is_vmalloc_addr(pg_vec[i]))
305 				vfree(pg_vec[i]);
306 			else
307 				free_pages((unsigned long)pg_vec[i], order);
308 		}
309 	}
310 	kfree(pg_vec);
311 }
312 
313 static void *alloc_one_pg_vec_page(unsigned long order)
314 {
315 	void *buffer;
316 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
317 			  __GFP_NOWARN | __GFP_NORETRY;
318 
319 	buffer = (void *)__get_free_pages(gfp_flags, order);
320 	if (buffer != NULL)
321 		return buffer;
322 
323 	buffer = vzalloc((1 << order) * PAGE_SIZE);
324 	if (buffer != NULL)
325 		return buffer;
326 
327 	gfp_flags &= ~__GFP_NORETRY;
328 	return (void *)__get_free_pages(gfp_flags, order);
329 }
330 
331 static void **alloc_pg_vec(struct netlink_sock *nlk,
332 			   struct nl_mmap_req *req, unsigned int order)
333 {
334 	unsigned int block_nr = req->nm_block_nr;
335 	unsigned int i;
336 	void **pg_vec;
337 
338 	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
339 	if (pg_vec == NULL)
340 		return NULL;
341 
342 	for (i = 0; i < block_nr; i++) {
343 		pg_vec[i] = alloc_one_pg_vec_page(order);
344 		if (pg_vec[i] == NULL)
345 			goto err1;
346 	}
347 
348 	return pg_vec;
349 err1:
350 	free_pg_vec(pg_vec, order, block_nr);
351 	return NULL;
352 }
353 
354 static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
355 			    bool closing, bool tx_ring)
356 {
357 	struct netlink_sock *nlk = nlk_sk(sk);
358 	struct netlink_ring *ring;
359 	struct sk_buff_head *queue;
360 	void **pg_vec = NULL;
361 	unsigned int order = 0;
362 	int err;
363 
364 	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
365 	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
366 
367 	if (!closing) {
368 		if (atomic_read(&nlk->mapped))
369 			return -EBUSY;
370 		if (atomic_read(&ring->pending))
371 			return -EBUSY;
372 	}
373 
374 	if (req->nm_block_nr) {
375 		if (ring->pg_vec != NULL)
376 			return -EBUSY;
377 
378 		if ((int)req->nm_block_size <= 0)
379 			return -EINVAL;
380 		if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
381 			return -EINVAL;
382 		if (req->nm_frame_size < NL_MMAP_HDRLEN)
383 			return -EINVAL;
384 		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
385 			return -EINVAL;
386 
387 		ring->frames_per_block = req->nm_block_size /
388 					 req->nm_frame_size;
389 		if (ring->frames_per_block == 0)
390 			return -EINVAL;
391 		if (ring->frames_per_block * req->nm_block_nr !=
392 		    req->nm_frame_nr)
393 			return -EINVAL;
394 
395 		order = get_order(req->nm_block_size);
396 		pg_vec = alloc_pg_vec(nlk, req, order);
397 		if (pg_vec == NULL)
398 			return -ENOMEM;
399 	} else {
400 		if (req->nm_frame_nr)
401 			return -EINVAL;
402 	}
403 
404 	err = -EBUSY;
405 	mutex_lock(&nlk->pg_vec_lock);
406 	if (closing || atomic_read(&nlk->mapped) == 0) {
407 		err = 0;
408 		spin_lock_bh(&queue->lock);
409 
410 		ring->frame_max		= req->nm_frame_nr - 1;
411 		ring->head		= 0;
412 		ring->frame_size	= req->nm_frame_size;
413 		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
414 
415 		swap(ring->pg_vec_len, req->nm_block_nr);
416 		swap(ring->pg_vec_order, order);
417 		swap(ring->pg_vec, pg_vec);
418 
419 		__skb_queue_purge(queue);
420 		spin_unlock_bh(&queue->lock);
421 
422 		WARN_ON(atomic_read(&nlk->mapped));
423 	}
424 	mutex_unlock(&nlk->pg_vec_lock);
425 
426 	if (pg_vec)
427 		free_pg_vec(pg_vec, order, req->nm_block_nr);
428 	return err;
429 }
430 
431 static void netlink_mm_open(struct vm_area_struct *vma)
432 {
433 	struct file *file = vma->vm_file;
434 	struct socket *sock = file->private_data;
435 	struct sock *sk = sock->sk;
436 
437 	if (sk)
438 		atomic_inc(&nlk_sk(sk)->mapped);
439 }
440 
441 static void netlink_mm_close(struct vm_area_struct *vma)
442 {
443 	struct file *file = vma->vm_file;
444 	struct socket *sock = file->private_data;
445 	struct sock *sk = sock->sk;
446 
447 	if (sk)
448 		atomic_dec(&nlk_sk(sk)->mapped);
449 }
450 
451 static const struct vm_operations_struct netlink_mmap_ops = {
452 	.open	= netlink_mm_open,
453 	.close	= netlink_mm_close,
454 };
455 
456 static int netlink_mmap(struct file *file, struct socket *sock,
457 			struct vm_area_struct *vma)
458 {
459 	struct sock *sk = sock->sk;
460 	struct netlink_sock *nlk = nlk_sk(sk);
461 	struct netlink_ring *ring;
462 	unsigned long start, size, expected;
463 	unsigned int i;
464 	int err = -EINVAL;
465 
466 	if (vma->vm_pgoff)
467 		return -EINVAL;
468 
469 	mutex_lock(&nlk->pg_vec_lock);
470 
471 	expected = 0;
472 	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
473 		if (ring->pg_vec == NULL)
474 			continue;
475 		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
476 	}
477 
478 	if (expected == 0)
479 		goto out;
480 
481 	size = vma->vm_end - vma->vm_start;
482 	if (size != expected)
483 		goto out;
484 
485 	start = vma->vm_start;
486 	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
487 		if (ring->pg_vec == NULL)
488 			continue;
489 
490 		for (i = 0; i < ring->pg_vec_len; i++) {
491 			struct page *page;
492 			void *kaddr = ring->pg_vec[i];
493 			unsigned int pg_num;
494 
495 			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
496 				page = pgvec_to_page(kaddr);
497 				err = vm_insert_page(vma, start, page);
498 				if (err < 0)
499 					goto out;
500 				start += PAGE_SIZE;
501 				kaddr += PAGE_SIZE;
502 			}
503 		}
504 	}
505 
506 	atomic_inc(&nlk->mapped);
507 	vma->vm_ops = &netlink_mmap_ops;
508 	err = 0;
509 out:
510 	mutex_unlock(&nlk->pg_vec_lock);
511 	return err;
512 }
513 
514 static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
515 {
516 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
517 	struct page *p_start, *p_end;
518 
519 	/* First page is flushed through netlink_{get,set}_status */
520 	p_start = pgvec_to_page(hdr + PAGE_SIZE);
521 	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + hdr->nm_len - 1);
522 	while (p_start <= p_end) {
523 		flush_dcache_page(p_start);
524 		p_start++;
525 	}
526 #endif
527 }
528 
529 static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
530 {
531 	smp_rmb();
532 	flush_dcache_page(pgvec_to_page(hdr));
533 	return hdr->nm_status;
534 }
535 
536 static void netlink_set_status(struct nl_mmap_hdr *hdr,
537 			       enum nl_mmap_status status)
538 {
539 	hdr->nm_status = status;
540 	flush_dcache_page(pgvec_to_page(hdr));
541 	smp_wmb();
542 }
543 
544 static struct nl_mmap_hdr *
545 __netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
546 {
547 	unsigned int pg_vec_pos, frame_off;
548 
549 	pg_vec_pos = pos / ring->frames_per_block;
550 	frame_off  = pos % ring->frames_per_block;
551 
552 	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
553 }
554 
555 static struct nl_mmap_hdr *
556 netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
557 		     enum nl_mmap_status status)
558 {
559 	struct nl_mmap_hdr *hdr;
560 
561 	hdr = __netlink_lookup_frame(ring, pos);
562 	if (netlink_get_status(hdr) != status)
563 		return NULL;
564 
565 	return hdr;
566 }
567 
568 static struct nl_mmap_hdr *
569 netlink_current_frame(const struct netlink_ring *ring,
570 		      enum nl_mmap_status status)
571 {
572 	return netlink_lookup_frame(ring, ring->head, status);
573 }
574 
575 static struct nl_mmap_hdr *
576 netlink_previous_frame(const struct netlink_ring *ring,
577 		       enum nl_mmap_status status)
578 {
579 	unsigned int prev;
580 
581 	prev = ring->head ? ring->head - 1 : ring->frame_max;
582 	return netlink_lookup_frame(ring, prev, status);
583 }
584 
585 static void netlink_increment_head(struct netlink_ring *ring)
586 {
587 	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
588 }
589 
590 static void netlink_forward_ring(struct netlink_ring *ring)
591 {
592 	unsigned int head = ring->head, pos = head;
593 	const struct nl_mmap_hdr *hdr;
594 
595 	do {
596 		hdr = __netlink_lookup_frame(ring, pos);
597 		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
598 			break;
599 		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
600 			break;
601 		netlink_increment_head(ring);
602 	} while (ring->head != head);
603 }
604 
605 static bool netlink_dump_space(struct netlink_sock *nlk)
606 {
607 	struct netlink_ring *ring = &nlk->rx_ring;
608 	struct nl_mmap_hdr *hdr;
609 	unsigned int n;
610 
611 	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
612 	if (hdr == NULL)
613 		return false;
614 
615 	n = ring->head + ring->frame_max / 2;
616 	if (n > ring->frame_max)
617 		n -= ring->frame_max;
618 
619 	hdr = __netlink_lookup_frame(ring, n);
620 
621 	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
622 }
623 
624 static unsigned int netlink_poll(struct file *file, struct socket *sock,
625 				 poll_table *wait)
626 {
627 	struct sock *sk = sock->sk;
628 	struct netlink_sock *nlk = nlk_sk(sk);
629 	unsigned int mask;
630 	int err;
631 
632 	if (nlk->rx_ring.pg_vec != NULL) {
633 		/* Memory mapped sockets don't call recvmsg(), so flow control
634 		 * for dumps is performed here. A dump is allowed to continue
635 		 * if at least half the ring is unused.
636 		 */
637 		while (nlk->cb_running && netlink_dump_space(nlk)) {
638 			err = netlink_dump(sk);
639 			if (err < 0) {
640 				sk->sk_err = err;
641 				sk->sk_error_report(sk);
642 				break;
643 			}
644 		}
645 		netlink_rcv_wake(sk);
646 	}
647 
648 	mask = datagram_poll(file, sock, wait);
649 
650 	spin_lock_bh(&sk->sk_receive_queue.lock);
651 	if (nlk->rx_ring.pg_vec) {
652 		netlink_forward_ring(&nlk->rx_ring);
653 		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
654 			mask |= POLLIN | POLLRDNORM;
655 	}
656 	spin_unlock_bh(&sk->sk_receive_queue.lock);
657 
658 	spin_lock_bh(&sk->sk_write_queue.lock);
659 	if (nlk->tx_ring.pg_vec) {
660 		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
661 			mask |= POLLOUT | POLLWRNORM;
662 	}
663 	spin_unlock_bh(&sk->sk_write_queue.lock);
664 
665 	return mask;
666 }
667 
668 static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
669 {
670 	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
671 }
672 
673 static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
674 				   struct netlink_ring *ring,
675 				   struct nl_mmap_hdr *hdr)
676 {
677 	unsigned int size;
678 	void *data;
679 
680 	size = ring->frame_size - NL_MMAP_HDRLEN;
681 	data = (void *)hdr + NL_MMAP_HDRLEN;
682 
683 	skb->head	= data;
684 	skb->data	= data;
685 	skb_reset_tail_pointer(skb);
686 	skb->end	= skb->tail + size;
687 	skb->len	= 0;
688 
689 	skb->destructor	= netlink_skb_destructor;
690 	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
691 	NETLINK_CB(skb).sk = sk;
692 }
693 
694 static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
695 				u32 dst_portid, u32 dst_group,
696 				struct sock_iocb *siocb)
697 {
698 	struct netlink_sock *nlk = nlk_sk(sk);
699 	struct netlink_ring *ring;
700 	struct nl_mmap_hdr *hdr;
701 	struct sk_buff *skb;
702 	unsigned int maxlen;
703 	bool excl = true;
704 	int err = 0, len = 0;
705 
706 	/* Netlink messages are validated by the receiver before processing.
707 	 * In order to avoid userspace changing the contents of the message
708 	 * after validation, the socket and the ring may only be used by a
709 	 * single process, otherwise we fall back to copying.
710 	 */
711 	if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 ||
712 	    atomic_read(&nlk->mapped) > 1)
713 		excl = false;
714 
715 	mutex_lock(&nlk->pg_vec_lock);
716 
717 	ring   = &nlk->tx_ring;
718 	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
719 
720 	do {
721 		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
722 		if (hdr == NULL) {
723 			if (!(msg->msg_flags & MSG_DONTWAIT) &&
724 			    atomic_read(&nlk->tx_ring.pending))
725 				schedule();
726 			continue;
727 		}
728 		if (hdr->nm_len > maxlen) {
729 			err = -EINVAL;
730 			goto out;
731 		}
732 
733 		netlink_frame_flush_dcache(hdr);
734 
735 		if (likely(dst_portid == 0 && dst_group == 0 && excl)) {
736 			skb = alloc_skb_head(GFP_KERNEL);
737 			if (skb == NULL) {
738 				err = -ENOBUFS;
739 				goto out;
740 			}
741 			sock_hold(sk);
742 			netlink_ring_setup_skb(skb, sk, ring, hdr);
743 			NETLINK_CB(skb).flags |= NETLINK_SKB_TX;
744 			__skb_put(skb, hdr->nm_len);
745 			netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
746 			atomic_inc(&ring->pending);
747 		} else {
748 			skb = alloc_skb(hdr->nm_len, GFP_KERNEL);
749 			if (skb == NULL) {
750 				err = -ENOBUFS;
751 				goto out;
752 			}
753 			__skb_put(skb, hdr->nm_len);
754 			memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len);
755 			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
756 		}
757 
758 		netlink_increment_head(ring);
759 
760 		NETLINK_CB(skb).portid	  = nlk->portid;
761 		NETLINK_CB(skb).dst_group = dst_group;
762 		NETLINK_CB(skb).creds	  = siocb->scm->creds;
763 
764 		err = security_netlink_send(sk, skb);
765 		if (err) {
766 			kfree_skb(skb);
767 			goto out;
768 		}
769 
770 		if (unlikely(dst_group)) {
771 			atomic_inc(&skb->users);
772 			netlink_broadcast(sk, skb, dst_portid, dst_group,
773 					  GFP_KERNEL);
774 		}
775 		err = netlink_unicast(sk, skb, dst_portid,
776 				      msg->msg_flags & MSG_DONTWAIT);
777 		if (err < 0)
778 			goto out;
779 		len += err;
780 
781 	} while (hdr != NULL ||
782 		 (!(msg->msg_flags & MSG_DONTWAIT) &&
783 		  atomic_read(&nlk->tx_ring.pending)));
784 
785 	if (len > 0)
786 		err = len;
787 out:
788 	mutex_unlock(&nlk->pg_vec_lock);
789 	return err;
790 }
791 
792 static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
793 {
794 	struct nl_mmap_hdr *hdr;
795 
796 	hdr = netlink_mmap_hdr(skb);
797 	hdr->nm_len	= skb->len;
798 	hdr->nm_group	= NETLINK_CB(skb).dst_group;
799 	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
800 	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
801 	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
802 	netlink_frame_flush_dcache(hdr);
803 	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
804 
805 	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
806 	kfree_skb(skb);
807 }
808 
809 static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
810 {
811 	struct netlink_sock *nlk = nlk_sk(sk);
812 	struct netlink_ring *ring = &nlk->rx_ring;
813 	struct nl_mmap_hdr *hdr;
814 
815 	spin_lock_bh(&sk->sk_receive_queue.lock);
816 	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
817 	if (hdr == NULL) {
818 		spin_unlock_bh(&sk->sk_receive_queue.lock);
819 		kfree_skb(skb);
820 		netlink_overrun(sk);
821 		return;
822 	}
823 	netlink_increment_head(ring);
824 	__skb_queue_tail(&sk->sk_receive_queue, skb);
825 	spin_unlock_bh(&sk->sk_receive_queue.lock);
826 
827 	hdr->nm_len	= skb->len;
828 	hdr->nm_group	= NETLINK_CB(skb).dst_group;
829 	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
830 	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
831 	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
832 	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
833 }
834 
835 #else /* CONFIG_NETLINK_MMAP */
836 #define netlink_skb_is_mmaped(skb)	false
837 #define netlink_rx_is_mmaped(sk)	false
838 #define netlink_tx_is_mmaped(sk)	false
839 #define netlink_mmap			sock_no_mmap
840 #define netlink_poll			datagram_poll
841 #define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb)	0
842 #endif /* CONFIG_NETLINK_MMAP */
843 
844 static void netlink_skb_destructor(struct sk_buff *skb)
845 {
846 #ifdef CONFIG_NETLINK_MMAP
847 	struct nl_mmap_hdr *hdr;
848 	struct netlink_ring *ring;
849 	struct sock *sk;
850 
851 	/* If a packet from the kernel to userspace was freed because of an
852 	 * error without being delivered to userspace, the kernel must reset
853 	 * the status. In the direction userspace to kernel, the status is
854 	 * always reset here after the packet was processed and freed.
855 	 */
856 	if (netlink_skb_is_mmaped(skb)) {
857 		hdr = netlink_mmap_hdr(skb);
858 		sk = NETLINK_CB(skb).sk;
859 
860 		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
861 			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
862 			ring = &nlk_sk(sk)->tx_ring;
863 		} else {
864 			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
865 				hdr->nm_len = 0;
866 				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
867 			}
868 			ring = &nlk_sk(sk)->rx_ring;
869 		}
870 
871 		WARN_ON(atomic_read(&ring->pending) == 0);
872 		atomic_dec(&ring->pending);
873 		sock_put(sk);
874 
875 		skb->head = NULL;
876 	}
877 #endif
878 	if (is_vmalloc_addr(skb->head)) {
879 		if (!skb->cloned ||
880 		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
881 			vfree(skb->head);
882 
883 		skb->head = NULL;
884 	}
885 	if (skb->sk != NULL)
886 		sock_rfree(skb);
887 }
888 
889 static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
890 {
891 	WARN_ON(skb->sk != NULL);
892 	skb->sk = sk;
893 	skb->destructor = netlink_skb_destructor;
894 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
895 	sk_mem_charge(sk, skb->truesize);
896 }
897 
898 static void netlink_sock_destruct(struct sock *sk)
899 {
900 	struct netlink_sock *nlk = nlk_sk(sk);
901 
902 	if (nlk->cb_running) {
903 		if (nlk->cb.done)
904 			nlk->cb.done(&nlk->cb);
905 
906 		module_put(nlk->cb.module);
907 		kfree_skb(nlk->cb.skb);
908 	}
909 
910 	skb_queue_purge(&sk->sk_receive_queue);
911 #ifdef CONFIG_NETLINK_MMAP
912 	if (1) {
913 		struct nl_mmap_req req;
914 
915 		memset(&req, 0, sizeof(req));
916 		if (nlk->rx_ring.pg_vec)
917 			netlink_set_ring(sk, &req, true, false);
918 		memset(&req, 0, sizeof(req));
919 		if (nlk->tx_ring.pg_vec)
920 			netlink_set_ring(sk, &req, true, true);
921 	}
922 #endif /* CONFIG_NETLINK_MMAP */
923 
924 	if (!sock_flag(sk, SOCK_DEAD)) {
925 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
926 		return;
927 	}
928 
929 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
930 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
931 	WARN_ON(nlk_sk(sk)->groups);
932 }
933 
934 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
935  * SMP. Look, when several writers sleep and reader wakes them up, all but one
936  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
937  * this, _but_ remember, it adds useless work on UP machines.
938  */
939 
940 void netlink_table_grab(void)
941 	__acquires(nl_table_lock)
942 {
943 	might_sleep();
944 
945 	write_lock_irq(&nl_table_lock);
946 
947 	if (atomic_read(&nl_table_users)) {
948 		DECLARE_WAITQUEUE(wait, current);
949 
950 		add_wait_queue_exclusive(&nl_table_wait, &wait);
951 		for (;;) {
952 			set_current_state(TASK_UNINTERRUPTIBLE);
953 			if (atomic_read(&nl_table_users) == 0)
954 				break;
955 			write_unlock_irq(&nl_table_lock);
956 			schedule();
957 			write_lock_irq(&nl_table_lock);
958 		}
959 
960 		__set_current_state(TASK_RUNNING);
961 		remove_wait_queue(&nl_table_wait, &wait);
962 	}
963 }
964 
965 void netlink_table_ungrab(void)
966 	__releases(nl_table_lock)
967 {
968 	write_unlock_irq(&nl_table_lock);
969 	wake_up(&nl_table_wait);
970 }
971 
972 static inline void
973 netlink_lock_table(void)
974 {
975 	/* read_lock() synchronizes us to netlink_table_grab */
976 
977 	read_lock(&nl_table_lock);
978 	atomic_inc(&nl_table_users);
979 	read_unlock(&nl_table_lock);
980 }
981 
982 static inline void
983 netlink_unlock_table(void)
984 {
985 	if (atomic_dec_and_test(&nl_table_users))
986 		wake_up(&nl_table_wait);
987 }
988 
989 static bool netlink_compare(struct net *net, struct sock *sk)
990 {
991 	return net_eq(sock_net(sk), net);
992 }
993 
994 static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
995 {
996 	struct netlink_table *table = &nl_table[protocol];
997 	struct nl_portid_hash *hash = &table->hash;
998 	struct hlist_head *head;
999 	struct sock *sk;
1000 
1001 	read_lock(&nl_table_lock);
1002 	head = nl_portid_hashfn(hash, portid);
1003 	sk_for_each(sk, head) {
1004 		if (table->compare(net, sk) &&
1005 		    (nlk_sk(sk)->portid == portid)) {
1006 			sock_hold(sk);
1007 			goto found;
1008 		}
1009 	}
1010 	sk = NULL;
1011 found:
1012 	read_unlock(&nl_table_lock);
1013 	return sk;
1014 }
1015 
1016 static struct hlist_head *nl_portid_hash_zalloc(size_t size)
1017 {
1018 	if (size <= PAGE_SIZE)
1019 		return kzalloc(size, GFP_ATOMIC);
1020 	else
1021 		return (struct hlist_head *)
1022 			__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
1023 					 get_order(size));
1024 }
1025 
1026 static void nl_portid_hash_free(struct hlist_head *table, size_t size)
1027 {
1028 	if (size <= PAGE_SIZE)
1029 		kfree(table);
1030 	else
1031 		free_pages((unsigned long)table, get_order(size));
1032 }
1033 
1034 static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow)
1035 {
1036 	unsigned int omask, mask, shift;
1037 	size_t osize, size;
1038 	struct hlist_head *otable, *table;
1039 	int i;
1040 
1041 	omask = mask = hash->mask;
1042 	osize = size = (mask + 1) * sizeof(*table);
1043 	shift = hash->shift;
1044 
1045 	if (grow) {
1046 		if (++shift > hash->max_shift)
1047 			return 0;
1048 		mask = mask * 2 + 1;
1049 		size *= 2;
1050 	}
1051 
1052 	table = nl_portid_hash_zalloc(size);
1053 	if (!table)
1054 		return 0;
1055 
1056 	otable = hash->table;
1057 	hash->table = table;
1058 	hash->mask = mask;
1059 	hash->shift = shift;
1060 	get_random_bytes(&hash->rnd, sizeof(hash->rnd));
1061 
1062 	for (i = 0; i <= omask; i++) {
1063 		struct sock *sk;
1064 		struct hlist_node *tmp;
1065 
1066 		sk_for_each_safe(sk, tmp, &otable[i])
1067 			__sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid));
1068 	}
1069 
1070 	nl_portid_hash_free(otable, osize);
1071 	hash->rehash_time = jiffies + 10 * 60 * HZ;
1072 	return 1;
1073 }
1074 
1075 static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len)
1076 {
1077 	int avg = hash->entries >> hash->shift;
1078 
1079 	if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1))
1080 		return 1;
1081 
1082 	if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) {
1083 		nl_portid_hash_rehash(hash, 0);
1084 		return 1;
1085 	}
1086 
1087 	return 0;
1088 }
1089 
1090 static const struct proto_ops netlink_ops;
1091 
1092 static void
1093 netlink_update_listeners(struct sock *sk)
1094 {
1095 	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
1096 	unsigned long mask;
1097 	unsigned int i;
1098 	struct listeners *listeners;
1099 
1100 	listeners = nl_deref_protected(tbl->listeners);
1101 	if (!listeners)
1102 		return;
1103 
1104 	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
1105 		mask = 0;
1106 		sk_for_each_bound(sk, &tbl->mc_list) {
1107 			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
1108 				mask |= nlk_sk(sk)->groups[i];
1109 		}
1110 		listeners->masks[i] = mask;
1111 	}
1112 	/* this function is only called with the netlink table "grabbed", which
1113 	 * makes sure updates are visible before bind or setsockopt return. */
1114 }
1115 
1116 static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
1117 {
1118 	struct netlink_table *table = &nl_table[sk->sk_protocol];
1119 	struct nl_portid_hash *hash = &table->hash;
1120 	struct hlist_head *head;
1121 	int err = -EADDRINUSE;
1122 	struct sock *osk;
1123 	int len;
1124 
1125 	netlink_table_grab();
1126 	head = nl_portid_hashfn(hash, portid);
1127 	len = 0;
1128 	sk_for_each(osk, head) {
1129 		if (table->compare(net, osk) &&
1130 		    (nlk_sk(osk)->portid == portid))
1131 			break;
1132 		len++;
1133 	}
1134 	if (osk)
1135 		goto err;
1136 
1137 	err = -EBUSY;
1138 	if (nlk_sk(sk)->portid)
1139 		goto err;
1140 
1141 	err = -ENOMEM;
1142 	if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
1143 		goto err;
1144 
1145 	if (len && nl_portid_hash_dilute(hash, len))
1146 		head = nl_portid_hashfn(hash, portid);
1147 	hash->entries++;
1148 	nlk_sk(sk)->portid = portid;
1149 	sk_add_node(sk, head);
1150 	err = 0;
1151 
1152 err:
1153 	netlink_table_ungrab();
1154 	return err;
1155 }
1156 
1157 static void netlink_remove(struct sock *sk)
1158 {
1159 	netlink_table_grab();
1160 	if (sk_del_node_init(sk))
1161 		nl_table[sk->sk_protocol].hash.entries--;
1162 	if (nlk_sk(sk)->subscriptions)
1163 		__sk_del_bind_node(sk);
1164 	netlink_table_ungrab();
1165 }
1166 
1167 static struct proto netlink_proto = {
1168 	.name	  = "NETLINK",
1169 	.owner	  = THIS_MODULE,
1170 	.obj_size = sizeof(struct netlink_sock),
1171 };
1172 
1173 static int __netlink_create(struct net *net, struct socket *sock,
1174 			    struct mutex *cb_mutex, int protocol)
1175 {
1176 	struct sock *sk;
1177 	struct netlink_sock *nlk;
1178 
1179 	sock->ops = &netlink_ops;
1180 
1181 	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
1182 	if (!sk)
1183 		return -ENOMEM;
1184 
1185 	sock_init_data(sock, sk);
1186 
1187 	nlk = nlk_sk(sk);
1188 	if (cb_mutex) {
1189 		nlk->cb_mutex = cb_mutex;
1190 	} else {
1191 		nlk->cb_mutex = &nlk->cb_def_mutex;
1192 		mutex_init(nlk->cb_mutex);
1193 	}
1194 	init_waitqueue_head(&nlk->wait);
1195 #ifdef CONFIG_NETLINK_MMAP
1196 	mutex_init(&nlk->pg_vec_lock);
1197 #endif
1198 
1199 	sk->sk_destruct = netlink_sock_destruct;
1200 	sk->sk_protocol = protocol;
1201 	return 0;
1202 }
1203 
1204 static int netlink_create(struct net *net, struct socket *sock, int protocol,
1205 			  int kern)
1206 {
1207 	struct module *module = NULL;
1208 	struct mutex *cb_mutex;
1209 	struct netlink_sock *nlk;
1210 	void (*bind)(int group);
1211 	int err = 0;
1212 
1213 	sock->state = SS_UNCONNECTED;
1214 
1215 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
1216 		return -ESOCKTNOSUPPORT;
1217 
1218 	if (protocol < 0 || protocol >= MAX_LINKS)
1219 		return -EPROTONOSUPPORT;
1220 
1221 	netlink_lock_table();
1222 #ifdef CONFIG_MODULES
1223 	if (!nl_table[protocol].registered) {
1224 		netlink_unlock_table();
1225 		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
1226 		netlink_lock_table();
1227 	}
1228 #endif
1229 	if (nl_table[protocol].registered &&
1230 	    try_module_get(nl_table[protocol].module))
1231 		module = nl_table[protocol].module;
1232 	else
1233 		err = -EPROTONOSUPPORT;
1234 	cb_mutex = nl_table[protocol].cb_mutex;
1235 	bind = nl_table[protocol].bind;
1236 	netlink_unlock_table();
1237 
1238 	if (err < 0)
1239 		goto out;
1240 
1241 	err = __netlink_create(net, sock, cb_mutex, protocol);
1242 	if (err < 0)
1243 		goto out_module;
1244 
1245 	local_bh_disable();
1246 	sock_prot_inuse_add(net, &netlink_proto, 1);
1247 	local_bh_enable();
1248 
1249 	nlk = nlk_sk(sock->sk);
1250 	nlk->module = module;
1251 	nlk->netlink_bind = bind;
1252 out:
1253 	return err;
1254 
1255 out_module:
1256 	module_put(module);
1257 	goto out;
1258 }
1259 
1260 static int netlink_release(struct socket *sock)
1261 {
1262 	struct sock *sk = sock->sk;
1263 	struct netlink_sock *nlk;
1264 
1265 	if (!sk)
1266 		return 0;
1267 
1268 	netlink_remove(sk);
1269 	sock_orphan(sk);
1270 	nlk = nlk_sk(sk);
1271 
1272 	/*
1273 	 * OK. Socket is unlinked, any packets that arrive now
1274 	 * will be purged.
1275 	 */
1276 
1277 	sock->sk = NULL;
1278 	wake_up_interruptible_all(&nlk->wait);
1279 
1280 	skb_queue_purge(&sk->sk_write_queue);
1281 
1282 	if (nlk->portid) {
1283 		struct netlink_notify n = {
1284 						.net = sock_net(sk),
1285 						.protocol = sk->sk_protocol,
1286 						.portid = nlk->portid,
1287 					  };
1288 		atomic_notifier_call_chain(&netlink_chain,
1289 				NETLINK_URELEASE, &n);
1290 	}
1291 
1292 	module_put(nlk->module);
1293 
1294 	netlink_table_grab();
1295 	if (netlink_is_kernel(sk)) {
1296 		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
1297 		if (--nl_table[sk->sk_protocol].registered == 0) {
1298 			struct listeners *old;
1299 
1300 			old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
1301 			RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
1302 			kfree_rcu(old, rcu);
1303 			nl_table[sk->sk_protocol].module = NULL;
1304 			nl_table[sk->sk_protocol].bind = NULL;
1305 			nl_table[sk->sk_protocol].flags = 0;
1306 			nl_table[sk->sk_protocol].registered = 0;
1307 		}
1308 	} else if (nlk->subscriptions) {
1309 		netlink_update_listeners(sk);
1310 	}
1311 	netlink_table_ungrab();
1312 
1313 	kfree(nlk->groups);
1314 	nlk->groups = NULL;
1315 
1316 	local_bh_disable();
1317 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1318 	local_bh_enable();
1319 	sock_put(sk);
1320 	return 0;
1321 }
1322 
1323 static int netlink_autobind(struct socket *sock)
1324 {
1325 	struct sock *sk = sock->sk;
1326 	struct net *net = sock_net(sk);
1327 	struct netlink_table *table = &nl_table[sk->sk_protocol];
1328 	struct nl_portid_hash *hash = &table->hash;
1329 	struct hlist_head *head;
1330 	struct sock *osk;
1331 	s32 portid = task_tgid_vnr(current);
1332 	int err;
1333 	static s32 rover = -4097;
1334 
1335 retry:
1336 	cond_resched();
1337 	netlink_table_grab();
1338 	head = nl_portid_hashfn(hash, portid);
1339 	sk_for_each(osk, head) {
1340 		if (!table->compare(net, osk))
1341 			continue;
1342 		if (nlk_sk(osk)->portid == portid) {
1343 			/* Bind collision, search negative portid values. */
1344 			portid = rover--;
1345 			if (rover > -4097)
1346 				rover = -4097;
1347 			netlink_table_ungrab();
1348 			goto retry;
1349 		}
1350 	}
1351 	netlink_table_ungrab();
1352 
1353 	err = netlink_insert(sk, net, portid);
1354 	if (err == -EADDRINUSE)
1355 		goto retry;
1356 
1357 	/* If 2 threads race to autobind, that is fine.  */
1358 	if (err == -EBUSY)
1359 		err = 0;
1360 
1361 	return err;
1362 }
1363 
1364 static inline int netlink_capable(const struct socket *sock, unsigned int flag)
1365 {
1366 	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
1367 		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
1368 }
1369 
1370 static void
1371 netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
1372 {
1373 	struct netlink_sock *nlk = nlk_sk(sk);
1374 
1375 	if (nlk->subscriptions && !subscriptions)
1376 		__sk_del_bind_node(sk);
1377 	else if (!nlk->subscriptions && subscriptions)
1378 		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
1379 	nlk->subscriptions = subscriptions;
1380 }
1381 
1382 static int netlink_realloc_groups(struct sock *sk)
1383 {
1384 	struct netlink_sock *nlk = nlk_sk(sk);
1385 	unsigned int groups;
1386 	unsigned long *new_groups;
1387 	int err = 0;
1388 
1389 	netlink_table_grab();
1390 
1391 	groups = nl_table[sk->sk_protocol].groups;
1392 	if (!nl_table[sk->sk_protocol].registered) {
1393 		err = -ENOENT;
1394 		goto out_unlock;
1395 	}
1396 
1397 	if (nlk->ngroups >= groups)
1398 		goto out_unlock;
1399 
1400 	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
1401 	if (new_groups == NULL) {
1402 		err = -ENOMEM;
1403 		goto out_unlock;
1404 	}
1405 	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
1406 	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
1407 
1408 	nlk->groups = new_groups;
1409 	nlk->ngroups = groups;
1410  out_unlock:
1411 	netlink_table_ungrab();
1412 	return err;
1413 }
1414 
1415 static int netlink_bind(struct socket *sock, struct sockaddr *addr,
1416 			int addr_len)
1417 {
1418 	struct sock *sk = sock->sk;
1419 	struct net *net = sock_net(sk);
1420 	struct netlink_sock *nlk = nlk_sk(sk);
1421 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
1422 	int err;
1423 
1424 	if (addr_len < sizeof(struct sockaddr_nl))
1425 		return -EINVAL;
1426 
1427 	if (nladdr->nl_family != AF_NETLINK)
1428 		return -EINVAL;
1429 
1430 	/* Only superuser is allowed to listen multicasts */
1431 	if (nladdr->nl_groups) {
1432 		if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
1433 			return -EPERM;
1434 		err = netlink_realloc_groups(sk);
1435 		if (err)
1436 			return err;
1437 	}
1438 
1439 	if (nlk->portid) {
1440 		if (nladdr->nl_pid != nlk->portid)
1441 			return -EINVAL;
1442 	} else {
1443 		err = nladdr->nl_pid ?
1444 			netlink_insert(sk, net, nladdr->nl_pid) :
1445 			netlink_autobind(sock);
1446 		if (err)
1447 			return err;
1448 	}
1449 
1450 	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
1451 		return 0;
1452 
1453 	netlink_table_grab();
1454 	netlink_update_subscriptions(sk, nlk->subscriptions +
1455 					 hweight32(nladdr->nl_groups) -
1456 					 hweight32(nlk->groups[0]));
1457 	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
1458 	netlink_update_listeners(sk);
1459 	netlink_table_ungrab();
1460 
1461 	if (nlk->netlink_bind && nlk->groups[0]) {
1462 		int i;
1463 
1464 		for (i=0; i<nlk->ngroups; i++) {
1465 			if (test_bit(i, nlk->groups))
1466 				nlk->netlink_bind(i);
1467 		}
1468 	}
1469 
1470 	return 0;
1471 }
1472 
1473 static int netlink_connect(struct socket *sock, struct sockaddr *addr,
1474 			   int alen, int flags)
1475 {
1476 	int err = 0;
1477 	struct sock *sk = sock->sk;
1478 	struct netlink_sock *nlk = nlk_sk(sk);
1479 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
1480 
1481 	if (alen < sizeof(addr->sa_family))
1482 		return -EINVAL;
1483 
1484 	if (addr->sa_family == AF_UNSPEC) {
1485 		sk->sk_state	= NETLINK_UNCONNECTED;
1486 		nlk->dst_portid	= 0;
1487 		nlk->dst_group  = 0;
1488 		return 0;
1489 	}
1490 	if (addr->sa_family != AF_NETLINK)
1491 		return -EINVAL;
1492 
1493 	/* Only superuser is allowed to send multicasts */
1494 	if (nladdr->nl_groups && !netlink_capable(sock, NL_CFG_F_NONROOT_SEND))
1495 		return -EPERM;
1496 
1497 	if (!nlk->portid)
1498 		err = netlink_autobind(sock);
1499 
1500 	if (err == 0) {
1501 		sk->sk_state	= NETLINK_CONNECTED;
1502 		nlk->dst_portid = nladdr->nl_pid;
1503 		nlk->dst_group  = ffs(nladdr->nl_groups);
1504 	}
1505 
1506 	return err;
1507 }
1508 
1509 static int netlink_getname(struct socket *sock, struct sockaddr *addr,
1510 			   int *addr_len, int peer)
1511 {
1512 	struct sock *sk = sock->sk;
1513 	struct netlink_sock *nlk = nlk_sk(sk);
1514 	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
1515 
1516 	nladdr->nl_family = AF_NETLINK;
1517 	nladdr->nl_pad = 0;
1518 	*addr_len = sizeof(*nladdr);
1519 
1520 	if (peer) {
1521 		nladdr->nl_pid = nlk->dst_portid;
1522 		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
1523 	} else {
1524 		nladdr->nl_pid = nlk->portid;
1525 		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
1526 	}
1527 	return 0;
1528 }
1529 
1530 static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
1531 {
1532 	struct sock *sock;
1533 	struct netlink_sock *nlk;
1534 
1535 	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
1536 	if (!sock)
1537 		return ERR_PTR(-ECONNREFUSED);
1538 
1539 	/* Don't bother queuing skb if kernel socket has no input function */
1540 	nlk = nlk_sk(sock);
1541 	if (sock->sk_state == NETLINK_CONNECTED &&
1542 	    nlk->dst_portid != nlk_sk(ssk)->portid) {
1543 		sock_put(sock);
1544 		return ERR_PTR(-ECONNREFUSED);
1545 	}
1546 	return sock;
1547 }
1548 
1549 struct sock *netlink_getsockbyfilp(struct file *filp)
1550 {
1551 	struct inode *inode = file_inode(filp);
1552 	struct sock *sock;
1553 
1554 	if (!S_ISSOCK(inode->i_mode))
1555 		return ERR_PTR(-ENOTSOCK);
1556 
1557 	sock = SOCKET_I(inode)->sk;
1558 	if (sock->sk_family != AF_NETLINK)
1559 		return ERR_PTR(-EINVAL);
1560 
1561 	sock_hold(sock);
1562 	return sock;
1563 }
1564 
1565 static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
1566 					       int broadcast)
1567 {
1568 	struct sk_buff *skb;
1569 	void *data;
1570 
1571 	if (size <= NLMSG_GOODSIZE || broadcast)
1572 		return alloc_skb(size, GFP_KERNEL);
1573 
1574 	size = SKB_DATA_ALIGN(size) +
1575 	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1576 
1577 	data = vmalloc(size);
1578 	if (data == NULL)
1579 		return NULL;
1580 
1581 	skb = build_skb(data, size);
1582 	if (skb == NULL)
1583 		vfree(data);
1584 	else {
1585 		skb->head_frag = 0;
1586 		skb->destructor = netlink_skb_destructor;
1587 	}
1588 
1589 	return skb;
1590 }
1591 
1592 /*
1593  * Attach a skb to a netlink socket.
1594  * The caller must hold a reference to the destination socket. On error, the
1595  * reference is dropped. The skb is not send to the destination, just all
1596  * all error checks are performed and memory in the queue is reserved.
1597  * Return values:
1598  * < 0: error. skb freed, reference to sock dropped.
1599  * 0: continue
1600  * 1: repeat lookup - reference dropped while waiting for socket memory.
1601  */
1602 int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
1603 		      long *timeo, struct sock *ssk)
1604 {
1605 	struct netlink_sock *nlk;
1606 
1607 	nlk = nlk_sk(sk);
1608 
1609 	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1610 	     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
1611 	    !netlink_skb_is_mmaped(skb)) {
1612 		DECLARE_WAITQUEUE(wait, current);
1613 		if (!*timeo) {
1614 			if (!ssk || netlink_is_kernel(ssk))
1615 				netlink_overrun(sk);
1616 			sock_put(sk);
1617 			kfree_skb(skb);
1618 			return -EAGAIN;
1619 		}
1620 
1621 		__set_current_state(TASK_INTERRUPTIBLE);
1622 		add_wait_queue(&nlk->wait, &wait);
1623 
1624 		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1625 		     test_bit(NETLINK_CONGESTED, &nlk->state)) &&
1626 		    !sock_flag(sk, SOCK_DEAD))
1627 			*timeo = schedule_timeout(*timeo);
1628 
1629 		__set_current_state(TASK_RUNNING);
1630 		remove_wait_queue(&nlk->wait, &wait);
1631 		sock_put(sk);
1632 
1633 		if (signal_pending(current)) {
1634 			kfree_skb(skb);
1635 			return sock_intr_errno(*timeo);
1636 		}
1637 		return 1;
1638 	}
1639 	netlink_skb_set_owner_r(skb, sk);
1640 	return 0;
1641 }
1642 
1643 static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
1644 {
1645 	int len = skb->len;
1646 
1647 	netlink_deliver_tap(skb);
1648 
1649 #ifdef CONFIG_NETLINK_MMAP
1650 	if (netlink_skb_is_mmaped(skb))
1651 		netlink_queue_mmaped_skb(sk, skb);
1652 	else if (netlink_rx_is_mmaped(sk))
1653 		netlink_ring_set_copied(sk, skb);
1654 	else
1655 #endif /* CONFIG_NETLINK_MMAP */
1656 		skb_queue_tail(&sk->sk_receive_queue, skb);
1657 	sk->sk_data_ready(sk, len);
1658 	return len;
1659 }
1660 
1661 int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
1662 {
1663 	int len = __netlink_sendskb(sk, skb);
1664 
1665 	sock_put(sk);
1666 	return len;
1667 }
1668 
1669 void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
1670 {
1671 	kfree_skb(skb);
1672 	sock_put(sk);
1673 }
1674 
1675 static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
1676 {
1677 	int delta;
1678 
1679 	WARN_ON(skb->sk != NULL);
1680 	if (netlink_skb_is_mmaped(skb))
1681 		return skb;
1682 
1683 	delta = skb->end - skb->tail;
1684 	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
1685 		return skb;
1686 
1687 	if (skb_shared(skb)) {
1688 		struct sk_buff *nskb = skb_clone(skb, allocation);
1689 		if (!nskb)
1690 			return skb;
1691 		consume_skb(skb);
1692 		skb = nskb;
1693 	}
1694 
1695 	if (!pskb_expand_head(skb, 0, -delta, allocation))
1696 		skb->truesize -= delta;
1697 
1698 	return skb;
1699 }
1700 
1701 static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
1702 				  struct sock *ssk)
1703 {
1704 	int ret;
1705 	struct netlink_sock *nlk = nlk_sk(sk);
1706 
1707 	ret = -ECONNREFUSED;
1708 	if (nlk->netlink_rcv != NULL) {
1709 		ret = skb->len;
1710 		netlink_skb_set_owner_r(skb, sk);
1711 		NETLINK_CB(skb).sk = ssk;
1712 		netlink_deliver_tap_kernel(sk, ssk, skb);
1713 		nlk->netlink_rcv(skb);
1714 		consume_skb(skb);
1715 	} else {
1716 		kfree_skb(skb);
1717 	}
1718 	sock_put(sk);
1719 	return ret;
1720 }
1721 
1722 int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
1723 		    u32 portid, int nonblock)
1724 {
1725 	struct sock *sk;
1726 	int err;
1727 	long timeo;
1728 
1729 	skb = netlink_trim(skb, gfp_any());
1730 
1731 	timeo = sock_sndtimeo(ssk, nonblock);
1732 retry:
1733 	sk = netlink_getsockbyportid(ssk, portid);
1734 	if (IS_ERR(sk)) {
1735 		kfree_skb(skb);
1736 		return PTR_ERR(sk);
1737 	}
1738 	if (netlink_is_kernel(sk))
1739 		return netlink_unicast_kernel(sk, skb, ssk);
1740 
1741 	if (sk_filter(sk, skb)) {
1742 		err = skb->len;
1743 		kfree_skb(skb);
1744 		sock_put(sk);
1745 		return err;
1746 	}
1747 
1748 	err = netlink_attachskb(sk, skb, &timeo, ssk);
1749 	if (err == 1)
1750 		goto retry;
1751 	if (err)
1752 		return err;
1753 
1754 	return netlink_sendskb(sk, skb);
1755 }
1756 EXPORT_SYMBOL(netlink_unicast);
1757 
1758 struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
1759 				  u32 dst_portid, gfp_t gfp_mask)
1760 {
1761 #ifdef CONFIG_NETLINK_MMAP
1762 	struct sock *sk = NULL;
1763 	struct sk_buff *skb;
1764 	struct netlink_ring *ring;
1765 	struct nl_mmap_hdr *hdr;
1766 	unsigned int maxlen;
1767 
1768 	sk = netlink_getsockbyportid(ssk, dst_portid);
1769 	if (IS_ERR(sk))
1770 		goto out;
1771 
1772 	ring = &nlk_sk(sk)->rx_ring;
1773 	/* fast-path without atomic ops for common case: non-mmaped receiver */
1774 	if (ring->pg_vec == NULL)
1775 		goto out_put;
1776 
1777 	skb = alloc_skb_head(gfp_mask);
1778 	if (skb == NULL)
1779 		goto err1;
1780 
1781 	spin_lock_bh(&sk->sk_receive_queue.lock);
1782 	/* check again under lock */
1783 	if (ring->pg_vec == NULL)
1784 		goto out_free;
1785 
1786 	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1787 	if (maxlen < size)
1788 		goto out_free;
1789 
1790 	netlink_forward_ring(ring);
1791 	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1792 	if (hdr == NULL)
1793 		goto err2;
1794 	netlink_ring_setup_skb(skb, sk, ring, hdr);
1795 	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
1796 	atomic_inc(&ring->pending);
1797 	netlink_increment_head(ring);
1798 
1799 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1800 	return skb;
1801 
1802 err2:
1803 	kfree_skb(skb);
1804 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1805 	netlink_overrun(sk);
1806 err1:
1807 	sock_put(sk);
1808 	return NULL;
1809 
1810 out_free:
1811 	kfree_skb(skb);
1812 	spin_unlock_bh(&sk->sk_receive_queue.lock);
1813 out_put:
1814 	sock_put(sk);
1815 out:
1816 #endif
1817 	return alloc_skb(size, gfp_mask);
1818 }
1819 EXPORT_SYMBOL_GPL(netlink_alloc_skb);
1820 
1821 int netlink_has_listeners(struct sock *sk, unsigned int group)
1822 {
1823 	int res = 0;
1824 	struct listeners *listeners;
1825 
1826 	BUG_ON(!netlink_is_kernel(sk));
1827 
1828 	rcu_read_lock();
1829 	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
1830 
1831 	if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
1832 		res = test_bit(group - 1, listeners->masks);
1833 
1834 	rcu_read_unlock();
1835 
1836 	return res;
1837 }
1838 EXPORT_SYMBOL_GPL(netlink_has_listeners);
1839 
1840 static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
1841 {
1842 	struct netlink_sock *nlk = nlk_sk(sk);
1843 
1844 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
1845 	    !test_bit(NETLINK_CONGESTED, &nlk->state)) {
1846 		netlink_skb_set_owner_r(skb, sk);
1847 		__netlink_sendskb(sk, skb);
1848 		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
1849 	}
1850 	return -1;
1851 }
1852 
1853 struct netlink_broadcast_data {
1854 	struct sock *exclude_sk;
1855 	struct net *net;
1856 	u32 portid;
1857 	u32 group;
1858 	int failure;
1859 	int delivery_failure;
1860 	int congested;
1861 	int delivered;
1862 	gfp_t allocation;
1863 	struct sk_buff *skb, *skb2;
1864 	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
1865 	void *tx_data;
1866 };
1867 
1868 static int do_one_broadcast(struct sock *sk,
1869 				   struct netlink_broadcast_data *p)
1870 {
1871 	struct netlink_sock *nlk = nlk_sk(sk);
1872 	int val;
1873 
1874 	if (p->exclude_sk == sk)
1875 		goto out;
1876 
1877 	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
1878 	    !test_bit(p->group - 1, nlk->groups))
1879 		goto out;
1880 
1881 	if (!net_eq(sock_net(sk), p->net))
1882 		goto out;
1883 
1884 	if (p->failure) {
1885 		netlink_overrun(sk);
1886 		goto out;
1887 	}
1888 
1889 	sock_hold(sk);
1890 	if (p->skb2 == NULL) {
1891 		if (skb_shared(p->skb)) {
1892 			p->skb2 = skb_clone(p->skb, p->allocation);
1893 		} else {
1894 			p->skb2 = skb_get(p->skb);
1895 			/*
1896 			 * skb ownership may have been set when
1897 			 * delivered to a previous socket.
1898 			 */
1899 			skb_orphan(p->skb2);
1900 		}
1901 	}
1902 	if (p->skb2 == NULL) {
1903 		netlink_overrun(sk);
1904 		/* Clone failed. Notify ALL listeners. */
1905 		p->failure = 1;
1906 		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
1907 			p->delivery_failure = 1;
1908 	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
1909 		kfree_skb(p->skb2);
1910 		p->skb2 = NULL;
1911 	} else if (sk_filter(sk, p->skb2)) {
1912 		kfree_skb(p->skb2);
1913 		p->skb2 = NULL;
1914 	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
1915 		netlink_overrun(sk);
1916 		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
1917 			p->delivery_failure = 1;
1918 	} else {
1919 		p->congested |= val;
1920 		p->delivered = 1;
1921 		p->skb2 = NULL;
1922 	}
1923 	sock_put(sk);
1924 
1925 out:
1926 	return 0;
1927 }
1928 
1929 int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
1930 	u32 group, gfp_t allocation,
1931 	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
1932 	void *filter_data)
1933 {
1934 	struct net *net = sock_net(ssk);
1935 	struct netlink_broadcast_data info;
1936 	struct sock *sk;
1937 
1938 	skb = netlink_trim(skb, allocation);
1939 
1940 	info.exclude_sk = ssk;
1941 	info.net = net;
1942 	info.portid = portid;
1943 	info.group = group;
1944 	info.failure = 0;
1945 	info.delivery_failure = 0;
1946 	info.congested = 0;
1947 	info.delivered = 0;
1948 	info.allocation = allocation;
1949 	info.skb = skb;
1950 	info.skb2 = NULL;
1951 	info.tx_filter = filter;
1952 	info.tx_data = filter_data;
1953 
1954 	/* While we sleep in clone, do not allow to change socket list */
1955 
1956 	netlink_lock_table();
1957 
1958 	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
1959 		do_one_broadcast(sk, &info);
1960 
1961 	consume_skb(skb);
1962 
1963 	netlink_unlock_table();
1964 
1965 	if (info.delivery_failure) {
1966 		kfree_skb(info.skb2);
1967 		return -ENOBUFS;
1968 	}
1969 	consume_skb(info.skb2);
1970 
1971 	if (info.delivered) {
1972 		if (info.congested && (allocation & __GFP_WAIT))
1973 			yield();
1974 		return 0;
1975 	}
1976 	return -ESRCH;
1977 }
1978 EXPORT_SYMBOL(netlink_broadcast_filtered);
1979 
1980 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
1981 		      u32 group, gfp_t allocation)
1982 {
1983 	return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
1984 		NULL, NULL);
1985 }
1986 EXPORT_SYMBOL(netlink_broadcast);
1987 
1988 struct netlink_set_err_data {
1989 	struct sock *exclude_sk;
1990 	u32 portid;
1991 	u32 group;
1992 	int code;
1993 };
1994 
1995 static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
1996 {
1997 	struct netlink_sock *nlk = nlk_sk(sk);
1998 	int ret = 0;
1999 
2000 	if (sk == p->exclude_sk)
2001 		goto out;
2002 
2003 	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
2004 		goto out;
2005 
2006 	if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
2007 	    !test_bit(p->group - 1, nlk->groups))
2008 		goto out;
2009 
2010 	if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
2011 		ret = 1;
2012 		goto out;
2013 	}
2014 
2015 	sk->sk_err = p->code;
2016 	sk->sk_error_report(sk);
2017 out:
2018 	return ret;
2019 }
2020 
2021 /**
2022  * netlink_set_err - report error to broadcast listeners
2023  * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
2024  * @portid: the PORTID of a process that we want to skip (if any)
2025  * @group: the broadcast group that will notice the error
2026  * @code: error code, must be negative (as usual in kernelspace)
2027  *
2028  * This function returns the number of broadcast listeners that have set the
2029  * NETLINK_RECV_NO_ENOBUFS socket option.
2030  */
2031 int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
2032 {
2033 	struct netlink_set_err_data info;
2034 	struct sock *sk;
2035 	int ret = 0;
2036 
2037 	info.exclude_sk = ssk;
2038 	info.portid = portid;
2039 	info.group = group;
2040 	/* sk->sk_err wants a positive error value */
2041 	info.code = -code;
2042 
2043 	read_lock(&nl_table_lock);
2044 
2045 	sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
2046 		ret += do_one_set_err(sk, &info);
2047 
2048 	read_unlock(&nl_table_lock);
2049 	return ret;
2050 }
2051 EXPORT_SYMBOL(netlink_set_err);
2052 
2053 /* must be called with netlink table grabbed */
2054 static void netlink_update_socket_mc(struct netlink_sock *nlk,
2055 				     unsigned int group,
2056 				     int is_new)
2057 {
2058 	int old, new = !!is_new, subscriptions;
2059 
2060 	old = test_bit(group - 1, nlk->groups);
2061 	subscriptions = nlk->subscriptions - old + new;
2062 	if (new)
2063 		__set_bit(group - 1, nlk->groups);
2064 	else
2065 		__clear_bit(group - 1, nlk->groups);
2066 	netlink_update_subscriptions(&nlk->sk, subscriptions);
2067 	netlink_update_listeners(&nlk->sk);
2068 }
2069 
2070 static int netlink_setsockopt(struct socket *sock, int level, int optname,
2071 			      char __user *optval, unsigned int optlen)
2072 {
2073 	struct sock *sk = sock->sk;
2074 	struct netlink_sock *nlk = nlk_sk(sk);
2075 	unsigned int val = 0;
2076 	int err;
2077 
2078 	if (level != SOL_NETLINK)
2079 		return -ENOPROTOOPT;
2080 
2081 	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
2082 	    optlen >= sizeof(int) &&
2083 	    get_user(val, (unsigned int __user *)optval))
2084 		return -EFAULT;
2085 
2086 	switch (optname) {
2087 	case NETLINK_PKTINFO:
2088 		if (val)
2089 			nlk->flags |= NETLINK_RECV_PKTINFO;
2090 		else
2091 			nlk->flags &= ~NETLINK_RECV_PKTINFO;
2092 		err = 0;
2093 		break;
2094 	case NETLINK_ADD_MEMBERSHIP:
2095 	case NETLINK_DROP_MEMBERSHIP: {
2096 		if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
2097 			return -EPERM;
2098 		err = netlink_realloc_groups(sk);
2099 		if (err)
2100 			return err;
2101 		if (!val || val - 1 >= nlk->ngroups)
2102 			return -EINVAL;
2103 		netlink_table_grab();
2104 		netlink_update_socket_mc(nlk, val,
2105 					 optname == NETLINK_ADD_MEMBERSHIP);
2106 		netlink_table_ungrab();
2107 
2108 		if (nlk->netlink_bind)
2109 			nlk->netlink_bind(val);
2110 
2111 		err = 0;
2112 		break;
2113 	}
2114 	case NETLINK_BROADCAST_ERROR:
2115 		if (val)
2116 			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
2117 		else
2118 			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
2119 		err = 0;
2120 		break;
2121 	case NETLINK_NO_ENOBUFS:
2122 		if (val) {
2123 			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
2124 			clear_bit(NETLINK_CONGESTED, &nlk->state);
2125 			wake_up_interruptible(&nlk->wait);
2126 		} else {
2127 			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
2128 		}
2129 		err = 0;
2130 		break;
2131 #ifdef CONFIG_NETLINK_MMAP
2132 	case NETLINK_RX_RING:
2133 	case NETLINK_TX_RING: {
2134 		struct nl_mmap_req req;
2135 
2136 		/* Rings might consume more memory than queue limits, require
2137 		 * CAP_NET_ADMIN.
2138 		 */
2139 		if (!capable(CAP_NET_ADMIN))
2140 			return -EPERM;
2141 		if (optlen < sizeof(req))
2142 			return -EINVAL;
2143 		if (copy_from_user(&req, optval, sizeof(req)))
2144 			return -EFAULT;
2145 		err = netlink_set_ring(sk, &req, false,
2146 				       optname == NETLINK_TX_RING);
2147 		break;
2148 	}
2149 #endif /* CONFIG_NETLINK_MMAP */
2150 	default:
2151 		err = -ENOPROTOOPT;
2152 	}
2153 	return err;
2154 }
2155 
2156 static int netlink_getsockopt(struct socket *sock, int level, int optname,
2157 			      char __user *optval, int __user *optlen)
2158 {
2159 	struct sock *sk = sock->sk;
2160 	struct netlink_sock *nlk = nlk_sk(sk);
2161 	int len, val, err;
2162 
2163 	if (level != SOL_NETLINK)
2164 		return -ENOPROTOOPT;
2165 
2166 	if (get_user(len, optlen))
2167 		return -EFAULT;
2168 	if (len < 0)
2169 		return -EINVAL;
2170 
2171 	switch (optname) {
2172 	case NETLINK_PKTINFO:
2173 		if (len < sizeof(int))
2174 			return -EINVAL;
2175 		len = sizeof(int);
2176 		val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
2177 		if (put_user(len, optlen) ||
2178 		    put_user(val, optval))
2179 			return -EFAULT;
2180 		err = 0;
2181 		break;
2182 	case NETLINK_BROADCAST_ERROR:
2183 		if (len < sizeof(int))
2184 			return -EINVAL;
2185 		len = sizeof(int);
2186 		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
2187 		if (put_user(len, optlen) ||
2188 		    put_user(val, optval))
2189 			return -EFAULT;
2190 		err = 0;
2191 		break;
2192 	case NETLINK_NO_ENOBUFS:
2193 		if (len < sizeof(int))
2194 			return -EINVAL;
2195 		len = sizeof(int);
2196 		val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
2197 		if (put_user(len, optlen) ||
2198 		    put_user(val, optval))
2199 			return -EFAULT;
2200 		err = 0;
2201 		break;
2202 	default:
2203 		err = -ENOPROTOOPT;
2204 	}
2205 	return err;
2206 }
2207 
2208 static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
2209 {
2210 	struct nl_pktinfo info;
2211 
2212 	info.group = NETLINK_CB(skb).dst_group;
2213 	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
2214 }
2215 
2216 static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
2217 			   struct msghdr *msg, size_t len)
2218 {
2219 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
2220 	struct sock *sk = sock->sk;
2221 	struct netlink_sock *nlk = nlk_sk(sk);
2222 	struct sockaddr_nl *addr = msg->msg_name;
2223 	u32 dst_portid;
2224 	u32 dst_group;
2225 	struct sk_buff *skb;
2226 	int err;
2227 	struct scm_cookie scm;
2228 
2229 	if (msg->msg_flags&MSG_OOB)
2230 		return -EOPNOTSUPP;
2231 
2232 	if (NULL == siocb->scm)
2233 		siocb->scm = &scm;
2234 
2235 	err = scm_send(sock, msg, siocb->scm, true);
2236 	if (err < 0)
2237 		return err;
2238 
2239 	if (msg->msg_namelen) {
2240 		err = -EINVAL;
2241 		if (addr->nl_family != AF_NETLINK)
2242 			goto out;
2243 		dst_portid = addr->nl_pid;
2244 		dst_group = ffs(addr->nl_groups);
2245 		err =  -EPERM;
2246 		if ((dst_group || dst_portid) &&
2247 		    !netlink_capable(sock, NL_CFG_F_NONROOT_SEND))
2248 			goto out;
2249 	} else {
2250 		dst_portid = nlk->dst_portid;
2251 		dst_group = nlk->dst_group;
2252 	}
2253 
2254 	if (!nlk->portid) {
2255 		err = netlink_autobind(sock);
2256 		if (err)
2257 			goto out;
2258 	}
2259 
2260 	if (netlink_tx_is_mmaped(sk) &&
2261 	    msg->msg_iov->iov_base == NULL) {
2262 		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
2263 					   siocb);
2264 		goto out;
2265 	}
2266 
2267 	err = -EMSGSIZE;
2268 	if (len > sk->sk_sndbuf - 32)
2269 		goto out;
2270 	err = -ENOBUFS;
2271 	skb = netlink_alloc_large_skb(len, dst_group);
2272 	if (skb == NULL)
2273 		goto out;
2274 
2275 	NETLINK_CB(skb).portid	= nlk->portid;
2276 	NETLINK_CB(skb).dst_group = dst_group;
2277 	NETLINK_CB(skb).creds	= siocb->scm->creds;
2278 
2279 	err = -EFAULT;
2280 	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
2281 		kfree_skb(skb);
2282 		goto out;
2283 	}
2284 
2285 	err = security_netlink_send(sk, skb);
2286 	if (err) {
2287 		kfree_skb(skb);
2288 		goto out;
2289 	}
2290 
2291 	if (dst_group) {
2292 		atomic_inc(&skb->users);
2293 		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
2294 	}
2295 	err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
2296 
2297 out:
2298 	scm_destroy(siocb->scm);
2299 	return err;
2300 }
2301 
2302 static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
2303 			   struct msghdr *msg, size_t len,
2304 			   int flags)
2305 {
2306 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
2307 	struct scm_cookie scm;
2308 	struct sock *sk = sock->sk;
2309 	struct netlink_sock *nlk = nlk_sk(sk);
2310 	int noblock = flags&MSG_DONTWAIT;
2311 	size_t copied;
2312 	struct sk_buff *skb, *data_skb;
2313 	int err, ret;
2314 
2315 	if (flags&MSG_OOB)
2316 		return -EOPNOTSUPP;
2317 
2318 	copied = 0;
2319 
2320 	skb = skb_recv_datagram(sk, flags, noblock, &err);
2321 	if (skb == NULL)
2322 		goto out;
2323 
2324 	data_skb = skb;
2325 
2326 #ifdef CONFIG_COMPAT_NETLINK_MESSAGES
2327 	if (unlikely(skb_shinfo(skb)->frag_list)) {
2328 		/*
2329 		 * If this skb has a frag_list, then here that means that we
2330 		 * will have to use the frag_list skb's data for compat tasks
2331 		 * and the regular skb's data for normal (non-compat) tasks.
2332 		 *
2333 		 * If we need to send the compat skb, assign it to the
2334 		 * 'data_skb' variable so that it will be used below for data
2335 		 * copying. We keep 'skb' for everything else, including
2336 		 * freeing both later.
2337 		 */
2338 		if (flags & MSG_CMSG_COMPAT)
2339 			data_skb = skb_shinfo(skb)->frag_list;
2340 	}
2341 #endif
2342 
2343 	copied = data_skb->len;
2344 	if (len < copied) {
2345 		msg->msg_flags |= MSG_TRUNC;
2346 		copied = len;
2347 	}
2348 
2349 	skb_reset_transport_header(data_skb);
2350 	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
2351 
2352 	if (msg->msg_name) {
2353 		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
2354 		addr->nl_family = AF_NETLINK;
2355 		addr->nl_pad    = 0;
2356 		addr->nl_pid	= NETLINK_CB(skb).portid;
2357 		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
2358 		msg->msg_namelen = sizeof(*addr);
2359 	}
2360 
2361 	if (nlk->flags & NETLINK_RECV_PKTINFO)
2362 		netlink_cmsg_recv_pktinfo(msg, skb);
2363 
2364 	if (NULL == siocb->scm) {
2365 		memset(&scm, 0, sizeof(scm));
2366 		siocb->scm = &scm;
2367 	}
2368 	siocb->scm->creds = *NETLINK_CREDS(skb);
2369 	if (flags & MSG_TRUNC)
2370 		copied = data_skb->len;
2371 
2372 	skb_free_datagram(sk, skb);
2373 
2374 	if (nlk->cb_running &&
2375 	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
2376 		ret = netlink_dump(sk);
2377 		if (ret) {
2378 			sk->sk_err = ret;
2379 			sk->sk_error_report(sk);
2380 		}
2381 	}
2382 
2383 	scm_recv(sock, msg, siocb->scm, flags);
2384 out:
2385 	netlink_rcv_wake(sk);
2386 	return err ? : copied;
2387 }
2388 
2389 static void netlink_data_ready(struct sock *sk, int len)
2390 {
2391 	BUG();
2392 }
2393 
2394 /*
2395  *	We export these functions to other modules. They provide a
2396  *	complete set of kernel non-blocking support for message
2397  *	queueing.
2398  */
2399 
2400 struct sock *
2401 __netlink_kernel_create(struct net *net, int unit, struct module *module,
2402 			struct netlink_kernel_cfg *cfg)
2403 {
2404 	struct socket *sock;
2405 	struct sock *sk;
2406 	struct netlink_sock *nlk;
2407 	struct listeners *listeners = NULL;
2408 	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
2409 	unsigned int groups;
2410 
2411 	BUG_ON(!nl_table);
2412 
2413 	if (unit < 0 || unit >= MAX_LINKS)
2414 		return NULL;
2415 
2416 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
2417 		return NULL;
2418 
2419 	/*
2420 	 * We have to just have a reference on the net from sk, but don't
2421 	 * get_net it. Besides, we cannot get and then put the net here.
2422 	 * So we create one inside init_net and the move it to net.
2423 	 */
2424 
2425 	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
2426 		goto out_sock_release_nosk;
2427 
2428 	sk = sock->sk;
2429 	sk_change_net(sk, net);
2430 
2431 	if (!cfg || cfg->groups < 32)
2432 		groups = 32;
2433 	else
2434 		groups = cfg->groups;
2435 
2436 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
2437 	if (!listeners)
2438 		goto out_sock_release;
2439 
2440 	sk->sk_data_ready = netlink_data_ready;
2441 	if (cfg && cfg->input)
2442 		nlk_sk(sk)->netlink_rcv = cfg->input;
2443 
2444 	if (netlink_insert(sk, net, 0))
2445 		goto out_sock_release;
2446 
2447 	nlk = nlk_sk(sk);
2448 	nlk->flags |= NETLINK_KERNEL_SOCKET;
2449 
2450 	netlink_table_grab();
2451 	if (!nl_table[unit].registered) {
2452 		nl_table[unit].groups = groups;
2453 		rcu_assign_pointer(nl_table[unit].listeners, listeners);
2454 		nl_table[unit].cb_mutex = cb_mutex;
2455 		nl_table[unit].module = module;
2456 		if (cfg) {
2457 			nl_table[unit].bind = cfg->bind;
2458 			nl_table[unit].flags = cfg->flags;
2459 			if (cfg->compare)
2460 				nl_table[unit].compare = cfg->compare;
2461 		}
2462 		nl_table[unit].registered = 1;
2463 	} else {
2464 		kfree(listeners);
2465 		nl_table[unit].registered++;
2466 	}
2467 	netlink_table_ungrab();
2468 	return sk;
2469 
2470 out_sock_release:
2471 	kfree(listeners);
2472 	netlink_kernel_release(sk);
2473 	return NULL;
2474 
2475 out_sock_release_nosk:
2476 	sock_release(sock);
2477 	return NULL;
2478 }
2479 EXPORT_SYMBOL(__netlink_kernel_create);
2480 
2481 void
2482 netlink_kernel_release(struct sock *sk)
2483 {
2484 	sk_release_kernel(sk);
2485 }
2486 EXPORT_SYMBOL(netlink_kernel_release);
2487 
2488 int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
2489 {
2490 	struct listeners *new, *old;
2491 	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
2492 
2493 	if (groups < 32)
2494 		groups = 32;
2495 
2496 	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
2497 		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
2498 		if (!new)
2499 			return -ENOMEM;
2500 		old = nl_deref_protected(tbl->listeners);
2501 		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
2502 		rcu_assign_pointer(tbl->listeners, new);
2503 
2504 		kfree_rcu(old, rcu);
2505 	}
2506 	tbl->groups = groups;
2507 
2508 	return 0;
2509 }
2510 
2511 /**
2512  * netlink_change_ngroups - change number of multicast groups
2513  *
2514  * This changes the number of multicast groups that are available
2515  * on a certain netlink family. Note that it is not possible to
2516  * change the number of groups to below 32. Also note that it does
2517  * not implicitly call netlink_clear_multicast_users() when the
2518  * number of groups is reduced.
2519  *
2520  * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
2521  * @groups: The new number of groups.
2522  */
2523 int netlink_change_ngroups(struct sock *sk, unsigned int groups)
2524 {
2525 	int err;
2526 
2527 	netlink_table_grab();
2528 	err = __netlink_change_ngroups(sk, groups);
2529 	netlink_table_ungrab();
2530 
2531 	return err;
2532 }
2533 
2534 void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
2535 {
2536 	struct sock *sk;
2537 	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
2538 
2539 	sk_for_each_bound(sk, &tbl->mc_list)
2540 		netlink_update_socket_mc(nlk_sk(sk), group, 0);
2541 }
2542 
2543 /**
2544  * netlink_clear_multicast_users - kick off multicast listeners
2545  *
2546  * This function removes all listeners from the given group.
2547  * @ksk: The kernel netlink socket, as returned by
2548  *	netlink_kernel_create().
2549  * @group: The multicast group to clear.
2550  */
2551 void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
2552 {
2553 	netlink_table_grab();
2554 	__netlink_clear_multicast_users(ksk, group);
2555 	netlink_table_ungrab();
2556 }
2557 
2558 struct nlmsghdr *
2559 __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
2560 {
2561 	struct nlmsghdr *nlh;
2562 	int size = nlmsg_msg_size(len);
2563 
2564 	nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size));
2565 	nlh->nlmsg_type = type;
2566 	nlh->nlmsg_len = size;
2567 	nlh->nlmsg_flags = flags;
2568 	nlh->nlmsg_pid = portid;
2569 	nlh->nlmsg_seq = seq;
2570 	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
2571 		memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
2572 	return nlh;
2573 }
2574 EXPORT_SYMBOL(__nlmsg_put);
2575 
2576 /*
2577  * It looks a bit ugly.
2578  * It would be better to create kernel thread.
2579  */
2580 
2581 static int netlink_dump(struct sock *sk)
2582 {
2583 	struct netlink_sock *nlk = nlk_sk(sk);
2584 	struct netlink_callback *cb;
2585 	struct sk_buff *skb = NULL;
2586 	struct nlmsghdr *nlh;
2587 	int len, err = -ENOBUFS;
2588 	int alloc_size;
2589 
2590 	mutex_lock(nlk->cb_mutex);
2591 	if (!nlk->cb_running) {
2592 		err = -EINVAL;
2593 		goto errout_skb;
2594 	}
2595 
2596 	cb = &nlk->cb;
2597 	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
2598 
2599 	if (!netlink_rx_is_mmaped(sk) &&
2600 	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2601 		goto errout_skb;
2602 	skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);
2603 	if (!skb)
2604 		goto errout_skb;
2605 	netlink_skb_set_owner_r(skb, sk);
2606 
2607 	len = cb->dump(skb, cb);
2608 
2609 	if (len > 0) {
2610 		mutex_unlock(nlk->cb_mutex);
2611 
2612 		if (sk_filter(sk, skb))
2613 			kfree_skb(skb);
2614 		else
2615 			__netlink_sendskb(sk, skb);
2616 		return 0;
2617 	}
2618 
2619 	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
2620 	if (!nlh)
2621 		goto errout_skb;
2622 
2623 	nl_dump_check_consistent(cb, nlh);
2624 
2625 	memcpy(nlmsg_data(nlh), &len, sizeof(len));
2626 
2627 	if (sk_filter(sk, skb))
2628 		kfree_skb(skb);
2629 	else
2630 		__netlink_sendskb(sk, skb);
2631 
2632 	if (cb->done)
2633 		cb->done(cb);
2634 
2635 	nlk->cb_running = false;
2636 	mutex_unlock(nlk->cb_mutex);
2637 	module_put(cb->module);
2638 	consume_skb(cb->skb);
2639 	return 0;
2640 
2641 errout_skb:
2642 	mutex_unlock(nlk->cb_mutex);
2643 	kfree_skb(skb);
2644 	return err;
2645 }
2646 
2647 int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
2648 			 const struct nlmsghdr *nlh,
2649 			 struct netlink_dump_control *control)
2650 {
2651 	struct netlink_callback *cb;
2652 	struct sock *sk;
2653 	struct netlink_sock *nlk;
2654 	int ret;
2655 
2656 	/* Memory mapped dump requests need to be copied to avoid looping
2657 	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
2658 	 * a reference to the skb.
2659 	 */
2660 	if (netlink_skb_is_mmaped(skb)) {
2661 		skb = skb_copy(skb, GFP_KERNEL);
2662 		if (skb == NULL)
2663 			return -ENOBUFS;
2664 	} else
2665 		atomic_inc(&skb->users);
2666 
2667 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
2668 	if (sk == NULL) {
2669 		ret = -ECONNREFUSED;
2670 		goto error_free;
2671 	}
2672 
2673 	nlk = nlk_sk(sk);
2674 	mutex_lock(nlk->cb_mutex);
2675 	/* A dump is in progress... */
2676 	if (nlk->cb_running) {
2677 		ret = -EBUSY;
2678 		goto error_unlock;
2679 	}
2680 	/* add reference of module which cb->dump belongs to */
2681 	if (!try_module_get(control->module)) {
2682 		ret = -EPROTONOSUPPORT;
2683 		goto error_unlock;
2684 	}
2685 
2686 	cb = &nlk->cb;
2687 	memset(cb, 0, sizeof(*cb));
2688 	cb->dump = control->dump;
2689 	cb->done = control->done;
2690 	cb->nlh = nlh;
2691 	cb->data = control->data;
2692 	cb->module = control->module;
2693 	cb->min_dump_alloc = control->min_dump_alloc;
2694 	cb->skb = skb;
2695 
2696 	nlk->cb_running = true;
2697 
2698 	mutex_unlock(nlk->cb_mutex);
2699 
2700 	ret = netlink_dump(sk);
2701 	sock_put(sk);
2702 
2703 	if (ret)
2704 		return ret;
2705 
2706 	/* We successfully started a dump, by returning -EINTR we
2707 	 * signal not to send ACK even if it was requested.
2708 	 */
2709 	return -EINTR;
2710 
2711 error_unlock:
2712 	sock_put(sk);
2713 	mutex_unlock(nlk->cb_mutex);
2714 error_free:
2715 	kfree_skb(skb);
2716 	return ret;
2717 }
2718 EXPORT_SYMBOL(__netlink_dump_start);
2719 
2720 void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
2721 {
2722 	struct sk_buff *skb;
2723 	struct nlmsghdr *rep;
2724 	struct nlmsgerr *errmsg;
2725 	size_t payload = sizeof(*errmsg);
2726 
2727 	/* error messages get the original request appened */
2728 	if (err)
2729 		payload += nlmsg_len(nlh);
2730 
2731 	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
2732 				NETLINK_CB(in_skb).portid, GFP_KERNEL);
2733 	if (!skb) {
2734 		struct sock *sk;
2735 
2736 		sk = netlink_lookup(sock_net(in_skb->sk),
2737 				    in_skb->sk->sk_protocol,
2738 				    NETLINK_CB(in_skb).portid);
2739 		if (sk) {
2740 			sk->sk_err = ENOBUFS;
2741 			sk->sk_error_report(sk);
2742 			sock_put(sk);
2743 		}
2744 		return;
2745 	}
2746 
2747 	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2748 			  NLMSG_ERROR, payload, 0);
2749 	errmsg = nlmsg_data(rep);
2750 	errmsg->error = err;
2751 	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
2752 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
2753 }
2754 EXPORT_SYMBOL(netlink_ack);
2755 
2756 int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
2757 						     struct nlmsghdr *))
2758 {
2759 	struct nlmsghdr *nlh;
2760 	int err;
2761 
2762 	while (skb->len >= nlmsg_total_size(0)) {
2763 		int msglen;
2764 
2765 		nlh = nlmsg_hdr(skb);
2766 		err = 0;
2767 
2768 		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
2769 			return 0;
2770 
2771 		/* Only requests are handled by the kernel */
2772 		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
2773 			goto ack;
2774 
2775 		/* Skip control messages */
2776 		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
2777 			goto ack;
2778 
2779 		err = cb(skb, nlh);
2780 		if (err == -EINTR)
2781 			goto skip;
2782 
2783 ack:
2784 		if (nlh->nlmsg_flags & NLM_F_ACK || err)
2785 			netlink_ack(skb, nlh, err);
2786 
2787 skip:
2788 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
2789 		if (msglen > skb->len)
2790 			msglen = skb->len;
2791 		skb_pull(skb, msglen);
2792 	}
2793 
2794 	return 0;
2795 }
2796 EXPORT_SYMBOL(netlink_rcv_skb);
2797 
2798 /**
2799  * nlmsg_notify - send a notification netlink message
2800  * @sk: netlink socket to use
2801  * @skb: notification message
2802  * @portid: destination netlink portid for reports or 0
2803  * @group: destination multicast group or 0
2804  * @report: 1 to report back, 0 to disable
2805  * @flags: allocation flags
2806  */
2807 int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
2808 		 unsigned int group, int report, gfp_t flags)
2809 {
2810 	int err = 0;
2811 
2812 	if (group) {
2813 		int exclude_portid = 0;
2814 
2815 		if (report) {
2816 			atomic_inc(&skb->users);
2817 			exclude_portid = portid;
2818 		}
2819 
2820 		/* errors reported via destination sk->sk_err, but propagate
2821 		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
2822 		err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
2823 	}
2824 
2825 	if (report) {
2826 		int err2;
2827 
2828 		err2 = nlmsg_unicast(sk, skb, portid);
2829 		if (!err || err == -ESRCH)
2830 			err = err2;
2831 	}
2832 
2833 	return err;
2834 }
2835 EXPORT_SYMBOL(nlmsg_notify);
2836 
2837 #ifdef CONFIG_PROC_FS
2838 struct nl_seq_iter {
2839 	struct seq_net_private p;
2840 	int link;
2841 	int hash_idx;
2842 };
2843 
2844 static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
2845 {
2846 	struct nl_seq_iter *iter = seq->private;
2847 	int i, j;
2848 	struct sock *s;
2849 	loff_t off = 0;
2850 
2851 	for (i = 0; i < MAX_LINKS; i++) {
2852 		struct nl_portid_hash *hash = &nl_table[i].hash;
2853 
2854 		for (j = 0; j <= hash->mask; j++) {
2855 			sk_for_each(s, &hash->table[j]) {
2856 				if (sock_net(s) != seq_file_net(seq))
2857 					continue;
2858 				if (off == pos) {
2859 					iter->link = i;
2860 					iter->hash_idx = j;
2861 					return s;
2862 				}
2863 				++off;
2864 			}
2865 		}
2866 	}
2867 	return NULL;
2868 }
2869 
2870 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
2871 	__acquires(nl_table_lock)
2872 {
2873 	read_lock(&nl_table_lock);
2874 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2875 }
2876 
2877 static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2878 {
2879 	struct sock *s;
2880 	struct nl_seq_iter *iter;
2881 	struct net *net;
2882 	int i, j;
2883 
2884 	++*pos;
2885 
2886 	if (v == SEQ_START_TOKEN)
2887 		return netlink_seq_socket_idx(seq, 0);
2888 
2889 	net = seq_file_net(seq);
2890 	iter = seq->private;
2891 	s = v;
2892 	do {
2893 		s = sk_next(s);
2894 	} while (s && !nl_table[s->sk_protocol].compare(net, s));
2895 	if (s)
2896 		return s;
2897 
2898 	i = iter->link;
2899 	j = iter->hash_idx + 1;
2900 
2901 	do {
2902 		struct nl_portid_hash *hash = &nl_table[i].hash;
2903 
2904 		for (; j <= hash->mask; j++) {
2905 			s = sk_head(&hash->table[j]);
2906 
2907 			while (s && !nl_table[s->sk_protocol].compare(net, s))
2908 				s = sk_next(s);
2909 			if (s) {
2910 				iter->link = i;
2911 				iter->hash_idx = j;
2912 				return s;
2913 			}
2914 		}
2915 
2916 		j = 0;
2917 	} while (++i < MAX_LINKS);
2918 
2919 	return NULL;
2920 }
2921 
2922 static void netlink_seq_stop(struct seq_file *seq, void *v)
2923 	__releases(nl_table_lock)
2924 {
2925 	read_unlock(&nl_table_lock);
2926 }
2927 
2928 
2929 static int netlink_seq_show(struct seq_file *seq, void *v)
2930 {
2931 	if (v == SEQ_START_TOKEN) {
2932 		seq_puts(seq,
2933 			 "sk       Eth Pid    Groups   "
2934 			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
2935 	} else {
2936 		struct sock *s = v;
2937 		struct netlink_sock *nlk = nlk_sk(s);
2938 
2939 		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
2940 			   s,
2941 			   s->sk_protocol,
2942 			   nlk->portid,
2943 			   nlk->groups ? (u32)nlk->groups[0] : 0,
2944 			   sk_rmem_alloc_get(s),
2945 			   sk_wmem_alloc_get(s),
2946 			   nlk->cb_running,
2947 			   atomic_read(&s->sk_refcnt),
2948 			   atomic_read(&s->sk_drops),
2949 			   sock_i_ino(s)
2950 			);
2951 
2952 	}
2953 	return 0;
2954 }
2955 
2956 static const struct seq_operations netlink_seq_ops = {
2957 	.start  = netlink_seq_start,
2958 	.next   = netlink_seq_next,
2959 	.stop   = netlink_seq_stop,
2960 	.show   = netlink_seq_show,
2961 };
2962 
2963 
2964 static int netlink_seq_open(struct inode *inode, struct file *file)
2965 {
2966 	return seq_open_net(inode, file, &netlink_seq_ops,
2967 				sizeof(struct nl_seq_iter));
2968 }
2969 
2970 static const struct file_operations netlink_seq_fops = {
2971 	.owner		= THIS_MODULE,
2972 	.open		= netlink_seq_open,
2973 	.read		= seq_read,
2974 	.llseek		= seq_lseek,
2975 	.release	= seq_release_net,
2976 };
2977 
2978 #endif
2979 
2980 int netlink_register_notifier(struct notifier_block *nb)
2981 {
2982 	return atomic_notifier_chain_register(&netlink_chain, nb);
2983 }
2984 EXPORT_SYMBOL(netlink_register_notifier);
2985 
2986 int netlink_unregister_notifier(struct notifier_block *nb)
2987 {
2988 	return atomic_notifier_chain_unregister(&netlink_chain, nb);
2989 }
2990 EXPORT_SYMBOL(netlink_unregister_notifier);
2991 
2992 static const struct proto_ops netlink_ops = {
2993 	.family =	PF_NETLINK,
2994 	.owner =	THIS_MODULE,
2995 	.release =	netlink_release,
2996 	.bind =		netlink_bind,
2997 	.connect =	netlink_connect,
2998 	.socketpair =	sock_no_socketpair,
2999 	.accept =	sock_no_accept,
3000 	.getname =	netlink_getname,
3001 	.poll =		netlink_poll,
3002 	.ioctl =	sock_no_ioctl,
3003 	.listen =	sock_no_listen,
3004 	.shutdown =	sock_no_shutdown,
3005 	.setsockopt =	netlink_setsockopt,
3006 	.getsockopt =	netlink_getsockopt,
3007 	.sendmsg =	netlink_sendmsg,
3008 	.recvmsg =	netlink_recvmsg,
3009 	.mmap =		netlink_mmap,
3010 	.sendpage =	sock_no_sendpage,
3011 };
3012 
3013 static const struct net_proto_family netlink_family_ops = {
3014 	.family = PF_NETLINK,
3015 	.create = netlink_create,
3016 	.owner	= THIS_MODULE,	/* for consistency 8) */
3017 };
3018 
3019 static int __net_init netlink_net_init(struct net *net)
3020 {
3021 #ifdef CONFIG_PROC_FS
3022 	if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
3023 		return -ENOMEM;
3024 #endif
3025 	return 0;
3026 }
3027 
3028 static void __net_exit netlink_net_exit(struct net *net)
3029 {
3030 #ifdef CONFIG_PROC_FS
3031 	remove_proc_entry("netlink", net->proc_net);
3032 #endif
3033 }
3034 
3035 static void __init netlink_add_usersock_entry(void)
3036 {
3037 	struct listeners *listeners;
3038 	int groups = 32;
3039 
3040 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
3041 	if (!listeners)
3042 		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
3043 
3044 	netlink_table_grab();
3045 
3046 	nl_table[NETLINK_USERSOCK].groups = groups;
3047 	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
3048 	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
3049 	nl_table[NETLINK_USERSOCK].registered = 1;
3050 	nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
3051 
3052 	netlink_table_ungrab();
3053 }
3054 
3055 static struct pernet_operations __net_initdata netlink_net_ops = {
3056 	.init = netlink_net_init,
3057 	.exit = netlink_net_exit,
3058 };
3059 
3060 static int __init netlink_proto_init(void)
3061 {
3062 	int i;
3063 	unsigned long limit;
3064 	unsigned int order;
3065 	int err = proto_register(&netlink_proto, 0);
3066 
3067 	if (err != 0)
3068 		goto out;
3069 
3070 	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
3071 
3072 	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
3073 	if (!nl_table)
3074 		goto panic;
3075 
3076 	if (totalram_pages >= (128 * 1024))
3077 		limit = totalram_pages >> (21 - PAGE_SHIFT);
3078 	else
3079 		limit = totalram_pages >> (23 - PAGE_SHIFT);
3080 
3081 	order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
3082 	limit = (1UL << order) / sizeof(struct hlist_head);
3083 	order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1;
3084 
3085 	for (i = 0; i < MAX_LINKS; i++) {
3086 		struct nl_portid_hash *hash = &nl_table[i].hash;
3087 
3088 		hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table));
3089 		if (!hash->table) {
3090 			while (i-- > 0)
3091 				nl_portid_hash_free(nl_table[i].hash.table,
3092 						 1 * sizeof(*hash->table));
3093 			kfree(nl_table);
3094 			goto panic;
3095 		}
3096 		hash->max_shift = order;
3097 		hash->shift = 0;
3098 		hash->mask = 0;
3099 		hash->rehash_time = jiffies;
3100 
3101 		nl_table[i].compare = netlink_compare;
3102 	}
3103 
3104 	INIT_LIST_HEAD(&netlink_tap_all);
3105 
3106 	netlink_add_usersock_entry();
3107 
3108 	sock_register(&netlink_family_ops);
3109 	register_pernet_subsys(&netlink_net_ops);
3110 	/* The netlink device handler may be needed early. */
3111 	rtnetlink_init();
3112 out:
3113 	return err;
3114 panic:
3115 	panic("netlink_init: Cannot allocate nl_table\n");
3116 }
3117 
3118 core_initcall(netlink_proto_init);
3119