xref: /linux/net/core/sock.c (revision d96fc832bcb6269d96e33d506f33033d7ed08598)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 static void sock_inuse_add(struct net *net, int val);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
230 
231 static const char *const af_family_key_strings[AF_MAX+1] = {
232 	_sock_locks("sk_lock-")
233 };
234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
235 	_sock_locks("slock-")
236 };
237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
238 	_sock_locks("clock-")
239 };
240 
241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-sk_lock-")
243 };
244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-slock-")
246 };
247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-clock-")
249 };
250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
266 };
267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
283 };
284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
300 };
301 
302 /*
303  * sk_callback_lock and sk queues locking rules are per-address-family,
304  * so split the lock classes by using a per-AF key:
305  */
306 static struct lock_class_key af_callback_keys[AF_MAX];
307 static struct lock_class_key af_rlock_keys[AF_MAX];
308 static struct lock_class_key af_wlock_keys[AF_MAX];
309 static struct lock_class_key af_elock_keys[AF_MAX];
310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
311 
312 /* Run time adjustable parameters. */
313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
314 EXPORT_SYMBOL(sysctl_wmem_max);
315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
316 EXPORT_SYMBOL(sysctl_rmem_max);
317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
319 
320 /* Maximal space eaten by iovec or ancillary data plus some space */
321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
322 EXPORT_SYMBOL(sysctl_optmem_max);
323 
324 int sysctl_tstamp_allow_data __read_mostly = 1;
325 
326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
327 EXPORT_SYMBOL_GPL(memalloc_socks);
328 
329 /**
330  * sk_set_memalloc - sets %SOCK_MEMALLOC
331  * @sk: socket to set it on
332  *
333  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
334  * It's the responsibility of the admin to adjust min_free_kbytes
335  * to meet the requirements
336  */
337 void sk_set_memalloc(struct sock *sk)
338 {
339 	sock_set_flag(sk, SOCK_MEMALLOC);
340 	sk->sk_allocation |= __GFP_MEMALLOC;
341 	static_key_slow_inc(&memalloc_socks);
342 }
343 EXPORT_SYMBOL_GPL(sk_set_memalloc);
344 
345 void sk_clear_memalloc(struct sock *sk)
346 {
347 	sock_reset_flag(sk, SOCK_MEMALLOC);
348 	sk->sk_allocation &= ~__GFP_MEMALLOC;
349 	static_key_slow_dec(&memalloc_socks);
350 
351 	/*
352 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
353 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
354 	 * it has rmem allocations due to the last swapfile being deactivated
355 	 * but there is a risk that the socket is unusable due to exceeding
356 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
357 	 */
358 	sk_mem_reclaim(sk);
359 }
360 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
361 
362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
363 {
364 	int ret;
365 	unsigned int noreclaim_flag;
366 
367 	/* these should have been dropped before queueing */
368 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
369 
370 	noreclaim_flag = memalloc_noreclaim_save();
371 	ret = sk->sk_backlog_rcv(sk, skb);
372 	memalloc_noreclaim_restore(noreclaim_flag);
373 
374 	return ret;
375 }
376 EXPORT_SYMBOL(__sk_backlog_rcv);
377 
378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
379 {
380 	struct timeval tv;
381 
382 	if (optlen < sizeof(tv))
383 		return -EINVAL;
384 	if (copy_from_user(&tv, optval, sizeof(tv)))
385 		return -EFAULT;
386 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
387 		return -EDOM;
388 
389 	if (tv.tv_sec < 0) {
390 		static int warned __read_mostly;
391 
392 		*timeo_p = 0;
393 		if (warned < 10 && net_ratelimit()) {
394 			warned++;
395 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
396 				__func__, current->comm, task_pid_nr(current));
397 		}
398 		return 0;
399 	}
400 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
401 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
402 		return 0;
403 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
404 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
405 	return 0;
406 }
407 
408 static void sock_warn_obsolete_bsdism(const char *name)
409 {
410 	static int warned;
411 	static char warncomm[TASK_COMM_LEN];
412 	if (strcmp(warncomm, current->comm) && warned < 5) {
413 		strcpy(warncomm,  current->comm);
414 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
415 			warncomm, name);
416 		warned++;
417 	}
418 }
419 
420 static bool sock_needs_netstamp(const struct sock *sk)
421 {
422 	switch (sk->sk_family) {
423 	case AF_UNSPEC:
424 	case AF_UNIX:
425 		return false;
426 	default:
427 		return true;
428 	}
429 }
430 
431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
432 {
433 	if (sk->sk_flags & flags) {
434 		sk->sk_flags &= ~flags;
435 		if (sock_needs_netstamp(sk) &&
436 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
437 			net_disable_timestamp();
438 	}
439 }
440 
441 
442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
443 {
444 	unsigned long flags;
445 	struct sk_buff_head *list = &sk->sk_receive_queue;
446 
447 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
448 		atomic_inc(&sk->sk_drops);
449 		trace_sock_rcvqueue_full(sk, skb);
450 		return -ENOMEM;
451 	}
452 
453 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454 		atomic_inc(&sk->sk_drops);
455 		return -ENOBUFS;
456 	}
457 
458 	skb->dev = NULL;
459 	skb_set_owner_r(skb, sk);
460 
461 	/* we escape from rcu protected region, make sure we dont leak
462 	 * a norefcounted dst
463 	 */
464 	skb_dst_force(skb);
465 
466 	spin_lock_irqsave(&list->lock, flags);
467 	sock_skb_set_dropcount(sk, skb);
468 	__skb_queue_tail(list, skb);
469 	spin_unlock_irqrestore(&list->lock, flags);
470 
471 	if (!sock_flag(sk, SOCK_DEAD))
472 		sk->sk_data_ready(sk);
473 	return 0;
474 }
475 EXPORT_SYMBOL(__sock_queue_rcv_skb);
476 
477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
478 {
479 	int err;
480 
481 	err = sk_filter(sk, skb);
482 	if (err)
483 		return err;
484 
485 	return __sock_queue_rcv_skb(sk, skb);
486 }
487 EXPORT_SYMBOL(sock_queue_rcv_skb);
488 
489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
490 		     const int nested, unsigned int trim_cap, bool refcounted)
491 {
492 	int rc = NET_RX_SUCCESS;
493 
494 	if (sk_filter_trim_cap(sk, skb, trim_cap))
495 		goto discard_and_relse;
496 
497 	skb->dev = NULL;
498 
499 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
500 		atomic_inc(&sk->sk_drops);
501 		goto discard_and_relse;
502 	}
503 	if (nested)
504 		bh_lock_sock_nested(sk);
505 	else
506 		bh_lock_sock(sk);
507 	if (!sock_owned_by_user(sk)) {
508 		/*
509 		 * trylock + unlock semantics:
510 		 */
511 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
512 
513 		rc = sk_backlog_rcv(sk, skb);
514 
515 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
516 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
517 		bh_unlock_sock(sk);
518 		atomic_inc(&sk->sk_drops);
519 		goto discard_and_relse;
520 	}
521 
522 	bh_unlock_sock(sk);
523 out:
524 	if (refcounted)
525 		sock_put(sk);
526 	return rc;
527 discard_and_relse:
528 	kfree_skb(skb);
529 	goto out;
530 }
531 EXPORT_SYMBOL(__sk_receive_skb);
532 
533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534 {
535 	struct dst_entry *dst = __sk_dst_get(sk);
536 
537 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 		sk_tx_queue_clear(sk);
539 		sk->sk_dst_pending_confirm = 0;
540 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
541 		dst_release(dst);
542 		return NULL;
543 	}
544 
545 	return dst;
546 }
547 EXPORT_SYMBOL(__sk_dst_check);
548 
549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550 {
551 	struct dst_entry *dst = sk_dst_get(sk);
552 
553 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
554 		sk_dst_reset(sk);
555 		dst_release(dst);
556 		return NULL;
557 	}
558 
559 	return dst;
560 }
561 EXPORT_SYMBOL(sk_dst_check);
562 
563 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
564 				int optlen)
565 {
566 	int ret = -ENOPROTOOPT;
567 #ifdef CONFIG_NETDEVICES
568 	struct net *net = sock_net(sk);
569 	char devname[IFNAMSIZ];
570 	int index;
571 
572 	/* Sorry... */
573 	ret = -EPERM;
574 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
575 		goto out;
576 
577 	ret = -EINVAL;
578 	if (optlen < 0)
579 		goto out;
580 
581 	/* Bind this socket to a particular device like "eth0",
582 	 * as specified in the passed interface name. If the
583 	 * name is "" or the option length is zero the socket
584 	 * is not bound.
585 	 */
586 	if (optlen > IFNAMSIZ - 1)
587 		optlen = IFNAMSIZ - 1;
588 	memset(devname, 0, sizeof(devname));
589 
590 	ret = -EFAULT;
591 	if (copy_from_user(devname, optval, optlen))
592 		goto out;
593 
594 	index = 0;
595 	if (devname[0] != '\0') {
596 		struct net_device *dev;
597 
598 		rcu_read_lock();
599 		dev = dev_get_by_name_rcu(net, devname);
600 		if (dev)
601 			index = dev->ifindex;
602 		rcu_read_unlock();
603 		ret = -ENODEV;
604 		if (!dev)
605 			goto out;
606 	}
607 
608 	lock_sock(sk);
609 	sk->sk_bound_dev_if = index;
610 	sk_dst_reset(sk);
611 	release_sock(sk);
612 
613 	ret = 0;
614 
615 out:
616 #endif
617 
618 	return ret;
619 }
620 
621 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
622 				int __user *optlen, int len)
623 {
624 	int ret = -ENOPROTOOPT;
625 #ifdef CONFIG_NETDEVICES
626 	struct net *net = sock_net(sk);
627 	char devname[IFNAMSIZ];
628 
629 	if (sk->sk_bound_dev_if == 0) {
630 		len = 0;
631 		goto zero;
632 	}
633 
634 	ret = -EINVAL;
635 	if (len < IFNAMSIZ)
636 		goto out;
637 
638 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
639 	if (ret)
640 		goto out;
641 
642 	len = strlen(devname) + 1;
643 
644 	ret = -EFAULT;
645 	if (copy_to_user(optval, devname, len))
646 		goto out;
647 
648 zero:
649 	ret = -EFAULT;
650 	if (put_user(len, optlen))
651 		goto out;
652 
653 	ret = 0;
654 
655 out:
656 #endif
657 
658 	return ret;
659 }
660 
661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662 {
663 	if (valbool)
664 		sock_set_flag(sk, bit);
665 	else
666 		sock_reset_flag(sk, bit);
667 }
668 
669 bool sk_mc_loop(struct sock *sk)
670 {
671 	if (dev_recursion_level())
672 		return false;
673 	if (!sk)
674 		return true;
675 	switch (sk->sk_family) {
676 	case AF_INET:
677 		return inet_sk(sk)->mc_loop;
678 #if IS_ENABLED(CONFIG_IPV6)
679 	case AF_INET6:
680 		return inet6_sk(sk)->mc_loop;
681 #endif
682 	}
683 	WARN_ON(1);
684 	return true;
685 }
686 EXPORT_SYMBOL(sk_mc_loop);
687 
688 /*
689  *	This is meant for all protocols to use and covers goings on
690  *	at the socket level. Everything here is generic.
691  */
692 
693 int sock_setsockopt(struct socket *sock, int level, int optname,
694 		    char __user *optval, unsigned int optlen)
695 {
696 	struct sock *sk = sock->sk;
697 	int val;
698 	int valbool;
699 	struct linger ling;
700 	int ret = 0;
701 
702 	/*
703 	 *	Options without arguments
704 	 */
705 
706 	if (optname == SO_BINDTODEVICE)
707 		return sock_setbindtodevice(sk, optval, optlen);
708 
709 	if (optlen < sizeof(int))
710 		return -EINVAL;
711 
712 	if (get_user(val, (int __user *)optval))
713 		return -EFAULT;
714 
715 	valbool = val ? 1 : 0;
716 
717 	lock_sock(sk);
718 
719 	switch (optname) {
720 	case SO_DEBUG:
721 		if (val && !capable(CAP_NET_ADMIN))
722 			ret = -EACCES;
723 		else
724 			sock_valbool_flag(sk, SOCK_DBG, valbool);
725 		break;
726 	case SO_REUSEADDR:
727 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
728 		break;
729 	case SO_REUSEPORT:
730 		sk->sk_reuseport = valbool;
731 		break;
732 	case SO_TYPE:
733 	case SO_PROTOCOL:
734 	case SO_DOMAIN:
735 	case SO_ERROR:
736 		ret = -ENOPROTOOPT;
737 		break;
738 	case SO_DONTROUTE:
739 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
740 		break;
741 	case SO_BROADCAST:
742 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
743 		break;
744 	case SO_SNDBUF:
745 		/* Don't error on this BSD doesn't and if you think
746 		 * about it this is right. Otherwise apps have to
747 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
748 		 * are treated in BSD as hints
749 		 */
750 		val = min_t(u32, val, sysctl_wmem_max);
751 set_sndbuf:
752 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
753 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
754 		/* Wake up sending tasks if we upped the value. */
755 		sk->sk_write_space(sk);
756 		break;
757 
758 	case SO_SNDBUFFORCE:
759 		if (!capable(CAP_NET_ADMIN)) {
760 			ret = -EPERM;
761 			break;
762 		}
763 		goto set_sndbuf;
764 
765 	case SO_RCVBUF:
766 		/* Don't error on this BSD doesn't and if you think
767 		 * about it this is right. Otherwise apps have to
768 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
769 		 * are treated in BSD as hints
770 		 */
771 		val = min_t(u32, val, sysctl_rmem_max);
772 set_rcvbuf:
773 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
774 		/*
775 		 * We double it on the way in to account for
776 		 * "struct sk_buff" etc. overhead.   Applications
777 		 * assume that the SO_RCVBUF setting they make will
778 		 * allow that much actual data to be received on that
779 		 * socket.
780 		 *
781 		 * Applications are unaware that "struct sk_buff" and
782 		 * other overheads allocate from the receive buffer
783 		 * during socket buffer allocation.
784 		 *
785 		 * And after considering the possible alternatives,
786 		 * returning the value we actually used in getsockopt
787 		 * is the most desirable behavior.
788 		 */
789 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
790 		break;
791 
792 	case SO_RCVBUFFORCE:
793 		if (!capable(CAP_NET_ADMIN)) {
794 			ret = -EPERM;
795 			break;
796 		}
797 		goto set_rcvbuf;
798 
799 	case SO_KEEPALIVE:
800 		if (sk->sk_prot->keepalive)
801 			sk->sk_prot->keepalive(sk, valbool);
802 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
803 		break;
804 
805 	case SO_OOBINLINE:
806 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
807 		break;
808 
809 	case SO_NO_CHECK:
810 		sk->sk_no_check_tx = valbool;
811 		break;
812 
813 	case SO_PRIORITY:
814 		if ((val >= 0 && val <= 6) ||
815 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
816 			sk->sk_priority = val;
817 		else
818 			ret = -EPERM;
819 		break;
820 
821 	case SO_LINGER:
822 		if (optlen < sizeof(ling)) {
823 			ret = -EINVAL;	/* 1003.1g */
824 			break;
825 		}
826 		if (copy_from_user(&ling, optval, sizeof(ling))) {
827 			ret = -EFAULT;
828 			break;
829 		}
830 		if (!ling.l_onoff)
831 			sock_reset_flag(sk, SOCK_LINGER);
832 		else {
833 #if (BITS_PER_LONG == 32)
834 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
835 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
836 			else
837 #endif
838 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
839 			sock_set_flag(sk, SOCK_LINGER);
840 		}
841 		break;
842 
843 	case SO_BSDCOMPAT:
844 		sock_warn_obsolete_bsdism("setsockopt");
845 		break;
846 
847 	case SO_PASSCRED:
848 		if (valbool)
849 			set_bit(SOCK_PASSCRED, &sock->flags);
850 		else
851 			clear_bit(SOCK_PASSCRED, &sock->flags);
852 		break;
853 
854 	case SO_TIMESTAMP:
855 	case SO_TIMESTAMPNS:
856 		if (valbool)  {
857 			if (optname == SO_TIMESTAMP)
858 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
859 			else
860 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
861 			sock_set_flag(sk, SOCK_RCVTSTAMP);
862 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
863 		} else {
864 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
865 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
866 		}
867 		break;
868 
869 	case SO_TIMESTAMPING:
870 		if (val & ~SOF_TIMESTAMPING_MASK) {
871 			ret = -EINVAL;
872 			break;
873 		}
874 
875 		if (val & SOF_TIMESTAMPING_OPT_ID &&
876 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
877 			if (sk->sk_protocol == IPPROTO_TCP &&
878 			    sk->sk_type == SOCK_STREAM) {
879 				if ((1 << sk->sk_state) &
880 				    (TCPF_CLOSE | TCPF_LISTEN)) {
881 					ret = -EINVAL;
882 					break;
883 				}
884 				sk->sk_tskey = tcp_sk(sk)->snd_una;
885 			} else {
886 				sk->sk_tskey = 0;
887 			}
888 		}
889 
890 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
891 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
892 			ret = -EINVAL;
893 			break;
894 		}
895 
896 		sk->sk_tsflags = val;
897 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
898 			sock_enable_timestamp(sk,
899 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
900 		else
901 			sock_disable_timestamp(sk,
902 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
903 		break;
904 
905 	case SO_RCVLOWAT:
906 		if (val < 0)
907 			val = INT_MAX;
908 		sk->sk_rcvlowat = val ? : 1;
909 		break;
910 
911 	case SO_RCVTIMEO:
912 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
913 		break;
914 
915 	case SO_SNDTIMEO:
916 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
917 		break;
918 
919 	case SO_ATTACH_FILTER:
920 		ret = -EINVAL;
921 		if (optlen == sizeof(struct sock_fprog)) {
922 			struct sock_fprog fprog;
923 
924 			ret = -EFAULT;
925 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
926 				break;
927 
928 			ret = sk_attach_filter(&fprog, sk);
929 		}
930 		break;
931 
932 	case SO_ATTACH_BPF:
933 		ret = -EINVAL;
934 		if (optlen == sizeof(u32)) {
935 			u32 ufd;
936 
937 			ret = -EFAULT;
938 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
939 				break;
940 
941 			ret = sk_attach_bpf(ufd, sk);
942 		}
943 		break;
944 
945 	case SO_ATTACH_REUSEPORT_CBPF:
946 		ret = -EINVAL;
947 		if (optlen == sizeof(struct sock_fprog)) {
948 			struct sock_fprog fprog;
949 
950 			ret = -EFAULT;
951 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
952 				break;
953 
954 			ret = sk_reuseport_attach_filter(&fprog, sk);
955 		}
956 		break;
957 
958 	case SO_ATTACH_REUSEPORT_EBPF:
959 		ret = -EINVAL;
960 		if (optlen == sizeof(u32)) {
961 			u32 ufd;
962 
963 			ret = -EFAULT;
964 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
965 				break;
966 
967 			ret = sk_reuseport_attach_bpf(ufd, sk);
968 		}
969 		break;
970 
971 	case SO_DETACH_FILTER:
972 		ret = sk_detach_filter(sk);
973 		break;
974 
975 	case SO_LOCK_FILTER:
976 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
977 			ret = -EPERM;
978 		else
979 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
980 		break;
981 
982 	case SO_PASSSEC:
983 		if (valbool)
984 			set_bit(SOCK_PASSSEC, &sock->flags);
985 		else
986 			clear_bit(SOCK_PASSSEC, &sock->flags);
987 		break;
988 	case SO_MARK:
989 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
990 			ret = -EPERM;
991 		else
992 			sk->sk_mark = val;
993 		break;
994 
995 	case SO_RXQ_OVFL:
996 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
997 		break;
998 
999 	case SO_WIFI_STATUS:
1000 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1001 		break;
1002 
1003 	case SO_PEEK_OFF:
1004 		if (sock->ops->set_peek_off)
1005 			ret = sock->ops->set_peek_off(sk, val);
1006 		else
1007 			ret = -EOPNOTSUPP;
1008 		break;
1009 
1010 	case SO_NOFCS:
1011 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1012 		break;
1013 
1014 	case SO_SELECT_ERR_QUEUE:
1015 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1016 		break;
1017 
1018 #ifdef CONFIG_NET_RX_BUSY_POLL
1019 	case SO_BUSY_POLL:
1020 		/* allow unprivileged users to decrease the value */
1021 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1022 			ret = -EPERM;
1023 		else {
1024 			if (val < 0)
1025 				ret = -EINVAL;
1026 			else
1027 				sk->sk_ll_usec = val;
1028 		}
1029 		break;
1030 #endif
1031 
1032 	case SO_MAX_PACING_RATE:
1033 		if (val != ~0U)
1034 			cmpxchg(&sk->sk_pacing_status,
1035 				SK_PACING_NONE,
1036 				SK_PACING_NEEDED);
1037 		sk->sk_max_pacing_rate = val;
1038 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1039 					 sk->sk_max_pacing_rate);
1040 		break;
1041 
1042 	case SO_INCOMING_CPU:
1043 		sk->sk_incoming_cpu = val;
1044 		break;
1045 
1046 	case SO_CNX_ADVICE:
1047 		if (val == 1)
1048 			dst_negative_advice(sk);
1049 		break;
1050 
1051 	case SO_ZEROCOPY:
1052 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1053 			if (sk->sk_protocol != IPPROTO_TCP)
1054 				ret = -ENOTSUPP;
1055 			else if (sk->sk_state != TCP_CLOSE)
1056 				ret = -EBUSY;
1057 		} else if (sk->sk_family != PF_RDS) {
1058 			ret = -ENOTSUPP;
1059 		}
1060 		if (!ret) {
1061 			if (val < 0 || val > 1)
1062 				ret = -EINVAL;
1063 			else
1064 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1065 			break;
1066 		}
1067 	default:
1068 		ret = -ENOPROTOOPT;
1069 		break;
1070 	}
1071 	release_sock(sk);
1072 	return ret;
1073 }
1074 EXPORT_SYMBOL(sock_setsockopt);
1075 
1076 
1077 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1078 			  struct ucred *ucred)
1079 {
1080 	ucred->pid = pid_vnr(pid);
1081 	ucred->uid = ucred->gid = -1;
1082 	if (cred) {
1083 		struct user_namespace *current_ns = current_user_ns();
1084 
1085 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1086 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1087 	}
1088 }
1089 
1090 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1091 {
1092 	struct user_namespace *user_ns = current_user_ns();
1093 	int i;
1094 
1095 	for (i = 0; i < src->ngroups; i++)
1096 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1097 			return -EFAULT;
1098 
1099 	return 0;
1100 }
1101 
1102 int sock_getsockopt(struct socket *sock, int level, int optname,
1103 		    char __user *optval, int __user *optlen)
1104 {
1105 	struct sock *sk = sock->sk;
1106 
1107 	union {
1108 		int val;
1109 		u64 val64;
1110 		struct linger ling;
1111 		struct timeval tm;
1112 	} v;
1113 
1114 	int lv = sizeof(int);
1115 	int len;
1116 
1117 	if (get_user(len, optlen))
1118 		return -EFAULT;
1119 	if (len < 0)
1120 		return -EINVAL;
1121 
1122 	memset(&v, 0, sizeof(v));
1123 
1124 	switch (optname) {
1125 	case SO_DEBUG:
1126 		v.val = sock_flag(sk, SOCK_DBG);
1127 		break;
1128 
1129 	case SO_DONTROUTE:
1130 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1131 		break;
1132 
1133 	case SO_BROADCAST:
1134 		v.val = sock_flag(sk, SOCK_BROADCAST);
1135 		break;
1136 
1137 	case SO_SNDBUF:
1138 		v.val = sk->sk_sndbuf;
1139 		break;
1140 
1141 	case SO_RCVBUF:
1142 		v.val = sk->sk_rcvbuf;
1143 		break;
1144 
1145 	case SO_REUSEADDR:
1146 		v.val = sk->sk_reuse;
1147 		break;
1148 
1149 	case SO_REUSEPORT:
1150 		v.val = sk->sk_reuseport;
1151 		break;
1152 
1153 	case SO_KEEPALIVE:
1154 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1155 		break;
1156 
1157 	case SO_TYPE:
1158 		v.val = sk->sk_type;
1159 		break;
1160 
1161 	case SO_PROTOCOL:
1162 		v.val = sk->sk_protocol;
1163 		break;
1164 
1165 	case SO_DOMAIN:
1166 		v.val = sk->sk_family;
1167 		break;
1168 
1169 	case SO_ERROR:
1170 		v.val = -sock_error(sk);
1171 		if (v.val == 0)
1172 			v.val = xchg(&sk->sk_err_soft, 0);
1173 		break;
1174 
1175 	case SO_OOBINLINE:
1176 		v.val = sock_flag(sk, SOCK_URGINLINE);
1177 		break;
1178 
1179 	case SO_NO_CHECK:
1180 		v.val = sk->sk_no_check_tx;
1181 		break;
1182 
1183 	case SO_PRIORITY:
1184 		v.val = sk->sk_priority;
1185 		break;
1186 
1187 	case SO_LINGER:
1188 		lv		= sizeof(v.ling);
1189 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1190 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1191 		break;
1192 
1193 	case SO_BSDCOMPAT:
1194 		sock_warn_obsolete_bsdism("getsockopt");
1195 		break;
1196 
1197 	case SO_TIMESTAMP:
1198 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1199 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1200 		break;
1201 
1202 	case SO_TIMESTAMPNS:
1203 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1204 		break;
1205 
1206 	case SO_TIMESTAMPING:
1207 		v.val = sk->sk_tsflags;
1208 		break;
1209 
1210 	case SO_RCVTIMEO:
1211 		lv = sizeof(struct timeval);
1212 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1213 			v.tm.tv_sec = 0;
1214 			v.tm.tv_usec = 0;
1215 		} else {
1216 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1217 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1218 		}
1219 		break;
1220 
1221 	case SO_SNDTIMEO:
1222 		lv = sizeof(struct timeval);
1223 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1224 			v.tm.tv_sec = 0;
1225 			v.tm.tv_usec = 0;
1226 		} else {
1227 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1228 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1229 		}
1230 		break;
1231 
1232 	case SO_RCVLOWAT:
1233 		v.val = sk->sk_rcvlowat;
1234 		break;
1235 
1236 	case SO_SNDLOWAT:
1237 		v.val = 1;
1238 		break;
1239 
1240 	case SO_PASSCRED:
1241 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1242 		break;
1243 
1244 	case SO_PEERCRED:
1245 	{
1246 		struct ucred peercred;
1247 		if (len > sizeof(peercred))
1248 			len = sizeof(peercred);
1249 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1250 		if (copy_to_user(optval, &peercred, len))
1251 			return -EFAULT;
1252 		goto lenout;
1253 	}
1254 
1255 	case SO_PEERGROUPS:
1256 	{
1257 		int ret, n;
1258 
1259 		if (!sk->sk_peer_cred)
1260 			return -ENODATA;
1261 
1262 		n = sk->sk_peer_cred->group_info->ngroups;
1263 		if (len < n * sizeof(gid_t)) {
1264 			len = n * sizeof(gid_t);
1265 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1266 		}
1267 		len = n * sizeof(gid_t);
1268 
1269 		ret = groups_to_user((gid_t __user *)optval,
1270 				     sk->sk_peer_cred->group_info);
1271 		if (ret)
1272 			return ret;
1273 		goto lenout;
1274 	}
1275 
1276 	case SO_PEERNAME:
1277 	{
1278 		char address[128];
1279 
1280 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1281 		if (lv < 0)
1282 			return -ENOTCONN;
1283 		if (lv < len)
1284 			return -EINVAL;
1285 		if (copy_to_user(optval, address, len))
1286 			return -EFAULT;
1287 		goto lenout;
1288 	}
1289 
1290 	/* Dubious BSD thing... Probably nobody even uses it, but
1291 	 * the UNIX standard wants it for whatever reason... -DaveM
1292 	 */
1293 	case SO_ACCEPTCONN:
1294 		v.val = sk->sk_state == TCP_LISTEN;
1295 		break;
1296 
1297 	case SO_PASSSEC:
1298 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1299 		break;
1300 
1301 	case SO_PEERSEC:
1302 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1303 
1304 	case SO_MARK:
1305 		v.val = sk->sk_mark;
1306 		break;
1307 
1308 	case SO_RXQ_OVFL:
1309 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1310 		break;
1311 
1312 	case SO_WIFI_STATUS:
1313 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1314 		break;
1315 
1316 	case SO_PEEK_OFF:
1317 		if (!sock->ops->set_peek_off)
1318 			return -EOPNOTSUPP;
1319 
1320 		v.val = sk->sk_peek_off;
1321 		break;
1322 	case SO_NOFCS:
1323 		v.val = sock_flag(sk, SOCK_NOFCS);
1324 		break;
1325 
1326 	case SO_BINDTODEVICE:
1327 		return sock_getbindtodevice(sk, optval, optlen, len);
1328 
1329 	case SO_GET_FILTER:
1330 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1331 		if (len < 0)
1332 			return len;
1333 
1334 		goto lenout;
1335 
1336 	case SO_LOCK_FILTER:
1337 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1338 		break;
1339 
1340 	case SO_BPF_EXTENSIONS:
1341 		v.val = bpf_tell_extensions();
1342 		break;
1343 
1344 	case SO_SELECT_ERR_QUEUE:
1345 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1346 		break;
1347 
1348 #ifdef CONFIG_NET_RX_BUSY_POLL
1349 	case SO_BUSY_POLL:
1350 		v.val = sk->sk_ll_usec;
1351 		break;
1352 #endif
1353 
1354 	case SO_MAX_PACING_RATE:
1355 		v.val = sk->sk_max_pacing_rate;
1356 		break;
1357 
1358 	case SO_INCOMING_CPU:
1359 		v.val = sk->sk_incoming_cpu;
1360 		break;
1361 
1362 	case SO_MEMINFO:
1363 	{
1364 		u32 meminfo[SK_MEMINFO_VARS];
1365 
1366 		if (get_user(len, optlen))
1367 			return -EFAULT;
1368 
1369 		sk_get_meminfo(sk, meminfo);
1370 
1371 		len = min_t(unsigned int, len, sizeof(meminfo));
1372 		if (copy_to_user(optval, &meminfo, len))
1373 			return -EFAULT;
1374 
1375 		goto lenout;
1376 	}
1377 
1378 #ifdef CONFIG_NET_RX_BUSY_POLL
1379 	case SO_INCOMING_NAPI_ID:
1380 		v.val = READ_ONCE(sk->sk_napi_id);
1381 
1382 		/* aggregate non-NAPI IDs down to 0 */
1383 		if (v.val < MIN_NAPI_ID)
1384 			v.val = 0;
1385 
1386 		break;
1387 #endif
1388 
1389 	case SO_COOKIE:
1390 		lv = sizeof(u64);
1391 		if (len < lv)
1392 			return -EINVAL;
1393 		v.val64 = sock_gen_cookie(sk);
1394 		break;
1395 
1396 	case SO_ZEROCOPY:
1397 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1398 		break;
1399 
1400 	default:
1401 		/* We implement the SO_SNDLOWAT etc to not be settable
1402 		 * (1003.1g 7).
1403 		 */
1404 		return -ENOPROTOOPT;
1405 	}
1406 
1407 	if (len > lv)
1408 		len = lv;
1409 	if (copy_to_user(optval, &v, len))
1410 		return -EFAULT;
1411 lenout:
1412 	if (put_user(len, optlen))
1413 		return -EFAULT;
1414 	return 0;
1415 }
1416 
1417 /*
1418  * Initialize an sk_lock.
1419  *
1420  * (We also register the sk_lock with the lock validator.)
1421  */
1422 static inline void sock_lock_init(struct sock *sk)
1423 {
1424 	if (sk->sk_kern_sock)
1425 		sock_lock_init_class_and_name(
1426 			sk,
1427 			af_family_kern_slock_key_strings[sk->sk_family],
1428 			af_family_kern_slock_keys + sk->sk_family,
1429 			af_family_kern_key_strings[sk->sk_family],
1430 			af_family_kern_keys + sk->sk_family);
1431 	else
1432 		sock_lock_init_class_and_name(
1433 			sk,
1434 			af_family_slock_key_strings[sk->sk_family],
1435 			af_family_slock_keys + sk->sk_family,
1436 			af_family_key_strings[sk->sk_family],
1437 			af_family_keys + sk->sk_family);
1438 }
1439 
1440 /*
1441  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1442  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1443  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1444  */
1445 static void sock_copy(struct sock *nsk, const struct sock *osk)
1446 {
1447 #ifdef CONFIG_SECURITY_NETWORK
1448 	void *sptr = nsk->sk_security;
1449 #endif
1450 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1451 
1452 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1453 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1454 
1455 #ifdef CONFIG_SECURITY_NETWORK
1456 	nsk->sk_security = sptr;
1457 	security_sk_clone(osk, nsk);
1458 #endif
1459 }
1460 
1461 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1462 		int family)
1463 {
1464 	struct sock *sk;
1465 	struct kmem_cache *slab;
1466 
1467 	slab = prot->slab;
1468 	if (slab != NULL) {
1469 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1470 		if (!sk)
1471 			return sk;
1472 		if (priority & __GFP_ZERO)
1473 			sk_prot_clear_nulls(sk, prot->obj_size);
1474 	} else
1475 		sk = kmalloc(prot->obj_size, priority);
1476 
1477 	if (sk != NULL) {
1478 		if (security_sk_alloc(sk, family, priority))
1479 			goto out_free;
1480 
1481 		if (!try_module_get(prot->owner))
1482 			goto out_free_sec;
1483 		sk_tx_queue_clear(sk);
1484 	}
1485 
1486 	return sk;
1487 
1488 out_free_sec:
1489 	security_sk_free(sk);
1490 out_free:
1491 	if (slab != NULL)
1492 		kmem_cache_free(slab, sk);
1493 	else
1494 		kfree(sk);
1495 	return NULL;
1496 }
1497 
1498 static void sk_prot_free(struct proto *prot, struct sock *sk)
1499 {
1500 	struct kmem_cache *slab;
1501 	struct module *owner;
1502 
1503 	owner = prot->owner;
1504 	slab = prot->slab;
1505 
1506 	cgroup_sk_free(&sk->sk_cgrp_data);
1507 	mem_cgroup_sk_free(sk);
1508 	security_sk_free(sk);
1509 	if (slab != NULL)
1510 		kmem_cache_free(slab, sk);
1511 	else
1512 		kfree(sk);
1513 	module_put(owner);
1514 }
1515 
1516 /**
1517  *	sk_alloc - All socket objects are allocated here
1518  *	@net: the applicable net namespace
1519  *	@family: protocol family
1520  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1521  *	@prot: struct proto associated with this new sock instance
1522  *	@kern: is this to be a kernel socket?
1523  */
1524 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1525 		      struct proto *prot, int kern)
1526 {
1527 	struct sock *sk;
1528 
1529 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1530 	if (sk) {
1531 		sk->sk_family = family;
1532 		/*
1533 		 * See comment in struct sock definition to understand
1534 		 * why we need sk_prot_creator -acme
1535 		 */
1536 		sk->sk_prot = sk->sk_prot_creator = prot;
1537 		sk->sk_kern_sock = kern;
1538 		sock_lock_init(sk);
1539 		sk->sk_net_refcnt = kern ? 0 : 1;
1540 		if (likely(sk->sk_net_refcnt)) {
1541 			get_net(net);
1542 			sock_inuse_add(net, 1);
1543 		}
1544 
1545 		sock_net_set(sk, net);
1546 		refcount_set(&sk->sk_wmem_alloc, 1);
1547 
1548 		mem_cgroup_sk_alloc(sk);
1549 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1550 		sock_update_classid(&sk->sk_cgrp_data);
1551 		sock_update_netprioidx(&sk->sk_cgrp_data);
1552 	}
1553 
1554 	return sk;
1555 }
1556 EXPORT_SYMBOL(sk_alloc);
1557 
1558 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1559  * grace period. This is the case for UDP sockets and TCP listeners.
1560  */
1561 static void __sk_destruct(struct rcu_head *head)
1562 {
1563 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1564 	struct sk_filter *filter;
1565 
1566 	if (sk->sk_destruct)
1567 		sk->sk_destruct(sk);
1568 
1569 	filter = rcu_dereference_check(sk->sk_filter,
1570 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1571 	if (filter) {
1572 		sk_filter_uncharge(sk, filter);
1573 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1574 	}
1575 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1576 		reuseport_detach_sock(sk);
1577 
1578 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1579 
1580 	if (atomic_read(&sk->sk_omem_alloc))
1581 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1582 			 __func__, atomic_read(&sk->sk_omem_alloc));
1583 
1584 	if (sk->sk_frag.page) {
1585 		put_page(sk->sk_frag.page);
1586 		sk->sk_frag.page = NULL;
1587 	}
1588 
1589 	if (sk->sk_peer_cred)
1590 		put_cred(sk->sk_peer_cred);
1591 	put_pid(sk->sk_peer_pid);
1592 	if (likely(sk->sk_net_refcnt))
1593 		put_net(sock_net(sk));
1594 	sk_prot_free(sk->sk_prot_creator, sk);
1595 }
1596 
1597 void sk_destruct(struct sock *sk)
1598 {
1599 	if (sock_flag(sk, SOCK_RCU_FREE))
1600 		call_rcu(&sk->sk_rcu, __sk_destruct);
1601 	else
1602 		__sk_destruct(&sk->sk_rcu);
1603 }
1604 
1605 static void __sk_free(struct sock *sk)
1606 {
1607 	if (likely(sk->sk_net_refcnt))
1608 		sock_inuse_add(sock_net(sk), -1);
1609 
1610 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1611 		sock_diag_broadcast_destroy(sk);
1612 	else
1613 		sk_destruct(sk);
1614 }
1615 
1616 void sk_free(struct sock *sk)
1617 {
1618 	/*
1619 	 * We subtract one from sk_wmem_alloc and can know if
1620 	 * some packets are still in some tx queue.
1621 	 * If not null, sock_wfree() will call __sk_free(sk) later
1622 	 */
1623 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1624 		__sk_free(sk);
1625 }
1626 EXPORT_SYMBOL(sk_free);
1627 
1628 static void sk_init_common(struct sock *sk)
1629 {
1630 	skb_queue_head_init(&sk->sk_receive_queue);
1631 	skb_queue_head_init(&sk->sk_write_queue);
1632 	skb_queue_head_init(&sk->sk_error_queue);
1633 
1634 	rwlock_init(&sk->sk_callback_lock);
1635 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1636 			af_rlock_keys + sk->sk_family,
1637 			af_family_rlock_key_strings[sk->sk_family]);
1638 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1639 			af_wlock_keys + sk->sk_family,
1640 			af_family_wlock_key_strings[sk->sk_family]);
1641 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1642 			af_elock_keys + sk->sk_family,
1643 			af_family_elock_key_strings[sk->sk_family]);
1644 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1645 			af_callback_keys + sk->sk_family,
1646 			af_family_clock_key_strings[sk->sk_family]);
1647 }
1648 
1649 /**
1650  *	sk_clone_lock - clone a socket, and lock its clone
1651  *	@sk: the socket to clone
1652  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1653  *
1654  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1655  */
1656 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1657 {
1658 	struct sock *newsk;
1659 	bool is_charged = true;
1660 
1661 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1662 	if (newsk != NULL) {
1663 		struct sk_filter *filter;
1664 
1665 		sock_copy(newsk, sk);
1666 
1667 		newsk->sk_prot_creator = sk->sk_prot;
1668 
1669 		/* SANITY */
1670 		if (likely(newsk->sk_net_refcnt))
1671 			get_net(sock_net(newsk));
1672 		sk_node_init(&newsk->sk_node);
1673 		sock_lock_init(newsk);
1674 		bh_lock_sock(newsk);
1675 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1676 		newsk->sk_backlog.len = 0;
1677 
1678 		atomic_set(&newsk->sk_rmem_alloc, 0);
1679 		/*
1680 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1681 		 */
1682 		refcount_set(&newsk->sk_wmem_alloc, 1);
1683 		atomic_set(&newsk->sk_omem_alloc, 0);
1684 		sk_init_common(newsk);
1685 
1686 		newsk->sk_dst_cache	= NULL;
1687 		newsk->sk_dst_pending_confirm = 0;
1688 		newsk->sk_wmem_queued	= 0;
1689 		newsk->sk_forward_alloc = 0;
1690 		atomic_set(&newsk->sk_drops, 0);
1691 		newsk->sk_send_head	= NULL;
1692 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1693 		atomic_set(&newsk->sk_zckey, 0);
1694 
1695 		sock_reset_flag(newsk, SOCK_DONE);
1696 		mem_cgroup_sk_alloc(newsk);
1697 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1698 
1699 		rcu_read_lock();
1700 		filter = rcu_dereference(sk->sk_filter);
1701 		if (filter != NULL)
1702 			/* though it's an empty new sock, the charging may fail
1703 			 * if sysctl_optmem_max was changed between creation of
1704 			 * original socket and cloning
1705 			 */
1706 			is_charged = sk_filter_charge(newsk, filter);
1707 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1708 		rcu_read_unlock();
1709 
1710 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1711 			/* We need to make sure that we don't uncharge the new
1712 			 * socket if we couldn't charge it in the first place
1713 			 * as otherwise we uncharge the parent's filter.
1714 			 */
1715 			if (!is_charged)
1716 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1717 			sk_free_unlock_clone(newsk);
1718 			newsk = NULL;
1719 			goto out;
1720 		}
1721 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1722 
1723 		newsk->sk_err	   = 0;
1724 		newsk->sk_err_soft = 0;
1725 		newsk->sk_priority = 0;
1726 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1727 		atomic64_set(&newsk->sk_cookie, 0);
1728 		if (likely(newsk->sk_net_refcnt))
1729 			sock_inuse_add(sock_net(newsk), 1);
1730 
1731 		/*
1732 		 * Before updating sk_refcnt, we must commit prior changes to memory
1733 		 * (Documentation/RCU/rculist_nulls.txt for details)
1734 		 */
1735 		smp_wmb();
1736 		refcount_set(&newsk->sk_refcnt, 2);
1737 
1738 		/*
1739 		 * Increment the counter in the same struct proto as the master
1740 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1741 		 * is the same as sk->sk_prot->socks, as this field was copied
1742 		 * with memcpy).
1743 		 *
1744 		 * This _changes_ the previous behaviour, where
1745 		 * tcp_create_openreq_child always was incrementing the
1746 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1747 		 * to be taken into account in all callers. -acme
1748 		 */
1749 		sk_refcnt_debug_inc(newsk);
1750 		sk_set_socket(newsk, NULL);
1751 		newsk->sk_wq = NULL;
1752 
1753 		if (newsk->sk_prot->sockets_allocated)
1754 			sk_sockets_allocated_inc(newsk);
1755 
1756 		if (sock_needs_netstamp(sk) &&
1757 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1758 			net_enable_timestamp();
1759 	}
1760 out:
1761 	return newsk;
1762 }
1763 EXPORT_SYMBOL_GPL(sk_clone_lock);
1764 
1765 void sk_free_unlock_clone(struct sock *sk)
1766 {
1767 	/* It is still raw copy of parent, so invalidate
1768 	 * destructor and make plain sk_free() */
1769 	sk->sk_destruct = NULL;
1770 	bh_unlock_sock(sk);
1771 	sk_free(sk);
1772 }
1773 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1774 
1775 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1776 {
1777 	u32 max_segs = 1;
1778 
1779 	sk_dst_set(sk, dst);
1780 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1781 	if (sk->sk_route_caps & NETIF_F_GSO)
1782 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1783 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1784 	if (sk_can_gso(sk)) {
1785 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1786 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1787 		} else {
1788 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1789 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1790 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1791 		}
1792 	}
1793 	sk->sk_gso_max_segs = max_segs;
1794 }
1795 EXPORT_SYMBOL_GPL(sk_setup_caps);
1796 
1797 /*
1798  *	Simple resource managers for sockets.
1799  */
1800 
1801 
1802 /*
1803  * Write buffer destructor automatically called from kfree_skb.
1804  */
1805 void sock_wfree(struct sk_buff *skb)
1806 {
1807 	struct sock *sk = skb->sk;
1808 	unsigned int len = skb->truesize;
1809 
1810 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1811 		/*
1812 		 * Keep a reference on sk_wmem_alloc, this will be released
1813 		 * after sk_write_space() call
1814 		 */
1815 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1816 		sk->sk_write_space(sk);
1817 		len = 1;
1818 	}
1819 	/*
1820 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1821 	 * could not do because of in-flight packets
1822 	 */
1823 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1824 		__sk_free(sk);
1825 }
1826 EXPORT_SYMBOL(sock_wfree);
1827 
1828 /* This variant of sock_wfree() is used by TCP,
1829  * since it sets SOCK_USE_WRITE_QUEUE.
1830  */
1831 void __sock_wfree(struct sk_buff *skb)
1832 {
1833 	struct sock *sk = skb->sk;
1834 
1835 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1836 		__sk_free(sk);
1837 }
1838 
1839 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1840 {
1841 	skb_orphan(skb);
1842 	skb->sk = sk;
1843 #ifdef CONFIG_INET
1844 	if (unlikely(!sk_fullsock(sk))) {
1845 		skb->destructor = sock_edemux;
1846 		sock_hold(sk);
1847 		return;
1848 	}
1849 #endif
1850 	skb->destructor = sock_wfree;
1851 	skb_set_hash_from_sk(skb, sk);
1852 	/*
1853 	 * We used to take a refcount on sk, but following operation
1854 	 * is enough to guarantee sk_free() wont free this sock until
1855 	 * all in-flight packets are completed
1856 	 */
1857 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1858 }
1859 EXPORT_SYMBOL(skb_set_owner_w);
1860 
1861 /* This helper is used by netem, as it can hold packets in its
1862  * delay queue. We want to allow the owner socket to send more
1863  * packets, as if they were already TX completed by a typical driver.
1864  * But we also want to keep skb->sk set because some packet schedulers
1865  * rely on it (sch_fq for example).
1866  */
1867 void skb_orphan_partial(struct sk_buff *skb)
1868 {
1869 	if (skb_is_tcp_pure_ack(skb))
1870 		return;
1871 
1872 	if (skb->destructor == sock_wfree
1873 #ifdef CONFIG_INET
1874 	    || skb->destructor == tcp_wfree
1875 #endif
1876 		) {
1877 		struct sock *sk = skb->sk;
1878 
1879 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1880 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1881 			skb->destructor = sock_efree;
1882 		}
1883 	} else {
1884 		skb_orphan(skb);
1885 	}
1886 }
1887 EXPORT_SYMBOL(skb_orphan_partial);
1888 
1889 /*
1890  * Read buffer destructor automatically called from kfree_skb.
1891  */
1892 void sock_rfree(struct sk_buff *skb)
1893 {
1894 	struct sock *sk = skb->sk;
1895 	unsigned int len = skb->truesize;
1896 
1897 	atomic_sub(len, &sk->sk_rmem_alloc);
1898 	sk_mem_uncharge(sk, len);
1899 }
1900 EXPORT_SYMBOL(sock_rfree);
1901 
1902 /*
1903  * Buffer destructor for skbs that are not used directly in read or write
1904  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1905  */
1906 void sock_efree(struct sk_buff *skb)
1907 {
1908 	sock_put(skb->sk);
1909 }
1910 EXPORT_SYMBOL(sock_efree);
1911 
1912 kuid_t sock_i_uid(struct sock *sk)
1913 {
1914 	kuid_t uid;
1915 
1916 	read_lock_bh(&sk->sk_callback_lock);
1917 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1918 	read_unlock_bh(&sk->sk_callback_lock);
1919 	return uid;
1920 }
1921 EXPORT_SYMBOL(sock_i_uid);
1922 
1923 unsigned long sock_i_ino(struct sock *sk)
1924 {
1925 	unsigned long ino;
1926 
1927 	read_lock_bh(&sk->sk_callback_lock);
1928 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1929 	read_unlock_bh(&sk->sk_callback_lock);
1930 	return ino;
1931 }
1932 EXPORT_SYMBOL(sock_i_ino);
1933 
1934 /*
1935  * Allocate a skb from the socket's send buffer.
1936  */
1937 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1938 			     gfp_t priority)
1939 {
1940 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1941 		struct sk_buff *skb = alloc_skb(size, priority);
1942 		if (skb) {
1943 			skb_set_owner_w(skb, sk);
1944 			return skb;
1945 		}
1946 	}
1947 	return NULL;
1948 }
1949 EXPORT_SYMBOL(sock_wmalloc);
1950 
1951 static void sock_ofree(struct sk_buff *skb)
1952 {
1953 	struct sock *sk = skb->sk;
1954 
1955 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1956 }
1957 
1958 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1959 			     gfp_t priority)
1960 {
1961 	struct sk_buff *skb;
1962 
1963 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1964 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1965 	    sysctl_optmem_max)
1966 		return NULL;
1967 
1968 	skb = alloc_skb(size, priority);
1969 	if (!skb)
1970 		return NULL;
1971 
1972 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1973 	skb->sk = sk;
1974 	skb->destructor = sock_ofree;
1975 	return skb;
1976 }
1977 
1978 /*
1979  * Allocate a memory block from the socket's option memory buffer.
1980  */
1981 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1982 {
1983 	if ((unsigned int)size <= sysctl_optmem_max &&
1984 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1985 		void *mem;
1986 		/* First do the add, to avoid the race if kmalloc
1987 		 * might sleep.
1988 		 */
1989 		atomic_add(size, &sk->sk_omem_alloc);
1990 		mem = kmalloc(size, priority);
1991 		if (mem)
1992 			return mem;
1993 		atomic_sub(size, &sk->sk_omem_alloc);
1994 	}
1995 	return NULL;
1996 }
1997 EXPORT_SYMBOL(sock_kmalloc);
1998 
1999 /* Free an option memory block. Note, we actually want the inline
2000  * here as this allows gcc to detect the nullify and fold away the
2001  * condition entirely.
2002  */
2003 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2004 				  const bool nullify)
2005 {
2006 	if (WARN_ON_ONCE(!mem))
2007 		return;
2008 	if (nullify)
2009 		kzfree(mem);
2010 	else
2011 		kfree(mem);
2012 	atomic_sub(size, &sk->sk_omem_alloc);
2013 }
2014 
2015 void sock_kfree_s(struct sock *sk, void *mem, int size)
2016 {
2017 	__sock_kfree_s(sk, mem, size, false);
2018 }
2019 EXPORT_SYMBOL(sock_kfree_s);
2020 
2021 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2022 {
2023 	__sock_kfree_s(sk, mem, size, true);
2024 }
2025 EXPORT_SYMBOL(sock_kzfree_s);
2026 
2027 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2028    I think, these locks should be removed for datagram sockets.
2029  */
2030 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2031 {
2032 	DEFINE_WAIT(wait);
2033 
2034 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2035 	for (;;) {
2036 		if (!timeo)
2037 			break;
2038 		if (signal_pending(current))
2039 			break;
2040 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2041 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2042 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2043 			break;
2044 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2045 			break;
2046 		if (sk->sk_err)
2047 			break;
2048 		timeo = schedule_timeout(timeo);
2049 	}
2050 	finish_wait(sk_sleep(sk), &wait);
2051 	return timeo;
2052 }
2053 
2054 
2055 /*
2056  *	Generic send/receive buffer handlers
2057  */
2058 
2059 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2060 				     unsigned long data_len, int noblock,
2061 				     int *errcode, int max_page_order)
2062 {
2063 	struct sk_buff *skb;
2064 	long timeo;
2065 	int err;
2066 
2067 	timeo = sock_sndtimeo(sk, noblock);
2068 	for (;;) {
2069 		err = sock_error(sk);
2070 		if (err != 0)
2071 			goto failure;
2072 
2073 		err = -EPIPE;
2074 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2075 			goto failure;
2076 
2077 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2078 			break;
2079 
2080 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2081 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2082 		err = -EAGAIN;
2083 		if (!timeo)
2084 			goto failure;
2085 		if (signal_pending(current))
2086 			goto interrupted;
2087 		timeo = sock_wait_for_wmem(sk, timeo);
2088 	}
2089 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2090 				   errcode, sk->sk_allocation);
2091 	if (skb)
2092 		skb_set_owner_w(skb, sk);
2093 	return skb;
2094 
2095 interrupted:
2096 	err = sock_intr_errno(timeo);
2097 failure:
2098 	*errcode = err;
2099 	return NULL;
2100 }
2101 EXPORT_SYMBOL(sock_alloc_send_pskb);
2102 
2103 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2104 				    int noblock, int *errcode)
2105 {
2106 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2107 }
2108 EXPORT_SYMBOL(sock_alloc_send_skb);
2109 
2110 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2111 		     struct sockcm_cookie *sockc)
2112 {
2113 	u32 tsflags;
2114 
2115 	switch (cmsg->cmsg_type) {
2116 	case SO_MARK:
2117 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2118 			return -EPERM;
2119 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2120 			return -EINVAL;
2121 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2122 		break;
2123 	case SO_TIMESTAMPING:
2124 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2125 			return -EINVAL;
2126 
2127 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2128 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2129 			return -EINVAL;
2130 
2131 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2132 		sockc->tsflags |= tsflags;
2133 		break;
2134 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2135 	case SCM_RIGHTS:
2136 	case SCM_CREDENTIALS:
2137 		break;
2138 	default:
2139 		return -EINVAL;
2140 	}
2141 	return 0;
2142 }
2143 EXPORT_SYMBOL(__sock_cmsg_send);
2144 
2145 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2146 		   struct sockcm_cookie *sockc)
2147 {
2148 	struct cmsghdr *cmsg;
2149 	int ret;
2150 
2151 	for_each_cmsghdr(cmsg, msg) {
2152 		if (!CMSG_OK(msg, cmsg))
2153 			return -EINVAL;
2154 		if (cmsg->cmsg_level != SOL_SOCKET)
2155 			continue;
2156 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2157 		if (ret)
2158 			return ret;
2159 	}
2160 	return 0;
2161 }
2162 EXPORT_SYMBOL(sock_cmsg_send);
2163 
2164 static void sk_enter_memory_pressure(struct sock *sk)
2165 {
2166 	if (!sk->sk_prot->enter_memory_pressure)
2167 		return;
2168 
2169 	sk->sk_prot->enter_memory_pressure(sk);
2170 }
2171 
2172 static void sk_leave_memory_pressure(struct sock *sk)
2173 {
2174 	if (sk->sk_prot->leave_memory_pressure) {
2175 		sk->sk_prot->leave_memory_pressure(sk);
2176 	} else {
2177 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2178 
2179 		if (memory_pressure && *memory_pressure)
2180 			*memory_pressure = 0;
2181 	}
2182 }
2183 
2184 /* On 32bit arches, an skb frag is limited to 2^15 */
2185 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2186 
2187 /**
2188  * skb_page_frag_refill - check that a page_frag contains enough room
2189  * @sz: minimum size of the fragment we want to get
2190  * @pfrag: pointer to page_frag
2191  * @gfp: priority for memory allocation
2192  *
2193  * Note: While this allocator tries to use high order pages, there is
2194  * no guarantee that allocations succeed. Therefore, @sz MUST be
2195  * less or equal than PAGE_SIZE.
2196  */
2197 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2198 {
2199 	if (pfrag->page) {
2200 		if (page_ref_count(pfrag->page) == 1) {
2201 			pfrag->offset = 0;
2202 			return true;
2203 		}
2204 		if (pfrag->offset + sz <= pfrag->size)
2205 			return true;
2206 		put_page(pfrag->page);
2207 	}
2208 
2209 	pfrag->offset = 0;
2210 	if (SKB_FRAG_PAGE_ORDER) {
2211 		/* Avoid direct reclaim but allow kswapd to wake */
2212 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2213 					  __GFP_COMP | __GFP_NOWARN |
2214 					  __GFP_NORETRY,
2215 					  SKB_FRAG_PAGE_ORDER);
2216 		if (likely(pfrag->page)) {
2217 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2218 			return true;
2219 		}
2220 	}
2221 	pfrag->page = alloc_page(gfp);
2222 	if (likely(pfrag->page)) {
2223 		pfrag->size = PAGE_SIZE;
2224 		return true;
2225 	}
2226 	return false;
2227 }
2228 EXPORT_SYMBOL(skb_page_frag_refill);
2229 
2230 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2231 {
2232 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2233 		return true;
2234 
2235 	sk_enter_memory_pressure(sk);
2236 	sk_stream_moderate_sndbuf(sk);
2237 	return false;
2238 }
2239 EXPORT_SYMBOL(sk_page_frag_refill);
2240 
2241 static void __lock_sock(struct sock *sk)
2242 	__releases(&sk->sk_lock.slock)
2243 	__acquires(&sk->sk_lock.slock)
2244 {
2245 	DEFINE_WAIT(wait);
2246 
2247 	for (;;) {
2248 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2249 					TASK_UNINTERRUPTIBLE);
2250 		spin_unlock_bh(&sk->sk_lock.slock);
2251 		schedule();
2252 		spin_lock_bh(&sk->sk_lock.slock);
2253 		if (!sock_owned_by_user(sk))
2254 			break;
2255 	}
2256 	finish_wait(&sk->sk_lock.wq, &wait);
2257 }
2258 
2259 static void __release_sock(struct sock *sk)
2260 	__releases(&sk->sk_lock.slock)
2261 	__acquires(&sk->sk_lock.slock)
2262 {
2263 	struct sk_buff *skb, *next;
2264 
2265 	while ((skb = sk->sk_backlog.head) != NULL) {
2266 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2267 
2268 		spin_unlock_bh(&sk->sk_lock.slock);
2269 
2270 		do {
2271 			next = skb->next;
2272 			prefetch(next);
2273 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2274 			skb->next = NULL;
2275 			sk_backlog_rcv(sk, skb);
2276 
2277 			cond_resched();
2278 
2279 			skb = next;
2280 		} while (skb != NULL);
2281 
2282 		spin_lock_bh(&sk->sk_lock.slock);
2283 	}
2284 
2285 	/*
2286 	 * Doing the zeroing here guarantee we can not loop forever
2287 	 * while a wild producer attempts to flood us.
2288 	 */
2289 	sk->sk_backlog.len = 0;
2290 }
2291 
2292 void __sk_flush_backlog(struct sock *sk)
2293 {
2294 	spin_lock_bh(&sk->sk_lock.slock);
2295 	__release_sock(sk);
2296 	spin_unlock_bh(&sk->sk_lock.slock);
2297 }
2298 
2299 /**
2300  * sk_wait_data - wait for data to arrive at sk_receive_queue
2301  * @sk:    sock to wait on
2302  * @timeo: for how long
2303  * @skb:   last skb seen on sk_receive_queue
2304  *
2305  * Now socket state including sk->sk_err is changed only under lock,
2306  * hence we may omit checks after joining wait queue.
2307  * We check receive queue before schedule() only as optimization;
2308  * it is very likely that release_sock() added new data.
2309  */
2310 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2311 {
2312 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2313 	int rc;
2314 
2315 	add_wait_queue(sk_sleep(sk), &wait);
2316 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2317 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2318 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2319 	remove_wait_queue(sk_sleep(sk), &wait);
2320 	return rc;
2321 }
2322 EXPORT_SYMBOL(sk_wait_data);
2323 
2324 /**
2325  *	__sk_mem_raise_allocated - increase memory_allocated
2326  *	@sk: socket
2327  *	@size: memory size to allocate
2328  *	@amt: pages to allocate
2329  *	@kind: allocation type
2330  *
2331  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2332  */
2333 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2334 {
2335 	struct proto *prot = sk->sk_prot;
2336 	long allocated = sk_memory_allocated_add(sk, amt);
2337 
2338 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2339 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2340 		goto suppress_allocation;
2341 
2342 	/* Under limit. */
2343 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2344 		sk_leave_memory_pressure(sk);
2345 		return 1;
2346 	}
2347 
2348 	/* Under pressure. */
2349 	if (allocated > sk_prot_mem_limits(sk, 1))
2350 		sk_enter_memory_pressure(sk);
2351 
2352 	/* Over hard limit. */
2353 	if (allocated > sk_prot_mem_limits(sk, 2))
2354 		goto suppress_allocation;
2355 
2356 	/* guarantee minimum buffer size under pressure */
2357 	if (kind == SK_MEM_RECV) {
2358 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2359 			return 1;
2360 
2361 	} else { /* SK_MEM_SEND */
2362 		int wmem0 = sk_get_wmem0(sk, prot);
2363 
2364 		if (sk->sk_type == SOCK_STREAM) {
2365 			if (sk->sk_wmem_queued < wmem0)
2366 				return 1;
2367 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2368 				return 1;
2369 		}
2370 	}
2371 
2372 	if (sk_has_memory_pressure(sk)) {
2373 		int alloc;
2374 
2375 		if (!sk_under_memory_pressure(sk))
2376 			return 1;
2377 		alloc = sk_sockets_allocated_read_positive(sk);
2378 		if (sk_prot_mem_limits(sk, 2) > alloc *
2379 		    sk_mem_pages(sk->sk_wmem_queued +
2380 				 atomic_read(&sk->sk_rmem_alloc) +
2381 				 sk->sk_forward_alloc))
2382 			return 1;
2383 	}
2384 
2385 suppress_allocation:
2386 
2387 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2388 		sk_stream_moderate_sndbuf(sk);
2389 
2390 		/* Fail only if socket is _under_ its sndbuf.
2391 		 * In this case we cannot block, so that we have to fail.
2392 		 */
2393 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2394 			return 1;
2395 	}
2396 
2397 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2398 
2399 	sk_memory_allocated_sub(sk, amt);
2400 
2401 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2402 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2403 
2404 	return 0;
2405 }
2406 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2407 
2408 /**
2409  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2410  *	@sk: socket
2411  *	@size: memory size to allocate
2412  *	@kind: allocation type
2413  *
2414  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2415  *	rmem allocation. This function assumes that protocols which have
2416  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2417  */
2418 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2419 {
2420 	int ret, amt = sk_mem_pages(size);
2421 
2422 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2423 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2424 	if (!ret)
2425 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2426 	return ret;
2427 }
2428 EXPORT_SYMBOL(__sk_mem_schedule);
2429 
2430 /**
2431  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2432  *	@sk: socket
2433  *	@amount: number of quanta
2434  *
2435  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2436  */
2437 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2438 {
2439 	sk_memory_allocated_sub(sk, amount);
2440 
2441 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2442 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2443 
2444 	if (sk_under_memory_pressure(sk) &&
2445 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2446 		sk_leave_memory_pressure(sk);
2447 }
2448 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2449 
2450 /**
2451  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2452  *	@sk: socket
2453  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2454  */
2455 void __sk_mem_reclaim(struct sock *sk, int amount)
2456 {
2457 	amount >>= SK_MEM_QUANTUM_SHIFT;
2458 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2459 	__sk_mem_reduce_allocated(sk, amount);
2460 }
2461 EXPORT_SYMBOL(__sk_mem_reclaim);
2462 
2463 int sk_set_peek_off(struct sock *sk, int val)
2464 {
2465 	sk->sk_peek_off = val;
2466 	return 0;
2467 }
2468 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2469 
2470 /*
2471  * Set of default routines for initialising struct proto_ops when
2472  * the protocol does not support a particular function. In certain
2473  * cases where it makes no sense for a protocol to have a "do nothing"
2474  * function, some default processing is provided.
2475  */
2476 
2477 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2478 {
2479 	return -EOPNOTSUPP;
2480 }
2481 EXPORT_SYMBOL(sock_no_bind);
2482 
2483 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2484 		    int len, int flags)
2485 {
2486 	return -EOPNOTSUPP;
2487 }
2488 EXPORT_SYMBOL(sock_no_connect);
2489 
2490 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2491 {
2492 	return -EOPNOTSUPP;
2493 }
2494 EXPORT_SYMBOL(sock_no_socketpair);
2495 
2496 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2497 		   bool kern)
2498 {
2499 	return -EOPNOTSUPP;
2500 }
2501 EXPORT_SYMBOL(sock_no_accept);
2502 
2503 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2504 		    int peer)
2505 {
2506 	return -EOPNOTSUPP;
2507 }
2508 EXPORT_SYMBOL(sock_no_getname);
2509 
2510 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2511 {
2512 	return 0;
2513 }
2514 EXPORT_SYMBOL(sock_no_poll);
2515 
2516 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2517 {
2518 	return -EOPNOTSUPP;
2519 }
2520 EXPORT_SYMBOL(sock_no_ioctl);
2521 
2522 int sock_no_listen(struct socket *sock, int backlog)
2523 {
2524 	return -EOPNOTSUPP;
2525 }
2526 EXPORT_SYMBOL(sock_no_listen);
2527 
2528 int sock_no_shutdown(struct socket *sock, int how)
2529 {
2530 	return -EOPNOTSUPP;
2531 }
2532 EXPORT_SYMBOL(sock_no_shutdown);
2533 
2534 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2535 		    char __user *optval, unsigned int optlen)
2536 {
2537 	return -EOPNOTSUPP;
2538 }
2539 EXPORT_SYMBOL(sock_no_setsockopt);
2540 
2541 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2542 		    char __user *optval, int __user *optlen)
2543 {
2544 	return -EOPNOTSUPP;
2545 }
2546 EXPORT_SYMBOL(sock_no_getsockopt);
2547 
2548 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2549 {
2550 	return -EOPNOTSUPP;
2551 }
2552 EXPORT_SYMBOL(sock_no_sendmsg);
2553 
2554 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2555 {
2556 	return -EOPNOTSUPP;
2557 }
2558 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2559 
2560 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2561 		    int flags)
2562 {
2563 	return -EOPNOTSUPP;
2564 }
2565 EXPORT_SYMBOL(sock_no_recvmsg);
2566 
2567 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2568 {
2569 	/* Mirror missing mmap method error code */
2570 	return -ENODEV;
2571 }
2572 EXPORT_SYMBOL(sock_no_mmap);
2573 
2574 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2575 {
2576 	ssize_t res;
2577 	struct msghdr msg = {.msg_flags = flags};
2578 	struct kvec iov;
2579 	char *kaddr = kmap(page);
2580 	iov.iov_base = kaddr + offset;
2581 	iov.iov_len = size;
2582 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2583 	kunmap(page);
2584 	return res;
2585 }
2586 EXPORT_SYMBOL(sock_no_sendpage);
2587 
2588 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2589 				int offset, size_t size, int flags)
2590 {
2591 	ssize_t res;
2592 	struct msghdr msg = {.msg_flags = flags};
2593 	struct kvec iov;
2594 	char *kaddr = kmap(page);
2595 
2596 	iov.iov_base = kaddr + offset;
2597 	iov.iov_len = size;
2598 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2599 	kunmap(page);
2600 	return res;
2601 }
2602 EXPORT_SYMBOL(sock_no_sendpage_locked);
2603 
2604 /*
2605  *	Default Socket Callbacks
2606  */
2607 
2608 static void sock_def_wakeup(struct sock *sk)
2609 {
2610 	struct socket_wq *wq;
2611 
2612 	rcu_read_lock();
2613 	wq = rcu_dereference(sk->sk_wq);
2614 	if (skwq_has_sleeper(wq))
2615 		wake_up_interruptible_all(&wq->wait);
2616 	rcu_read_unlock();
2617 }
2618 
2619 static void sock_def_error_report(struct sock *sk)
2620 {
2621 	struct socket_wq *wq;
2622 
2623 	rcu_read_lock();
2624 	wq = rcu_dereference(sk->sk_wq);
2625 	if (skwq_has_sleeper(wq))
2626 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2627 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2628 	rcu_read_unlock();
2629 }
2630 
2631 static void sock_def_readable(struct sock *sk)
2632 {
2633 	struct socket_wq *wq;
2634 
2635 	rcu_read_lock();
2636 	wq = rcu_dereference(sk->sk_wq);
2637 	if (skwq_has_sleeper(wq))
2638 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2639 						EPOLLRDNORM | EPOLLRDBAND);
2640 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2641 	rcu_read_unlock();
2642 }
2643 
2644 static void sock_def_write_space(struct sock *sk)
2645 {
2646 	struct socket_wq *wq;
2647 
2648 	rcu_read_lock();
2649 
2650 	/* Do not wake up a writer until he can make "significant"
2651 	 * progress.  --DaveM
2652 	 */
2653 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2654 		wq = rcu_dereference(sk->sk_wq);
2655 		if (skwq_has_sleeper(wq))
2656 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2657 						EPOLLWRNORM | EPOLLWRBAND);
2658 
2659 		/* Should agree with poll, otherwise some programs break */
2660 		if (sock_writeable(sk))
2661 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2662 	}
2663 
2664 	rcu_read_unlock();
2665 }
2666 
2667 static void sock_def_destruct(struct sock *sk)
2668 {
2669 }
2670 
2671 void sk_send_sigurg(struct sock *sk)
2672 {
2673 	if (sk->sk_socket && sk->sk_socket->file)
2674 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2675 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2676 }
2677 EXPORT_SYMBOL(sk_send_sigurg);
2678 
2679 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2680 		    unsigned long expires)
2681 {
2682 	if (!mod_timer(timer, expires))
2683 		sock_hold(sk);
2684 }
2685 EXPORT_SYMBOL(sk_reset_timer);
2686 
2687 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2688 {
2689 	if (del_timer(timer))
2690 		__sock_put(sk);
2691 }
2692 EXPORT_SYMBOL(sk_stop_timer);
2693 
2694 void sock_init_data(struct socket *sock, struct sock *sk)
2695 {
2696 	sk_init_common(sk);
2697 	sk->sk_send_head	=	NULL;
2698 
2699 	timer_setup(&sk->sk_timer, NULL, 0);
2700 
2701 	sk->sk_allocation	=	GFP_KERNEL;
2702 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2703 	sk->sk_sndbuf		=	sysctl_wmem_default;
2704 	sk->sk_state		=	TCP_CLOSE;
2705 	sk_set_socket(sk, sock);
2706 
2707 	sock_set_flag(sk, SOCK_ZAPPED);
2708 
2709 	if (sock) {
2710 		sk->sk_type	=	sock->type;
2711 		sk->sk_wq	=	sock->wq;
2712 		sock->sk	=	sk;
2713 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2714 	} else {
2715 		sk->sk_wq	=	NULL;
2716 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2717 	}
2718 
2719 	rwlock_init(&sk->sk_callback_lock);
2720 	if (sk->sk_kern_sock)
2721 		lockdep_set_class_and_name(
2722 			&sk->sk_callback_lock,
2723 			af_kern_callback_keys + sk->sk_family,
2724 			af_family_kern_clock_key_strings[sk->sk_family]);
2725 	else
2726 		lockdep_set_class_and_name(
2727 			&sk->sk_callback_lock,
2728 			af_callback_keys + sk->sk_family,
2729 			af_family_clock_key_strings[sk->sk_family]);
2730 
2731 	sk->sk_state_change	=	sock_def_wakeup;
2732 	sk->sk_data_ready	=	sock_def_readable;
2733 	sk->sk_write_space	=	sock_def_write_space;
2734 	sk->sk_error_report	=	sock_def_error_report;
2735 	sk->sk_destruct		=	sock_def_destruct;
2736 
2737 	sk->sk_frag.page	=	NULL;
2738 	sk->sk_frag.offset	=	0;
2739 	sk->sk_peek_off		=	-1;
2740 
2741 	sk->sk_peer_pid 	=	NULL;
2742 	sk->sk_peer_cred	=	NULL;
2743 	sk->sk_write_pending	=	0;
2744 	sk->sk_rcvlowat		=	1;
2745 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2746 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2747 
2748 	sk->sk_stamp = SK_DEFAULT_STAMP;
2749 	atomic_set(&sk->sk_zckey, 0);
2750 
2751 #ifdef CONFIG_NET_RX_BUSY_POLL
2752 	sk->sk_napi_id		=	0;
2753 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2754 #endif
2755 
2756 	sk->sk_max_pacing_rate = ~0U;
2757 	sk->sk_pacing_rate = ~0U;
2758 	sk->sk_pacing_shift = 10;
2759 	sk->sk_incoming_cpu = -1;
2760 	/*
2761 	 * Before updating sk_refcnt, we must commit prior changes to memory
2762 	 * (Documentation/RCU/rculist_nulls.txt for details)
2763 	 */
2764 	smp_wmb();
2765 	refcount_set(&sk->sk_refcnt, 1);
2766 	atomic_set(&sk->sk_drops, 0);
2767 }
2768 EXPORT_SYMBOL(sock_init_data);
2769 
2770 void lock_sock_nested(struct sock *sk, int subclass)
2771 {
2772 	might_sleep();
2773 	spin_lock_bh(&sk->sk_lock.slock);
2774 	if (sk->sk_lock.owned)
2775 		__lock_sock(sk);
2776 	sk->sk_lock.owned = 1;
2777 	spin_unlock(&sk->sk_lock.slock);
2778 	/*
2779 	 * The sk_lock has mutex_lock() semantics here:
2780 	 */
2781 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2782 	local_bh_enable();
2783 }
2784 EXPORT_SYMBOL(lock_sock_nested);
2785 
2786 void release_sock(struct sock *sk)
2787 {
2788 	spin_lock_bh(&sk->sk_lock.slock);
2789 	if (sk->sk_backlog.tail)
2790 		__release_sock(sk);
2791 
2792 	/* Warning : release_cb() might need to release sk ownership,
2793 	 * ie call sock_release_ownership(sk) before us.
2794 	 */
2795 	if (sk->sk_prot->release_cb)
2796 		sk->sk_prot->release_cb(sk);
2797 
2798 	sock_release_ownership(sk);
2799 	if (waitqueue_active(&sk->sk_lock.wq))
2800 		wake_up(&sk->sk_lock.wq);
2801 	spin_unlock_bh(&sk->sk_lock.slock);
2802 }
2803 EXPORT_SYMBOL(release_sock);
2804 
2805 /**
2806  * lock_sock_fast - fast version of lock_sock
2807  * @sk: socket
2808  *
2809  * This version should be used for very small section, where process wont block
2810  * return false if fast path is taken:
2811  *
2812  *   sk_lock.slock locked, owned = 0, BH disabled
2813  *
2814  * return true if slow path is taken:
2815  *
2816  *   sk_lock.slock unlocked, owned = 1, BH enabled
2817  */
2818 bool lock_sock_fast(struct sock *sk)
2819 {
2820 	might_sleep();
2821 	spin_lock_bh(&sk->sk_lock.slock);
2822 
2823 	if (!sk->sk_lock.owned)
2824 		/*
2825 		 * Note : We must disable BH
2826 		 */
2827 		return false;
2828 
2829 	__lock_sock(sk);
2830 	sk->sk_lock.owned = 1;
2831 	spin_unlock(&sk->sk_lock.slock);
2832 	/*
2833 	 * The sk_lock has mutex_lock() semantics here:
2834 	 */
2835 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2836 	local_bh_enable();
2837 	return true;
2838 }
2839 EXPORT_SYMBOL(lock_sock_fast);
2840 
2841 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2842 {
2843 	struct timeval tv;
2844 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2845 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2846 	tv = ktime_to_timeval(sk->sk_stamp);
2847 	if (tv.tv_sec == -1)
2848 		return -ENOENT;
2849 	if (tv.tv_sec == 0) {
2850 		sk->sk_stamp = ktime_get_real();
2851 		tv = ktime_to_timeval(sk->sk_stamp);
2852 	}
2853 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2854 }
2855 EXPORT_SYMBOL(sock_get_timestamp);
2856 
2857 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2858 {
2859 	struct timespec ts;
2860 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2861 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2862 	ts = ktime_to_timespec(sk->sk_stamp);
2863 	if (ts.tv_sec == -1)
2864 		return -ENOENT;
2865 	if (ts.tv_sec == 0) {
2866 		sk->sk_stamp = ktime_get_real();
2867 		ts = ktime_to_timespec(sk->sk_stamp);
2868 	}
2869 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2870 }
2871 EXPORT_SYMBOL(sock_get_timestampns);
2872 
2873 void sock_enable_timestamp(struct sock *sk, int flag)
2874 {
2875 	if (!sock_flag(sk, flag)) {
2876 		unsigned long previous_flags = sk->sk_flags;
2877 
2878 		sock_set_flag(sk, flag);
2879 		/*
2880 		 * we just set one of the two flags which require net
2881 		 * time stamping, but time stamping might have been on
2882 		 * already because of the other one
2883 		 */
2884 		if (sock_needs_netstamp(sk) &&
2885 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2886 			net_enable_timestamp();
2887 	}
2888 }
2889 
2890 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2891 		       int level, int type)
2892 {
2893 	struct sock_exterr_skb *serr;
2894 	struct sk_buff *skb;
2895 	int copied, err;
2896 
2897 	err = -EAGAIN;
2898 	skb = sock_dequeue_err_skb(sk);
2899 	if (skb == NULL)
2900 		goto out;
2901 
2902 	copied = skb->len;
2903 	if (copied > len) {
2904 		msg->msg_flags |= MSG_TRUNC;
2905 		copied = len;
2906 	}
2907 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2908 	if (err)
2909 		goto out_free_skb;
2910 
2911 	sock_recv_timestamp(msg, sk, skb);
2912 
2913 	serr = SKB_EXT_ERR(skb);
2914 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2915 
2916 	msg->msg_flags |= MSG_ERRQUEUE;
2917 	err = copied;
2918 
2919 out_free_skb:
2920 	kfree_skb(skb);
2921 out:
2922 	return err;
2923 }
2924 EXPORT_SYMBOL(sock_recv_errqueue);
2925 
2926 /*
2927  *	Get a socket option on an socket.
2928  *
2929  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2930  *	asynchronous errors should be reported by getsockopt. We assume
2931  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2932  */
2933 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2934 			   char __user *optval, int __user *optlen)
2935 {
2936 	struct sock *sk = sock->sk;
2937 
2938 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2939 }
2940 EXPORT_SYMBOL(sock_common_getsockopt);
2941 
2942 #ifdef CONFIG_COMPAT
2943 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2944 				  char __user *optval, int __user *optlen)
2945 {
2946 	struct sock *sk = sock->sk;
2947 
2948 	if (sk->sk_prot->compat_getsockopt != NULL)
2949 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2950 						      optval, optlen);
2951 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2952 }
2953 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2954 #endif
2955 
2956 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2957 			int flags)
2958 {
2959 	struct sock *sk = sock->sk;
2960 	int addr_len = 0;
2961 	int err;
2962 
2963 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2964 				   flags & ~MSG_DONTWAIT, &addr_len);
2965 	if (err >= 0)
2966 		msg->msg_namelen = addr_len;
2967 	return err;
2968 }
2969 EXPORT_SYMBOL(sock_common_recvmsg);
2970 
2971 /*
2972  *	Set socket options on an inet socket.
2973  */
2974 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2975 			   char __user *optval, unsigned int optlen)
2976 {
2977 	struct sock *sk = sock->sk;
2978 
2979 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2980 }
2981 EXPORT_SYMBOL(sock_common_setsockopt);
2982 
2983 #ifdef CONFIG_COMPAT
2984 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2985 				  char __user *optval, unsigned int optlen)
2986 {
2987 	struct sock *sk = sock->sk;
2988 
2989 	if (sk->sk_prot->compat_setsockopt != NULL)
2990 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2991 						      optval, optlen);
2992 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2993 }
2994 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2995 #endif
2996 
2997 void sk_common_release(struct sock *sk)
2998 {
2999 	if (sk->sk_prot->destroy)
3000 		sk->sk_prot->destroy(sk);
3001 
3002 	/*
3003 	 * Observation: when sock_common_release is called, processes have
3004 	 * no access to socket. But net still has.
3005 	 * Step one, detach it from networking:
3006 	 *
3007 	 * A. Remove from hash tables.
3008 	 */
3009 
3010 	sk->sk_prot->unhash(sk);
3011 
3012 	/*
3013 	 * In this point socket cannot receive new packets, but it is possible
3014 	 * that some packets are in flight because some CPU runs receiver and
3015 	 * did hash table lookup before we unhashed socket. They will achieve
3016 	 * receive queue and will be purged by socket destructor.
3017 	 *
3018 	 * Also we still have packets pending on receive queue and probably,
3019 	 * our own packets waiting in device queues. sock_destroy will drain
3020 	 * receive queue, but transmitted packets will delay socket destruction
3021 	 * until the last reference will be released.
3022 	 */
3023 
3024 	sock_orphan(sk);
3025 
3026 	xfrm_sk_free_policy(sk);
3027 
3028 	sk_refcnt_debug_release(sk);
3029 
3030 	sock_put(sk);
3031 }
3032 EXPORT_SYMBOL(sk_common_release);
3033 
3034 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3035 {
3036 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3037 
3038 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3039 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3040 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3041 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3042 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3043 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3044 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3045 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3046 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3047 }
3048 
3049 #ifdef CONFIG_PROC_FS
3050 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3051 struct prot_inuse {
3052 	int val[PROTO_INUSE_NR];
3053 };
3054 
3055 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3056 
3057 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3058 {
3059 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3060 }
3061 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3062 
3063 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3064 {
3065 	int cpu, idx = prot->inuse_idx;
3066 	int res = 0;
3067 
3068 	for_each_possible_cpu(cpu)
3069 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3070 
3071 	return res >= 0 ? res : 0;
3072 }
3073 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3074 
3075 static void sock_inuse_add(struct net *net, int val)
3076 {
3077 	this_cpu_add(*net->core.sock_inuse, val);
3078 }
3079 
3080 int sock_inuse_get(struct net *net)
3081 {
3082 	int cpu, res = 0;
3083 
3084 	for_each_possible_cpu(cpu)
3085 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3086 
3087 	return res;
3088 }
3089 
3090 EXPORT_SYMBOL_GPL(sock_inuse_get);
3091 
3092 static int __net_init sock_inuse_init_net(struct net *net)
3093 {
3094 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3095 	if (net->core.prot_inuse == NULL)
3096 		return -ENOMEM;
3097 
3098 	net->core.sock_inuse = alloc_percpu(int);
3099 	if (net->core.sock_inuse == NULL)
3100 		goto out;
3101 
3102 	return 0;
3103 
3104 out:
3105 	free_percpu(net->core.prot_inuse);
3106 	return -ENOMEM;
3107 }
3108 
3109 static void __net_exit sock_inuse_exit_net(struct net *net)
3110 {
3111 	free_percpu(net->core.prot_inuse);
3112 	free_percpu(net->core.sock_inuse);
3113 }
3114 
3115 static struct pernet_operations net_inuse_ops = {
3116 	.init = sock_inuse_init_net,
3117 	.exit = sock_inuse_exit_net,
3118 	.async = true,
3119 };
3120 
3121 static __init int net_inuse_init(void)
3122 {
3123 	if (register_pernet_subsys(&net_inuse_ops))
3124 		panic("Cannot initialize net inuse counters");
3125 
3126 	return 0;
3127 }
3128 
3129 core_initcall(net_inuse_init);
3130 
3131 static void assign_proto_idx(struct proto *prot)
3132 {
3133 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3134 
3135 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3136 		pr_err("PROTO_INUSE_NR exhausted\n");
3137 		return;
3138 	}
3139 
3140 	set_bit(prot->inuse_idx, proto_inuse_idx);
3141 }
3142 
3143 static void release_proto_idx(struct proto *prot)
3144 {
3145 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3146 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3147 }
3148 #else
3149 static inline void assign_proto_idx(struct proto *prot)
3150 {
3151 }
3152 
3153 static inline void release_proto_idx(struct proto *prot)
3154 {
3155 }
3156 
3157 static void sock_inuse_add(struct net *net, int val)
3158 {
3159 }
3160 #endif
3161 
3162 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3163 {
3164 	if (!rsk_prot)
3165 		return;
3166 	kfree(rsk_prot->slab_name);
3167 	rsk_prot->slab_name = NULL;
3168 	kmem_cache_destroy(rsk_prot->slab);
3169 	rsk_prot->slab = NULL;
3170 }
3171 
3172 static int req_prot_init(const struct proto *prot)
3173 {
3174 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3175 
3176 	if (!rsk_prot)
3177 		return 0;
3178 
3179 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3180 					prot->name);
3181 	if (!rsk_prot->slab_name)
3182 		return -ENOMEM;
3183 
3184 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3185 					   rsk_prot->obj_size, 0,
3186 					   prot->slab_flags, NULL);
3187 
3188 	if (!rsk_prot->slab) {
3189 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3190 			prot->name);
3191 		return -ENOMEM;
3192 	}
3193 	return 0;
3194 }
3195 
3196 int proto_register(struct proto *prot, int alloc_slab)
3197 {
3198 	if (alloc_slab) {
3199 		prot->slab = kmem_cache_create_usercopy(prot->name,
3200 					prot->obj_size, 0,
3201 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3202 					prot->useroffset, prot->usersize,
3203 					NULL);
3204 
3205 		if (prot->slab == NULL) {
3206 			pr_crit("%s: Can't create sock SLAB cache!\n",
3207 				prot->name);
3208 			goto out;
3209 		}
3210 
3211 		if (req_prot_init(prot))
3212 			goto out_free_request_sock_slab;
3213 
3214 		if (prot->twsk_prot != NULL) {
3215 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3216 
3217 			if (prot->twsk_prot->twsk_slab_name == NULL)
3218 				goto out_free_request_sock_slab;
3219 
3220 			prot->twsk_prot->twsk_slab =
3221 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3222 						  prot->twsk_prot->twsk_obj_size,
3223 						  0,
3224 						  prot->slab_flags,
3225 						  NULL);
3226 			if (prot->twsk_prot->twsk_slab == NULL)
3227 				goto out_free_timewait_sock_slab_name;
3228 		}
3229 	}
3230 
3231 	mutex_lock(&proto_list_mutex);
3232 	list_add(&prot->node, &proto_list);
3233 	assign_proto_idx(prot);
3234 	mutex_unlock(&proto_list_mutex);
3235 	return 0;
3236 
3237 out_free_timewait_sock_slab_name:
3238 	kfree(prot->twsk_prot->twsk_slab_name);
3239 out_free_request_sock_slab:
3240 	req_prot_cleanup(prot->rsk_prot);
3241 
3242 	kmem_cache_destroy(prot->slab);
3243 	prot->slab = NULL;
3244 out:
3245 	return -ENOBUFS;
3246 }
3247 EXPORT_SYMBOL(proto_register);
3248 
3249 void proto_unregister(struct proto *prot)
3250 {
3251 	mutex_lock(&proto_list_mutex);
3252 	release_proto_idx(prot);
3253 	list_del(&prot->node);
3254 	mutex_unlock(&proto_list_mutex);
3255 
3256 	kmem_cache_destroy(prot->slab);
3257 	prot->slab = NULL;
3258 
3259 	req_prot_cleanup(prot->rsk_prot);
3260 
3261 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3262 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3263 		kfree(prot->twsk_prot->twsk_slab_name);
3264 		prot->twsk_prot->twsk_slab = NULL;
3265 	}
3266 }
3267 EXPORT_SYMBOL(proto_unregister);
3268 
3269 #ifdef CONFIG_PROC_FS
3270 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3271 	__acquires(proto_list_mutex)
3272 {
3273 	mutex_lock(&proto_list_mutex);
3274 	return seq_list_start_head(&proto_list, *pos);
3275 }
3276 
3277 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3278 {
3279 	return seq_list_next(v, &proto_list, pos);
3280 }
3281 
3282 static void proto_seq_stop(struct seq_file *seq, void *v)
3283 	__releases(proto_list_mutex)
3284 {
3285 	mutex_unlock(&proto_list_mutex);
3286 }
3287 
3288 static char proto_method_implemented(const void *method)
3289 {
3290 	return method == NULL ? 'n' : 'y';
3291 }
3292 static long sock_prot_memory_allocated(struct proto *proto)
3293 {
3294 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3295 }
3296 
3297 static char *sock_prot_memory_pressure(struct proto *proto)
3298 {
3299 	return proto->memory_pressure != NULL ?
3300 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3301 }
3302 
3303 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3304 {
3305 
3306 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3307 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3308 		   proto->name,
3309 		   proto->obj_size,
3310 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3311 		   sock_prot_memory_allocated(proto),
3312 		   sock_prot_memory_pressure(proto),
3313 		   proto->max_header,
3314 		   proto->slab == NULL ? "no" : "yes",
3315 		   module_name(proto->owner),
3316 		   proto_method_implemented(proto->close),
3317 		   proto_method_implemented(proto->connect),
3318 		   proto_method_implemented(proto->disconnect),
3319 		   proto_method_implemented(proto->accept),
3320 		   proto_method_implemented(proto->ioctl),
3321 		   proto_method_implemented(proto->init),
3322 		   proto_method_implemented(proto->destroy),
3323 		   proto_method_implemented(proto->shutdown),
3324 		   proto_method_implemented(proto->setsockopt),
3325 		   proto_method_implemented(proto->getsockopt),
3326 		   proto_method_implemented(proto->sendmsg),
3327 		   proto_method_implemented(proto->recvmsg),
3328 		   proto_method_implemented(proto->sendpage),
3329 		   proto_method_implemented(proto->bind),
3330 		   proto_method_implemented(proto->backlog_rcv),
3331 		   proto_method_implemented(proto->hash),
3332 		   proto_method_implemented(proto->unhash),
3333 		   proto_method_implemented(proto->get_port),
3334 		   proto_method_implemented(proto->enter_memory_pressure));
3335 }
3336 
3337 static int proto_seq_show(struct seq_file *seq, void *v)
3338 {
3339 	if (v == &proto_list)
3340 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3341 			   "protocol",
3342 			   "size",
3343 			   "sockets",
3344 			   "memory",
3345 			   "press",
3346 			   "maxhdr",
3347 			   "slab",
3348 			   "module",
3349 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3350 	else
3351 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3352 	return 0;
3353 }
3354 
3355 static const struct seq_operations proto_seq_ops = {
3356 	.start  = proto_seq_start,
3357 	.next   = proto_seq_next,
3358 	.stop   = proto_seq_stop,
3359 	.show   = proto_seq_show,
3360 };
3361 
3362 static int proto_seq_open(struct inode *inode, struct file *file)
3363 {
3364 	return seq_open_net(inode, file, &proto_seq_ops,
3365 			    sizeof(struct seq_net_private));
3366 }
3367 
3368 static const struct file_operations proto_seq_fops = {
3369 	.open		= proto_seq_open,
3370 	.read		= seq_read,
3371 	.llseek		= seq_lseek,
3372 	.release	= seq_release_net,
3373 };
3374 
3375 static __net_init int proto_init_net(struct net *net)
3376 {
3377 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3378 		return -ENOMEM;
3379 
3380 	return 0;
3381 }
3382 
3383 static __net_exit void proto_exit_net(struct net *net)
3384 {
3385 	remove_proc_entry("protocols", net->proc_net);
3386 }
3387 
3388 
3389 static __net_initdata struct pernet_operations proto_net_ops = {
3390 	.init = proto_init_net,
3391 	.exit = proto_exit_net,
3392 	.async = true,
3393 };
3394 
3395 static int __init proto_init(void)
3396 {
3397 	return register_pernet_subsys(&proto_net_ops);
3398 }
3399 
3400 subsys_initcall(proto_init);
3401 
3402 #endif /* PROC_FS */
3403 
3404 #ifdef CONFIG_NET_RX_BUSY_POLL
3405 bool sk_busy_loop_end(void *p, unsigned long start_time)
3406 {
3407 	struct sock *sk = p;
3408 
3409 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3410 	       sk_busy_loop_timeout(sk, start_time);
3411 }
3412 EXPORT_SYMBOL(sk_busy_loop_end);
3413 #endif /* CONFIG_NET_RX_BUSY_POLL */
3414