xref: /linux/net/core/sock.c (revision 9708fb630d19ee51ae3aeb3a533e3010da0e8570)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 static void sock_inuse_add(struct net *net, int val);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
230 
231 static const char *const af_family_key_strings[AF_MAX+1] = {
232 	_sock_locks("sk_lock-")
233 };
234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
235 	_sock_locks("slock-")
236 };
237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
238 	_sock_locks("clock-")
239 };
240 
241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-sk_lock-")
243 };
244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-slock-")
246 };
247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-clock-")
249 };
250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
266 };
267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
283 };
284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
300 };
301 
302 /*
303  * sk_callback_lock and sk queues locking rules are per-address-family,
304  * so split the lock classes by using a per-AF key:
305  */
306 static struct lock_class_key af_callback_keys[AF_MAX];
307 static struct lock_class_key af_rlock_keys[AF_MAX];
308 static struct lock_class_key af_wlock_keys[AF_MAX];
309 static struct lock_class_key af_elock_keys[AF_MAX];
310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
311 
312 /* Run time adjustable parameters. */
313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
314 EXPORT_SYMBOL(sysctl_wmem_max);
315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
316 EXPORT_SYMBOL(sysctl_rmem_max);
317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
319 
320 /* Maximal space eaten by iovec or ancillary data plus some space */
321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
322 EXPORT_SYMBOL(sysctl_optmem_max);
323 
324 int sysctl_tstamp_allow_data __read_mostly = 1;
325 
326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
327 EXPORT_SYMBOL_GPL(memalloc_socks);
328 
329 /**
330  * sk_set_memalloc - sets %SOCK_MEMALLOC
331  * @sk: socket to set it on
332  *
333  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
334  * It's the responsibility of the admin to adjust min_free_kbytes
335  * to meet the requirements
336  */
337 void sk_set_memalloc(struct sock *sk)
338 {
339 	sock_set_flag(sk, SOCK_MEMALLOC);
340 	sk->sk_allocation |= __GFP_MEMALLOC;
341 	static_key_slow_inc(&memalloc_socks);
342 }
343 EXPORT_SYMBOL_GPL(sk_set_memalloc);
344 
345 void sk_clear_memalloc(struct sock *sk)
346 {
347 	sock_reset_flag(sk, SOCK_MEMALLOC);
348 	sk->sk_allocation &= ~__GFP_MEMALLOC;
349 	static_key_slow_dec(&memalloc_socks);
350 
351 	/*
352 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
353 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
354 	 * it has rmem allocations due to the last swapfile being deactivated
355 	 * but there is a risk that the socket is unusable due to exceeding
356 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
357 	 */
358 	sk_mem_reclaim(sk);
359 }
360 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
361 
362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
363 {
364 	int ret;
365 	unsigned int noreclaim_flag;
366 
367 	/* these should have been dropped before queueing */
368 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
369 
370 	noreclaim_flag = memalloc_noreclaim_save();
371 	ret = sk->sk_backlog_rcv(sk, skb);
372 	memalloc_noreclaim_restore(noreclaim_flag);
373 
374 	return ret;
375 }
376 EXPORT_SYMBOL(__sk_backlog_rcv);
377 
378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
379 {
380 	struct timeval tv;
381 
382 	if (optlen < sizeof(tv))
383 		return -EINVAL;
384 	if (copy_from_user(&tv, optval, sizeof(tv)))
385 		return -EFAULT;
386 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
387 		return -EDOM;
388 
389 	if (tv.tv_sec < 0) {
390 		static int warned __read_mostly;
391 
392 		*timeo_p = 0;
393 		if (warned < 10 && net_ratelimit()) {
394 			warned++;
395 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
396 				__func__, current->comm, task_pid_nr(current));
397 		}
398 		return 0;
399 	}
400 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
401 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
402 		return 0;
403 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
404 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
405 	return 0;
406 }
407 
408 static void sock_warn_obsolete_bsdism(const char *name)
409 {
410 	static int warned;
411 	static char warncomm[TASK_COMM_LEN];
412 	if (strcmp(warncomm, current->comm) && warned < 5) {
413 		strcpy(warncomm,  current->comm);
414 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
415 			warncomm, name);
416 		warned++;
417 	}
418 }
419 
420 static bool sock_needs_netstamp(const struct sock *sk)
421 {
422 	switch (sk->sk_family) {
423 	case AF_UNSPEC:
424 	case AF_UNIX:
425 		return false;
426 	default:
427 		return true;
428 	}
429 }
430 
431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
432 {
433 	if (sk->sk_flags & flags) {
434 		sk->sk_flags &= ~flags;
435 		if (sock_needs_netstamp(sk) &&
436 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
437 			net_disable_timestamp();
438 	}
439 }
440 
441 
442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
443 {
444 	unsigned long flags;
445 	struct sk_buff_head *list = &sk->sk_receive_queue;
446 
447 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
448 		atomic_inc(&sk->sk_drops);
449 		trace_sock_rcvqueue_full(sk, skb);
450 		return -ENOMEM;
451 	}
452 
453 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454 		atomic_inc(&sk->sk_drops);
455 		return -ENOBUFS;
456 	}
457 
458 	skb->dev = NULL;
459 	skb_set_owner_r(skb, sk);
460 
461 	/* we escape from rcu protected region, make sure we dont leak
462 	 * a norefcounted dst
463 	 */
464 	skb_dst_force(skb);
465 
466 	spin_lock_irqsave(&list->lock, flags);
467 	sock_skb_set_dropcount(sk, skb);
468 	__skb_queue_tail(list, skb);
469 	spin_unlock_irqrestore(&list->lock, flags);
470 
471 	if (!sock_flag(sk, SOCK_DEAD))
472 		sk->sk_data_ready(sk);
473 	return 0;
474 }
475 EXPORT_SYMBOL(__sock_queue_rcv_skb);
476 
477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
478 {
479 	int err;
480 
481 	err = sk_filter(sk, skb);
482 	if (err)
483 		return err;
484 
485 	return __sock_queue_rcv_skb(sk, skb);
486 }
487 EXPORT_SYMBOL(sock_queue_rcv_skb);
488 
489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
490 		     const int nested, unsigned int trim_cap, bool refcounted)
491 {
492 	int rc = NET_RX_SUCCESS;
493 
494 	if (sk_filter_trim_cap(sk, skb, trim_cap))
495 		goto discard_and_relse;
496 
497 	skb->dev = NULL;
498 
499 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
500 		atomic_inc(&sk->sk_drops);
501 		goto discard_and_relse;
502 	}
503 	if (nested)
504 		bh_lock_sock_nested(sk);
505 	else
506 		bh_lock_sock(sk);
507 	if (!sock_owned_by_user(sk)) {
508 		/*
509 		 * trylock + unlock semantics:
510 		 */
511 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
512 
513 		rc = sk_backlog_rcv(sk, skb);
514 
515 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
516 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
517 		bh_unlock_sock(sk);
518 		atomic_inc(&sk->sk_drops);
519 		goto discard_and_relse;
520 	}
521 
522 	bh_unlock_sock(sk);
523 out:
524 	if (refcounted)
525 		sock_put(sk);
526 	return rc;
527 discard_and_relse:
528 	kfree_skb(skb);
529 	goto out;
530 }
531 EXPORT_SYMBOL(__sk_receive_skb);
532 
533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534 {
535 	struct dst_entry *dst = __sk_dst_get(sk);
536 
537 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 		sk_tx_queue_clear(sk);
539 		sk->sk_dst_pending_confirm = 0;
540 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
541 		dst_release(dst);
542 		return NULL;
543 	}
544 
545 	return dst;
546 }
547 EXPORT_SYMBOL(__sk_dst_check);
548 
549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550 {
551 	struct dst_entry *dst = sk_dst_get(sk);
552 
553 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
554 		sk_dst_reset(sk);
555 		dst_release(dst);
556 		return NULL;
557 	}
558 
559 	return dst;
560 }
561 EXPORT_SYMBOL(sk_dst_check);
562 
563 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
564 				int optlen)
565 {
566 	int ret = -ENOPROTOOPT;
567 #ifdef CONFIG_NETDEVICES
568 	struct net *net = sock_net(sk);
569 	char devname[IFNAMSIZ];
570 	int index;
571 
572 	/* Sorry... */
573 	ret = -EPERM;
574 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
575 		goto out;
576 
577 	ret = -EINVAL;
578 	if (optlen < 0)
579 		goto out;
580 
581 	/* Bind this socket to a particular device like "eth0",
582 	 * as specified in the passed interface name. If the
583 	 * name is "" or the option length is zero the socket
584 	 * is not bound.
585 	 */
586 	if (optlen > IFNAMSIZ - 1)
587 		optlen = IFNAMSIZ - 1;
588 	memset(devname, 0, sizeof(devname));
589 
590 	ret = -EFAULT;
591 	if (copy_from_user(devname, optval, optlen))
592 		goto out;
593 
594 	index = 0;
595 	if (devname[0] != '\0') {
596 		struct net_device *dev;
597 
598 		rcu_read_lock();
599 		dev = dev_get_by_name_rcu(net, devname);
600 		if (dev)
601 			index = dev->ifindex;
602 		rcu_read_unlock();
603 		ret = -ENODEV;
604 		if (!dev)
605 			goto out;
606 	}
607 
608 	lock_sock(sk);
609 	sk->sk_bound_dev_if = index;
610 	sk_dst_reset(sk);
611 	release_sock(sk);
612 
613 	ret = 0;
614 
615 out:
616 #endif
617 
618 	return ret;
619 }
620 
621 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
622 				int __user *optlen, int len)
623 {
624 	int ret = -ENOPROTOOPT;
625 #ifdef CONFIG_NETDEVICES
626 	struct net *net = sock_net(sk);
627 	char devname[IFNAMSIZ];
628 
629 	if (sk->sk_bound_dev_if == 0) {
630 		len = 0;
631 		goto zero;
632 	}
633 
634 	ret = -EINVAL;
635 	if (len < IFNAMSIZ)
636 		goto out;
637 
638 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
639 	if (ret)
640 		goto out;
641 
642 	len = strlen(devname) + 1;
643 
644 	ret = -EFAULT;
645 	if (copy_to_user(optval, devname, len))
646 		goto out;
647 
648 zero:
649 	ret = -EFAULT;
650 	if (put_user(len, optlen))
651 		goto out;
652 
653 	ret = 0;
654 
655 out:
656 #endif
657 
658 	return ret;
659 }
660 
661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662 {
663 	if (valbool)
664 		sock_set_flag(sk, bit);
665 	else
666 		sock_reset_flag(sk, bit);
667 }
668 
669 bool sk_mc_loop(struct sock *sk)
670 {
671 	if (dev_recursion_level())
672 		return false;
673 	if (!sk)
674 		return true;
675 	switch (sk->sk_family) {
676 	case AF_INET:
677 		return inet_sk(sk)->mc_loop;
678 #if IS_ENABLED(CONFIG_IPV6)
679 	case AF_INET6:
680 		return inet6_sk(sk)->mc_loop;
681 #endif
682 	}
683 	WARN_ON(1);
684 	return true;
685 }
686 EXPORT_SYMBOL(sk_mc_loop);
687 
688 /*
689  *	This is meant for all protocols to use and covers goings on
690  *	at the socket level. Everything here is generic.
691  */
692 
693 int sock_setsockopt(struct socket *sock, int level, int optname,
694 		    char __user *optval, unsigned int optlen)
695 {
696 	struct sock *sk = sock->sk;
697 	int val;
698 	int valbool;
699 	struct linger ling;
700 	int ret = 0;
701 
702 	/*
703 	 *	Options without arguments
704 	 */
705 
706 	if (optname == SO_BINDTODEVICE)
707 		return sock_setbindtodevice(sk, optval, optlen);
708 
709 	if (optlen < sizeof(int))
710 		return -EINVAL;
711 
712 	if (get_user(val, (int __user *)optval))
713 		return -EFAULT;
714 
715 	valbool = val ? 1 : 0;
716 
717 	lock_sock(sk);
718 
719 	switch (optname) {
720 	case SO_DEBUG:
721 		if (val && !capable(CAP_NET_ADMIN))
722 			ret = -EACCES;
723 		else
724 			sock_valbool_flag(sk, SOCK_DBG, valbool);
725 		break;
726 	case SO_REUSEADDR:
727 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
728 		break;
729 	case SO_REUSEPORT:
730 		sk->sk_reuseport = valbool;
731 		break;
732 	case SO_TYPE:
733 	case SO_PROTOCOL:
734 	case SO_DOMAIN:
735 	case SO_ERROR:
736 		ret = -ENOPROTOOPT;
737 		break;
738 	case SO_DONTROUTE:
739 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
740 		break;
741 	case SO_BROADCAST:
742 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
743 		break;
744 	case SO_SNDBUF:
745 		/* Don't error on this BSD doesn't and if you think
746 		 * about it this is right. Otherwise apps have to
747 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
748 		 * are treated in BSD as hints
749 		 */
750 		val = min_t(u32, val, sysctl_wmem_max);
751 set_sndbuf:
752 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
753 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
754 		/* Wake up sending tasks if we upped the value. */
755 		sk->sk_write_space(sk);
756 		break;
757 
758 	case SO_SNDBUFFORCE:
759 		if (!capable(CAP_NET_ADMIN)) {
760 			ret = -EPERM;
761 			break;
762 		}
763 		goto set_sndbuf;
764 
765 	case SO_RCVBUF:
766 		/* Don't error on this BSD doesn't and if you think
767 		 * about it this is right. Otherwise apps have to
768 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
769 		 * are treated in BSD as hints
770 		 */
771 		val = min_t(u32, val, sysctl_rmem_max);
772 set_rcvbuf:
773 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
774 		/*
775 		 * We double it on the way in to account for
776 		 * "struct sk_buff" etc. overhead.   Applications
777 		 * assume that the SO_RCVBUF setting they make will
778 		 * allow that much actual data to be received on that
779 		 * socket.
780 		 *
781 		 * Applications are unaware that "struct sk_buff" and
782 		 * other overheads allocate from the receive buffer
783 		 * during socket buffer allocation.
784 		 *
785 		 * And after considering the possible alternatives,
786 		 * returning the value we actually used in getsockopt
787 		 * is the most desirable behavior.
788 		 */
789 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
790 		break;
791 
792 	case SO_RCVBUFFORCE:
793 		if (!capable(CAP_NET_ADMIN)) {
794 			ret = -EPERM;
795 			break;
796 		}
797 		goto set_rcvbuf;
798 
799 	case SO_KEEPALIVE:
800 		if (sk->sk_prot->keepalive)
801 			sk->sk_prot->keepalive(sk, valbool);
802 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
803 		break;
804 
805 	case SO_OOBINLINE:
806 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
807 		break;
808 
809 	case SO_NO_CHECK:
810 		sk->sk_no_check_tx = valbool;
811 		break;
812 
813 	case SO_PRIORITY:
814 		if ((val >= 0 && val <= 6) ||
815 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
816 			sk->sk_priority = val;
817 		else
818 			ret = -EPERM;
819 		break;
820 
821 	case SO_LINGER:
822 		if (optlen < sizeof(ling)) {
823 			ret = -EINVAL;	/* 1003.1g */
824 			break;
825 		}
826 		if (copy_from_user(&ling, optval, sizeof(ling))) {
827 			ret = -EFAULT;
828 			break;
829 		}
830 		if (!ling.l_onoff)
831 			sock_reset_flag(sk, SOCK_LINGER);
832 		else {
833 #if (BITS_PER_LONG == 32)
834 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
835 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
836 			else
837 #endif
838 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
839 			sock_set_flag(sk, SOCK_LINGER);
840 		}
841 		break;
842 
843 	case SO_BSDCOMPAT:
844 		sock_warn_obsolete_bsdism("setsockopt");
845 		break;
846 
847 	case SO_PASSCRED:
848 		if (valbool)
849 			set_bit(SOCK_PASSCRED, &sock->flags);
850 		else
851 			clear_bit(SOCK_PASSCRED, &sock->flags);
852 		break;
853 
854 	case SO_TIMESTAMP:
855 	case SO_TIMESTAMPNS:
856 		if (valbool)  {
857 			if (optname == SO_TIMESTAMP)
858 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
859 			else
860 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
861 			sock_set_flag(sk, SOCK_RCVTSTAMP);
862 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
863 		} else {
864 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
865 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
866 		}
867 		break;
868 
869 	case SO_TIMESTAMPING:
870 		if (val & ~SOF_TIMESTAMPING_MASK) {
871 			ret = -EINVAL;
872 			break;
873 		}
874 
875 		if (val & SOF_TIMESTAMPING_OPT_ID &&
876 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
877 			if (sk->sk_protocol == IPPROTO_TCP &&
878 			    sk->sk_type == SOCK_STREAM) {
879 				if ((1 << sk->sk_state) &
880 				    (TCPF_CLOSE | TCPF_LISTEN)) {
881 					ret = -EINVAL;
882 					break;
883 				}
884 				sk->sk_tskey = tcp_sk(sk)->snd_una;
885 			} else {
886 				sk->sk_tskey = 0;
887 			}
888 		}
889 
890 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
891 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
892 			ret = -EINVAL;
893 			break;
894 		}
895 
896 		sk->sk_tsflags = val;
897 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
898 			sock_enable_timestamp(sk,
899 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
900 		else
901 			sock_disable_timestamp(sk,
902 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
903 		break;
904 
905 	case SO_RCVLOWAT:
906 		if (val < 0)
907 			val = INT_MAX;
908 		sk->sk_rcvlowat = val ? : 1;
909 		break;
910 
911 	case SO_RCVTIMEO:
912 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
913 		break;
914 
915 	case SO_SNDTIMEO:
916 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
917 		break;
918 
919 	case SO_ATTACH_FILTER:
920 		ret = -EINVAL;
921 		if (optlen == sizeof(struct sock_fprog)) {
922 			struct sock_fprog fprog;
923 
924 			ret = -EFAULT;
925 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
926 				break;
927 
928 			ret = sk_attach_filter(&fprog, sk);
929 		}
930 		break;
931 
932 	case SO_ATTACH_BPF:
933 		ret = -EINVAL;
934 		if (optlen == sizeof(u32)) {
935 			u32 ufd;
936 
937 			ret = -EFAULT;
938 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
939 				break;
940 
941 			ret = sk_attach_bpf(ufd, sk);
942 		}
943 		break;
944 
945 	case SO_ATTACH_REUSEPORT_CBPF:
946 		ret = -EINVAL;
947 		if (optlen == sizeof(struct sock_fprog)) {
948 			struct sock_fprog fprog;
949 
950 			ret = -EFAULT;
951 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
952 				break;
953 
954 			ret = sk_reuseport_attach_filter(&fprog, sk);
955 		}
956 		break;
957 
958 	case SO_ATTACH_REUSEPORT_EBPF:
959 		ret = -EINVAL;
960 		if (optlen == sizeof(u32)) {
961 			u32 ufd;
962 
963 			ret = -EFAULT;
964 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
965 				break;
966 
967 			ret = sk_reuseport_attach_bpf(ufd, sk);
968 		}
969 		break;
970 
971 	case SO_DETACH_FILTER:
972 		ret = sk_detach_filter(sk);
973 		break;
974 
975 	case SO_LOCK_FILTER:
976 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
977 			ret = -EPERM;
978 		else
979 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
980 		break;
981 
982 	case SO_PASSSEC:
983 		if (valbool)
984 			set_bit(SOCK_PASSSEC, &sock->flags);
985 		else
986 			clear_bit(SOCK_PASSSEC, &sock->flags);
987 		break;
988 	case SO_MARK:
989 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
990 			ret = -EPERM;
991 		else
992 			sk->sk_mark = val;
993 		break;
994 
995 	case SO_RXQ_OVFL:
996 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
997 		break;
998 
999 	case SO_WIFI_STATUS:
1000 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1001 		break;
1002 
1003 	case SO_PEEK_OFF:
1004 		if (sock->ops->set_peek_off)
1005 			ret = sock->ops->set_peek_off(sk, val);
1006 		else
1007 			ret = -EOPNOTSUPP;
1008 		break;
1009 
1010 	case SO_NOFCS:
1011 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1012 		break;
1013 
1014 	case SO_SELECT_ERR_QUEUE:
1015 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1016 		break;
1017 
1018 #ifdef CONFIG_NET_RX_BUSY_POLL
1019 	case SO_BUSY_POLL:
1020 		/* allow unprivileged users to decrease the value */
1021 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1022 			ret = -EPERM;
1023 		else {
1024 			if (val < 0)
1025 				ret = -EINVAL;
1026 			else
1027 				sk->sk_ll_usec = val;
1028 		}
1029 		break;
1030 #endif
1031 
1032 	case SO_MAX_PACING_RATE:
1033 		if (val != ~0U)
1034 			cmpxchg(&sk->sk_pacing_status,
1035 				SK_PACING_NONE,
1036 				SK_PACING_NEEDED);
1037 		sk->sk_max_pacing_rate = val;
1038 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1039 					 sk->sk_max_pacing_rate);
1040 		break;
1041 
1042 	case SO_INCOMING_CPU:
1043 		sk->sk_incoming_cpu = val;
1044 		break;
1045 
1046 	case SO_CNX_ADVICE:
1047 		if (val == 1)
1048 			dst_negative_advice(sk);
1049 		break;
1050 
1051 	case SO_ZEROCOPY:
1052 		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1053 			ret = -ENOTSUPP;
1054 		else if (sk->sk_protocol != IPPROTO_TCP)
1055 			ret = -ENOTSUPP;
1056 		else if (sk->sk_state != TCP_CLOSE)
1057 			ret = -EBUSY;
1058 		else if (val < 0 || val > 1)
1059 			ret = -EINVAL;
1060 		else
1061 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1062 		break;
1063 
1064 	default:
1065 		ret = -ENOPROTOOPT;
1066 		break;
1067 	}
1068 	release_sock(sk);
1069 	return ret;
1070 }
1071 EXPORT_SYMBOL(sock_setsockopt);
1072 
1073 
1074 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1075 			  struct ucred *ucred)
1076 {
1077 	ucred->pid = pid_vnr(pid);
1078 	ucred->uid = ucred->gid = -1;
1079 	if (cred) {
1080 		struct user_namespace *current_ns = current_user_ns();
1081 
1082 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1083 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1084 	}
1085 }
1086 
1087 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1088 {
1089 	struct user_namespace *user_ns = current_user_ns();
1090 	int i;
1091 
1092 	for (i = 0; i < src->ngroups; i++)
1093 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1094 			return -EFAULT;
1095 
1096 	return 0;
1097 }
1098 
1099 int sock_getsockopt(struct socket *sock, int level, int optname,
1100 		    char __user *optval, int __user *optlen)
1101 {
1102 	struct sock *sk = sock->sk;
1103 
1104 	union {
1105 		int val;
1106 		u64 val64;
1107 		struct linger ling;
1108 		struct timeval tm;
1109 	} v;
1110 
1111 	int lv = sizeof(int);
1112 	int len;
1113 
1114 	if (get_user(len, optlen))
1115 		return -EFAULT;
1116 	if (len < 0)
1117 		return -EINVAL;
1118 
1119 	memset(&v, 0, sizeof(v));
1120 
1121 	switch (optname) {
1122 	case SO_DEBUG:
1123 		v.val = sock_flag(sk, SOCK_DBG);
1124 		break;
1125 
1126 	case SO_DONTROUTE:
1127 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1128 		break;
1129 
1130 	case SO_BROADCAST:
1131 		v.val = sock_flag(sk, SOCK_BROADCAST);
1132 		break;
1133 
1134 	case SO_SNDBUF:
1135 		v.val = sk->sk_sndbuf;
1136 		break;
1137 
1138 	case SO_RCVBUF:
1139 		v.val = sk->sk_rcvbuf;
1140 		break;
1141 
1142 	case SO_REUSEADDR:
1143 		v.val = sk->sk_reuse;
1144 		break;
1145 
1146 	case SO_REUSEPORT:
1147 		v.val = sk->sk_reuseport;
1148 		break;
1149 
1150 	case SO_KEEPALIVE:
1151 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1152 		break;
1153 
1154 	case SO_TYPE:
1155 		v.val = sk->sk_type;
1156 		break;
1157 
1158 	case SO_PROTOCOL:
1159 		v.val = sk->sk_protocol;
1160 		break;
1161 
1162 	case SO_DOMAIN:
1163 		v.val = sk->sk_family;
1164 		break;
1165 
1166 	case SO_ERROR:
1167 		v.val = -sock_error(sk);
1168 		if (v.val == 0)
1169 			v.val = xchg(&sk->sk_err_soft, 0);
1170 		break;
1171 
1172 	case SO_OOBINLINE:
1173 		v.val = sock_flag(sk, SOCK_URGINLINE);
1174 		break;
1175 
1176 	case SO_NO_CHECK:
1177 		v.val = sk->sk_no_check_tx;
1178 		break;
1179 
1180 	case SO_PRIORITY:
1181 		v.val = sk->sk_priority;
1182 		break;
1183 
1184 	case SO_LINGER:
1185 		lv		= sizeof(v.ling);
1186 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1187 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1188 		break;
1189 
1190 	case SO_BSDCOMPAT:
1191 		sock_warn_obsolete_bsdism("getsockopt");
1192 		break;
1193 
1194 	case SO_TIMESTAMP:
1195 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1196 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1197 		break;
1198 
1199 	case SO_TIMESTAMPNS:
1200 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1201 		break;
1202 
1203 	case SO_TIMESTAMPING:
1204 		v.val = sk->sk_tsflags;
1205 		break;
1206 
1207 	case SO_RCVTIMEO:
1208 		lv = sizeof(struct timeval);
1209 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1210 			v.tm.tv_sec = 0;
1211 			v.tm.tv_usec = 0;
1212 		} else {
1213 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1214 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1215 		}
1216 		break;
1217 
1218 	case SO_SNDTIMEO:
1219 		lv = sizeof(struct timeval);
1220 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1221 			v.tm.tv_sec = 0;
1222 			v.tm.tv_usec = 0;
1223 		} else {
1224 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1225 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1226 		}
1227 		break;
1228 
1229 	case SO_RCVLOWAT:
1230 		v.val = sk->sk_rcvlowat;
1231 		break;
1232 
1233 	case SO_SNDLOWAT:
1234 		v.val = 1;
1235 		break;
1236 
1237 	case SO_PASSCRED:
1238 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1239 		break;
1240 
1241 	case SO_PEERCRED:
1242 	{
1243 		struct ucred peercred;
1244 		if (len > sizeof(peercred))
1245 			len = sizeof(peercred);
1246 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1247 		if (copy_to_user(optval, &peercred, len))
1248 			return -EFAULT;
1249 		goto lenout;
1250 	}
1251 
1252 	case SO_PEERGROUPS:
1253 	{
1254 		int ret, n;
1255 
1256 		if (!sk->sk_peer_cred)
1257 			return -ENODATA;
1258 
1259 		n = sk->sk_peer_cred->group_info->ngroups;
1260 		if (len < n * sizeof(gid_t)) {
1261 			len = n * sizeof(gid_t);
1262 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1263 		}
1264 		len = n * sizeof(gid_t);
1265 
1266 		ret = groups_to_user((gid_t __user *)optval,
1267 				     sk->sk_peer_cred->group_info);
1268 		if (ret)
1269 			return ret;
1270 		goto lenout;
1271 	}
1272 
1273 	case SO_PEERNAME:
1274 	{
1275 		char address[128];
1276 
1277 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1278 		if (lv < 0)
1279 			return -ENOTCONN;
1280 		if (lv < len)
1281 			return -EINVAL;
1282 		if (copy_to_user(optval, address, len))
1283 			return -EFAULT;
1284 		goto lenout;
1285 	}
1286 
1287 	/* Dubious BSD thing... Probably nobody even uses it, but
1288 	 * the UNIX standard wants it for whatever reason... -DaveM
1289 	 */
1290 	case SO_ACCEPTCONN:
1291 		v.val = sk->sk_state == TCP_LISTEN;
1292 		break;
1293 
1294 	case SO_PASSSEC:
1295 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1296 		break;
1297 
1298 	case SO_PEERSEC:
1299 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1300 
1301 	case SO_MARK:
1302 		v.val = sk->sk_mark;
1303 		break;
1304 
1305 	case SO_RXQ_OVFL:
1306 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1307 		break;
1308 
1309 	case SO_WIFI_STATUS:
1310 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1311 		break;
1312 
1313 	case SO_PEEK_OFF:
1314 		if (!sock->ops->set_peek_off)
1315 			return -EOPNOTSUPP;
1316 
1317 		v.val = sk->sk_peek_off;
1318 		break;
1319 	case SO_NOFCS:
1320 		v.val = sock_flag(sk, SOCK_NOFCS);
1321 		break;
1322 
1323 	case SO_BINDTODEVICE:
1324 		return sock_getbindtodevice(sk, optval, optlen, len);
1325 
1326 	case SO_GET_FILTER:
1327 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1328 		if (len < 0)
1329 			return len;
1330 
1331 		goto lenout;
1332 
1333 	case SO_LOCK_FILTER:
1334 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1335 		break;
1336 
1337 	case SO_BPF_EXTENSIONS:
1338 		v.val = bpf_tell_extensions();
1339 		break;
1340 
1341 	case SO_SELECT_ERR_QUEUE:
1342 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1343 		break;
1344 
1345 #ifdef CONFIG_NET_RX_BUSY_POLL
1346 	case SO_BUSY_POLL:
1347 		v.val = sk->sk_ll_usec;
1348 		break;
1349 #endif
1350 
1351 	case SO_MAX_PACING_RATE:
1352 		v.val = sk->sk_max_pacing_rate;
1353 		break;
1354 
1355 	case SO_INCOMING_CPU:
1356 		v.val = sk->sk_incoming_cpu;
1357 		break;
1358 
1359 	case SO_MEMINFO:
1360 	{
1361 		u32 meminfo[SK_MEMINFO_VARS];
1362 
1363 		if (get_user(len, optlen))
1364 			return -EFAULT;
1365 
1366 		sk_get_meminfo(sk, meminfo);
1367 
1368 		len = min_t(unsigned int, len, sizeof(meminfo));
1369 		if (copy_to_user(optval, &meminfo, len))
1370 			return -EFAULT;
1371 
1372 		goto lenout;
1373 	}
1374 
1375 #ifdef CONFIG_NET_RX_BUSY_POLL
1376 	case SO_INCOMING_NAPI_ID:
1377 		v.val = READ_ONCE(sk->sk_napi_id);
1378 
1379 		/* aggregate non-NAPI IDs down to 0 */
1380 		if (v.val < MIN_NAPI_ID)
1381 			v.val = 0;
1382 
1383 		break;
1384 #endif
1385 
1386 	case SO_COOKIE:
1387 		lv = sizeof(u64);
1388 		if (len < lv)
1389 			return -EINVAL;
1390 		v.val64 = sock_gen_cookie(sk);
1391 		break;
1392 
1393 	case SO_ZEROCOPY:
1394 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1395 		break;
1396 
1397 	default:
1398 		/* We implement the SO_SNDLOWAT etc to not be settable
1399 		 * (1003.1g 7).
1400 		 */
1401 		return -ENOPROTOOPT;
1402 	}
1403 
1404 	if (len > lv)
1405 		len = lv;
1406 	if (copy_to_user(optval, &v, len))
1407 		return -EFAULT;
1408 lenout:
1409 	if (put_user(len, optlen))
1410 		return -EFAULT;
1411 	return 0;
1412 }
1413 
1414 /*
1415  * Initialize an sk_lock.
1416  *
1417  * (We also register the sk_lock with the lock validator.)
1418  */
1419 static inline void sock_lock_init(struct sock *sk)
1420 {
1421 	if (sk->sk_kern_sock)
1422 		sock_lock_init_class_and_name(
1423 			sk,
1424 			af_family_kern_slock_key_strings[sk->sk_family],
1425 			af_family_kern_slock_keys + sk->sk_family,
1426 			af_family_kern_key_strings[sk->sk_family],
1427 			af_family_kern_keys + sk->sk_family);
1428 	else
1429 		sock_lock_init_class_and_name(
1430 			sk,
1431 			af_family_slock_key_strings[sk->sk_family],
1432 			af_family_slock_keys + sk->sk_family,
1433 			af_family_key_strings[sk->sk_family],
1434 			af_family_keys + sk->sk_family);
1435 }
1436 
1437 /*
1438  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1439  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1440  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1441  */
1442 static void sock_copy(struct sock *nsk, const struct sock *osk)
1443 {
1444 #ifdef CONFIG_SECURITY_NETWORK
1445 	void *sptr = nsk->sk_security;
1446 #endif
1447 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1448 
1449 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1450 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1451 
1452 #ifdef CONFIG_SECURITY_NETWORK
1453 	nsk->sk_security = sptr;
1454 	security_sk_clone(osk, nsk);
1455 #endif
1456 }
1457 
1458 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1459 		int family)
1460 {
1461 	struct sock *sk;
1462 	struct kmem_cache *slab;
1463 
1464 	slab = prot->slab;
1465 	if (slab != NULL) {
1466 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1467 		if (!sk)
1468 			return sk;
1469 		if (priority & __GFP_ZERO)
1470 			sk_prot_clear_nulls(sk, prot->obj_size);
1471 	} else
1472 		sk = kmalloc(prot->obj_size, priority);
1473 
1474 	if (sk != NULL) {
1475 		if (security_sk_alloc(sk, family, priority))
1476 			goto out_free;
1477 
1478 		if (!try_module_get(prot->owner))
1479 			goto out_free_sec;
1480 		sk_tx_queue_clear(sk);
1481 	}
1482 
1483 	return sk;
1484 
1485 out_free_sec:
1486 	security_sk_free(sk);
1487 out_free:
1488 	if (slab != NULL)
1489 		kmem_cache_free(slab, sk);
1490 	else
1491 		kfree(sk);
1492 	return NULL;
1493 }
1494 
1495 static void sk_prot_free(struct proto *prot, struct sock *sk)
1496 {
1497 	struct kmem_cache *slab;
1498 	struct module *owner;
1499 
1500 	owner = prot->owner;
1501 	slab = prot->slab;
1502 
1503 	cgroup_sk_free(&sk->sk_cgrp_data);
1504 	mem_cgroup_sk_free(sk);
1505 	security_sk_free(sk);
1506 	if (slab != NULL)
1507 		kmem_cache_free(slab, sk);
1508 	else
1509 		kfree(sk);
1510 	module_put(owner);
1511 }
1512 
1513 /**
1514  *	sk_alloc - All socket objects are allocated here
1515  *	@net: the applicable net namespace
1516  *	@family: protocol family
1517  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1518  *	@prot: struct proto associated with this new sock instance
1519  *	@kern: is this to be a kernel socket?
1520  */
1521 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1522 		      struct proto *prot, int kern)
1523 {
1524 	struct sock *sk;
1525 
1526 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1527 	if (sk) {
1528 		sk->sk_family = family;
1529 		/*
1530 		 * See comment in struct sock definition to understand
1531 		 * why we need sk_prot_creator -acme
1532 		 */
1533 		sk->sk_prot = sk->sk_prot_creator = prot;
1534 		sk->sk_kern_sock = kern;
1535 		sock_lock_init(sk);
1536 		sk->sk_net_refcnt = kern ? 0 : 1;
1537 		if (likely(sk->sk_net_refcnt)) {
1538 			get_net(net);
1539 			sock_inuse_add(net, 1);
1540 		}
1541 
1542 		sock_net_set(sk, net);
1543 		refcount_set(&sk->sk_wmem_alloc, 1);
1544 
1545 		mem_cgroup_sk_alloc(sk);
1546 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1547 		sock_update_classid(&sk->sk_cgrp_data);
1548 		sock_update_netprioidx(&sk->sk_cgrp_data);
1549 	}
1550 
1551 	return sk;
1552 }
1553 EXPORT_SYMBOL(sk_alloc);
1554 
1555 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1556  * grace period. This is the case for UDP sockets and TCP listeners.
1557  */
1558 static void __sk_destruct(struct rcu_head *head)
1559 {
1560 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1561 	struct sk_filter *filter;
1562 
1563 	if (sk->sk_destruct)
1564 		sk->sk_destruct(sk);
1565 
1566 	filter = rcu_dereference_check(sk->sk_filter,
1567 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1568 	if (filter) {
1569 		sk_filter_uncharge(sk, filter);
1570 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1571 	}
1572 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1573 		reuseport_detach_sock(sk);
1574 
1575 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1576 
1577 	if (atomic_read(&sk->sk_omem_alloc))
1578 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1579 			 __func__, atomic_read(&sk->sk_omem_alloc));
1580 
1581 	if (sk->sk_frag.page) {
1582 		put_page(sk->sk_frag.page);
1583 		sk->sk_frag.page = NULL;
1584 	}
1585 
1586 	if (sk->sk_peer_cred)
1587 		put_cred(sk->sk_peer_cred);
1588 	put_pid(sk->sk_peer_pid);
1589 	if (likely(sk->sk_net_refcnt))
1590 		put_net(sock_net(sk));
1591 	sk_prot_free(sk->sk_prot_creator, sk);
1592 }
1593 
1594 void sk_destruct(struct sock *sk)
1595 {
1596 	if (sock_flag(sk, SOCK_RCU_FREE))
1597 		call_rcu(&sk->sk_rcu, __sk_destruct);
1598 	else
1599 		__sk_destruct(&sk->sk_rcu);
1600 }
1601 
1602 static void __sk_free(struct sock *sk)
1603 {
1604 	if (likely(sk->sk_net_refcnt))
1605 		sock_inuse_add(sock_net(sk), -1);
1606 
1607 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1608 		sock_diag_broadcast_destroy(sk);
1609 	else
1610 		sk_destruct(sk);
1611 }
1612 
1613 void sk_free(struct sock *sk)
1614 {
1615 	/*
1616 	 * We subtract one from sk_wmem_alloc and can know if
1617 	 * some packets are still in some tx queue.
1618 	 * If not null, sock_wfree() will call __sk_free(sk) later
1619 	 */
1620 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1621 		__sk_free(sk);
1622 }
1623 EXPORT_SYMBOL(sk_free);
1624 
1625 static void sk_init_common(struct sock *sk)
1626 {
1627 	skb_queue_head_init(&sk->sk_receive_queue);
1628 	skb_queue_head_init(&sk->sk_write_queue);
1629 	skb_queue_head_init(&sk->sk_error_queue);
1630 
1631 	rwlock_init(&sk->sk_callback_lock);
1632 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1633 			af_rlock_keys + sk->sk_family,
1634 			af_family_rlock_key_strings[sk->sk_family]);
1635 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1636 			af_wlock_keys + sk->sk_family,
1637 			af_family_wlock_key_strings[sk->sk_family]);
1638 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1639 			af_elock_keys + sk->sk_family,
1640 			af_family_elock_key_strings[sk->sk_family]);
1641 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1642 			af_callback_keys + sk->sk_family,
1643 			af_family_clock_key_strings[sk->sk_family]);
1644 }
1645 
1646 /**
1647  *	sk_clone_lock - clone a socket, and lock its clone
1648  *	@sk: the socket to clone
1649  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1650  *
1651  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1652  */
1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1654 {
1655 	struct sock *newsk;
1656 	bool is_charged = true;
1657 
1658 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1659 	if (newsk != NULL) {
1660 		struct sk_filter *filter;
1661 
1662 		sock_copy(newsk, sk);
1663 
1664 		newsk->sk_prot_creator = sk->sk_prot;
1665 
1666 		/* SANITY */
1667 		if (likely(newsk->sk_net_refcnt))
1668 			get_net(sock_net(newsk));
1669 		sk_node_init(&newsk->sk_node);
1670 		sock_lock_init(newsk);
1671 		bh_lock_sock(newsk);
1672 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1673 		newsk->sk_backlog.len = 0;
1674 
1675 		atomic_set(&newsk->sk_rmem_alloc, 0);
1676 		/*
1677 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1678 		 */
1679 		refcount_set(&newsk->sk_wmem_alloc, 1);
1680 		atomic_set(&newsk->sk_omem_alloc, 0);
1681 		sk_init_common(newsk);
1682 
1683 		newsk->sk_dst_cache	= NULL;
1684 		newsk->sk_dst_pending_confirm = 0;
1685 		newsk->sk_wmem_queued	= 0;
1686 		newsk->sk_forward_alloc = 0;
1687 		atomic_set(&newsk->sk_drops, 0);
1688 		newsk->sk_send_head	= NULL;
1689 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1690 		atomic_set(&newsk->sk_zckey, 0);
1691 
1692 		sock_reset_flag(newsk, SOCK_DONE);
1693 		mem_cgroup_sk_alloc(newsk);
1694 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1695 
1696 		rcu_read_lock();
1697 		filter = rcu_dereference(sk->sk_filter);
1698 		if (filter != NULL)
1699 			/* though it's an empty new sock, the charging may fail
1700 			 * if sysctl_optmem_max was changed between creation of
1701 			 * original socket and cloning
1702 			 */
1703 			is_charged = sk_filter_charge(newsk, filter);
1704 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1705 		rcu_read_unlock();
1706 
1707 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1708 			/* We need to make sure that we don't uncharge the new
1709 			 * socket if we couldn't charge it in the first place
1710 			 * as otherwise we uncharge the parent's filter.
1711 			 */
1712 			if (!is_charged)
1713 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1714 			sk_free_unlock_clone(newsk);
1715 			newsk = NULL;
1716 			goto out;
1717 		}
1718 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1719 
1720 		newsk->sk_err	   = 0;
1721 		newsk->sk_err_soft = 0;
1722 		newsk->sk_priority = 0;
1723 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1724 		atomic64_set(&newsk->sk_cookie, 0);
1725 		if (likely(newsk->sk_net_refcnt))
1726 			sock_inuse_add(sock_net(newsk), 1);
1727 
1728 		/*
1729 		 * Before updating sk_refcnt, we must commit prior changes to memory
1730 		 * (Documentation/RCU/rculist_nulls.txt for details)
1731 		 */
1732 		smp_wmb();
1733 		refcount_set(&newsk->sk_refcnt, 2);
1734 
1735 		/*
1736 		 * Increment the counter in the same struct proto as the master
1737 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1738 		 * is the same as sk->sk_prot->socks, as this field was copied
1739 		 * with memcpy).
1740 		 *
1741 		 * This _changes_ the previous behaviour, where
1742 		 * tcp_create_openreq_child always was incrementing the
1743 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1744 		 * to be taken into account in all callers. -acme
1745 		 */
1746 		sk_refcnt_debug_inc(newsk);
1747 		sk_set_socket(newsk, NULL);
1748 		newsk->sk_wq = NULL;
1749 
1750 		if (newsk->sk_prot->sockets_allocated)
1751 			sk_sockets_allocated_inc(newsk);
1752 
1753 		if (sock_needs_netstamp(sk) &&
1754 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1755 			net_enable_timestamp();
1756 	}
1757 out:
1758 	return newsk;
1759 }
1760 EXPORT_SYMBOL_GPL(sk_clone_lock);
1761 
1762 void sk_free_unlock_clone(struct sock *sk)
1763 {
1764 	/* It is still raw copy of parent, so invalidate
1765 	 * destructor and make plain sk_free() */
1766 	sk->sk_destruct = NULL;
1767 	bh_unlock_sock(sk);
1768 	sk_free(sk);
1769 }
1770 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1771 
1772 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1773 {
1774 	u32 max_segs = 1;
1775 
1776 	sk_dst_set(sk, dst);
1777 	sk->sk_route_caps = dst->dev->features;
1778 	if (sk->sk_route_caps & NETIF_F_GSO)
1779 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1780 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1781 	if (sk_can_gso(sk)) {
1782 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1783 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1784 		} else {
1785 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1786 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1787 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1788 		}
1789 	}
1790 	sk->sk_gso_max_segs = max_segs;
1791 }
1792 EXPORT_SYMBOL_GPL(sk_setup_caps);
1793 
1794 /*
1795  *	Simple resource managers for sockets.
1796  */
1797 
1798 
1799 /*
1800  * Write buffer destructor automatically called from kfree_skb.
1801  */
1802 void sock_wfree(struct sk_buff *skb)
1803 {
1804 	struct sock *sk = skb->sk;
1805 	unsigned int len = skb->truesize;
1806 
1807 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1808 		/*
1809 		 * Keep a reference on sk_wmem_alloc, this will be released
1810 		 * after sk_write_space() call
1811 		 */
1812 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1813 		sk->sk_write_space(sk);
1814 		len = 1;
1815 	}
1816 	/*
1817 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1818 	 * could not do because of in-flight packets
1819 	 */
1820 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1821 		__sk_free(sk);
1822 }
1823 EXPORT_SYMBOL(sock_wfree);
1824 
1825 /* This variant of sock_wfree() is used by TCP,
1826  * since it sets SOCK_USE_WRITE_QUEUE.
1827  */
1828 void __sock_wfree(struct sk_buff *skb)
1829 {
1830 	struct sock *sk = skb->sk;
1831 
1832 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1833 		__sk_free(sk);
1834 }
1835 
1836 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1837 {
1838 	skb_orphan(skb);
1839 	skb->sk = sk;
1840 #ifdef CONFIG_INET
1841 	if (unlikely(!sk_fullsock(sk))) {
1842 		skb->destructor = sock_edemux;
1843 		sock_hold(sk);
1844 		return;
1845 	}
1846 #endif
1847 	skb->destructor = sock_wfree;
1848 	skb_set_hash_from_sk(skb, sk);
1849 	/*
1850 	 * We used to take a refcount on sk, but following operation
1851 	 * is enough to guarantee sk_free() wont free this sock until
1852 	 * all in-flight packets are completed
1853 	 */
1854 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1855 }
1856 EXPORT_SYMBOL(skb_set_owner_w);
1857 
1858 /* This helper is used by netem, as it can hold packets in its
1859  * delay queue. We want to allow the owner socket to send more
1860  * packets, as if they were already TX completed by a typical driver.
1861  * But we also want to keep skb->sk set because some packet schedulers
1862  * rely on it (sch_fq for example).
1863  */
1864 void skb_orphan_partial(struct sk_buff *skb)
1865 {
1866 	if (skb_is_tcp_pure_ack(skb))
1867 		return;
1868 
1869 	if (skb->destructor == sock_wfree
1870 #ifdef CONFIG_INET
1871 	    || skb->destructor == tcp_wfree
1872 #endif
1873 		) {
1874 		struct sock *sk = skb->sk;
1875 
1876 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1877 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1878 			skb->destructor = sock_efree;
1879 		}
1880 	} else {
1881 		skb_orphan(skb);
1882 	}
1883 }
1884 EXPORT_SYMBOL(skb_orphan_partial);
1885 
1886 /*
1887  * Read buffer destructor automatically called from kfree_skb.
1888  */
1889 void sock_rfree(struct sk_buff *skb)
1890 {
1891 	struct sock *sk = skb->sk;
1892 	unsigned int len = skb->truesize;
1893 
1894 	atomic_sub(len, &sk->sk_rmem_alloc);
1895 	sk_mem_uncharge(sk, len);
1896 }
1897 EXPORT_SYMBOL(sock_rfree);
1898 
1899 /*
1900  * Buffer destructor for skbs that are not used directly in read or write
1901  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1902  */
1903 void sock_efree(struct sk_buff *skb)
1904 {
1905 	sock_put(skb->sk);
1906 }
1907 EXPORT_SYMBOL(sock_efree);
1908 
1909 kuid_t sock_i_uid(struct sock *sk)
1910 {
1911 	kuid_t uid;
1912 
1913 	read_lock_bh(&sk->sk_callback_lock);
1914 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1915 	read_unlock_bh(&sk->sk_callback_lock);
1916 	return uid;
1917 }
1918 EXPORT_SYMBOL(sock_i_uid);
1919 
1920 unsigned long sock_i_ino(struct sock *sk)
1921 {
1922 	unsigned long ino;
1923 
1924 	read_lock_bh(&sk->sk_callback_lock);
1925 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1926 	read_unlock_bh(&sk->sk_callback_lock);
1927 	return ino;
1928 }
1929 EXPORT_SYMBOL(sock_i_ino);
1930 
1931 /*
1932  * Allocate a skb from the socket's send buffer.
1933  */
1934 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1935 			     gfp_t priority)
1936 {
1937 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1938 		struct sk_buff *skb = alloc_skb(size, priority);
1939 		if (skb) {
1940 			skb_set_owner_w(skb, sk);
1941 			return skb;
1942 		}
1943 	}
1944 	return NULL;
1945 }
1946 EXPORT_SYMBOL(sock_wmalloc);
1947 
1948 static void sock_ofree(struct sk_buff *skb)
1949 {
1950 	struct sock *sk = skb->sk;
1951 
1952 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1953 }
1954 
1955 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1956 			     gfp_t priority)
1957 {
1958 	struct sk_buff *skb;
1959 
1960 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1961 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1962 	    sysctl_optmem_max)
1963 		return NULL;
1964 
1965 	skb = alloc_skb(size, priority);
1966 	if (!skb)
1967 		return NULL;
1968 
1969 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1970 	skb->sk = sk;
1971 	skb->destructor = sock_ofree;
1972 	return skb;
1973 }
1974 
1975 /*
1976  * Allocate a memory block from the socket's option memory buffer.
1977  */
1978 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1979 {
1980 	if ((unsigned int)size <= sysctl_optmem_max &&
1981 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1982 		void *mem;
1983 		/* First do the add, to avoid the race if kmalloc
1984 		 * might sleep.
1985 		 */
1986 		atomic_add(size, &sk->sk_omem_alloc);
1987 		mem = kmalloc(size, priority);
1988 		if (mem)
1989 			return mem;
1990 		atomic_sub(size, &sk->sk_omem_alloc);
1991 	}
1992 	return NULL;
1993 }
1994 EXPORT_SYMBOL(sock_kmalloc);
1995 
1996 /* Free an option memory block. Note, we actually want the inline
1997  * here as this allows gcc to detect the nullify and fold away the
1998  * condition entirely.
1999  */
2000 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2001 				  const bool nullify)
2002 {
2003 	if (WARN_ON_ONCE(!mem))
2004 		return;
2005 	if (nullify)
2006 		kzfree(mem);
2007 	else
2008 		kfree(mem);
2009 	atomic_sub(size, &sk->sk_omem_alloc);
2010 }
2011 
2012 void sock_kfree_s(struct sock *sk, void *mem, int size)
2013 {
2014 	__sock_kfree_s(sk, mem, size, false);
2015 }
2016 EXPORT_SYMBOL(sock_kfree_s);
2017 
2018 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2019 {
2020 	__sock_kfree_s(sk, mem, size, true);
2021 }
2022 EXPORT_SYMBOL(sock_kzfree_s);
2023 
2024 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2025    I think, these locks should be removed for datagram sockets.
2026  */
2027 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2028 {
2029 	DEFINE_WAIT(wait);
2030 
2031 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2032 	for (;;) {
2033 		if (!timeo)
2034 			break;
2035 		if (signal_pending(current))
2036 			break;
2037 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2038 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2039 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2040 			break;
2041 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2042 			break;
2043 		if (sk->sk_err)
2044 			break;
2045 		timeo = schedule_timeout(timeo);
2046 	}
2047 	finish_wait(sk_sleep(sk), &wait);
2048 	return timeo;
2049 }
2050 
2051 
2052 /*
2053  *	Generic send/receive buffer handlers
2054  */
2055 
2056 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2057 				     unsigned long data_len, int noblock,
2058 				     int *errcode, int max_page_order)
2059 {
2060 	struct sk_buff *skb;
2061 	long timeo;
2062 	int err;
2063 
2064 	timeo = sock_sndtimeo(sk, noblock);
2065 	for (;;) {
2066 		err = sock_error(sk);
2067 		if (err != 0)
2068 			goto failure;
2069 
2070 		err = -EPIPE;
2071 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2072 			goto failure;
2073 
2074 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2075 			break;
2076 
2077 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2078 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2079 		err = -EAGAIN;
2080 		if (!timeo)
2081 			goto failure;
2082 		if (signal_pending(current))
2083 			goto interrupted;
2084 		timeo = sock_wait_for_wmem(sk, timeo);
2085 	}
2086 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2087 				   errcode, sk->sk_allocation);
2088 	if (skb)
2089 		skb_set_owner_w(skb, sk);
2090 	return skb;
2091 
2092 interrupted:
2093 	err = sock_intr_errno(timeo);
2094 failure:
2095 	*errcode = err;
2096 	return NULL;
2097 }
2098 EXPORT_SYMBOL(sock_alloc_send_pskb);
2099 
2100 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2101 				    int noblock, int *errcode)
2102 {
2103 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2104 }
2105 EXPORT_SYMBOL(sock_alloc_send_skb);
2106 
2107 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2108 		     struct sockcm_cookie *sockc)
2109 {
2110 	u32 tsflags;
2111 
2112 	switch (cmsg->cmsg_type) {
2113 	case SO_MARK:
2114 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2115 			return -EPERM;
2116 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2117 			return -EINVAL;
2118 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2119 		break;
2120 	case SO_TIMESTAMPING:
2121 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122 			return -EINVAL;
2123 
2124 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2125 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2126 			return -EINVAL;
2127 
2128 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2129 		sockc->tsflags |= tsflags;
2130 		break;
2131 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2132 	case SCM_RIGHTS:
2133 	case SCM_CREDENTIALS:
2134 		break;
2135 	default:
2136 		return -EINVAL;
2137 	}
2138 	return 0;
2139 }
2140 EXPORT_SYMBOL(__sock_cmsg_send);
2141 
2142 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2143 		   struct sockcm_cookie *sockc)
2144 {
2145 	struct cmsghdr *cmsg;
2146 	int ret;
2147 
2148 	for_each_cmsghdr(cmsg, msg) {
2149 		if (!CMSG_OK(msg, cmsg))
2150 			return -EINVAL;
2151 		if (cmsg->cmsg_level != SOL_SOCKET)
2152 			continue;
2153 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2154 		if (ret)
2155 			return ret;
2156 	}
2157 	return 0;
2158 }
2159 EXPORT_SYMBOL(sock_cmsg_send);
2160 
2161 static void sk_enter_memory_pressure(struct sock *sk)
2162 {
2163 	if (!sk->sk_prot->enter_memory_pressure)
2164 		return;
2165 
2166 	sk->sk_prot->enter_memory_pressure(sk);
2167 }
2168 
2169 static void sk_leave_memory_pressure(struct sock *sk)
2170 {
2171 	if (sk->sk_prot->leave_memory_pressure) {
2172 		sk->sk_prot->leave_memory_pressure(sk);
2173 	} else {
2174 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2175 
2176 		if (memory_pressure && *memory_pressure)
2177 			*memory_pressure = 0;
2178 	}
2179 }
2180 
2181 /* On 32bit arches, an skb frag is limited to 2^15 */
2182 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2183 
2184 /**
2185  * skb_page_frag_refill - check that a page_frag contains enough room
2186  * @sz: minimum size of the fragment we want to get
2187  * @pfrag: pointer to page_frag
2188  * @gfp: priority for memory allocation
2189  *
2190  * Note: While this allocator tries to use high order pages, there is
2191  * no guarantee that allocations succeed. Therefore, @sz MUST be
2192  * less or equal than PAGE_SIZE.
2193  */
2194 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2195 {
2196 	if (pfrag->page) {
2197 		if (page_ref_count(pfrag->page) == 1) {
2198 			pfrag->offset = 0;
2199 			return true;
2200 		}
2201 		if (pfrag->offset + sz <= pfrag->size)
2202 			return true;
2203 		put_page(pfrag->page);
2204 	}
2205 
2206 	pfrag->offset = 0;
2207 	if (SKB_FRAG_PAGE_ORDER) {
2208 		/* Avoid direct reclaim but allow kswapd to wake */
2209 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2210 					  __GFP_COMP | __GFP_NOWARN |
2211 					  __GFP_NORETRY,
2212 					  SKB_FRAG_PAGE_ORDER);
2213 		if (likely(pfrag->page)) {
2214 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2215 			return true;
2216 		}
2217 	}
2218 	pfrag->page = alloc_page(gfp);
2219 	if (likely(pfrag->page)) {
2220 		pfrag->size = PAGE_SIZE;
2221 		return true;
2222 	}
2223 	return false;
2224 }
2225 EXPORT_SYMBOL(skb_page_frag_refill);
2226 
2227 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2228 {
2229 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2230 		return true;
2231 
2232 	sk_enter_memory_pressure(sk);
2233 	sk_stream_moderate_sndbuf(sk);
2234 	return false;
2235 }
2236 EXPORT_SYMBOL(sk_page_frag_refill);
2237 
2238 static void __lock_sock(struct sock *sk)
2239 	__releases(&sk->sk_lock.slock)
2240 	__acquires(&sk->sk_lock.slock)
2241 {
2242 	DEFINE_WAIT(wait);
2243 
2244 	for (;;) {
2245 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2246 					TASK_UNINTERRUPTIBLE);
2247 		spin_unlock_bh(&sk->sk_lock.slock);
2248 		schedule();
2249 		spin_lock_bh(&sk->sk_lock.slock);
2250 		if (!sock_owned_by_user(sk))
2251 			break;
2252 	}
2253 	finish_wait(&sk->sk_lock.wq, &wait);
2254 }
2255 
2256 static void __release_sock(struct sock *sk)
2257 	__releases(&sk->sk_lock.slock)
2258 	__acquires(&sk->sk_lock.slock)
2259 {
2260 	struct sk_buff *skb, *next;
2261 
2262 	while ((skb = sk->sk_backlog.head) != NULL) {
2263 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2264 
2265 		spin_unlock_bh(&sk->sk_lock.slock);
2266 
2267 		do {
2268 			next = skb->next;
2269 			prefetch(next);
2270 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2271 			skb->next = NULL;
2272 			sk_backlog_rcv(sk, skb);
2273 
2274 			cond_resched();
2275 
2276 			skb = next;
2277 		} while (skb != NULL);
2278 
2279 		spin_lock_bh(&sk->sk_lock.slock);
2280 	}
2281 
2282 	/*
2283 	 * Doing the zeroing here guarantee we can not loop forever
2284 	 * while a wild producer attempts to flood us.
2285 	 */
2286 	sk->sk_backlog.len = 0;
2287 }
2288 
2289 void __sk_flush_backlog(struct sock *sk)
2290 {
2291 	spin_lock_bh(&sk->sk_lock.slock);
2292 	__release_sock(sk);
2293 	spin_unlock_bh(&sk->sk_lock.slock);
2294 }
2295 
2296 /**
2297  * sk_wait_data - wait for data to arrive at sk_receive_queue
2298  * @sk:    sock to wait on
2299  * @timeo: for how long
2300  * @skb:   last skb seen on sk_receive_queue
2301  *
2302  * Now socket state including sk->sk_err is changed only under lock,
2303  * hence we may omit checks after joining wait queue.
2304  * We check receive queue before schedule() only as optimization;
2305  * it is very likely that release_sock() added new data.
2306  */
2307 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2308 {
2309 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2310 	int rc;
2311 
2312 	add_wait_queue(sk_sleep(sk), &wait);
2313 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2314 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2315 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2316 	remove_wait_queue(sk_sleep(sk), &wait);
2317 	return rc;
2318 }
2319 EXPORT_SYMBOL(sk_wait_data);
2320 
2321 /**
2322  *	__sk_mem_raise_allocated - increase memory_allocated
2323  *	@sk: socket
2324  *	@size: memory size to allocate
2325  *	@amt: pages to allocate
2326  *	@kind: allocation type
2327  *
2328  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2329  */
2330 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2331 {
2332 	struct proto *prot = sk->sk_prot;
2333 	long allocated = sk_memory_allocated_add(sk, amt);
2334 
2335 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2336 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2337 		goto suppress_allocation;
2338 
2339 	/* Under limit. */
2340 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2341 		sk_leave_memory_pressure(sk);
2342 		return 1;
2343 	}
2344 
2345 	/* Under pressure. */
2346 	if (allocated > sk_prot_mem_limits(sk, 1))
2347 		sk_enter_memory_pressure(sk);
2348 
2349 	/* Over hard limit. */
2350 	if (allocated > sk_prot_mem_limits(sk, 2))
2351 		goto suppress_allocation;
2352 
2353 	/* guarantee minimum buffer size under pressure */
2354 	if (kind == SK_MEM_RECV) {
2355 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2356 			return 1;
2357 
2358 	} else { /* SK_MEM_SEND */
2359 		int wmem0 = sk_get_wmem0(sk, prot);
2360 
2361 		if (sk->sk_type == SOCK_STREAM) {
2362 			if (sk->sk_wmem_queued < wmem0)
2363 				return 1;
2364 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2365 				return 1;
2366 		}
2367 	}
2368 
2369 	if (sk_has_memory_pressure(sk)) {
2370 		int alloc;
2371 
2372 		if (!sk_under_memory_pressure(sk))
2373 			return 1;
2374 		alloc = sk_sockets_allocated_read_positive(sk);
2375 		if (sk_prot_mem_limits(sk, 2) > alloc *
2376 		    sk_mem_pages(sk->sk_wmem_queued +
2377 				 atomic_read(&sk->sk_rmem_alloc) +
2378 				 sk->sk_forward_alloc))
2379 			return 1;
2380 	}
2381 
2382 suppress_allocation:
2383 
2384 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2385 		sk_stream_moderate_sndbuf(sk);
2386 
2387 		/* Fail only if socket is _under_ its sndbuf.
2388 		 * In this case we cannot block, so that we have to fail.
2389 		 */
2390 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2391 			return 1;
2392 	}
2393 
2394 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2395 
2396 	sk_memory_allocated_sub(sk, amt);
2397 
2398 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2399 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2400 
2401 	return 0;
2402 }
2403 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2404 
2405 /**
2406  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2407  *	@sk: socket
2408  *	@size: memory size to allocate
2409  *	@kind: allocation type
2410  *
2411  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2412  *	rmem allocation. This function assumes that protocols which have
2413  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2414  */
2415 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2416 {
2417 	int ret, amt = sk_mem_pages(size);
2418 
2419 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2420 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2421 	if (!ret)
2422 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2423 	return ret;
2424 }
2425 EXPORT_SYMBOL(__sk_mem_schedule);
2426 
2427 /**
2428  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2429  *	@sk: socket
2430  *	@amount: number of quanta
2431  *
2432  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2433  */
2434 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2435 {
2436 	sk_memory_allocated_sub(sk, amount);
2437 
2438 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2439 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2440 
2441 	if (sk_under_memory_pressure(sk) &&
2442 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2443 		sk_leave_memory_pressure(sk);
2444 }
2445 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2446 
2447 /**
2448  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2449  *	@sk: socket
2450  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2451  */
2452 void __sk_mem_reclaim(struct sock *sk, int amount)
2453 {
2454 	amount >>= SK_MEM_QUANTUM_SHIFT;
2455 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2456 	__sk_mem_reduce_allocated(sk, amount);
2457 }
2458 EXPORT_SYMBOL(__sk_mem_reclaim);
2459 
2460 int sk_set_peek_off(struct sock *sk, int val)
2461 {
2462 	sk->sk_peek_off = val;
2463 	return 0;
2464 }
2465 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2466 
2467 /*
2468  * Set of default routines for initialising struct proto_ops when
2469  * the protocol does not support a particular function. In certain
2470  * cases where it makes no sense for a protocol to have a "do nothing"
2471  * function, some default processing is provided.
2472  */
2473 
2474 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2475 {
2476 	return -EOPNOTSUPP;
2477 }
2478 EXPORT_SYMBOL(sock_no_bind);
2479 
2480 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2481 		    int len, int flags)
2482 {
2483 	return -EOPNOTSUPP;
2484 }
2485 EXPORT_SYMBOL(sock_no_connect);
2486 
2487 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2488 {
2489 	return -EOPNOTSUPP;
2490 }
2491 EXPORT_SYMBOL(sock_no_socketpair);
2492 
2493 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2494 		   bool kern)
2495 {
2496 	return -EOPNOTSUPP;
2497 }
2498 EXPORT_SYMBOL(sock_no_accept);
2499 
2500 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2501 		    int peer)
2502 {
2503 	return -EOPNOTSUPP;
2504 }
2505 EXPORT_SYMBOL(sock_no_getname);
2506 
2507 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2508 {
2509 	return 0;
2510 }
2511 EXPORT_SYMBOL(sock_no_poll);
2512 
2513 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2514 {
2515 	return -EOPNOTSUPP;
2516 }
2517 EXPORT_SYMBOL(sock_no_ioctl);
2518 
2519 int sock_no_listen(struct socket *sock, int backlog)
2520 {
2521 	return -EOPNOTSUPP;
2522 }
2523 EXPORT_SYMBOL(sock_no_listen);
2524 
2525 int sock_no_shutdown(struct socket *sock, int how)
2526 {
2527 	return -EOPNOTSUPP;
2528 }
2529 EXPORT_SYMBOL(sock_no_shutdown);
2530 
2531 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2532 		    char __user *optval, unsigned int optlen)
2533 {
2534 	return -EOPNOTSUPP;
2535 }
2536 EXPORT_SYMBOL(sock_no_setsockopt);
2537 
2538 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2539 		    char __user *optval, int __user *optlen)
2540 {
2541 	return -EOPNOTSUPP;
2542 }
2543 EXPORT_SYMBOL(sock_no_getsockopt);
2544 
2545 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2546 {
2547 	return -EOPNOTSUPP;
2548 }
2549 EXPORT_SYMBOL(sock_no_sendmsg);
2550 
2551 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2552 {
2553 	return -EOPNOTSUPP;
2554 }
2555 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2556 
2557 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2558 		    int flags)
2559 {
2560 	return -EOPNOTSUPP;
2561 }
2562 EXPORT_SYMBOL(sock_no_recvmsg);
2563 
2564 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2565 {
2566 	/* Mirror missing mmap method error code */
2567 	return -ENODEV;
2568 }
2569 EXPORT_SYMBOL(sock_no_mmap);
2570 
2571 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2572 {
2573 	ssize_t res;
2574 	struct msghdr msg = {.msg_flags = flags};
2575 	struct kvec iov;
2576 	char *kaddr = kmap(page);
2577 	iov.iov_base = kaddr + offset;
2578 	iov.iov_len = size;
2579 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2580 	kunmap(page);
2581 	return res;
2582 }
2583 EXPORT_SYMBOL(sock_no_sendpage);
2584 
2585 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2586 				int offset, size_t size, int flags)
2587 {
2588 	ssize_t res;
2589 	struct msghdr msg = {.msg_flags = flags};
2590 	struct kvec iov;
2591 	char *kaddr = kmap(page);
2592 
2593 	iov.iov_base = kaddr + offset;
2594 	iov.iov_len = size;
2595 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2596 	kunmap(page);
2597 	return res;
2598 }
2599 EXPORT_SYMBOL(sock_no_sendpage_locked);
2600 
2601 /*
2602  *	Default Socket Callbacks
2603  */
2604 
2605 static void sock_def_wakeup(struct sock *sk)
2606 {
2607 	struct socket_wq *wq;
2608 
2609 	rcu_read_lock();
2610 	wq = rcu_dereference(sk->sk_wq);
2611 	if (skwq_has_sleeper(wq))
2612 		wake_up_interruptible_all(&wq->wait);
2613 	rcu_read_unlock();
2614 }
2615 
2616 static void sock_def_error_report(struct sock *sk)
2617 {
2618 	struct socket_wq *wq;
2619 
2620 	rcu_read_lock();
2621 	wq = rcu_dereference(sk->sk_wq);
2622 	if (skwq_has_sleeper(wq))
2623 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2624 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2625 	rcu_read_unlock();
2626 }
2627 
2628 static void sock_def_readable(struct sock *sk)
2629 {
2630 	struct socket_wq *wq;
2631 
2632 	rcu_read_lock();
2633 	wq = rcu_dereference(sk->sk_wq);
2634 	if (skwq_has_sleeper(wq))
2635 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2636 						EPOLLRDNORM | EPOLLRDBAND);
2637 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2638 	rcu_read_unlock();
2639 }
2640 
2641 static void sock_def_write_space(struct sock *sk)
2642 {
2643 	struct socket_wq *wq;
2644 
2645 	rcu_read_lock();
2646 
2647 	/* Do not wake up a writer until he can make "significant"
2648 	 * progress.  --DaveM
2649 	 */
2650 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2651 		wq = rcu_dereference(sk->sk_wq);
2652 		if (skwq_has_sleeper(wq))
2653 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2654 						EPOLLWRNORM | EPOLLWRBAND);
2655 
2656 		/* Should agree with poll, otherwise some programs break */
2657 		if (sock_writeable(sk))
2658 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2659 	}
2660 
2661 	rcu_read_unlock();
2662 }
2663 
2664 static void sock_def_destruct(struct sock *sk)
2665 {
2666 }
2667 
2668 void sk_send_sigurg(struct sock *sk)
2669 {
2670 	if (sk->sk_socket && sk->sk_socket->file)
2671 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2672 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2673 }
2674 EXPORT_SYMBOL(sk_send_sigurg);
2675 
2676 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2677 		    unsigned long expires)
2678 {
2679 	if (!mod_timer(timer, expires))
2680 		sock_hold(sk);
2681 }
2682 EXPORT_SYMBOL(sk_reset_timer);
2683 
2684 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2685 {
2686 	if (del_timer(timer))
2687 		__sock_put(sk);
2688 }
2689 EXPORT_SYMBOL(sk_stop_timer);
2690 
2691 void sock_init_data(struct socket *sock, struct sock *sk)
2692 {
2693 	sk_init_common(sk);
2694 	sk->sk_send_head	=	NULL;
2695 
2696 	timer_setup(&sk->sk_timer, NULL, 0);
2697 
2698 	sk->sk_allocation	=	GFP_KERNEL;
2699 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2700 	sk->sk_sndbuf		=	sysctl_wmem_default;
2701 	sk->sk_state		=	TCP_CLOSE;
2702 	sk_set_socket(sk, sock);
2703 
2704 	sock_set_flag(sk, SOCK_ZAPPED);
2705 
2706 	if (sock) {
2707 		sk->sk_type	=	sock->type;
2708 		sk->sk_wq	=	sock->wq;
2709 		sock->sk	=	sk;
2710 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2711 	} else {
2712 		sk->sk_wq	=	NULL;
2713 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2714 	}
2715 
2716 	rwlock_init(&sk->sk_callback_lock);
2717 	if (sk->sk_kern_sock)
2718 		lockdep_set_class_and_name(
2719 			&sk->sk_callback_lock,
2720 			af_kern_callback_keys + sk->sk_family,
2721 			af_family_kern_clock_key_strings[sk->sk_family]);
2722 	else
2723 		lockdep_set_class_and_name(
2724 			&sk->sk_callback_lock,
2725 			af_callback_keys + sk->sk_family,
2726 			af_family_clock_key_strings[sk->sk_family]);
2727 
2728 	sk->sk_state_change	=	sock_def_wakeup;
2729 	sk->sk_data_ready	=	sock_def_readable;
2730 	sk->sk_write_space	=	sock_def_write_space;
2731 	sk->sk_error_report	=	sock_def_error_report;
2732 	sk->sk_destruct		=	sock_def_destruct;
2733 
2734 	sk->sk_frag.page	=	NULL;
2735 	sk->sk_frag.offset	=	0;
2736 	sk->sk_peek_off		=	-1;
2737 
2738 	sk->sk_peer_pid 	=	NULL;
2739 	sk->sk_peer_cred	=	NULL;
2740 	sk->sk_write_pending	=	0;
2741 	sk->sk_rcvlowat		=	1;
2742 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2743 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2744 
2745 	sk->sk_stamp = SK_DEFAULT_STAMP;
2746 	atomic_set(&sk->sk_zckey, 0);
2747 
2748 #ifdef CONFIG_NET_RX_BUSY_POLL
2749 	sk->sk_napi_id		=	0;
2750 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2751 #endif
2752 
2753 	sk->sk_max_pacing_rate = ~0U;
2754 	sk->sk_pacing_rate = ~0U;
2755 	sk->sk_pacing_shift = 10;
2756 	sk->sk_incoming_cpu = -1;
2757 	/*
2758 	 * Before updating sk_refcnt, we must commit prior changes to memory
2759 	 * (Documentation/RCU/rculist_nulls.txt for details)
2760 	 */
2761 	smp_wmb();
2762 	refcount_set(&sk->sk_refcnt, 1);
2763 	atomic_set(&sk->sk_drops, 0);
2764 }
2765 EXPORT_SYMBOL(sock_init_data);
2766 
2767 void lock_sock_nested(struct sock *sk, int subclass)
2768 {
2769 	might_sleep();
2770 	spin_lock_bh(&sk->sk_lock.slock);
2771 	if (sk->sk_lock.owned)
2772 		__lock_sock(sk);
2773 	sk->sk_lock.owned = 1;
2774 	spin_unlock(&sk->sk_lock.slock);
2775 	/*
2776 	 * The sk_lock has mutex_lock() semantics here:
2777 	 */
2778 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2779 	local_bh_enable();
2780 }
2781 EXPORT_SYMBOL(lock_sock_nested);
2782 
2783 void release_sock(struct sock *sk)
2784 {
2785 	spin_lock_bh(&sk->sk_lock.slock);
2786 	if (sk->sk_backlog.tail)
2787 		__release_sock(sk);
2788 
2789 	/* Warning : release_cb() might need to release sk ownership,
2790 	 * ie call sock_release_ownership(sk) before us.
2791 	 */
2792 	if (sk->sk_prot->release_cb)
2793 		sk->sk_prot->release_cb(sk);
2794 
2795 	sock_release_ownership(sk);
2796 	if (waitqueue_active(&sk->sk_lock.wq))
2797 		wake_up(&sk->sk_lock.wq);
2798 	spin_unlock_bh(&sk->sk_lock.slock);
2799 }
2800 EXPORT_SYMBOL(release_sock);
2801 
2802 /**
2803  * lock_sock_fast - fast version of lock_sock
2804  * @sk: socket
2805  *
2806  * This version should be used for very small section, where process wont block
2807  * return false if fast path is taken:
2808  *
2809  *   sk_lock.slock locked, owned = 0, BH disabled
2810  *
2811  * return true if slow path is taken:
2812  *
2813  *   sk_lock.slock unlocked, owned = 1, BH enabled
2814  */
2815 bool lock_sock_fast(struct sock *sk)
2816 {
2817 	might_sleep();
2818 	spin_lock_bh(&sk->sk_lock.slock);
2819 
2820 	if (!sk->sk_lock.owned)
2821 		/*
2822 		 * Note : We must disable BH
2823 		 */
2824 		return false;
2825 
2826 	__lock_sock(sk);
2827 	sk->sk_lock.owned = 1;
2828 	spin_unlock(&sk->sk_lock.slock);
2829 	/*
2830 	 * The sk_lock has mutex_lock() semantics here:
2831 	 */
2832 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2833 	local_bh_enable();
2834 	return true;
2835 }
2836 EXPORT_SYMBOL(lock_sock_fast);
2837 
2838 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2839 {
2840 	struct timeval tv;
2841 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2842 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2843 	tv = ktime_to_timeval(sk->sk_stamp);
2844 	if (tv.tv_sec == -1)
2845 		return -ENOENT;
2846 	if (tv.tv_sec == 0) {
2847 		sk->sk_stamp = ktime_get_real();
2848 		tv = ktime_to_timeval(sk->sk_stamp);
2849 	}
2850 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2851 }
2852 EXPORT_SYMBOL(sock_get_timestamp);
2853 
2854 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2855 {
2856 	struct timespec ts;
2857 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2858 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2859 	ts = ktime_to_timespec(sk->sk_stamp);
2860 	if (ts.tv_sec == -1)
2861 		return -ENOENT;
2862 	if (ts.tv_sec == 0) {
2863 		sk->sk_stamp = ktime_get_real();
2864 		ts = ktime_to_timespec(sk->sk_stamp);
2865 	}
2866 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2867 }
2868 EXPORT_SYMBOL(sock_get_timestampns);
2869 
2870 void sock_enable_timestamp(struct sock *sk, int flag)
2871 {
2872 	if (!sock_flag(sk, flag)) {
2873 		unsigned long previous_flags = sk->sk_flags;
2874 
2875 		sock_set_flag(sk, flag);
2876 		/*
2877 		 * we just set one of the two flags which require net
2878 		 * time stamping, but time stamping might have been on
2879 		 * already because of the other one
2880 		 */
2881 		if (sock_needs_netstamp(sk) &&
2882 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2883 			net_enable_timestamp();
2884 	}
2885 }
2886 
2887 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2888 		       int level, int type)
2889 {
2890 	struct sock_exterr_skb *serr;
2891 	struct sk_buff *skb;
2892 	int copied, err;
2893 
2894 	err = -EAGAIN;
2895 	skb = sock_dequeue_err_skb(sk);
2896 	if (skb == NULL)
2897 		goto out;
2898 
2899 	copied = skb->len;
2900 	if (copied > len) {
2901 		msg->msg_flags |= MSG_TRUNC;
2902 		copied = len;
2903 	}
2904 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2905 	if (err)
2906 		goto out_free_skb;
2907 
2908 	sock_recv_timestamp(msg, sk, skb);
2909 
2910 	serr = SKB_EXT_ERR(skb);
2911 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2912 
2913 	msg->msg_flags |= MSG_ERRQUEUE;
2914 	err = copied;
2915 
2916 out_free_skb:
2917 	kfree_skb(skb);
2918 out:
2919 	return err;
2920 }
2921 EXPORT_SYMBOL(sock_recv_errqueue);
2922 
2923 /*
2924  *	Get a socket option on an socket.
2925  *
2926  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2927  *	asynchronous errors should be reported by getsockopt. We assume
2928  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2929  */
2930 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2931 			   char __user *optval, int __user *optlen)
2932 {
2933 	struct sock *sk = sock->sk;
2934 
2935 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2936 }
2937 EXPORT_SYMBOL(sock_common_getsockopt);
2938 
2939 #ifdef CONFIG_COMPAT
2940 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2941 				  char __user *optval, int __user *optlen)
2942 {
2943 	struct sock *sk = sock->sk;
2944 
2945 	if (sk->sk_prot->compat_getsockopt != NULL)
2946 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2947 						      optval, optlen);
2948 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2949 }
2950 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2951 #endif
2952 
2953 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2954 			int flags)
2955 {
2956 	struct sock *sk = sock->sk;
2957 	int addr_len = 0;
2958 	int err;
2959 
2960 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2961 				   flags & ~MSG_DONTWAIT, &addr_len);
2962 	if (err >= 0)
2963 		msg->msg_namelen = addr_len;
2964 	return err;
2965 }
2966 EXPORT_SYMBOL(sock_common_recvmsg);
2967 
2968 /*
2969  *	Set socket options on an inet socket.
2970  */
2971 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2972 			   char __user *optval, unsigned int optlen)
2973 {
2974 	struct sock *sk = sock->sk;
2975 
2976 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2977 }
2978 EXPORT_SYMBOL(sock_common_setsockopt);
2979 
2980 #ifdef CONFIG_COMPAT
2981 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2982 				  char __user *optval, unsigned int optlen)
2983 {
2984 	struct sock *sk = sock->sk;
2985 
2986 	if (sk->sk_prot->compat_setsockopt != NULL)
2987 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2988 						      optval, optlen);
2989 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2990 }
2991 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2992 #endif
2993 
2994 void sk_common_release(struct sock *sk)
2995 {
2996 	if (sk->sk_prot->destroy)
2997 		sk->sk_prot->destroy(sk);
2998 
2999 	/*
3000 	 * Observation: when sock_common_release is called, processes have
3001 	 * no access to socket. But net still has.
3002 	 * Step one, detach it from networking:
3003 	 *
3004 	 * A. Remove from hash tables.
3005 	 */
3006 
3007 	sk->sk_prot->unhash(sk);
3008 
3009 	/*
3010 	 * In this point socket cannot receive new packets, but it is possible
3011 	 * that some packets are in flight because some CPU runs receiver and
3012 	 * did hash table lookup before we unhashed socket. They will achieve
3013 	 * receive queue and will be purged by socket destructor.
3014 	 *
3015 	 * Also we still have packets pending on receive queue and probably,
3016 	 * our own packets waiting in device queues. sock_destroy will drain
3017 	 * receive queue, but transmitted packets will delay socket destruction
3018 	 * until the last reference will be released.
3019 	 */
3020 
3021 	sock_orphan(sk);
3022 
3023 	xfrm_sk_free_policy(sk);
3024 
3025 	sk_refcnt_debug_release(sk);
3026 
3027 	sock_put(sk);
3028 }
3029 EXPORT_SYMBOL(sk_common_release);
3030 
3031 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3032 {
3033 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3034 
3035 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3036 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3037 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3038 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3039 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3040 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3041 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3042 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3043 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3044 }
3045 
3046 #ifdef CONFIG_PROC_FS
3047 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3048 struct prot_inuse {
3049 	int val[PROTO_INUSE_NR];
3050 };
3051 
3052 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3053 
3054 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3055 {
3056 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3057 }
3058 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3059 
3060 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3061 {
3062 	int cpu, idx = prot->inuse_idx;
3063 	int res = 0;
3064 
3065 	for_each_possible_cpu(cpu)
3066 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3067 
3068 	return res >= 0 ? res : 0;
3069 }
3070 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3071 
3072 static void sock_inuse_add(struct net *net, int val)
3073 {
3074 	this_cpu_add(*net->core.sock_inuse, val);
3075 }
3076 
3077 int sock_inuse_get(struct net *net)
3078 {
3079 	int cpu, res = 0;
3080 
3081 	for_each_possible_cpu(cpu)
3082 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3083 
3084 	return res;
3085 }
3086 
3087 EXPORT_SYMBOL_GPL(sock_inuse_get);
3088 
3089 static int __net_init sock_inuse_init_net(struct net *net)
3090 {
3091 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3092 	if (net->core.prot_inuse == NULL)
3093 		return -ENOMEM;
3094 
3095 	net->core.sock_inuse = alloc_percpu(int);
3096 	if (net->core.sock_inuse == NULL)
3097 		goto out;
3098 
3099 	return 0;
3100 
3101 out:
3102 	free_percpu(net->core.prot_inuse);
3103 	return -ENOMEM;
3104 }
3105 
3106 static void __net_exit sock_inuse_exit_net(struct net *net)
3107 {
3108 	free_percpu(net->core.prot_inuse);
3109 	free_percpu(net->core.sock_inuse);
3110 }
3111 
3112 static struct pernet_operations net_inuse_ops = {
3113 	.init = sock_inuse_init_net,
3114 	.exit = sock_inuse_exit_net,
3115 	.async = true,
3116 };
3117 
3118 static __init int net_inuse_init(void)
3119 {
3120 	if (register_pernet_subsys(&net_inuse_ops))
3121 		panic("Cannot initialize net inuse counters");
3122 
3123 	return 0;
3124 }
3125 
3126 core_initcall(net_inuse_init);
3127 
3128 static void assign_proto_idx(struct proto *prot)
3129 {
3130 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3131 
3132 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3133 		pr_err("PROTO_INUSE_NR exhausted\n");
3134 		return;
3135 	}
3136 
3137 	set_bit(prot->inuse_idx, proto_inuse_idx);
3138 }
3139 
3140 static void release_proto_idx(struct proto *prot)
3141 {
3142 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3143 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3144 }
3145 #else
3146 static inline void assign_proto_idx(struct proto *prot)
3147 {
3148 }
3149 
3150 static inline void release_proto_idx(struct proto *prot)
3151 {
3152 }
3153 
3154 static void sock_inuse_add(struct net *net, int val)
3155 {
3156 }
3157 #endif
3158 
3159 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3160 {
3161 	if (!rsk_prot)
3162 		return;
3163 	kfree(rsk_prot->slab_name);
3164 	rsk_prot->slab_name = NULL;
3165 	kmem_cache_destroy(rsk_prot->slab);
3166 	rsk_prot->slab = NULL;
3167 }
3168 
3169 static int req_prot_init(const struct proto *prot)
3170 {
3171 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3172 
3173 	if (!rsk_prot)
3174 		return 0;
3175 
3176 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3177 					prot->name);
3178 	if (!rsk_prot->slab_name)
3179 		return -ENOMEM;
3180 
3181 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3182 					   rsk_prot->obj_size, 0,
3183 					   prot->slab_flags, NULL);
3184 
3185 	if (!rsk_prot->slab) {
3186 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3187 			prot->name);
3188 		return -ENOMEM;
3189 	}
3190 	return 0;
3191 }
3192 
3193 int proto_register(struct proto *prot, int alloc_slab)
3194 {
3195 	if (alloc_slab) {
3196 		prot->slab = kmem_cache_create_usercopy(prot->name,
3197 					prot->obj_size, 0,
3198 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3199 					prot->useroffset, prot->usersize,
3200 					NULL);
3201 
3202 		if (prot->slab == NULL) {
3203 			pr_crit("%s: Can't create sock SLAB cache!\n",
3204 				prot->name);
3205 			goto out;
3206 		}
3207 
3208 		if (req_prot_init(prot))
3209 			goto out_free_request_sock_slab;
3210 
3211 		if (prot->twsk_prot != NULL) {
3212 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3213 
3214 			if (prot->twsk_prot->twsk_slab_name == NULL)
3215 				goto out_free_request_sock_slab;
3216 
3217 			prot->twsk_prot->twsk_slab =
3218 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3219 						  prot->twsk_prot->twsk_obj_size,
3220 						  0,
3221 						  prot->slab_flags,
3222 						  NULL);
3223 			if (prot->twsk_prot->twsk_slab == NULL)
3224 				goto out_free_timewait_sock_slab_name;
3225 		}
3226 	}
3227 
3228 	mutex_lock(&proto_list_mutex);
3229 	list_add(&prot->node, &proto_list);
3230 	assign_proto_idx(prot);
3231 	mutex_unlock(&proto_list_mutex);
3232 	return 0;
3233 
3234 out_free_timewait_sock_slab_name:
3235 	kfree(prot->twsk_prot->twsk_slab_name);
3236 out_free_request_sock_slab:
3237 	req_prot_cleanup(prot->rsk_prot);
3238 
3239 	kmem_cache_destroy(prot->slab);
3240 	prot->slab = NULL;
3241 out:
3242 	return -ENOBUFS;
3243 }
3244 EXPORT_SYMBOL(proto_register);
3245 
3246 void proto_unregister(struct proto *prot)
3247 {
3248 	mutex_lock(&proto_list_mutex);
3249 	release_proto_idx(prot);
3250 	list_del(&prot->node);
3251 	mutex_unlock(&proto_list_mutex);
3252 
3253 	kmem_cache_destroy(prot->slab);
3254 	prot->slab = NULL;
3255 
3256 	req_prot_cleanup(prot->rsk_prot);
3257 
3258 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3259 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3260 		kfree(prot->twsk_prot->twsk_slab_name);
3261 		prot->twsk_prot->twsk_slab = NULL;
3262 	}
3263 }
3264 EXPORT_SYMBOL(proto_unregister);
3265 
3266 #ifdef CONFIG_PROC_FS
3267 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3268 	__acquires(proto_list_mutex)
3269 {
3270 	mutex_lock(&proto_list_mutex);
3271 	return seq_list_start_head(&proto_list, *pos);
3272 }
3273 
3274 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3275 {
3276 	return seq_list_next(v, &proto_list, pos);
3277 }
3278 
3279 static void proto_seq_stop(struct seq_file *seq, void *v)
3280 	__releases(proto_list_mutex)
3281 {
3282 	mutex_unlock(&proto_list_mutex);
3283 }
3284 
3285 static char proto_method_implemented(const void *method)
3286 {
3287 	return method == NULL ? 'n' : 'y';
3288 }
3289 static long sock_prot_memory_allocated(struct proto *proto)
3290 {
3291 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3292 }
3293 
3294 static char *sock_prot_memory_pressure(struct proto *proto)
3295 {
3296 	return proto->memory_pressure != NULL ?
3297 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3298 }
3299 
3300 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3301 {
3302 
3303 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3304 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3305 		   proto->name,
3306 		   proto->obj_size,
3307 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3308 		   sock_prot_memory_allocated(proto),
3309 		   sock_prot_memory_pressure(proto),
3310 		   proto->max_header,
3311 		   proto->slab == NULL ? "no" : "yes",
3312 		   module_name(proto->owner),
3313 		   proto_method_implemented(proto->close),
3314 		   proto_method_implemented(proto->connect),
3315 		   proto_method_implemented(proto->disconnect),
3316 		   proto_method_implemented(proto->accept),
3317 		   proto_method_implemented(proto->ioctl),
3318 		   proto_method_implemented(proto->init),
3319 		   proto_method_implemented(proto->destroy),
3320 		   proto_method_implemented(proto->shutdown),
3321 		   proto_method_implemented(proto->setsockopt),
3322 		   proto_method_implemented(proto->getsockopt),
3323 		   proto_method_implemented(proto->sendmsg),
3324 		   proto_method_implemented(proto->recvmsg),
3325 		   proto_method_implemented(proto->sendpage),
3326 		   proto_method_implemented(proto->bind),
3327 		   proto_method_implemented(proto->backlog_rcv),
3328 		   proto_method_implemented(proto->hash),
3329 		   proto_method_implemented(proto->unhash),
3330 		   proto_method_implemented(proto->get_port),
3331 		   proto_method_implemented(proto->enter_memory_pressure));
3332 }
3333 
3334 static int proto_seq_show(struct seq_file *seq, void *v)
3335 {
3336 	if (v == &proto_list)
3337 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3338 			   "protocol",
3339 			   "size",
3340 			   "sockets",
3341 			   "memory",
3342 			   "press",
3343 			   "maxhdr",
3344 			   "slab",
3345 			   "module",
3346 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3347 	else
3348 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3349 	return 0;
3350 }
3351 
3352 static const struct seq_operations proto_seq_ops = {
3353 	.start  = proto_seq_start,
3354 	.next   = proto_seq_next,
3355 	.stop   = proto_seq_stop,
3356 	.show   = proto_seq_show,
3357 };
3358 
3359 static int proto_seq_open(struct inode *inode, struct file *file)
3360 {
3361 	return seq_open_net(inode, file, &proto_seq_ops,
3362 			    sizeof(struct seq_net_private));
3363 }
3364 
3365 static const struct file_operations proto_seq_fops = {
3366 	.open		= proto_seq_open,
3367 	.read		= seq_read,
3368 	.llseek		= seq_lseek,
3369 	.release	= seq_release_net,
3370 };
3371 
3372 static __net_init int proto_init_net(struct net *net)
3373 {
3374 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3375 		return -ENOMEM;
3376 
3377 	return 0;
3378 }
3379 
3380 static __net_exit void proto_exit_net(struct net *net)
3381 {
3382 	remove_proc_entry("protocols", net->proc_net);
3383 }
3384 
3385 
3386 static __net_initdata struct pernet_operations proto_net_ops = {
3387 	.init = proto_init_net,
3388 	.exit = proto_exit_net,
3389 	.async = true,
3390 };
3391 
3392 static int __init proto_init(void)
3393 {
3394 	return register_pernet_subsys(&proto_net_ops);
3395 }
3396 
3397 subsys_initcall(proto_init);
3398 
3399 #endif /* PROC_FS */
3400 
3401 #ifdef CONFIG_NET_RX_BUSY_POLL
3402 bool sk_busy_loop_end(void *p, unsigned long start_time)
3403 {
3404 	struct sock *sk = p;
3405 
3406 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3407 	       sk_busy_loop_timeout(sk, start_time);
3408 }
3409 EXPORT_SYMBOL(sk_busy_loop_end);
3410 #endif /* CONFIG_NET_RX_BUSY_POLL */
3411