1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <linux/capability.h> 95 #include <linux/errno.h> 96 #include <linux/errqueue.h> 97 #include <linux/types.h> 98 #include <linux/socket.h> 99 #include <linux/in.h> 100 #include <linux/kernel.h> 101 #include <linux/module.h> 102 #include <linux/proc_fs.h> 103 #include <linux/seq_file.h> 104 #include <linux/sched.h> 105 #include <linux/sched/mm.h> 106 #include <linux/timer.h> 107 #include <linux/string.h> 108 #include <linux/sockios.h> 109 #include <linux/net.h> 110 #include <linux/mm.h> 111 #include <linux/slab.h> 112 #include <linux/interrupt.h> 113 #include <linux/poll.h> 114 #include <linux/tcp.h> 115 #include <linux/init.h> 116 #include <linux/highmem.h> 117 #include <linux/user_namespace.h> 118 #include <linux/static_key.h> 119 #include <linux/memcontrol.h> 120 #include <linux/prefetch.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <net/net_namespace.h> 128 #include <net/request_sock.h> 129 #include <net/sock.h> 130 #include <linux/net_tstamp.h> 131 #include <net/xfrm.h> 132 #include <linux/ipsec.h> 133 #include <net/cls_cgroup.h> 134 #include <net/netprio_cgroup.h> 135 #include <linux/sock_diag.h> 136 137 #include <linux/filter.h> 138 #include <net/sock_reuseport.h> 139 140 #include <trace/events/sock.h> 141 142 #include <net/tcp.h> 143 #include <net/busy_poll.h> 144 145 static DEFINE_MUTEX(proto_list_mutex); 146 static LIST_HEAD(proto_list); 147 148 static void sock_inuse_add(struct net *net, int val); 149 150 /** 151 * sk_ns_capable - General socket capability test 152 * @sk: Socket to use a capability on or through 153 * @user_ns: The user namespace of the capability to use 154 * @cap: The capability to use 155 * 156 * Test to see if the opener of the socket had when the socket was 157 * created and the current process has the capability @cap in the user 158 * namespace @user_ns. 159 */ 160 bool sk_ns_capable(const struct sock *sk, 161 struct user_namespace *user_ns, int cap) 162 { 163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 164 ns_capable(user_ns, cap); 165 } 166 EXPORT_SYMBOL(sk_ns_capable); 167 168 /** 169 * sk_capable - Socket global capability test 170 * @sk: Socket to use a capability on or through 171 * @cap: The global capability to use 172 * 173 * Test to see if the opener of the socket had when the socket was 174 * created and the current process has the capability @cap in all user 175 * namespaces. 176 */ 177 bool sk_capable(const struct sock *sk, int cap) 178 { 179 return sk_ns_capable(sk, &init_user_ns, cap); 180 } 181 EXPORT_SYMBOL(sk_capable); 182 183 /** 184 * sk_net_capable - Network namespace socket capability test 185 * @sk: Socket to use a capability on or through 186 * @cap: The capability to use 187 * 188 * Test to see if the opener of the socket had when the socket was created 189 * and the current process has the capability @cap over the network namespace 190 * the socket is a member of. 191 */ 192 bool sk_net_capable(const struct sock *sk, int cap) 193 { 194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 195 } 196 EXPORT_SYMBOL(sk_net_capable); 197 198 /* 199 * Each address family might have different locking rules, so we have 200 * one slock key per address family and separate keys for internal and 201 * userspace sockets. 202 */ 203 static struct lock_class_key af_family_keys[AF_MAX]; 204 static struct lock_class_key af_family_kern_keys[AF_MAX]; 205 static struct lock_class_key af_family_slock_keys[AF_MAX]; 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 207 208 /* 209 * Make lock validator output more readable. (we pre-construct these 210 * strings build-time, so that runtime initialization of socket 211 * locks is fast): 212 */ 213 214 #define _sock_locks(x) \ 215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 224 x "27" , x "28" , x "AF_CAN" , \ 225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" 230 231 static const char *const af_family_key_strings[AF_MAX+1] = { 232 _sock_locks("sk_lock-") 233 }; 234 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 235 _sock_locks("slock-") 236 }; 237 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 238 _sock_locks("clock-") 239 }; 240 241 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 242 _sock_locks("k-sk_lock-") 243 }; 244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 245 _sock_locks("k-slock-") 246 }; 247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 248 _sock_locks("k-clock-") 249 }; 250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 251 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , 252 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", 253 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , 254 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , 255 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , 256 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , 257 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , 258 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , 259 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , 260 "rlock-27" , "rlock-28" , "rlock-AF_CAN" , 261 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , 262 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 263 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 264 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 265 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" 266 }; 267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 268 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 269 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", 270 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , 271 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , 272 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , 273 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , 274 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , 275 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , 276 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , 277 "wlock-27" , "wlock-28" , "wlock-AF_CAN" , 278 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , 279 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 280 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 281 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 282 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" 283 }; 284 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 285 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 286 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", 287 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , 288 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , 289 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , 290 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , 291 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , 292 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , 293 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , 294 "elock-27" , "elock-28" , "elock-AF_CAN" , 295 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , 296 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 297 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 298 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 299 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" 300 }; 301 302 /* 303 * sk_callback_lock and sk queues locking rules are per-address-family, 304 * so split the lock classes by using a per-AF key: 305 */ 306 static struct lock_class_key af_callback_keys[AF_MAX]; 307 static struct lock_class_key af_rlock_keys[AF_MAX]; 308 static struct lock_class_key af_wlock_keys[AF_MAX]; 309 static struct lock_class_key af_elock_keys[AF_MAX]; 310 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 311 312 /* Run time adjustable parameters. */ 313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 314 EXPORT_SYMBOL(sysctl_wmem_max); 315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 316 EXPORT_SYMBOL(sysctl_rmem_max); 317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 319 320 /* Maximal space eaten by iovec or ancillary data plus some space */ 321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 322 EXPORT_SYMBOL(sysctl_optmem_max); 323 324 int sysctl_tstamp_allow_data __read_mostly = 1; 325 326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; 327 EXPORT_SYMBOL_GPL(memalloc_socks); 328 329 /** 330 * sk_set_memalloc - sets %SOCK_MEMALLOC 331 * @sk: socket to set it on 332 * 333 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 334 * It's the responsibility of the admin to adjust min_free_kbytes 335 * to meet the requirements 336 */ 337 void sk_set_memalloc(struct sock *sk) 338 { 339 sock_set_flag(sk, SOCK_MEMALLOC); 340 sk->sk_allocation |= __GFP_MEMALLOC; 341 static_key_slow_inc(&memalloc_socks); 342 } 343 EXPORT_SYMBOL_GPL(sk_set_memalloc); 344 345 void sk_clear_memalloc(struct sock *sk) 346 { 347 sock_reset_flag(sk, SOCK_MEMALLOC); 348 sk->sk_allocation &= ~__GFP_MEMALLOC; 349 static_key_slow_dec(&memalloc_socks); 350 351 /* 352 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 353 * progress of swapping. SOCK_MEMALLOC may be cleared while 354 * it has rmem allocations due to the last swapfile being deactivated 355 * but there is a risk that the socket is unusable due to exceeding 356 * the rmem limits. Reclaim the reserves and obey rmem limits again. 357 */ 358 sk_mem_reclaim(sk); 359 } 360 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 361 362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 363 { 364 int ret; 365 unsigned int noreclaim_flag; 366 367 /* these should have been dropped before queueing */ 368 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 369 370 noreclaim_flag = memalloc_noreclaim_save(); 371 ret = sk->sk_backlog_rcv(sk, skb); 372 memalloc_noreclaim_restore(noreclaim_flag); 373 374 return ret; 375 } 376 EXPORT_SYMBOL(__sk_backlog_rcv); 377 378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 379 { 380 struct timeval tv; 381 382 if (optlen < sizeof(tv)) 383 return -EINVAL; 384 if (copy_from_user(&tv, optval, sizeof(tv))) 385 return -EFAULT; 386 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 387 return -EDOM; 388 389 if (tv.tv_sec < 0) { 390 static int warned __read_mostly; 391 392 *timeo_p = 0; 393 if (warned < 10 && net_ratelimit()) { 394 warned++; 395 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 396 __func__, current->comm, task_pid_nr(current)); 397 } 398 return 0; 399 } 400 *timeo_p = MAX_SCHEDULE_TIMEOUT; 401 if (tv.tv_sec == 0 && tv.tv_usec == 0) 402 return 0; 403 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 404 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); 405 return 0; 406 } 407 408 static void sock_warn_obsolete_bsdism(const char *name) 409 { 410 static int warned; 411 static char warncomm[TASK_COMM_LEN]; 412 if (strcmp(warncomm, current->comm) && warned < 5) { 413 strcpy(warncomm, current->comm); 414 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 415 warncomm, name); 416 warned++; 417 } 418 } 419 420 static bool sock_needs_netstamp(const struct sock *sk) 421 { 422 switch (sk->sk_family) { 423 case AF_UNSPEC: 424 case AF_UNIX: 425 return false; 426 default: 427 return true; 428 } 429 } 430 431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 432 { 433 if (sk->sk_flags & flags) { 434 sk->sk_flags &= ~flags; 435 if (sock_needs_netstamp(sk) && 436 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 437 net_disable_timestamp(); 438 } 439 } 440 441 442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 443 { 444 unsigned long flags; 445 struct sk_buff_head *list = &sk->sk_receive_queue; 446 447 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 448 atomic_inc(&sk->sk_drops); 449 trace_sock_rcvqueue_full(sk, skb); 450 return -ENOMEM; 451 } 452 453 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 454 atomic_inc(&sk->sk_drops); 455 return -ENOBUFS; 456 } 457 458 skb->dev = NULL; 459 skb_set_owner_r(skb, sk); 460 461 /* we escape from rcu protected region, make sure we dont leak 462 * a norefcounted dst 463 */ 464 skb_dst_force(skb); 465 466 spin_lock_irqsave(&list->lock, flags); 467 sock_skb_set_dropcount(sk, skb); 468 __skb_queue_tail(list, skb); 469 spin_unlock_irqrestore(&list->lock, flags); 470 471 if (!sock_flag(sk, SOCK_DEAD)) 472 sk->sk_data_ready(sk); 473 return 0; 474 } 475 EXPORT_SYMBOL(__sock_queue_rcv_skb); 476 477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 478 { 479 int err; 480 481 err = sk_filter(sk, skb); 482 if (err) 483 return err; 484 485 return __sock_queue_rcv_skb(sk, skb); 486 } 487 EXPORT_SYMBOL(sock_queue_rcv_skb); 488 489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 490 const int nested, unsigned int trim_cap, bool refcounted) 491 { 492 int rc = NET_RX_SUCCESS; 493 494 if (sk_filter_trim_cap(sk, skb, trim_cap)) 495 goto discard_and_relse; 496 497 skb->dev = NULL; 498 499 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 500 atomic_inc(&sk->sk_drops); 501 goto discard_and_relse; 502 } 503 if (nested) 504 bh_lock_sock_nested(sk); 505 else 506 bh_lock_sock(sk); 507 if (!sock_owned_by_user(sk)) { 508 /* 509 * trylock + unlock semantics: 510 */ 511 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 512 513 rc = sk_backlog_rcv(sk, skb); 514 515 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 516 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 517 bh_unlock_sock(sk); 518 atomic_inc(&sk->sk_drops); 519 goto discard_and_relse; 520 } 521 522 bh_unlock_sock(sk); 523 out: 524 if (refcounted) 525 sock_put(sk); 526 return rc; 527 discard_and_relse: 528 kfree_skb(skb); 529 goto out; 530 } 531 EXPORT_SYMBOL(__sk_receive_skb); 532 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 534 { 535 struct dst_entry *dst = __sk_dst_get(sk); 536 537 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 538 sk_tx_queue_clear(sk); 539 sk->sk_dst_pending_confirm = 0; 540 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 541 dst_release(dst); 542 return NULL; 543 } 544 545 return dst; 546 } 547 EXPORT_SYMBOL(__sk_dst_check); 548 549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 550 { 551 struct dst_entry *dst = sk_dst_get(sk); 552 553 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 554 sk_dst_reset(sk); 555 dst_release(dst); 556 return NULL; 557 } 558 559 return dst; 560 } 561 EXPORT_SYMBOL(sk_dst_check); 562 563 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 564 int optlen) 565 { 566 int ret = -ENOPROTOOPT; 567 #ifdef CONFIG_NETDEVICES 568 struct net *net = sock_net(sk); 569 char devname[IFNAMSIZ]; 570 int index; 571 572 /* Sorry... */ 573 ret = -EPERM; 574 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 575 goto out; 576 577 ret = -EINVAL; 578 if (optlen < 0) 579 goto out; 580 581 /* Bind this socket to a particular device like "eth0", 582 * as specified in the passed interface name. If the 583 * name is "" or the option length is zero the socket 584 * is not bound. 585 */ 586 if (optlen > IFNAMSIZ - 1) 587 optlen = IFNAMSIZ - 1; 588 memset(devname, 0, sizeof(devname)); 589 590 ret = -EFAULT; 591 if (copy_from_user(devname, optval, optlen)) 592 goto out; 593 594 index = 0; 595 if (devname[0] != '\0') { 596 struct net_device *dev; 597 598 rcu_read_lock(); 599 dev = dev_get_by_name_rcu(net, devname); 600 if (dev) 601 index = dev->ifindex; 602 rcu_read_unlock(); 603 ret = -ENODEV; 604 if (!dev) 605 goto out; 606 } 607 608 lock_sock(sk); 609 sk->sk_bound_dev_if = index; 610 sk_dst_reset(sk); 611 release_sock(sk); 612 613 ret = 0; 614 615 out: 616 #endif 617 618 return ret; 619 } 620 621 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 622 int __user *optlen, int len) 623 { 624 int ret = -ENOPROTOOPT; 625 #ifdef CONFIG_NETDEVICES 626 struct net *net = sock_net(sk); 627 char devname[IFNAMSIZ]; 628 629 if (sk->sk_bound_dev_if == 0) { 630 len = 0; 631 goto zero; 632 } 633 634 ret = -EINVAL; 635 if (len < IFNAMSIZ) 636 goto out; 637 638 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 639 if (ret) 640 goto out; 641 642 len = strlen(devname) + 1; 643 644 ret = -EFAULT; 645 if (copy_to_user(optval, devname, len)) 646 goto out; 647 648 zero: 649 ret = -EFAULT; 650 if (put_user(len, optlen)) 651 goto out; 652 653 ret = 0; 654 655 out: 656 #endif 657 658 return ret; 659 } 660 661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 662 { 663 if (valbool) 664 sock_set_flag(sk, bit); 665 else 666 sock_reset_flag(sk, bit); 667 } 668 669 bool sk_mc_loop(struct sock *sk) 670 { 671 if (dev_recursion_level()) 672 return false; 673 if (!sk) 674 return true; 675 switch (sk->sk_family) { 676 case AF_INET: 677 return inet_sk(sk)->mc_loop; 678 #if IS_ENABLED(CONFIG_IPV6) 679 case AF_INET6: 680 return inet6_sk(sk)->mc_loop; 681 #endif 682 } 683 WARN_ON(1); 684 return true; 685 } 686 EXPORT_SYMBOL(sk_mc_loop); 687 688 /* 689 * This is meant for all protocols to use and covers goings on 690 * at the socket level. Everything here is generic. 691 */ 692 693 int sock_setsockopt(struct socket *sock, int level, int optname, 694 char __user *optval, unsigned int optlen) 695 { 696 struct sock *sk = sock->sk; 697 int val; 698 int valbool; 699 struct linger ling; 700 int ret = 0; 701 702 /* 703 * Options without arguments 704 */ 705 706 if (optname == SO_BINDTODEVICE) 707 return sock_setbindtodevice(sk, optval, optlen); 708 709 if (optlen < sizeof(int)) 710 return -EINVAL; 711 712 if (get_user(val, (int __user *)optval)) 713 return -EFAULT; 714 715 valbool = val ? 1 : 0; 716 717 lock_sock(sk); 718 719 switch (optname) { 720 case SO_DEBUG: 721 if (val && !capable(CAP_NET_ADMIN)) 722 ret = -EACCES; 723 else 724 sock_valbool_flag(sk, SOCK_DBG, valbool); 725 break; 726 case SO_REUSEADDR: 727 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 728 break; 729 case SO_REUSEPORT: 730 sk->sk_reuseport = valbool; 731 break; 732 case SO_TYPE: 733 case SO_PROTOCOL: 734 case SO_DOMAIN: 735 case SO_ERROR: 736 ret = -ENOPROTOOPT; 737 break; 738 case SO_DONTROUTE: 739 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 740 break; 741 case SO_BROADCAST: 742 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 743 break; 744 case SO_SNDBUF: 745 /* Don't error on this BSD doesn't and if you think 746 * about it this is right. Otherwise apps have to 747 * play 'guess the biggest size' games. RCVBUF/SNDBUF 748 * are treated in BSD as hints 749 */ 750 val = min_t(u32, val, sysctl_wmem_max); 751 set_sndbuf: 752 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 753 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 754 /* Wake up sending tasks if we upped the value. */ 755 sk->sk_write_space(sk); 756 break; 757 758 case SO_SNDBUFFORCE: 759 if (!capable(CAP_NET_ADMIN)) { 760 ret = -EPERM; 761 break; 762 } 763 goto set_sndbuf; 764 765 case SO_RCVBUF: 766 /* Don't error on this BSD doesn't and if you think 767 * about it this is right. Otherwise apps have to 768 * play 'guess the biggest size' games. RCVBUF/SNDBUF 769 * are treated in BSD as hints 770 */ 771 val = min_t(u32, val, sysctl_rmem_max); 772 set_rcvbuf: 773 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 774 /* 775 * We double it on the way in to account for 776 * "struct sk_buff" etc. overhead. Applications 777 * assume that the SO_RCVBUF setting they make will 778 * allow that much actual data to be received on that 779 * socket. 780 * 781 * Applications are unaware that "struct sk_buff" and 782 * other overheads allocate from the receive buffer 783 * during socket buffer allocation. 784 * 785 * And after considering the possible alternatives, 786 * returning the value we actually used in getsockopt 787 * is the most desirable behavior. 788 */ 789 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 790 break; 791 792 case SO_RCVBUFFORCE: 793 if (!capable(CAP_NET_ADMIN)) { 794 ret = -EPERM; 795 break; 796 } 797 goto set_rcvbuf; 798 799 case SO_KEEPALIVE: 800 if (sk->sk_prot->keepalive) 801 sk->sk_prot->keepalive(sk, valbool); 802 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 803 break; 804 805 case SO_OOBINLINE: 806 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 807 break; 808 809 case SO_NO_CHECK: 810 sk->sk_no_check_tx = valbool; 811 break; 812 813 case SO_PRIORITY: 814 if ((val >= 0 && val <= 6) || 815 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 816 sk->sk_priority = val; 817 else 818 ret = -EPERM; 819 break; 820 821 case SO_LINGER: 822 if (optlen < sizeof(ling)) { 823 ret = -EINVAL; /* 1003.1g */ 824 break; 825 } 826 if (copy_from_user(&ling, optval, sizeof(ling))) { 827 ret = -EFAULT; 828 break; 829 } 830 if (!ling.l_onoff) 831 sock_reset_flag(sk, SOCK_LINGER); 832 else { 833 #if (BITS_PER_LONG == 32) 834 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 835 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 836 else 837 #endif 838 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 839 sock_set_flag(sk, SOCK_LINGER); 840 } 841 break; 842 843 case SO_BSDCOMPAT: 844 sock_warn_obsolete_bsdism("setsockopt"); 845 break; 846 847 case SO_PASSCRED: 848 if (valbool) 849 set_bit(SOCK_PASSCRED, &sock->flags); 850 else 851 clear_bit(SOCK_PASSCRED, &sock->flags); 852 break; 853 854 case SO_TIMESTAMP: 855 case SO_TIMESTAMPNS: 856 if (valbool) { 857 if (optname == SO_TIMESTAMP) 858 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 859 else 860 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 861 sock_set_flag(sk, SOCK_RCVTSTAMP); 862 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 863 } else { 864 sock_reset_flag(sk, SOCK_RCVTSTAMP); 865 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 866 } 867 break; 868 869 case SO_TIMESTAMPING: 870 if (val & ~SOF_TIMESTAMPING_MASK) { 871 ret = -EINVAL; 872 break; 873 } 874 875 if (val & SOF_TIMESTAMPING_OPT_ID && 876 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 877 if (sk->sk_protocol == IPPROTO_TCP && 878 sk->sk_type == SOCK_STREAM) { 879 if ((1 << sk->sk_state) & 880 (TCPF_CLOSE | TCPF_LISTEN)) { 881 ret = -EINVAL; 882 break; 883 } 884 sk->sk_tskey = tcp_sk(sk)->snd_una; 885 } else { 886 sk->sk_tskey = 0; 887 } 888 } 889 890 if (val & SOF_TIMESTAMPING_OPT_STATS && 891 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 892 ret = -EINVAL; 893 break; 894 } 895 896 sk->sk_tsflags = val; 897 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 898 sock_enable_timestamp(sk, 899 SOCK_TIMESTAMPING_RX_SOFTWARE); 900 else 901 sock_disable_timestamp(sk, 902 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 903 break; 904 905 case SO_RCVLOWAT: 906 if (val < 0) 907 val = INT_MAX; 908 sk->sk_rcvlowat = val ? : 1; 909 break; 910 911 case SO_RCVTIMEO: 912 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 913 break; 914 915 case SO_SNDTIMEO: 916 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 917 break; 918 919 case SO_ATTACH_FILTER: 920 ret = -EINVAL; 921 if (optlen == sizeof(struct sock_fprog)) { 922 struct sock_fprog fprog; 923 924 ret = -EFAULT; 925 if (copy_from_user(&fprog, optval, sizeof(fprog))) 926 break; 927 928 ret = sk_attach_filter(&fprog, sk); 929 } 930 break; 931 932 case SO_ATTACH_BPF: 933 ret = -EINVAL; 934 if (optlen == sizeof(u32)) { 935 u32 ufd; 936 937 ret = -EFAULT; 938 if (copy_from_user(&ufd, optval, sizeof(ufd))) 939 break; 940 941 ret = sk_attach_bpf(ufd, sk); 942 } 943 break; 944 945 case SO_ATTACH_REUSEPORT_CBPF: 946 ret = -EINVAL; 947 if (optlen == sizeof(struct sock_fprog)) { 948 struct sock_fprog fprog; 949 950 ret = -EFAULT; 951 if (copy_from_user(&fprog, optval, sizeof(fprog))) 952 break; 953 954 ret = sk_reuseport_attach_filter(&fprog, sk); 955 } 956 break; 957 958 case SO_ATTACH_REUSEPORT_EBPF: 959 ret = -EINVAL; 960 if (optlen == sizeof(u32)) { 961 u32 ufd; 962 963 ret = -EFAULT; 964 if (copy_from_user(&ufd, optval, sizeof(ufd))) 965 break; 966 967 ret = sk_reuseport_attach_bpf(ufd, sk); 968 } 969 break; 970 971 case SO_DETACH_FILTER: 972 ret = sk_detach_filter(sk); 973 break; 974 975 case SO_LOCK_FILTER: 976 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 977 ret = -EPERM; 978 else 979 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 980 break; 981 982 case SO_PASSSEC: 983 if (valbool) 984 set_bit(SOCK_PASSSEC, &sock->flags); 985 else 986 clear_bit(SOCK_PASSSEC, &sock->flags); 987 break; 988 case SO_MARK: 989 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 990 ret = -EPERM; 991 else 992 sk->sk_mark = val; 993 break; 994 995 case SO_RXQ_OVFL: 996 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 997 break; 998 999 case SO_WIFI_STATUS: 1000 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1001 break; 1002 1003 case SO_PEEK_OFF: 1004 if (sock->ops->set_peek_off) 1005 ret = sock->ops->set_peek_off(sk, val); 1006 else 1007 ret = -EOPNOTSUPP; 1008 break; 1009 1010 case SO_NOFCS: 1011 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1012 break; 1013 1014 case SO_SELECT_ERR_QUEUE: 1015 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1016 break; 1017 1018 #ifdef CONFIG_NET_RX_BUSY_POLL 1019 case SO_BUSY_POLL: 1020 /* allow unprivileged users to decrease the value */ 1021 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1022 ret = -EPERM; 1023 else { 1024 if (val < 0) 1025 ret = -EINVAL; 1026 else 1027 sk->sk_ll_usec = val; 1028 } 1029 break; 1030 #endif 1031 1032 case SO_MAX_PACING_RATE: 1033 if (val != ~0U) 1034 cmpxchg(&sk->sk_pacing_status, 1035 SK_PACING_NONE, 1036 SK_PACING_NEEDED); 1037 sk->sk_max_pacing_rate = val; 1038 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1039 sk->sk_max_pacing_rate); 1040 break; 1041 1042 case SO_INCOMING_CPU: 1043 sk->sk_incoming_cpu = val; 1044 break; 1045 1046 case SO_CNX_ADVICE: 1047 if (val == 1) 1048 dst_negative_advice(sk); 1049 break; 1050 1051 case SO_ZEROCOPY: 1052 if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) 1053 ret = -ENOTSUPP; 1054 else if (sk->sk_protocol != IPPROTO_TCP) 1055 ret = -ENOTSUPP; 1056 else if (sk->sk_state != TCP_CLOSE) 1057 ret = -EBUSY; 1058 else if (val < 0 || val > 1) 1059 ret = -EINVAL; 1060 else 1061 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1062 break; 1063 1064 default: 1065 ret = -ENOPROTOOPT; 1066 break; 1067 } 1068 release_sock(sk); 1069 return ret; 1070 } 1071 EXPORT_SYMBOL(sock_setsockopt); 1072 1073 1074 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1075 struct ucred *ucred) 1076 { 1077 ucred->pid = pid_vnr(pid); 1078 ucred->uid = ucred->gid = -1; 1079 if (cred) { 1080 struct user_namespace *current_ns = current_user_ns(); 1081 1082 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1083 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1084 } 1085 } 1086 1087 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1088 { 1089 struct user_namespace *user_ns = current_user_ns(); 1090 int i; 1091 1092 for (i = 0; i < src->ngroups; i++) 1093 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1094 return -EFAULT; 1095 1096 return 0; 1097 } 1098 1099 int sock_getsockopt(struct socket *sock, int level, int optname, 1100 char __user *optval, int __user *optlen) 1101 { 1102 struct sock *sk = sock->sk; 1103 1104 union { 1105 int val; 1106 u64 val64; 1107 struct linger ling; 1108 struct timeval tm; 1109 } v; 1110 1111 int lv = sizeof(int); 1112 int len; 1113 1114 if (get_user(len, optlen)) 1115 return -EFAULT; 1116 if (len < 0) 1117 return -EINVAL; 1118 1119 memset(&v, 0, sizeof(v)); 1120 1121 switch (optname) { 1122 case SO_DEBUG: 1123 v.val = sock_flag(sk, SOCK_DBG); 1124 break; 1125 1126 case SO_DONTROUTE: 1127 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1128 break; 1129 1130 case SO_BROADCAST: 1131 v.val = sock_flag(sk, SOCK_BROADCAST); 1132 break; 1133 1134 case SO_SNDBUF: 1135 v.val = sk->sk_sndbuf; 1136 break; 1137 1138 case SO_RCVBUF: 1139 v.val = sk->sk_rcvbuf; 1140 break; 1141 1142 case SO_REUSEADDR: 1143 v.val = sk->sk_reuse; 1144 break; 1145 1146 case SO_REUSEPORT: 1147 v.val = sk->sk_reuseport; 1148 break; 1149 1150 case SO_KEEPALIVE: 1151 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1152 break; 1153 1154 case SO_TYPE: 1155 v.val = sk->sk_type; 1156 break; 1157 1158 case SO_PROTOCOL: 1159 v.val = sk->sk_protocol; 1160 break; 1161 1162 case SO_DOMAIN: 1163 v.val = sk->sk_family; 1164 break; 1165 1166 case SO_ERROR: 1167 v.val = -sock_error(sk); 1168 if (v.val == 0) 1169 v.val = xchg(&sk->sk_err_soft, 0); 1170 break; 1171 1172 case SO_OOBINLINE: 1173 v.val = sock_flag(sk, SOCK_URGINLINE); 1174 break; 1175 1176 case SO_NO_CHECK: 1177 v.val = sk->sk_no_check_tx; 1178 break; 1179 1180 case SO_PRIORITY: 1181 v.val = sk->sk_priority; 1182 break; 1183 1184 case SO_LINGER: 1185 lv = sizeof(v.ling); 1186 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1187 v.ling.l_linger = sk->sk_lingertime / HZ; 1188 break; 1189 1190 case SO_BSDCOMPAT: 1191 sock_warn_obsolete_bsdism("getsockopt"); 1192 break; 1193 1194 case SO_TIMESTAMP: 1195 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1196 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1197 break; 1198 1199 case SO_TIMESTAMPNS: 1200 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1201 break; 1202 1203 case SO_TIMESTAMPING: 1204 v.val = sk->sk_tsflags; 1205 break; 1206 1207 case SO_RCVTIMEO: 1208 lv = sizeof(struct timeval); 1209 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1210 v.tm.tv_sec = 0; 1211 v.tm.tv_usec = 0; 1212 } else { 1213 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1214 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; 1215 } 1216 break; 1217 1218 case SO_SNDTIMEO: 1219 lv = sizeof(struct timeval); 1220 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1221 v.tm.tv_sec = 0; 1222 v.tm.tv_usec = 0; 1223 } else { 1224 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1225 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; 1226 } 1227 break; 1228 1229 case SO_RCVLOWAT: 1230 v.val = sk->sk_rcvlowat; 1231 break; 1232 1233 case SO_SNDLOWAT: 1234 v.val = 1; 1235 break; 1236 1237 case SO_PASSCRED: 1238 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1239 break; 1240 1241 case SO_PEERCRED: 1242 { 1243 struct ucred peercred; 1244 if (len > sizeof(peercred)) 1245 len = sizeof(peercred); 1246 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1247 if (copy_to_user(optval, &peercred, len)) 1248 return -EFAULT; 1249 goto lenout; 1250 } 1251 1252 case SO_PEERGROUPS: 1253 { 1254 int ret, n; 1255 1256 if (!sk->sk_peer_cred) 1257 return -ENODATA; 1258 1259 n = sk->sk_peer_cred->group_info->ngroups; 1260 if (len < n * sizeof(gid_t)) { 1261 len = n * sizeof(gid_t); 1262 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1263 } 1264 len = n * sizeof(gid_t); 1265 1266 ret = groups_to_user((gid_t __user *)optval, 1267 sk->sk_peer_cred->group_info); 1268 if (ret) 1269 return ret; 1270 goto lenout; 1271 } 1272 1273 case SO_PEERNAME: 1274 { 1275 char address[128]; 1276 1277 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1278 if (lv < 0) 1279 return -ENOTCONN; 1280 if (lv < len) 1281 return -EINVAL; 1282 if (copy_to_user(optval, address, len)) 1283 return -EFAULT; 1284 goto lenout; 1285 } 1286 1287 /* Dubious BSD thing... Probably nobody even uses it, but 1288 * the UNIX standard wants it for whatever reason... -DaveM 1289 */ 1290 case SO_ACCEPTCONN: 1291 v.val = sk->sk_state == TCP_LISTEN; 1292 break; 1293 1294 case SO_PASSSEC: 1295 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1296 break; 1297 1298 case SO_PEERSEC: 1299 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1300 1301 case SO_MARK: 1302 v.val = sk->sk_mark; 1303 break; 1304 1305 case SO_RXQ_OVFL: 1306 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1307 break; 1308 1309 case SO_WIFI_STATUS: 1310 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1311 break; 1312 1313 case SO_PEEK_OFF: 1314 if (!sock->ops->set_peek_off) 1315 return -EOPNOTSUPP; 1316 1317 v.val = sk->sk_peek_off; 1318 break; 1319 case SO_NOFCS: 1320 v.val = sock_flag(sk, SOCK_NOFCS); 1321 break; 1322 1323 case SO_BINDTODEVICE: 1324 return sock_getbindtodevice(sk, optval, optlen, len); 1325 1326 case SO_GET_FILTER: 1327 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1328 if (len < 0) 1329 return len; 1330 1331 goto lenout; 1332 1333 case SO_LOCK_FILTER: 1334 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1335 break; 1336 1337 case SO_BPF_EXTENSIONS: 1338 v.val = bpf_tell_extensions(); 1339 break; 1340 1341 case SO_SELECT_ERR_QUEUE: 1342 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1343 break; 1344 1345 #ifdef CONFIG_NET_RX_BUSY_POLL 1346 case SO_BUSY_POLL: 1347 v.val = sk->sk_ll_usec; 1348 break; 1349 #endif 1350 1351 case SO_MAX_PACING_RATE: 1352 v.val = sk->sk_max_pacing_rate; 1353 break; 1354 1355 case SO_INCOMING_CPU: 1356 v.val = sk->sk_incoming_cpu; 1357 break; 1358 1359 case SO_MEMINFO: 1360 { 1361 u32 meminfo[SK_MEMINFO_VARS]; 1362 1363 if (get_user(len, optlen)) 1364 return -EFAULT; 1365 1366 sk_get_meminfo(sk, meminfo); 1367 1368 len = min_t(unsigned int, len, sizeof(meminfo)); 1369 if (copy_to_user(optval, &meminfo, len)) 1370 return -EFAULT; 1371 1372 goto lenout; 1373 } 1374 1375 #ifdef CONFIG_NET_RX_BUSY_POLL 1376 case SO_INCOMING_NAPI_ID: 1377 v.val = READ_ONCE(sk->sk_napi_id); 1378 1379 /* aggregate non-NAPI IDs down to 0 */ 1380 if (v.val < MIN_NAPI_ID) 1381 v.val = 0; 1382 1383 break; 1384 #endif 1385 1386 case SO_COOKIE: 1387 lv = sizeof(u64); 1388 if (len < lv) 1389 return -EINVAL; 1390 v.val64 = sock_gen_cookie(sk); 1391 break; 1392 1393 case SO_ZEROCOPY: 1394 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1395 break; 1396 1397 default: 1398 /* We implement the SO_SNDLOWAT etc to not be settable 1399 * (1003.1g 7). 1400 */ 1401 return -ENOPROTOOPT; 1402 } 1403 1404 if (len > lv) 1405 len = lv; 1406 if (copy_to_user(optval, &v, len)) 1407 return -EFAULT; 1408 lenout: 1409 if (put_user(len, optlen)) 1410 return -EFAULT; 1411 return 0; 1412 } 1413 1414 /* 1415 * Initialize an sk_lock. 1416 * 1417 * (We also register the sk_lock with the lock validator.) 1418 */ 1419 static inline void sock_lock_init(struct sock *sk) 1420 { 1421 if (sk->sk_kern_sock) 1422 sock_lock_init_class_and_name( 1423 sk, 1424 af_family_kern_slock_key_strings[sk->sk_family], 1425 af_family_kern_slock_keys + sk->sk_family, 1426 af_family_kern_key_strings[sk->sk_family], 1427 af_family_kern_keys + sk->sk_family); 1428 else 1429 sock_lock_init_class_and_name( 1430 sk, 1431 af_family_slock_key_strings[sk->sk_family], 1432 af_family_slock_keys + sk->sk_family, 1433 af_family_key_strings[sk->sk_family], 1434 af_family_keys + sk->sk_family); 1435 } 1436 1437 /* 1438 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1439 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1440 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1441 */ 1442 static void sock_copy(struct sock *nsk, const struct sock *osk) 1443 { 1444 #ifdef CONFIG_SECURITY_NETWORK 1445 void *sptr = nsk->sk_security; 1446 #endif 1447 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1448 1449 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1450 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1451 1452 #ifdef CONFIG_SECURITY_NETWORK 1453 nsk->sk_security = sptr; 1454 security_sk_clone(osk, nsk); 1455 #endif 1456 } 1457 1458 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1459 int family) 1460 { 1461 struct sock *sk; 1462 struct kmem_cache *slab; 1463 1464 slab = prot->slab; 1465 if (slab != NULL) { 1466 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1467 if (!sk) 1468 return sk; 1469 if (priority & __GFP_ZERO) 1470 sk_prot_clear_nulls(sk, prot->obj_size); 1471 } else 1472 sk = kmalloc(prot->obj_size, priority); 1473 1474 if (sk != NULL) { 1475 if (security_sk_alloc(sk, family, priority)) 1476 goto out_free; 1477 1478 if (!try_module_get(prot->owner)) 1479 goto out_free_sec; 1480 sk_tx_queue_clear(sk); 1481 } 1482 1483 return sk; 1484 1485 out_free_sec: 1486 security_sk_free(sk); 1487 out_free: 1488 if (slab != NULL) 1489 kmem_cache_free(slab, sk); 1490 else 1491 kfree(sk); 1492 return NULL; 1493 } 1494 1495 static void sk_prot_free(struct proto *prot, struct sock *sk) 1496 { 1497 struct kmem_cache *slab; 1498 struct module *owner; 1499 1500 owner = prot->owner; 1501 slab = prot->slab; 1502 1503 cgroup_sk_free(&sk->sk_cgrp_data); 1504 mem_cgroup_sk_free(sk); 1505 security_sk_free(sk); 1506 if (slab != NULL) 1507 kmem_cache_free(slab, sk); 1508 else 1509 kfree(sk); 1510 module_put(owner); 1511 } 1512 1513 /** 1514 * sk_alloc - All socket objects are allocated here 1515 * @net: the applicable net namespace 1516 * @family: protocol family 1517 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1518 * @prot: struct proto associated with this new sock instance 1519 * @kern: is this to be a kernel socket? 1520 */ 1521 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1522 struct proto *prot, int kern) 1523 { 1524 struct sock *sk; 1525 1526 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1527 if (sk) { 1528 sk->sk_family = family; 1529 /* 1530 * See comment in struct sock definition to understand 1531 * why we need sk_prot_creator -acme 1532 */ 1533 sk->sk_prot = sk->sk_prot_creator = prot; 1534 sk->sk_kern_sock = kern; 1535 sock_lock_init(sk); 1536 sk->sk_net_refcnt = kern ? 0 : 1; 1537 if (likely(sk->sk_net_refcnt)) { 1538 get_net(net); 1539 sock_inuse_add(net, 1); 1540 } 1541 1542 sock_net_set(sk, net); 1543 refcount_set(&sk->sk_wmem_alloc, 1); 1544 1545 mem_cgroup_sk_alloc(sk); 1546 cgroup_sk_alloc(&sk->sk_cgrp_data); 1547 sock_update_classid(&sk->sk_cgrp_data); 1548 sock_update_netprioidx(&sk->sk_cgrp_data); 1549 } 1550 1551 return sk; 1552 } 1553 EXPORT_SYMBOL(sk_alloc); 1554 1555 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1556 * grace period. This is the case for UDP sockets and TCP listeners. 1557 */ 1558 static void __sk_destruct(struct rcu_head *head) 1559 { 1560 struct sock *sk = container_of(head, struct sock, sk_rcu); 1561 struct sk_filter *filter; 1562 1563 if (sk->sk_destruct) 1564 sk->sk_destruct(sk); 1565 1566 filter = rcu_dereference_check(sk->sk_filter, 1567 refcount_read(&sk->sk_wmem_alloc) == 0); 1568 if (filter) { 1569 sk_filter_uncharge(sk, filter); 1570 RCU_INIT_POINTER(sk->sk_filter, NULL); 1571 } 1572 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1573 reuseport_detach_sock(sk); 1574 1575 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1576 1577 if (atomic_read(&sk->sk_omem_alloc)) 1578 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1579 __func__, atomic_read(&sk->sk_omem_alloc)); 1580 1581 if (sk->sk_frag.page) { 1582 put_page(sk->sk_frag.page); 1583 sk->sk_frag.page = NULL; 1584 } 1585 1586 if (sk->sk_peer_cred) 1587 put_cred(sk->sk_peer_cred); 1588 put_pid(sk->sk_peer_pid); 1589 if (likely(sk->sk_net_refcnt)) 1590 put_net(sock_net(sk)); 1591 sk_prot_free(sk->sk_prot_creator, sk); 1592 } 1593 1594 void sk_destruct(struct sock *sk) 1595 { 1596 if (sock_flag(sk, SOCK_RCU_FREE)) 1597 call_rcu(&sk->sk_rcu, __sk_destruct); 1598 else 1599 __sk_destruct(&sk->sk_rcu); 1600 } 1601 1602 static void __sk_free(struct sock *sk) 1603 { 1604 if (likely(sk->sk_net_refcnt)) 1605 sock_inuse_add(sock_net(sk), -1); 1606 1607 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) 1608 sock_diag_broadcast_destroy(sk); 1609 else 1610 sk_destruct(sk); 1611 } 1612 1613 void sk_free(struct sock *sk) 1614 { 1615 /* 1616 * We subtract one from sk_wmem_alloc and can know if 1617 * some packets are still in some tx queue. 1618 * If not null, sock_wfree() will call __sk_free(sk) later 1619 */ 1620 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1621 __sk_free(sk); 1622 } 1623 EXPORT_SYMBOL(sk_free); 1624 1625 static void sk_init_common(struct sock *sk) 1626 { 1627 skb_queue_head_init(&sk->sk_receive_queue); 1628 skb_queue_head_init(&sk->sk_write_queue); 1629 skb_queue_head_init(&sk->sk_error_queue); 1630 1631 rwlock_init(&sk->sk_callback_lock); 1632 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1633 af_rlock_keys + sk->sk_family, 1634 af_family_rlock_key_strings[sk->sk_family]); 1635 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1636 af_wlock_keys + sk->sk_family, 1637 af_family_wlock_key_strings[sk->sk_family]); 1638 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1639 af_elock_keys + sk->sk_family, 1640 af_family_elock_key_strings[sk->sk_family]); 1641 lockdep_set_class_and_name(&sk->sk_callback_lock, 1642 af_callback_keys + sk->sk_family, 1643 af_family_clock_key_strings[sk->sk_family]); 1644 } 1645 1646 /** 1647 * sk_clone_lock - clone a socket, and lock its clone 1648 * @sk: the socket to clone 1649 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1650 * 1651 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1652 */ 1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1654 { 1655 struct sock *newsk; 1656 bool is_charged = true; 1657 1658 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1659 if (newsk != NULL) { 1660 struct sk_filter *filter; 1661 1662 sock_copy(newsk, sk); 1663 1664 newsk->sk_prot_creator = sk->sk_prot; 1665 1666 /* SANITY */ 1667 if (likely(newsk->sk_net_refcnt)) 1668 get_net(sock_net(newsk)); 1669 sk_node_init(&newsk->sk_node); 1670 sock_lock_init(newsk); 1671 bh_lock_sock(newsk); 1672 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1673 newsk->sk_backlog.len = 0; 1674 1675 atomic_set(&newsk->sk_rmem_alloc, 0); 1676 /* 1677 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1678 */ 1679 refcount_set(&newsk->sk_wmem_alloc, 1); 1680 atomic_set(&newsk->sk_omem_alloc, 0); 1681 sk_init_common(newsk); 1682 1683 newsk->sk_dst_cache = NULL; 1684 newsk->sk_dst_pending_confirm = 0; 1685 newsk->sk_wmem_queued = 0; 1686 newsk->sk_forward_alloc = 0; 1687 atomic_set(&newsk->sk_drops, 0); 1688 newsk->sk_send_head = NULL; 1689 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1690 atomic_set(&newsk->sk_zckey, 0); 1691 1692 sock_reset_flag(newsk, SOCK_DONE); 1693 mem_cgroup_sk_alloc(newsk); 1694 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1695 1696 rcu_read_lock(); 1697 filter = rcu_dereference(sk->sk_filter); 1698 if (filter != NULL) 1699 /* though it's an empty new sock, the charging may fail 1700 * if sysctl_optmem_max was changed between creation of 1701 * original socket and cloning 1702 */ 1703 is_charged = sk_filter_charge(newsk, filter); 1704 RCU_INIT_POINTER(newsk->sk_filter, filter); 1705 rcu_read_unlock(); 1706 1707 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1708 /* We need to make sure that we don't uncharge the new 1709 * socket if we couldn't charge it in the first place 1710 * as otherwise we uncharge the parent's filter. 1711 */ 1712 if (!is_charged) 1713 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1714 sk_free_unlock_clone(newsk); 1715 newsk = NULL; 1716 goto out; 1717 } 1718 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1719 1720 newsk->sk_err = 0; 1721 newsk->sk_err_soft = 0; 1722 newsk->sk_priority = 0; 1723 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1724 atomic64_set(&newsk->sk_cookie, 0); 1725 if (likely(newsk->sk_net_refcnt)) 1726 sock_inuse_add(sock_net(newsk), 1); 1727 1728 /* 1729 * Before updating sk_refcnt, we must commit prior changes to memory 1730 * (Documentation/RCU/rculist_nulls.txt for details) 1731 */ 1732 smp_wmb(); 1733 refcount_set(&newsk->sk_refcnt, 2); 1734 1735 /* 1736 * Increment the counter in the same struct proto as the master 1737 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1738 * is the same as sk->sk_prot->socks, as this field was copied 1739 * with memcpy). 1740 * 1741 * This _changes_ the previous behaviour, where 1742 * tcp_create_openreq_child always was incrementing the 1743 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1744 * to be taken into account in all callers. -acme 1745 */ 1746 sk_refcnt_debug_inc(newsk); 1747 sk_set_socket(newsk, NULL); 1748 newsk->sk_wq = NULL; 1749 1750 if (newsk->sk_prot->sockets_allocated) 1751 sk_sockets_allocated_inc(newsk); 1752 1753 if (sock_needs_netstamp(sk) && 1754 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1755 net_enable_timestamp(); 1756 } 1757 out: 1758 return newsk; 1759 } 1760 EXPORT_SYMBOL_GPL(sk_clone_lock); 1761 1762 void sk_free_unlock_clone(struct sock *sk) 1763 { 1764 /* It is still raw copy of parent, so invalidate 1765 * destructor and make plain sk_free() */ 1766 sk->sk_destruct = NULL; 1767 bh_unlock_sock(sk); 1768 sk_free(sk); 1769 } 1770 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1771 1772 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1773 { 1774 u32 max_segs = 1; 1775 1776 sk_dst_set(sk, dst); 1777 sk->sk_route_caps = dst->dev->features; 1778 if (sk->sk_route_caps & NETIF_F_GSO) 1779 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1780 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1781 if (sk_can_gso(sk)) { 1782 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1783 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1784 } else { 1785 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1786 sk->sk_gso_max_size = dst->dev->gso_max_size; 1787 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1788 } 1789 } 1790 sk->sk_gso_max_segs = max_segs; 1791 } 1792 EXPORT_SYMBOL_GPL(sk_setup_caps); 1793 1794 /* 1795 * Simple resource managers for sockets. 1796 */ 1797 1798 1799 /* 1800 * Write buffer destructor automatically called from kfree_skb. 1801 */ 1802 void sock_wfree(struct sk_buff *skb) 1803 { 1804 struct sock *sk = skb->sk; 1805 unsigned int len = skb->truesize; 1806 1807 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1808 /* 1809 * Keep a reference on sk_wmem_alloc, this will be released 1810 * after sk_write_space() call 1811 */ 1812 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1813 sk->sk_write_space(sk); 1814 len = 1; 1815 } 1816 /* 1817 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1818 * could not do because of in-flight packets 1819 */ 1820 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1821 __sk_free(sk); 1822 } 1823 EXPORT_SYMBOL(sock_wfree); 1824 1825 /* This variant of sock_wfree() is used by TCP, 1826 * since it sets SOCK_USE_WRITE_QUEUE. 1827 */ 1828 void __sock_wfree(struct sk_buff *skb) 1829 { 1830 struct sock *sk = skb->sk; 1831 1832 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1833 __sk_free(sk); 1834 } 1835 1836 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1837 { 1838 skb_orphan(skb); 1839 skb->sk = sk; 1840 #ifdef CONFIG_INET 1841 if (unlikely(!sk_fullsock(sk))) { 1842 skb->destructor = sock_edemux; 1843 sock_hold(sk); 1844 return; 1845 } 1846 #endif 1847 skb->destructor = sock_wfree; 1848 skb_set_hash_from_sk(skb, sk); 1849 /* 1850 * We used to take a refcount on sk, but following operation 1851 * is enough to guarantee sk_free() wont free this sock until 1852 * all in-flight packets are completed 1853 */ 1854 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1855 } 1856 EXPORT_SYMBOL(skb_set_owner_w); 1857 1858 /* This helper is used by netem, as it can hold packets in its 1859 * delay queue. We want to allow the owner socket to send more 1860 * packets, as if they were already TX completed by a typical driver. 1861 * But we also want to keep skb->sk set because some packet schedulers 1862 * rely on it (sch_fq for example). 1863 */ 1864 void skb_orphan_partial(struct sk_buff *skb) 1865 { 1866 if (skb_is_tcp_pure_ack(skb)) 1867 return; 1868 1869 if (skb->destructor == sock_wfree 1870 #ifdef CONFIG_INET 1871 || skb->destructor == tcp_wfree 1872 #endif 1873 ) { 1874 struct sock *sk = skb->sk; 1875 1876 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1877 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1878 skb->destructor = sock_efree; 1879 } 1880 } else { 1881 skb_orphan(skb); 1882 } 1883 } 1884 EXPORT_SYMBOL(skb_orphan_partial); 1885 1886 /* 1887 * Read buffer destructor automatically called from kfree_skb. 1888 */ 1889 void sock_rfree(struct sk_buff *skb) 1890 { 1891 struct sock *sk = skb->sk; 1892 unsigned int len = skb->truesize; 1893 1894 atomic_sub(len, &sk->sk_rmem_alloc); 1895 sk_mem_uncharge(sk, len); 1896 } 1897 EXPORT_SYMBOL(sock_rfree); 1898 1899 /* 1900 * Buffer destructor for skbs that are not used directly in read or write 1901 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1902 */ 1903 void sock_efree(struct sk_buff *skb) 1904 { 1905 sock_put(skb->sk); 1906 } 1907 EXPORT_SYMBOL(sock_efree); 1908 1909 kuid_t sock_i_uid(struct sock *sk) 1910 { 1911 kuid_t uid; 1912 1913 read_lock_bh(&sk->sk_callback_lock); 1914 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1915 read_unlock_bh(&sk->sk_callback_lock); 1916 return uid; 1917 } 1918 EXPORT_SYMBOL(sock_i_uid); 1919 1920 unsigned long sock_i_ino(struct sock *sk) 1921 { 1922 unsigned long ino; 1923 1924 read_lock_bh(&sk->sk_callback_lock); 1925 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1926 read_unlock_bh(&sk->sk_callback_lock); 1927 return ino; 1928 } 1929 EXPORT_SYMBOL(sock_i_ino); 1930 1931 /* 1932 * Allocate a skb from the socket's send buffer. 1933 */ 1934 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1935 gfp_t priority) 1936 { 1937 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1938 struct sk_buff *skb = alloc_skb(size, priority); 1939 if (skb) { 1940 skb_set_owner_w(skb, sk); 1941 return skb; 1942 } 1943 } 1944 return NULL; 1945 } 1946 EXPORT_SYMBOL(sock_wmalloc); 1947 1948 static void sock_ofree(struct sk_buff *skb) 1949 { 1950 struct sock *sk = skb->sk; 1951 1952 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 1953 } 1954 1955 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1956 gfp_t priority) 1957 { 1958 struct sk_buff *skb; 1959 1960 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 1961 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 1962 sysctl_optmem_max) 1963 return NULL; 1964 1965 skb = alloc_skb(size, priority); 1966 if (!skb) 1967 return NULL; 1968 1969 atomic_add(skb->truesize, &sk->sk_omem_alloc); 1970 skb->sk = sk; 1971 skb->destructor = sock_ofree; 1972 return skb; 1973 } 1974 1975 /* 1976 * Allocate a memory block from the socket's option memory buffer. 1977 */ 1978 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1979 { 1980 if ((unsigned int)size <= sysctl_optmem_max && 1981 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1982 void *mem; 1983 /* First do the add, to avoid the race if kmalloc 1984 * might sleep. 1985 */ 1986 atomic_add(size, &sk->sk_omem_alloc); 1987 mem = kmalloc(size, priority); 1988 if (mem) 1989 return mem; 1990 atomic_sub(size, &sk->sk_omem_alloc); 1991 } 1992 return NULL; 1993 } 1994 EXPORT_SYMBOL(sock_kmalloc); 1995 1996 /* Free an option memory block. Note, we actually want the inline 1997 * here as this allows gcc to detect the nullify and fold away the 1998 * condition entirely. 1999 */ 2000 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2001 const bool nullify) 2002 { 2003 if (WARN_ON_ONCE(!mem)) 2004 return; 2005 if (nullify) 2006 kzfree(mem); 2007 else 2008 kfree(mem); 2009 atomic_sub(size, &sk->sk_omem_alloc); 2010 } 2011 2012 void sock_kfree_s(struct sock *sk, void *mem, int size) 2013 { 2014 __sock_kfree_s(sk, mem, size, false); 2015 } 2016 EXPORT_SYMBOL(sock_kfree_s); 2017 2018 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2019 { 2020 __sock_kfree_s(sk, mem, size, true); 2021 } 2022 EXPORT_SYMBOL(sock_kzfree_s); 2023 2024 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2025 I think, these locks should be removed for datagram sockets. 2026 */ 2027 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2028 { 2029 DEFINE_WAIT(wait); 2030 2031 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2032 for (;;) { 2033 if (!timeo) 2034 break; 2035 if (signal_pending(current)) 2036 break; 2037 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2038 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2039 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2040 break; 2041 if (sk->sk_shutdown & SEND_SHUTDOWN) 2042 break; 2043 if (sk->sk_err) 2044 break; 2045 timeo = schedule_timeout(timeo); 2046 } 2047 finish_wait(sk_sleep(sk), &wait); 2048 return timeo; 2049 } 2050 2051 2052 /* 2053 * Generic send/receive buffer handlers 2054 */ 2055 2056 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2057 unsigned long data_len, int noblock, 2058 int *errcode, int max_page_order) 2059 { 2060 struct sk_buff *skb; 2061 long timeo; 2062 int err; 2063 2064 timeo = sock_sndtimeo(sk, noblock); 2065 for (;;) { 2066 err = sock_error(sk); 2067 if (err != 0) 2068 goto failure; 2069 2070 err = -EPIPE; 2071 if (sk->sk_shutdown & SEND_SHUTDOWN) 2072 goto failure; 2073 2074 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2075 break; 2076 2077 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2078 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2079 err = -EAGAIN; 2080 if (!timeo) 2081 goto failure; 2082 if (signal_pending(current)) 2083 goto interrupted; 2084 timeo = sock_wait_for_wmem(sk, timeo); 2085 } 2086 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2087 errcode, sk->sk_allocation); 2088 if (skb) 2089 skb_set_owner_w(skb, sk); 2090 return skb; 2091 2092 interrupted: 2093 err = sock_intr_errno(timeo); 2094 failure: 2095 *errcode = err; 2096 return NULL; 2097 } 2098 EXPORT_SYMBOL(sock_alloc_send_pskb); 2099 2100 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2101 int noblock, int *errcode) 2102 { 2103 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2104 } 2105 EXPORT_SYMBOL(sock_alloc_send_skb); 2106 2107 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2108 struct sockcm_cookie *sockc) 2109 { 2110 u32 tsflags; 2111 2112 switch (cmsg->cmsg_type) { 2113 case SO_MARK: 2114 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2115 return -EPERM; 2116 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2117 return -EINVAL; 2118 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2119 break; 2120 case SO_TIMESTAMPING: 2121 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2122 return -EINVAL; 2123 2124 tsflags = *(u32 *)CMSG_DATA(cmsg); 2125 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2126 return -EINVAL; 2127 2128 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2129 sockc->tsflags |= tsflags; 2130 break; 2131 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2132 case SCM_RIGHTS: 2133 case SCM_CREDENTIALS: 2134 break; 2135 default: 2136 return -EINVAL; 2137 } 2138 return 0; 2139 } 2140 EXPORT_SYMBOL(__sock_cmsg_send); 2141 2142 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2143 struct sockcm_cookie *sockc) 2144 { 2145 struct cmsghdr *cmsg; 2146 int ret; 2147 2148 for_each_cmsghdr(cmsg, msg) { 2149 if (!CMSG_OK(msg, cmsg)) 2150 return -EINVAL; 2151 if (cmsg->cmsg_level != SOL_SOCKET) 2152 continue; 2153 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2154 if (ret) 2155 return ret; 2156 } 2157 return 0; 2158 } 2159 EXPORT_SYMBOL(sock_cmsg_send); 2160 2161 static void sk_enter_memory_pressure(struct sock *sk) 2162 { 2163 if (!sk->sk_prot->enter_memory_pressure) 2164 return; 2165 2166 sk->sk_prot->enter_memory_pressure(sk); 2167 } 2168 2169 static void sk_leave_memory_pressure(struct sock *sk) 2170 { 2171 if (sk->sk_prot->leave_memory_pressure) { 2172 sk->sk_prot->leave_memory_pressure(sk); 2173 } else { 2174 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2175 2176 if (memory_pressure && *memory_pressure) 2177 *memory_pressure = 0; 2178 } 2179 } 2180 2181 /* On 32bit arches, an skb frag is limited to 2^15 */ 2182 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2183 2184 /** 2185 * skb_page_frag_refill - check that a page_frag contains enough room 2186 * @sz: minimum size of the fragment we want to get 2187 * @pfrag: pointer to page_frag 2188 * @gfp: priority for memory allocation 2189 * 2190 * Note: While this allocator tries to use high order pages, there is 2191 * no guarantee that allocations succeed. Therefore, @sz MUST be 2192 * less or equal than PAGE_SIZE. 2193 */ 2194 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2195 { 2196 if (pfrag->page) { 2197 if (page_ref_count(pfrag->page) == 1) { 2198 pfrag->offset = 0; 2199 return true; 2200 } 2201 if (pfrag->offset + sz <= pfrag->size) 2202 return true; 2203 put_page(pfrag->page); 2204 } 2205 2206 pfrag->offset = 0; 2207 if (SKB_FRAG_PAGE_ORDER) { 2208 /* Avoid direct reclaim but allow kswapd to wake */ 2209 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2210 __GFP_COMP | __GFP_NOWARN | 2211 __GFP_NORETRY, 2212 SKB_FRAG_PAGE_ORDER); 2213 if (likely(pfrag->page)) { 2214 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2215 return true; 2216 } 2217 } 2218 pfrag->page = alloc_page(gfp); 2219 if (likely(pfrag->page)) { 2220 pfrag->size = PAGE_SIZE; 2221 return true; 2222 } 2223 return false; 2224 } 2225 EXPORT_SYMBOL(skb_page_frag_refill); 2226 2227 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2228 { 2229 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2230 return true; 2231 2232 sk_enter_memory_pressure(sk); 2233 sk_stream_moderate_sndbuf(sk); 2234 return false; 2235 } 2236 EXPORT_SYMBOL(sk_page_frag_refill); 2237 2238 static void __lock_sock(struct sock *sk) 2239 __releases(&sk->sk_lock.slock) 2240 __acquires(&sk->sk_lock.slock) 2241 { 2242 DEFINE_WAIT(wait); 2243 2244 for (;;) { 2245 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2246 TASK_UNINTERRUPTIBLE); 2247 spin_unlock_bh(&sk->sk_lock.slock); 2248 schedule(); 2249 spin_lock_bh(&sk->sk_lock.slock); 2250 if (!sock_owned_by_user(sk)) 2251 break; 2252 } 2253 finish_wait(&sk->sk_lock.wq, &wait); 2254 } 2255 2256 static void __release_sock(struct sock *sk) 2257 __releases(&sk->sk_lock.slock) 2258 __acquires(&sk->sk_lock.slock) 2259 { 2260 struct sk_buff *skb, *next; 2261 2262 while ((skb = sk->sk_backlog.head) != NULL) { 2263 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2264 2265 spin_unlock_bh(&sk->sk_lock.slock); 2266 2267 do { 2268 next = skb->next; 2269 prefetch(next); 2270 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2271 skb->next = NULL; 2272 sk_backlog_rcv(sk, skb); 2273 2274 cond_resched(); 2275 2276 skb = next; 2277 } while (skb != NULL); 2278 2279 spin_lock_bh(&sk->sk_lock.slock); 2280 } 2281 2282 /* 2283 * Doing the zeroing here guarantee we can not loop forever 2284 * while a wild producer attempts to flood us. 2285 */ 2286 sk->sk_backlog.len = 0; 2287 } 2288 2289 void __sk_flush_backlog(struct sock *sk) 2290 { 2291 spin_lock_bh(&sk->sk_lock.slock); 2292 __release_sock(sk); 2293 spin_unlock_bh(&sk->sk_lock.slock); 2294 } 2295 2296 /** 2297 * sk_wait_data - wait for data to arrive at sk_receive_queue 2298 * @sk: sock to wait on 2299 * @timeo: for how long 2300 * @skb: last skb seen on sk_receive_queue 2301 * 2302 * Now socket state including sk->sk_err is changed only under lock, 2303 * hence we may omit checks after joining wait queue. 2304 * We check receive queue before schedule() only as optimization; 2305 * it is very likely that release_sock() added new data. 2306 */ 2307 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2308 { 2309 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2310 int rc; 2311 2312 add_wait_queue(sk_sleep(sk), &wait); 2313 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2314 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2315 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2316 remove_wait_queue(sk_sleep(sk), &wait); 2317 return rc; 2318 } 2319 EXPORT_SYMBOL(sk_wait_data); 2320 2321 /** 2322 * __sk_mem_raise_allocated - increase memory_allocated 2323 * @sk: socket 2324 * @size: memory size to allocate 2325 * @amt: pages to allocate 2326 * @kind: allocation type 2327 * 2328 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2329 */ 2330 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2331 { 2332 struct proto *prot = sk->sk_prot; 2333 long allocated = sk_memory_allocated_add(sk, amt); 2334 2335 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2336 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2337 goto suppress_allocation; 2338 2339 /* Under limit. */ 2340 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2341 sk_leave_memory_pressure(sk); 2342 return 1; 2343 } 2344 2345 /* Under pressure. */ 2346 if (allocated > sk_prot_mem_limits(sk, 1)) 2347 sk_enter_memory_pressure(sk); 2348 2349 /* Over hard limit. */ 2350 if (allocated > sk_prot_mem_limits(sk, 2)) 2351 goto suppress_allocation; 2352 2353 /* guarantee minimum buffer size under pressure */ 2354 if (kind == SK_MEM_RECV) { 2355 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2356 return 1; 2357 2358 } else { /* SK_MEM_SEND */ 2359 int wmem0 = sk_get_wmem0(sk, prot); 2360 2361 if (sk->sk_type == SOCK_STREAM) { 2362 if (sk->sk_wmem_queued < wmem0) 2363 return 1; 2364 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2365 return 1; 2366 } 2367 } 2368 2369 if (sk_has_memory_pressure(sk)) { 2370 int alloc; 2371 2372 if (!sk_under_memory_pressure(sk)) 2373 return 1; 2374 alloc = sk_sockets_allocated_read_positive(sk); 2375 if (sk_prot_mem_limits(sk, 2) > alloc * 2376 sk_mem_pages(sk->sk_wmem_queued + 2377 atomic_read(&sk->sk_rmem_alloc) + 2378 sk->sk_forward_alloc)) 2379 return 1; 2380 } 2381 2382 suppress_allocation: 2383 2384 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2385 sk_stream_moderate_sndbuf(sk); 2386 2387 /* Fail only if socket is _under_ its sndbuf. 2388 * In this case we cannot block, so that we have to fail. 2389 */ 2390 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2391 return 1; 2392 } 2393 2394 trace_sock_exceed_buf_limit(sk, prot, allocated); 2395 2396 sk_memory_allocated_sub(sk, amt); 2397 2398 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2399 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2400 2401 return 0; 2402 } 2403 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2404 2405 /** 2406 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2407 * @sk: socket 2408 * @size: memory size to allocate 2409 * @kind: allocation type 2410 * 2411 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2412 * rmem allocation. This function assumes that protocols which have 2413 * memory_pressure use sk_wmem_queued as write buffer accounting. 2414 */ 2415 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2416 { 2417 int ret, amt = sk_mem_pages(size); 2418 2419 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2420 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2421 if (!ret) 2422 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2423 return ret; 2424 } 2425 EXPORT_SYMBOL(__sk_mem_schedule); 2426 2427 /** 2428 * __sk_mem_reduce_allocated - reclaim memory_allocated 2429 * @sk: socket 2430 * @amount: number of quanta 2431 * 2432 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2433 */ 2434 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2435 { 2436 sk_memory_allocated_sub(sk, amount); 2437 2438 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2439 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2440 2441 if (sk_under_memory_pressure(sk) && 2442 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2443 sk_leave_memory_pressure(sk); 2444 } 2445 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2446 2447 /** 2448 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2449 * @sk: socket 2450 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2451 */ 2452 void __sk_mem_reclaim(struct sock *sk, int amount) 2453 { 2454 amount >>= SK_MEM_QUANTUM_SHIFT; 2455 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2456 __sk_mem_reduce_allocated(sk, amount); 2457 } 2458 EXPORT_SYMBOL(__sk_mem_reclaim); 2459 2460 int sk_set_peek_off(struct sock *sk, int val) 2461 { 2462 sk->sk_peek_off = val; 2463 return 0; 2464 } 2465 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2466 2467 /* 2468 * Set of default routines for initialising struct proto_ops when 2469 * the protocol does not support a particular function. In certain 2470 * cases where it makes no sense for a protocol to have a "do nothing" 2471 * function, some default processing is provided. 2472 */ 2473 2474 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2475 { 2476 return -EOPNOTSUPP; 2477 } 2478 EXPORT_SYMBOL(sock_no_bind); 2479 2480 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2481 int len, int flags) 2482 { 2483 return -EOPNOTSUPP; 2484 } 2485 EXPORT_SYMBOL(sock_no_connect); 2486 2487 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2488 { 2489 return -EOPNOTSUPP; 2490 } 2491 EXPORT_SYMBOL(sock_no_socketpair); 2492 2493 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2494 bool kern) 2495 { 2496 return -EOPNOTSUPP; 2497 } 2498 EXPORT_SYMBOL(sock_no_accept); 2499 2500 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2501 int peer) 2502 { 2503 return -EOPNOTSUPP; 2504 } 2505 EXPORT_SYMBOL(sock_no_getname); 2506 2507 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 2508 { 2509 return 0; 2510 } 2511 EXPORT_SYMBOL(sock_no_poll); 2512 2513 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2514 { 2515 return -EOPNOTSUPP; 2516 } 2517 EXPORT_SYMBOL(sock_no_ioctl); 2518 2519 int sock_no_listen(struct socket *sock, int backlog) 2520 { 2521 return -EOPNOTSUPP; 2522 } 2523 EXPORT_SYMBOL(sock_no_listen); 2524 2525 int sock_no_shutdown(struct socket *sock, int how) 2526 { 2527 return -EOPNOTSUPP; 2528 } 2529 EXPORT_SYMBOL(sock_no_shutdown); 2530 2531 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2532 char __user *optval, unsigned int optlen) 2533 { 2534 return -EOPNOTSUPP; 2535 } 2536 EXPORT_SYMBOL(sock_no_setsockopt); 2537 2538 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2539 char __user *optval, int __user *optlen) 2540 { 2541 return -EOPNOTSUPP; 2542 } 2543 EXPORT_SYMBOL(sock_no_getsockopt); 2544 2545 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2546 { 2547 return -EOPNOTSUPP; 2548 } 2549 EXPORT_SYMBOL(sock_no_sendmsg); 2550 2551 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2552 { 2553 return -EOPNOTSUPP; 2554 } 2555 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2556 2557 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2558 int flags) 2559 { 2560 return -EOPNOTSUPP; 2561 } 2562 EXPORT_SYMBOL(sock_no_recvmsg); 2563 2564 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2565 { 2566 /* Mirror missing mmap method error code */ 2567 return -ENODEV; 2568 } 2569 EXPORT_SYMBOL(sock_no_mmap); 2570 2571 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2572 { 2573 ssize_t res; 2574 struct msghdr msg = {.msg_flags = flags}; 2575 struct kvec iov; 2576 char *kaddr = kmap(page); 2577 iov.iov_base = kaddr + offset; 2578 iov.iov_len = size; 2579 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2580 kunmap(page); 2581 return res; 2582 } 2583 EXPORT_SYMBOL(sock_no_sendpage); 2584 2585 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2586 int offset, size_t size, int flags) 2587 { 2588 ssize_t res; 2589 struct msghdr msg = {.msg_flags = flags}; 2590 struct kvec iov; 2591 char *kaddr = kmap(page); 2592 2593 iov.iov_base = kaddr + offset; 2594 iov.iov_len = size; 2595 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2596 kunmap(page); 2597 return res; 2598 } 2599 EXPORT_SYMBOL(sock_no_sendpage_locked); 2600 2601 /* 2602 * Default Socket Callbacks 2603 */ 2604 2605 static void sock_def_wakeup(struct sock *sk) 2606 { 2607 struct socket_wq *wq; 2608 2609 rcu_read_lock(); 2610 wq = rcu_dereference(sk->sk_wq); 2611 if (skwq_has_sleeper(wq)) 2612 wake_up_interruptible_all(&wq->wait); 2613 rcu_read_unlock(); 2614 } 2615 2616 static void sock_def_error_report(struct sock *sk) 2617 { 2618 struct socket_wq *wq; 2619 2620 rcu_read_lock(); 2621 wq = rcu_dereference(sk->sk_wq); 2622 if (skwq_has_sleeper(wq)) 2623 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2624 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2625 rcu_read_unlock(); 2626 } 2627 2628 static void sock_def_readable(struct sock *sk) 2629 { 2630 struct socket_wq *wq; 2631 2632 rcu_read_lock(); 2633 wq = rcu_dereference(sk->sk_wq); 2634 if (skwq_has_sleeper(wq)) 2635 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2636 EPOLLRDNORM | EPOLLRDBAND); 2637 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2638 rcu_read_unlock(); 2639 } 2640 2641 static void sock_def_write_space(struct sock *sk) 2642 { 2643 struct socket_wq *wq; 2644 2645 rcu_read_lock(); 2646 2647 /* Do not wake up a writer until he can make "significant" 2648 * progress. --DaveM 2649 */ 2650 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2651 wq = rcu_dereference(sk->sk_wq); 2652 if (skwq_has_sleeper(wq)) 2653 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2654 EPOLLWRNORM | EPOLLWRBAND); 2655 2656 /* Should agree with poll, otherwise some programs break */ 2657 if (sock_writeable(sk)) 2658 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2659 } 2660 2661 rcu_read_unlock(); 2662 } 2663 2664 static void sock_def_destruct(struct sock *sk) 2665 { 2666 } 2667 2668 void sk_send_sigurg(struct sock *sk) 2669 { 2670 if (sk->sk_socket && sk->sk_socket->file) 2671 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2672 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2673 } 2674 EXPORT_SYMBOL(sk_send_sigurg); 2675 2676 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2677 unsigned long expires) 2678 { 2679 if (!mod_timer(timer, expires)) 2680 sock_hold(sk); 2681 } 2682 EXPORT_SYMBOL(sk_reset_timer); 2683 2684 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2685 { 2686 if (del_timer(timer)) 2687 __sock_put(sk); 2688 } 2689 EXPORT_SYMBOL(sk_stop_timer); 2690 2691 void sock_init_data(struct socket *sock, struct sock *sk) 2692 { 2693 sk_init_common(sk); 2694 sk->sk_send_head = NULL; 2695 2696 timer_setup(&sk->sk_timer, NULL, 0); 2697 2698 sk->sk_allocation = GFP_KERNEL; 2699 sk->sk_rcvbuf = sysctl_rmem_default; 2700 sk->sk_sndbuf = sysctl_wmem_default; 2701 sk->sk_state = TCP_CLOSE; 2702 sk_set_socket(sk, sock); 2703 2704 sock_set_flag(sk, SOCK_ZAPPED); 2705 2706 if (sock) { 2707 sk->sk_type = sock->type; 2708 sk->sk_wq = sock->wq; 2709 sock->sk = sk; 2710 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2711 } else { 2712 sk->sk_wq = NULL; 2713 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2714 } 2715 2716 rwlock_init(&sk->sk_callback_lock); 2717 if (sk->sk_kern_sock) 2718 lockdep_set_class_and_name( 2719 &sk->sk_callback_lock, 2720 af_kern_callback_keys + sk->sk_family, 2721 af_family_kern_clock_key_strings[sk->sk_family]); 2722 else 2723 lockdep_set_class_and_name( 2724 &sk->sk_callback_lock, 2725 af_callback_keys + sk->sk_family, 2726 af_family_clock_key_strings[sk->sk_family]); 2727 2728 sk->sk_state_change = sock_def_wakeup; 2729 sk->sk_data_ready = sock_def_readable; 2730 sk->sk_write_space = sock_def_write_space; 2731 sk->sk_error_report = sock_def_error_report; 2732 sk->sk_destruct = sock_def_destruct; 2733 2734 sk->sk_frag.page = NULL; 2735 sk->sk_frag.offset = 0; 2736 sk->sk_peek_off = -1; 2737 2738 sk->sk_peer_pid = NULL; 2739 sk->sk_peer_cred = NULL; 2740 sk->sk_write_pending = 0; 2741 sk->sk_rcvlowat = 1; 2742 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2743 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2744 2745 sk->sk_stamp = SK_DEFAULT_STAMP; 2746 atomic_set(&sk->sk_zckey, 0); 2747 2748 #ifdef CONFIG_NET_RX_BUSY_POLL 2749 sk->sk_napi_id = 0; 2750 sk->sk_ll_usec = sysctl_net_busy_read; 2751 #endif 2752 2753 sk->sk_max_pacing_rate = ~0U; 2754 sk->sk_pacing_rate = ~0U; 2755 sk->sk_pacing_shift = 10; 2756 sk->sk_incoming_cpu = -1; 2757 /* 2758 * Before updating sk_refcnt, we must commit prior changes to memory 2759 * (Documentation/RCU/rculist_nulls.txt for details) 2760 */ 2761 smp_wmb(); 2762 refcount_set(&sk->sk_refcnt, 1); 2763 atomic_set(&sk->sk_drops, 0); 2764 } 2765 EXPORT_SYMBOL(sock_init_data); 2766 2767 void lock_sock_nested(struct sock *sk, int subclass) 2768 { 2769 might_sleep(); 2770 spin_lock_bh(&sk->sk_lock.slock); 2771 if (sk->sk_lock.owned) 2772 __lock_sock(sk); 2773 sk->sk_lock.owned = 1; 2774 spin_unlock(&sk->sk_lock.slock); 2775 /* 2776 * The sk_lock has mutex_lock() semantics here: 2777 */ 2778 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2779 local_bh_enable(); 2780 } 2781 EXPORT_SYMBOL(lock_sock_nested); 2782 2783 void release_sock(struct sock *sk) 2784 { 2785 spin_lock_bh(&sk->sk_lock.slock); 2786 if (sk->sk_backlog.tail) 2787 __release_sock(sk); 2788 2789 /* Warning : release_cb() might need to release sk ownership, 2790 * ie call sock_release_ownership(sk) before us. 2791 */ 2792 if (sk->sk_prot->release_cb) 2793 sk->sk_prot->release_cb(sk); 2794 2795 sock_release_ownership(sk); 2796 if (waitqueue_active(&sk->sk_lock.wq)) 2797 wake_up(&sk->sk_lock.wq); 2798 spin_unlock_bh(&sk->sk_lock.slock); 2799 } 2800 EXPORT_SYMBOL(release_sock); 2801 2802 /** 2803 * lock_sock_fast - fast version of lock_sock 2804 * @sk: socket 2805 * 2806 * This version should be used for very small section, where process wont block 2807 * return false if fast path is taken: 2808 * 2809 * sk_lock.slock locked, owned = 0, BH disabled 2810 * 2811 * return true if slow path is taken: 2812 * 2813 * sk_lock.slock unlocked, owned = 1, BH enabled 2814 */ 2815 bool lock_sock_fast(struct sock *sk) 2816 { 2817 might_sleep(); 2818 spin_lock_bh(&sk->sk_lock.slock); 2819 2820 if (!sk->sk_lock.owned) 2821 /* 2822 * Note : We must disable BH 2823 */ 2824 return false; 2825 2826 __lock_sock(sk); 2827 sk->sk_lock.owned = 1; 2828 spin_unlock(&sk->sk_lock.slock); 2829 /* 2830 * The sk_lock has mutex_lock() semantics here: 2831 */ 2832 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2833 local_bh_enable(); 2834 return true; 2835 } 2836 EXPORT_SYMBOL(lock_sock_fast); 2837 2838 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2839 { 2840 struct timeval tv; 2841 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2842 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2843 tv = ktime_to_timeval(sk->sk_stamp); 2844 if (tv.tv_sec == -1) 2845 return -ENOENT; 2846 if (tv.tv_sec == 0) { 2847 sk->sk_stamp = ktime_get_real(); 2848 tv = ktime_to_timeval(sk->sk_stamp); 2849 } 2850 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2851 } 2852 EXPORT_SYMBOL(sock_get_timestamp); 2853 2854 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2855 { 2856 struct timespec ts; 2857 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2858 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2859 ts = ktime_to_timespec(sk->sk_stamp); 2860 if (ts.tv_sec == -1) 2861 return -ENOENT; 2862 if (ts.tv_sec == 0) { 2863 sk->sk_stamp = ktime_get_real(); 2864 ts = ktime_to_timespec(sk->sk_stamp); 2865 } 2866 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2867 } 2868 EXPORT_SYMBOL(sock_get_timestampns); 2869 2870 void sock_enable_timestamp(struct sock *sk, int flag) 2871 { 2872 if (!sock_flag(sk, flag)) { 2873 unsigned long previous_flags = sk->sk_flags; 2874 2875 sock_set_flag(sk, flag); 2876 /* 2877 * we just set one of the two flags which require net 2878 * time stamping, but time stamping might have been on 2879 * already because of the other one 2880 */ 2881 if (sock_needs_netstamp(sk) && 2882 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2883 net_enable_timestamp(); 2884 } 2885 } 2886 2887 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2888 int level, int type) 2889 { 2890 struct sock_exterr_skb *serr; 2891 struct sk_buff *skb; 2892 int copied, err; 2893 2894 err = -EAGAIN; 2895 skb = sock_dequeue_err_skb(sk); 2896 if (skb == NULL) 2897 goto out; 2898 2899 copied = skb->len; 2900 if (copied > len) { 2901 msg->msg_flags |= MSG_TRUNC; 2902 copied = len; 2903 } 2904 err = skb_copy_datagram_msg(skb, 0, msg, copied); 2905 if (err) 2906 goto out_free_skb; 2907 2908 sock_recv_timestamp(msg, sk, skb); 2909 2910 serr = SKB_EXT_ERR(skb); 2911 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 2912 2913 msg->msg_flags |= MSG_ERRQUEUE; 2914 err = copied; 2915 2916 out_free_skb: 2917 kfree_skb(skb); 2918 out: 2919 return err; 2920 } 2921 EXPORT_SYMBOL(sock_recv_errqueue); 2922 2923 /* 2924 * Get a socket option on an socket. 2925 * 2926 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2927 * asynchronous errors should be reported by getsockopt. We assume 2928 * this means if you specify SO_ERROR (otherwise whats the point of it). 2929 */ 2930 int sock_common_getsockopt(struct socket *sock, int level, int optname, 2931 char __user *optval, int __user *optlen) 2932 { 2933 struct sock *sk = sock->sk; 2934 2935 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2936 } 2937 EXPORT_SYMBOL(sock_common_getsockopt); 2938 2939 #ifdef CONFIG_COMPAT 2940 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2941 char __user *optval, int __user *optlen) 2942 { 2943 struct sock *sk = sock->sk; 2944 2945 if (sk->sk_prot->compat_getsockopt != NULL) 2946 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2947 optval, optlen); 2948 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2949 } 2950 EXPORT_SYMBOL(compat_sock_common_getsockopt); 2951 #endif 2952 2953 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2954 int flags) 2955 { 2956 struct sock *sk = sock->sk; 2957 int addr_len = 0; 2958 int err; 2959 2960 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 2961 flags & ~MSG_DONTWAIT, &addr_len); 2962 if (err >= 0) 2963 msg->msg_namelen = addr_len; 2964 return err; 2965 } 2966 EXPORT_SYMBOL(sock_common_recvmsg); 2967 2968 /* 2969 * Set socket options on an inet socket. 2970 */ 2971 int sock_common_setsockopt(struct socket *sock, int level, int optname, 2972 char __user *optval, unsigned int optlen) 2973 { 2974 struct sock *sk = sock->sk; 2975 2976 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2977 } 2978 EXPORT_SYMBOL(sock_common_setsockopt); 2979 2980 #ifdef CONFIG_COMPAT 2981 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2982 char __user *optval, unsigned int optlen) 2983 { 2984 struct sock *sk = sock->sk; 2985 2986 if (sk->sk_prot->compat_setsockopt != NULL) 2987 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2988 optval, optlen); 2989 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2990 } 2991 EXPORT_SYMBOL(compat_sock_common_setsockopt); 2992 #endif 2993 2994 void sk_common_release(struct sock *sk) 2995 { 2996 if (sk->sk_prot->destroy) 2997 sk->sk_prot->destroy(sk); 2998 2999 /* 3000 * Observation: when sock_common_release is called, processes have 3001 * no access to socket. But net still has. 3002 * Step one, detach it from networking: 3003 * 3004 * A. Remove from hash tables. 3005 */ 3006 3007 sk->sk_prot->unhash(sk); 3008 3009 /* 3010 * In this point socket cannot receive new packets, but it is possible 3011 * that some packets are in flight because some CPU runs receiver and 3012 * did hash table lookup before we unhashed socket. They will achieve 3013 * receive queue and will be purged by socket destructor. 3014 * 3015 * Also we still have packets pending on receive queue and probably, 3016 * our own packets waiting in device queues. sock_destroy will drain 3017 * receive queue, but transmitted packets will delay socket destruction 3018 * until the last reference will be released. 3019 */ 3020 3021 sock_orphan(sk); 3022 3023 xfrm_sk_free_policy(sk); 3024 3025 sk_refcnt_debug_release(sk); 3026 3027 sock_put(sk); 3028 } 3029 EXPORT_SYMBOL(sk_common_release); 3030 3031 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3032 { 3033 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3034 3035 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3036 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3037 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3038 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3039 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3040 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3041 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3042 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3043 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3044 } 3045 3046 #ifdef CONFIG_PROC_FS 3047 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3048 struct prot_inuse { 3049 int val[PROTO_INUSE_NR]; 3050 }; 3051 3052 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3053 3054 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3055 { 3056 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3057 } 3058 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3059 3060 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3061 { 3062 int cpu, idx = prot->inuse_idx; 3063 int res = 0; 3064 3065 for_each_possible_cpu(cpu) 3066 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3067 3068 return res >= 0 ? res : 0; 3069 } 3070 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3071 3072 static void sock_inuse_add(struct net *net, int val) 3073 { 3074 this_cpu_add(*net->core.sock_inuse, val); 3075 } 3076 3077 int sock_inuse_get(struct net *net) 3078 { 3079 int cpu, res = 0; 3080 3081 for_each_possible_cpu(cpu) 3082 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3083 3084 return res; 3085 } 3086 3087 EXPORT_SYMBOL_GPL(sock_inuse_get); 3088 3089 static int __net_init sock_inuse_init_net(struct net *net) 3090 { 3091 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3092 if (net->core.prot_inuse == NULL) 3093 return -ENOMEM; 3094 3095 net->core.sock_inuse = alloc_percpu(int); 3096 if (net->core.sock_inuse == NULL) 3097 goto out; 3098 3099 return 0; 3100 3101 out: 3102 free_percpu(net->core.prot_inuse); 3103 return -ENOMEM; 3104 } 3105 3106 static void __net_exit sock_inuse_exit_net(struct net *net) 3107 { 3108 free_percpu(net->core.prot_inuse); 3109 free_percpu(net->core.sock_inuse); 3110 } 3111 3112 static struct pernet_operations net_inuse_ops = { 3113 .init = sock_inuse_init_net, 3114 .exit = sock_inuse_exit_net, 3115 .async = true, 3116 }; 3117 3118 static __init int net_inuse_init(void) 3119 { 3120 if (register_pernet_subsys(&net_inuse_ops)) 3121 panic("Cannot initialize net inuse counters"); 3122 3123 return 0; 3124 } 3125 3126 core_initcall(net_inuse_init); 3127 3128 static void assign_proto_idx(struct proto *prot) 3129 { 3130 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3131 3132 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3133 pr_err("PROTO_INUSE_NR exhausted\n"); 3134 return; 3135 } 3136 3137 set_bit(prot->inuse_idx, proto_inuse_idx); 3138 } 3139 3140 static void release_proto_idx(struct proto *prot) 3141 { 3142 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3143 clear_bit(prot->inuse_idx, proto_inuse_idx); 3144 } 3145 #else 3146 static inline void assign_proto_idx(struct proto *prot) 3147 { 3148 } 3149 3150 static inline void release_proto_idx(struct proto *prot) 3151 { 3152 } 3153 3154 static void sock_inuse_add(struct net *net, int val) 3155 { 3156 } 3157 #endif 3158 3159 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3160 { 3161 if (!rsk_prot) 3162 return; 3163 kfree(rsk_prot->slab_name); 3164 rsk_prot->slab_name = NULL; 3165 kmem_cache_destroy(rsk_prot->slab); 3166 rsk_prot->slab = NULL; 3167 } 3168 3169 static int req_prot_init(const struct proto *prot) 3170 { 3171 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3172 3173 if (!rsk_prot) 3174 return 0; 3175 3176 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3177 prot->name); 3178 if (!rsk_prot->slab_name) 3179 return -ENOMEM; 3180 3181 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3182 rsk_prot->obj_size, 0, 3183 prot->slab_flags, NULL); 3184 3185 if (!rsk_prot->slab) { 3186 pr_crit("%s: Can't create request sock SLAB cache!\n", 3187 prot->name); 3188 return -ENOMEM; 3189 } 3190 return 0; 3191 } 3192 3193 int proto_register(struct proto *prot, int alloc_slab) 3194 { 3195 if (alloc_slab) { 3196 prot->slab = kmem_cache_create_usercopy(prot->name, 3197 prot->obj_size, 0, 3198 SLAB_HWCACHE_ALIGN | prot->slab_flags, 3199 prot->useroffset, prot->usersize, 3200 NULL); 3201 3202 if (prot->slab == NULL) { 3203 pr_crit("%s: Can't create sock SLAB cache!\n", 3204 prot->name); 3205 goto out; 3206 } 3207 3208 if (req_prot_init(prot)) 3209 goto out_free_request_sock_slab; 3210 3211 if (prot->twsk_prot != NULL) { 3212 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3213 3214 if (prot->twsk_prot->twsk_slab_name == NULL) 3215 goto out_free_request_sock_slab; 3216 3217 prot->twsk_prot->twsk_slab = 3218 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3219 prot->twsk_prot->twsk_obj_size, 3220 0, 3221 prot->slab_flags, 3222 NULL); 3223 if (prot->twsk_prot->twsk_slab == NULL) 3224 goto out_free_timewait_sock_slab_name; 3225 } 3226 } 3227 3228 mutex_lock(&proto_list_mutex); 3229 list_add(&prot->node, &proto_list); 3230 assign_proto_idx(prot); 3231 mutex_unlock(&proto_list_mutex); 3232 return 0; 3233 3234 out_free_timewait_sock_slab_name: 3235 kfree(prot->twsk_prot->twsk_slab_name); 3236 out_free_request_sock_slab: 3237 req_prot_cleanup(prot->rsk_prot); 3238 3239 kmem_cache_destroy(prot->slab); 3240 prot->slab = NULL; 3241 out: 3242 return -ENOBUFS; 3243 } 3244 EXPORT_SYMBOL(proto_register); 3245 3246 void proto_unregister(struct proto *prot) 3247 { 3248 mutex_lock(&proto_list_mutex); 3249 release_proto_idx(prot); 3250 list_del(&prot->node); 3251 mutex_unlock(&proto_list_mutex); 3252 3253 kmem_cache_destroy(prot->slab); 3254 prot->slab = NULL; 3255 3256 req_prot_cleanup(prot->rsk_prot); 3257 3258 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3259 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3260 kfree(prot->twsk_prot->twsk_slab_name); 3261 prot->twsk_prot->twsk_slab = NULL; 3262 } 3263 } 3264 EXPORT_SYMBOL(proto_unregister); 3265 3266 #ifdef CONFIG_PROC_FS 3267 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3268 __acquires(proto_list_mutex) 3269 { 3270 mutex_lock(&proto_list_mutex); 3271 return seq_list_start_head(&proto_list, *pos); 3272 } 3273 3274 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3275 { 3276 return seq_list_next(v, &proto_list, pos); 3277 } 3278 3279 static void proto_seq_stop(struct seq_file *seq, void *v) 3280 __releases(proto_list_mutex) 3281 { 3282 mutex_unlock(&proto_list_mutex); 3283 } 3284 3285 static char proto_method_implemented(const void *method) 3286 { 3287 return method == NULL ? 'n' : 'y'; 3288 } 3289 static long sock_prot_memory_allocated(struct proto *proto) 3290 { 3291 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3292 } 3293 3294 static char *sock_prot_memory_pressure(struct proto *proto) 3295 { 3296 return proto->memory_pressure != NULL ? 3297 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3298 } 3299 3300 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3301 { 3302 3303 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3304 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3305 proto->name, 3306 proto->obj_size, 3307 sock_prot_inuse_get(seq_file_net(seq), proto), 3308 sock_prot_memory_allocated(proto), 3309 sock_prot_memory_pressure(proto), 3310 proto->max_header, 3311 proto->slab == NULL ? "no" : "yes", 3312 module_name(proto->owner), 3313 proto_method_implemented(proto->close), 3314 proto_method_implemented(proto->connect), 3315 proto_method_implemented(proto->disconnect), 3316 proto_method_implemented(proto->accept), 3317 proto_method_implemented(proto->ioctl), 3318 proto_method_implemented(proto->init), 3319 proto_method_implemented(proto->destroy), 3320 proto_method_implemented(proto->shutdown), 3321 proto_method_implemented(proto->setsockopt), 3322 proto_method_implemented(proto->getsockopt), 3323 proto_method_implemented(proto->sendmsg), 3324 proto_method_implemented(proto->recvmsg), 3325 proto_method_implemented(proto->sendpage), 3326 proto_method_implemented(proto->bind), 3327 proto_method_implemented(proto->backlog_rcv), 3328 proto_method_implemented(proto->hash), 3329 proto_method_implemented(proto->unhash), 3330 proto_method_implemented(proto->get_port), 3331 proto_method_implemented(proto->enter_memory_pressure)); 3332 } 3333 3334 static int proto_seq_show(struct seq_file *seq, void *v) 3335 { 3336 if (v == &proto_list) 3337 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3338 "protocol", 3339 "size", 3340 "sockets", 3341 "memory", 3342 "press", 3343 "maxhdr", 3344 "slab", 3345 "module", 3346 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3347 else 3348 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3349 return 0; 3350 } 3351 3352 static const struct seq_operations proto_seq_ops = { 3353 .start = proto_seq_start, 3354 .next = proto_seq_next, 3355 .stop = proto_seq_stop, 3356 .show = proto_seq_show, 3357 }; 3358 3359 static int proto_seq_open(struct inode *inode, struct file *file) 3360 { 3361 return seq_open_net(inode, file, &proto_seq_ops, 3362 sizeof(struct seq_net_private)); 3363 } 3364 3365 static const struct file_operations proto_seq_fops = { 3366 .open = proto_seq_open, 3367 .read = seq_read, 3368 .llseek = seq_lseek, 3369 .release = seq_release_net, 3370 }; 3371 3372 static __net_init int proto_init_net(struct net *net) 3373 { 3374 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) 3375 return -ENOMEM; 3376 3377 return 0; 3378 } 3379 3380 static __net_exit void proto_exit_net(struct net *net) 3381 { 3382 remove_proc_entry("protocols", net->proc_net); 3383 } 3384 3385 3386 static __net_initdata struct pernet_operations proto_net_ops = { 3387 .init = proto_init_net, 3388 .exit = proto_exit_net, 3389 .async = true, 3390 }; 3391 3392 static int __init proto_init(void) 3393 { 3394 return register_pernet_subsys(&proto_net_ops); 3395 } 3396 3397 subsys_initcall(proto_init); 3398 3399 #endif /* PROC_FS */ 3400 3401 #ifdef CONFIG_NET_RX_BUSY_POLL 3402 bool sk_busy_loop_end(void *p, unsigned long start_time) 3403 { 3404 struct sock *sk = p; 3405 3406 return !skb_queue_empty(&sk->sk_receive_queue) || 3407 sk_busy_loop_timeout(sk, start_time); 3408 } 3409 EXPORT_SYMBOL(sk_busy_loop_end); 3410 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3411