1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 117 #include <linux/uaccess.h> 118 119 #include <linux/netdevice.h> 120 #include <net/protocol.h> 121 #include <linux/skbuff.h> 122 #include <net/net_namespace.h> 123 #include <net/request_sock.h> 124 #include <net/sock.h> 125 #include <linux/net_tstamp.h> 126 #include <net/xfrm.h> 127 #include <linux/ipsec.h> 128 #include <net/cls_cgroup.h> 129 #include <net/netprio_cgroup.h> 130 #include <linux/sock_diag.h> 131 132 #include <linux/filter.h> 133 #include <net/sock_reuseport.h> 134 #include <net/bpf_sk_storage.h> 135 136 #include <trace/events/sock.h> 137 138 #include <net/tcp.h> 139 #include <net/busy_poll.h> 140 141 static DEFINE_MUTEX(proto_list_mutex); 142 static LIST_HEAD(proto_list); 143 144 static void sock_inuse_add(struct net *net, int val); 145 146 /** 147 * sk_ns_capable - General socket capability test 148 * @sk: Socket to use a capability on or through 149 * @user_ns: The user namespace of the capability to use 150 * @cap: The capability to use 151 * 152 * Test to see if the opener of the socket had when the socket was 153 * created and the current process has the capability @cap in the user 154 * namespace @user_ns. 155 */ 156 bool sk_ns_capable(const struct sock *sk, 157 struct user_namespace *user_ns, int cap) 158 { 159 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 160 ns_capable(user_ns, cap); 161 } 162 EXPORT_SYMBOL(sk_ns_capable); 163 164 /** 165 * sk_capable - Socket global capability test 166 * @sk: Socket to use a capability on or through 167 * @cap: The global capability to use 168 * 169 * Test to see if the opener of the socket had when the socket was 170 * created and the current process has the capability @cap in all user 171 * namespaces. 172 */ 173 bool sk_capable(const struct sock *sk, int cap) 174 { 175 return sk_ns_capable(sk, &init_user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_capable); 178 179 /** 180 * sk_net_capable - Network namespace socket capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was created 185 * and the current process has the capability @cap over the network namespace 186 * the socket is a member of. 187 */ 188 bool sk_net_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_net_capable); 193 194 /* 195 * Each address family might have different locking rules, so we have 196 * one slock key per address family and separate keys for internal and 197 * userspace sockets. 198 */ 199 static struct lock_class_key af_family_keys[AF_MAX]; 200 static struct lock_class_key af_family_kern_keys[AF_MAX]; 201 static struct lock_class_key af_family_slock_keys[AF_MAX]; 202 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 203 204 /* 205 * Make lock validator output more readable. (we pre-construct these 206 * strings build-time, so that runtime initialization of socket 207 * locks is fast): 208 */ 209 210 #define _sock_locks(x) \ 211 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 212 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 213 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 214 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 215 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 216 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 217 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 218 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 219 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 220 x "27" , x "28" , x "AF_CAN" , \ 221 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 222 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 223 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 224 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 225 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 226 x "AF_MAX" 227 228 static const char *const af_family_key_strings[AF_MAX+1] = { 229 _sock_locks("sk_lock-") 230 }; 231 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 232 _sock_locks("slock-") 233 }; 234 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 235 _sock_locks("clock-") 236 }; 237 238 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 239 _sock_locks("k-sk_lock-") 240 }; 241 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 242 _sock_locks("k-slock-") 243 }; 244 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 245 _sock_locks("k-clock-") 246 }; 247 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 248 _sock_locks("rlock-") 249 }; 250 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 251 _sock_locks("wlock-") 252 }; 253 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 254 _sock_locks("elock-") 255 }; 256 257 /* 258 * sk_callback_lock and sk queues locking rules are per-address-family, 259 * so split the lock classes by using a per-AF key: 260 */ 261 static struct lock_class_key af_callback_keys[AF_MAX]; 262 static struct lock_class_key af_rlock_keys[AF_MAX]; 263 static struct lock_class_key af_wlock_keys[AF_MAX]; 264 static struct lock_class_key af_elock_keys[AF_MAX]; 265 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 266 267 /* Run time adjustable parameters. */ 268 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 269 EXPORT_SYMBOL(sysctl_wmem_max); 270 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 271 EXPORT_SYMBOL(sysctl_rmem_max); 272 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 273 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 274 275 /* Maximal space eaten by iovec or ancillary data plus some space */ 276 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 277 EXPORT_SYMBOL(sysctl_optmem_max); 278 279 int sysctl_tstamp_allow_data __read_mostly = 1; 280 281 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 282 EXPORT_SYMBOL_GPL(memalloc_socks_key); 283 284 /** 285 * sk_set_memalloc - sets %SOCK_MEMALLOC 286 * @sk: socket to set it on 287 * 288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 289 * It's the responsibility of the admin to adjust min_free_kbytes 290 * to meet the requirements 291 */ 292 void sk_set_memalloc(struct sock *sk) 293 { 294 sock_set_flag(sk, SOCK_MEMALLOC); 295 sk->sk_allocation |= __GFP_MEMALLOC; 296 static_branch_inc(&memalloc_socks_key); 297 } 298 EXPORT_SYMBOL_GPL(sk_set_memalloc); 299 300 void sk_clear_memalloc(struct sock *sk) 301 { 302 sock_reset_flag(sk, SOCK_MEMALLOC); 303 sk->sk_allocation &= ~__GFP_MEMALLOC; 304 static_branch_dec(&memalloc_socks_key); 305 306 /* 307 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 308 * progress of swapping. SOCK_MEMALLOC may be cleared while 309 * it has rmem allocations due to the last swapfile being deactivated 310 * but there is a risk that the socket is unusable due to exceeding 311 * the rmem limits. Reclaim the reserves and obey rmem limits again. 312 */ 313 sk_mem_reclaim(sk); 314 } 315 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 316 317 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 318 { 319 int ret; 320 unsigned int noreclaim_flag; 321 322 /* these should have been dropped before queueing */ 323 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 324 325 noreclaim_flag = memalloc_noreclaim_save(); 326 ret = sk->sk_backlog_rcv(sk, skb); 327 memalloc_noreclaim_restore(noreclaim_flag); 328 329 return ret; 330 } 331 EXPORT_SYMBOL(__sk_backlog_rcv); 332 333 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 334 { 335 struct __kernel_sock_timeval tv; 336 337 if (timeo == MAX_SCHEDULE_TIMEOUT) { 338 tv.tv_sec = 0; 339 tv.tv_usec = 0; 340 } else { 341 tv.tv_sec = timeo / HZ; 342 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 343 } 344 345 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 346 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 347 *(struct old_timeval32 *)optval = tv32; 348 return sizeof(tv32); 349 } 350 351 if (old_timeval) { 352 struct __kernel_old_timeval old_tv; 353 old_tv.tv_sec = tv.tv_sec; 354 old_tv.tv_usec = tv.tv_usec; 355 *(struct __kernel_old_timeval *)optval = old_tv; 356 return sizeof(old_tv); 357 } 358 359 *(struct __kernel_sock_timeval *)optval = tv; 360 return sizeof(tv); 361 } 362 363 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) 364 { 365 struct __kernel_sock_timeval tv; 366 367 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 368 struct old_timeval32 tv32; 369 370 if (optlen < sizeof(tv32)) 371 return -EINVAL; 372 373 if (copy_from_user(&tv32, optval, sizeof(tv32))) 374 return -EFAULT; 375 tv.tv_sec = tv32.tv_sec; 376 tv.tv_usec = tv32.tv_usec; 377 } else if (old_timeval) { 378 struct __kernel_old_timeval old_tv; 379 380 if (optlen < sizeof(old_tv)) 381 return -EINVAL; 382 if (copy_from_user(&old_tv, optval, sizeof(old_tv))) 383 return -EFAULT; 384 tv.tv_sec = old_tv.tv_sec; 385 tv.tv_usec = old_tv.tv_usec; 386 } else { 387 if (optlen < sizeof(tv)) 388 return -EINVAL; 389 if (copy_from_user(&tv, optval, sizeof(tv))) 390 return -EFAULT; 391 } 392 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 393 return -EDOM; 394 395 if (tv.tv_sec < 0) { 396 static int warned __read_mostly; 397 398 *timeo_p = 0; 399 if (warned < 10 && net_ratelimit()) { 400 warned++; 401 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 402 __func__, current->comm, task_pid_nr(current)); 403 } 404 return 0; 405 } 406 *timeo_p = MAX_SCHEDULE_TIMEOUT; 407 if (tv.tv_sec == 0 && tv.tv_usec == 0) 408 return 0; 409 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 410 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 411 return 0; 412 } 413 414 static void sock_warn_obsolete_bsdism(const char *name) 415 { 416 static int warned; 417 static char warncomm[TASK_COMM_LEN]; 418 if (strcmp(warncomm, current->comm) && warned < 5) { 419 strcpy(warncomm, current->comm); 420 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 421 warncomm, name); 422 warned++; 423 } 424 } 425 426 static bool sock_needs_netstamp(const struct sock *sk) 427 { 428 switch (sk->sk_family) { 429 case AF_UNSPEC: 430 case AF_UNIX: 431 return false; 432 default: 433 return true; 434 } 435 } 436 437 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 438 { 439 if (sk->sk_flags & flags) { 440 sk->sk_flags &= ~flags; 441 if (sock_needs_netstamp(sk) && 442 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 443 net_disable_timestamp(); 444 } 445 } 446 447 448 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 449 { 450 unsigned long flags; 451 struct sk_buff_head *list = &sk->sk_receive_queue; 452 453 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 454 atomic_inc(&sk->sk_drops); 455 trace_sock_rcvqueue_full(sk, skb); 456 return -ENOMEM; 457 } 458 459 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 460 atomic_inc(&sk->sk_drops); 461 return -ENOBUFS; 462 } 463 464 skb->dev = NULL; 465 skb_set_owner_r(skb, sk); 466 467 /* we escape from rcu protected region, make sure we dont leak 468 * a norefcounted dst 469 */ 470 skb_dst_force(skb); 471 472 spin_lock_irqsave(&list->lock, flags); 473 sock_skb_set_dropcount(sk, skb); 474 __skb_queue_tail(list, skb); 475 spin_unlock_irqrestore(&list->lock, flags); 476 477 if (!sock_flag(sk, SOCK_DEAD)) 478 sk->sk_data_ready(sk); 479 return 0; 480 } 481 EXPORT_SYMBOL(__sock_queue_rcv_skb); 482 483 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 484 { 485 int err; 486 487 err = sk_filter(sk, skb); 488 if (err) 489 return err; 490 491 return __sock_queue_rcv_skb(sk, skb); 492 } 493 EXPORT_SYMBOL(sock_queue_rcv_skb); 494 495 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 496 const int nested, unsigned int trim_cap, bool refcounted) 497 { 498 int rc = NET_RX_SUCCESS; 499 500 if (sk_filter_trim_cap(sk, skb, trim_cap)) 501 goto discard_and_relse; 502 503 skb->dev = NULL; 504 505 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 506 atomic_inc(&sk->sk_drops); 507 goto discard_and_relse; 508 } 509 if (nested) 510 bh_lock_sock_nested(sk); 511 else 512 bh_lock_sock(sk); 513 if (!sock_owned_by_user(sk)) { 514 /* 515 * trylock + unlock semantics: 516 */ 517 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 518 519 rc = sk_backlog_rcv(sk, skb); 520 521 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 522 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 523 bh_unlock_sock(sk); 524 atomic_inc(&sk->sk_drops); 525 goto discard_and_relse; 526 } 527 528 bh_unlock_sock(sk); 529 out: 530 if (refcounted) 531 sock_put(sk); 532 return rc; 533 discard_and_relse: 534 kfree_skb(skb); 535 goto out; 536 } 537 EXPORT_SYMBOL(__sk_receive_skb); 538 539 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 540 { 541 struct dst_entry *dst = __sk_dst_get(sk); 542 543 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 544 sk_tx_queue_clear(sk); 545 sk->sk_dst_pending_confirm = 0; 546 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 547 dst_release(dst); 548 return NULL; 549 } 550 551 return dst; 552 } 553 EXPORT_SYMBOL(__sk_dst_check); 554 555 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 556 { 557 struct dst_entry *dst = sk_dst_get(sk); 558 559 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 560 sk_dst_reset(sk); 561 dst_release(dst); 562 return NULL; 563 } 564 565 return dst; 566 } 567 EXPORT_SYMBOL(sk_dst_check); 568 569 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 570 { 571 int ret = -ENOPROTOOPT; 572 #ifdef CONFIG_NETDEVICES 573 struct net *net = sock_net(sk); 574 575 /* Sorry... */ 576 ret = -EPERM; 577 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 578 goto out; 579 580 ret = -EINVAL; 581 if (ifindex < 0) 582 goto out; 583 584 sk->sk_bound_dev_if = ifindex; 585 if (sk->sk_prot->rehash) 586 sk->sk_prot->rehash(sk); 587 sk_dst_reset(sk); 588 589 ret = 0; 590 591 out: 592 #endif 593 594 return ret; 595 } 596 597 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 598 { 599 int ret; 600 601 if (lock_sk) 602 lock_sock(sk); 603 ret = sock_bindtoindex_locked(sk, ifindex); 604 if (lock_sk) 605 release_sock(sk); 606 607 return ret; 608 } 609 EXPORT_SYMBOL(sock_bindtoindex); 610 611 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 612 int optlen) 613 { 614 int ret = -ENOPROTOOPT; 615 #ifdef CONFIG_NETDEVICES 616 struct net *net = sock_net(sk); 617 char devname[IFNAMSIZ]; 618 int index; 619 620 ret = -EINVAL; 621 if (optlen < 0) 622 goto out; 623 624 /* Bind this socket to a particular device like "eth0", 625 * as specified in the passed interface name. If the 626 * name is "" or the option length is zero the socket 627 * is not bound. 628 */ 629 if (optlen > IFNAMSIZ - 1) 630 optlen = IFNAMSIZ - 1; 631 memset(devname, 0, sizeof(devname)); 632 633 ret = -EFAULT; 634 if (copy_from_user(devname, optval, optlen)) 635 goto out; 636 637 index = 0; 638 if (devname[0] != '\0') { 639 struct net_device *dev; 640 641 rcu_read_lock(); 642 dev = dev_get_by_name_rcu(net, devname); 643 if (dev) 644 index = dev->ifindex; 645 rcu_read_unlock(); 646 ret = -ENODEV; 647 if (!dev) 648 goto out; 649 } 650 651 return sock_bindtoindex(sk, index, true); 652 out: 653 #endif 654 655 return ret; 656 } 657 658 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 659 int __user *optlen, int len) 660 { 661 int ret = -ENOPROTOOPT; 662 #ifdef CONFIG_NETDEVICES 663 struct net *net = sock_net(sk); 664 char devname[IFNAMSIZ]; 665 666 if (sk->sk_bound_dev_if == 0) { 667 len = 0; 668 goto zero; 669 } 670 671 ret = -EINVAL; 672 if (len < IFNAMSIZ) 673 goto out; 674 675 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 676 if (ret) 677 goto out; 678 679 len = strlen(devname) + 1; 680 681 ret = -EFAULT; 682 if (copy_to_user(optval, devname, len)) 683 goto out; 684 685 zero: 686 ret = -EFAULT; 687 if (put_user(len, optlen)) 688 goto out; 689 690 ret = 0; 691 692 out: 693 #endif 694 695 return ret; 696 } 697 698 bool sk_mc_loop(struct sock *sk) 699 { 700 if (dev_recursion_level()) 701 return false; 702 if (!sk) 703 return true; 704 switch (sk->sk_family) { 705 case AF_INET: 706 return inet_sk(sk)->mc_loop; 707 #if IS_ENABLED(CONFIG_IPV6) 708 case AF_INET6: 709 return inet6_sk(sk)->mc_loop; 710 #endif 711 } 712 WARN_ON_ONCE(1); 713 return true; 714 } 715 EXPORT_SYMBOL(sk_mc_loop); 716 717 void sock_set_reuseaddr(struct sock *sk) 718 { 719 lock_sock(sk); 720 sk->sk_reuse = SK_CAN_REUSE; 721 release_sock(sk); 722 } 723 EXPORT_SYMBOL(sock_set_reuseaddr); 724 725 void sock_set_reuseport(struct sock *sk) 726 { 727 lock_sock(sk); 728 sk->sk_reuseport = true; 729 release_sock(sk); 730 } 731 EXPORT_SYMBOL(sock_set_reuseport); 732 733 void sock_no_linger(struct sock *sk) 734 { 735 lock_sock(sk); 736 sk->sk_lingertime = 0; 737 sock_set_flag(sk, SOCK_LINGER); 738 release_sock(sk); 739 } 740 EXPORT_SYMBOL(sock_no_linger); 741 742 void sock_set_priority(struct sock *sk, u32 priority) 743 { 744 lock_sock(sk); 745 sk->sk_priority = priority; 746 release_sock(sk); 747 } 748 EXPORT_SYMBOL(sock_set_priority); 749 750 void sock_set_sndtimeo(struct sock *sk, s64 secs) 751 { 752 lock_sock(sk); 753 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 754 sk->sk_sndtimeo = secs * HZ; 755 else 756 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 757 release_sock(sk); 758 } 759 EXPORT_SYMBOL(sock_set_sndtimeo); 760 761 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 762 { 763 if (val) { 764 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 765 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 766 sock_set_flag(sk, SOCK_RCVTSTAMP); 767 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 768 } else { 769 sock_reset_flag(sk, SOCK_RCVTSTAMP); 770 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 771 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 772 } 773 } 774 775 void sock_enable_timestamps(struct sock *sk) 776 { 777 lock_sock(sk); 778 __sock_set_timestamps(sk, true, false, true); 779 release_sock(sk); 780 } 781 EXPORT_SYMBOL(sock_enable_timestamps); 782 783 void sock_set_keepalive(struct sock *sk) 784 { 785 lock_sock(sk); 786 if (sk->sk_prot->keepalive) 787 sk->sk_prot->keepalive(sk, true); 788 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 789 release_sock(sk); 790 } 791 EXPORT_SYMBOL(sock_set_keepalive); 792 793 static void __sock_set_rcvbuf(struct sock *sk, int val) 794 { 795 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 796 * as a negative value. 797 */ 798 val = min_t(int, val, INT_MAX / 2); 799 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 800 801 /* We double it on the way in to account for "struct sk_buff" etc. 802 * overhead. Applications assume that the SO_RCVBUF setting they make 803 * will allow that much actual data to be received on that socket. 804 * 805 * Applications are unaware that "struct sk_buff" and other overheads 806 * allocate from the receive buffer during socket buffer allocation. 807 * 808 * And after considering the possible alternatives, returning the value 809 * we actually used in getsockopt is the most desirable behavior. 810 */ 811 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 812 } 813 814 void sock_set_rcvbuf(struct sock *sk, int val) 815 { 816 lock_sock(sk); 817 __sock_set_rcvbuf(sk, val); 818 release_sock(sk); 819 } 820 EXPORT_SYMBOL(sock_set_rcvbuf); 821 822 /* 823 * This is meant for all protocols to use and covers goings on 824 * at the socket level. Everything here is generic. 825 */ 826 827 int sock_setsockopt(struct socket *sock, int level, int optname, 828 char __user *optval, unsigned int optlen) 829 { 830 struct sock_txtime sk_txtime; 831 struct sock *sk = sock->sk; 832 int val; 833 int valbool; 834 struct linger ling; 835 int ret = 0; 836 837 /* 838 * Options without arguments 839 */ 840 841 if (optname == SO_BINDTODEVICE) 842 return sock_setbindtodevice(sk, optval, optlen); 843 844 if (optlen < sizeof(int)) 845 return -EINVAL; 846 847 if (get_user(val, (int __user *)optval)) 848 return -EFAULT; 849 850 valbool = val ? 1 : 0; 851 852 lock_sock(sk); 853 854 switch (optname) { 855 case SO_DEBUG: 856 if (val && !capable(CAP_NET_ADMIN)) 857 ret = -EACCES; 858 else 859 sock_valbool_flag(sk, SOCK_DBG, valbool); 860 break; 861 case SO_REUSEADDR: 862 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 863 break; 864 case SO_REUSEPORT: 865 sk->sk_reuseport = valbool; 866 break; 867 case SO_TYPE: 868 case SO_PROTOCOL: 869 case SO_DOMAIN: 870 case SO_ERROR: 871 ret = -ENOPROTOOPT; 872 break; 873 case SO_DONTROUTE: 874 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 875 sk_dst_reset(sk); 876 break; 877 case SO_BROADCAST: 878 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 879 break; 880 case SO_SNDBUF: 881 /* Don't error on this BSD doesn't and if you think 882 * about it this is right. Otherwise apps have to 883 * play 'guess the biggest size' games. RCVBUF/SNDBUF 884 * are treated in BSD as hints 885 */ 886 val = min_t(u32, val, sysctl_wmem_max); 887 set_sndbuf: 888 /* Ensure val * 2 fits into an int, to prevent max_t() 889 * from treating it as a negative value. 890 */ 891 val = min_t(int, val, INT_MAX / 2); 892 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 893 WRITE_ONCE(sk->sk_sndbuf, 894 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 895 /* Wake up sending tasks if we upped the value. */ 896 sk->sk_write_space(sk); 897 break; 898 899 case SO_SNDBUFFORCE: 900 if (!capable(CAP_NET_ADMIN)) { 901 ret = -EPERM; 902 break; 903 } 904 905 /* No negative values (to prevent underflow, as val will be 906 * multiplied by 2). 907 */ 908 if (val < 0) 909 val = 0; 910 goto set_sndbuf; 911 912 case SO_RCVBUF: 913 /* Don't error on this BSD doesn't and if you think 914 * about it this is right. Otherwise apps have to 915 * play 'guess the biggest size' games. RCVBUF/SNDBUF 916 * are treated in BSD as hints 917 */ 918 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 919 break; 920 921 case SO_RCVBUFFORCE: 922 if (!capable(CAP_NET_ADMIN)) { 923 ret = -EPERM; 924 break; 925 } 926 927 /* No negative values (to prevent underflow, as val will be 928 * multiplied by 2). 929 */ 930 __sock_set_rcvbuf(sk, max(val, 0)); 931 break; 932 933 case SO_KEEPALIVE: 934 if (sk->sk_prot->keepalive) 935 sk->sk_prot->keepalive(sk, valbool); 936 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 937 break; 938 939 case SO_OOBINLINE: 940 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 941 break; 942 943 case SO_NO_CHECK: 944 sk->sk_no_check_tx = valbool; 945 break; 946 947 case SO_PRIORITY: 948 if ((val >= 0 && val <= 6) || 949 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 950 sk->sk_priority = val; 951 else 952 ret = -EPERM; 953 break; 954 955 case SO_LINGER: 956 if (optlen < sizeof(ling)) { 957 ret = -EINVAL; /* 1003.1g */ 958 break; 959 } 960 if (copy_from_user(&ling, optval, sizeof(ling))) { 961 ret = -EFAULT; 962 break; 963 } 964 if (!ling.l_onoff) 965 sock_reset_flag(sk, SOCK_LINGER); 966 else { 967 #if (BITS_PER_LONG == 32) 968 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 969 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 970 else 971 #endif 972 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 973 sock_set_flag(sk, SOCK_LINGER); 974 } 975 break; 976 977 case SO_BSDCOMPAT: 978 sock_warn_obsolete_bsdism("setsockopt"); 979 break; 980 981 case SO_PASSCRED: 982 if (valbool) 983 set_bit(SOCK_PASSCRED, &sock->flags); 984 else 985 clear_bit(SOCK_PASSCRED, &sock->flags); 986 break; 987 988 case SO_TIMESTAMP_OLD: 989 __sock_set_timestamps(sk, valbool, false, false); 990 break; 991 case SO_TIMESTAMP_NEW: 992 __sock_set_timestamps(sk, valbool, true, false); 993 break; 994 case SO_TIMESTAMPNS_OLD: 995 __sock_set_timestamps(sk, valbool, false, true); 996 break; 997 case SO_TIMESTAMPNS_NEW: 998 __sock_set_timestamps(sk, valbool, true, true); 999 break; 1000 case SO_TIMESTAMPING_NEW: 1001 sock_set_flag(sk, SOCK_TSTAMP_NEW); 1002 /* fall through */ 1003 case SO_TIMESTAMPING_OLD: 1004 if (val & ~SOF_TIMESTAMPING_MASK) { 1005 ret = -EINVAL; 1006 break; 1007 } 1008 1009 if (val & SOF_TIMESTAMPING_OPT_ID && 1010 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 1011 if (sk->sk_protocol == IPPROTO_TCP && 1012 sk->sk_type == SOCK_STREAM) { 1013 if ((1 << sk->sk_state) & 1014 (TCPF_CLOSE | TCPF_LISTEN)) { 1015 ret = -EINVAL; 1016 break; 1017 } 1018 sk->sk_tskey = tcp_sk(sk)->snd_una; 1019 } else { 1020 sk->sk_tskey = 0; 1021 } 1022 } 1023 1024 if (val & SOF_TIMESTAMPING_OPT_STATS && 1025 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 1026 ret = -EINVAL; 1027 break; 1028 } 1029 1030 sk->sk_tsflags = val; 1031 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 1032 sock_enable_timestamp(sk, 1033 SOCK_TIMESTAMPING_RX_SOFTWARE); 1034 else { 1035 if (optname == SO_TIMESTAMPING_NEW) 1036 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 1037 1038 sock_disable_timestamp(sk, 1039 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 1040 } 1041 break; 1042 1043 case SO_RCVLOWAT: 1044 if (val < 0) 1045 val = INT_MAX; 1046 if (sock->ops->set_rcvlowat) 1047 ret = sock->ops->set_rcvlowat(sk, val); 1048 else 1049 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1050 break; 1051 1052 case SO_RCVTIMEO_OLD: 1053 case SO_RCVTIMEO_NEW: 1054 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); 1055 break; 1056 1057 case SO_SNDTIMEO_OLD: 1058 case SO_SNDTIMEO_NEW: 1059 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); 1060 break; 1061 1062 case SO_ATTACH_FILTER: 1063 ret = -EINVAL; 1064 if (optlen == sizeof(struct sock_fprog)) { 1065 struct sock_fprog fprog; 1066 1067 ret = -EFAULT; 1068 if (copy_from_user(&fprog, optval, sizeof(fprog))) 1069 break; 1070 1071 ret = sk_attach_filter(&fprog, sk); 1072 } 1073 break; 1074 1075 case SO_ATTACH_BPF: 1076 ret = -EINVAL; 1077 if (optlen == sizeof(u32)) { 1078 u32 ufd; 1079 1080 ret = -EFAULT; 1081 if (copy_from_user(&ufd, optval, sizeof(ufd))) 1082 break; 1083 1084 ret = sk_attach_bpf(ufd, sk); 1085 } 1086 break; 1087 1088 case SO_ATTACH_REUSEPORT_CBPF: 1089 ret = -EINVAL; 1090 if (optlen == sizeof(struct sock_fprog)) { 1091 struct sock_fprog fprog; 1092 1093 ret = -EFAULT; 1094 if (copy_from_user(&fprog, optval, sizeof(fprog))) 1095 break; 1096 1097 ret = sk_reuseport_attach_filter(&fprog, sk); 1098 } 1099 break; 1100 1101 case SO_ATTACH_REUSEPORT_EBPF: 1102 ret = -EINVAL; 1103 if (optlen == sizeof(u32)) { 1104 u32 ufd; 1105 1106 ret = -EFAULT; 1107 if (copy_from_user(&ufd, optval, sizeof(ufd))) 1108 break; 1109 1110 ret = sk_reuseport_attach_bpf(ufd, sk); 1111 } 1112 break; 1113 1114 case SO_DETACH_REUSEPORT_BPF: 1115 ret = reuseport_detach_prog(sk); 1116 break; 1117 1118 case SO_DETACH_FILTER: 1119 ret = sk_detach_filter(sk); 1120 break; 1121 1122 case SO_LOCK_FILTER: 1123 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1124 ret = -EPERM; 1125 else 1126 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1127 break; 1128 1129 case SO_PASSSEC: 1130 if (valbool) 1131 set_bit(SOCK_PASSSEC, &sock->flags); 1132 else 1133 clear_bit(SOCK_PASSSEC, &sock->flags); 1134 break; 1135 case SO_MARK: 1136 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1137 ret = -EPERM; 1138 } else if (val != sk->sk_mark) { 1139 sk->sk_mark = val; 1140 sk_dst_reset(sk); 1141 } 1142 break; 1143 1144 case SO_RXQ_OVFL: 1145 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1146 break; 1147 1148 case SO_WIFI_STATUS: 1149 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1150 break; 1151 1152 case SO_PEEK_OFF: 1153 if (sock->ops->set_peek_off) 1154 ret = sock->ops->set_peek_off(sk, val); 1155 else 1156 ret = -EOPNOTSUPP; 1157 break; 1158 1159 case SO_NOFCS: 1160 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1161 break; 1162 1163 case SO_SELECT_ERR_QUEUE: 1164 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1165 break; 1166 1167 #ifdef CONFIG_NET_RX_BUSY_POLL 1168 case SO_BUSY_POLL: 1169 /* allow unprivileged users to decrease the value */ 1170 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1171 ret = -EPERM; 1172 else { 1173 if (val < 0) 1174 ret = -EINVAL; 1175 else 1176 sk->sk_ll_usec = val; 1177 } 1178 break; 1179 #endif 1180 1181 case SO_MAX_PACING_RATE: 1182 { 1183 unsigned long ulval = (val == ~0U) ? ~0UL : val; 1184 1185 if (sizeof(ulval) != sizeof(val) && 1186 optlen >= sizeof(ulval) && 1187 get_user(ulval, (unsigned long __user *)optval)) { 1188 ret = -EFAULT; 1189 break; 1190 } 1191 if (ulval != ~0UL) 1192 cmpxchg(&sk->sk_pacing_status, 1193 SK_PACING_NONE, 1194 SK_PACING_NEEDED); 1195 sk->sk_max_pacing_rate = ulval; 1196 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1197 break; 1198 } 1199 case SO_INCOMING_CPU: 1200 WRITE_ONCE(sk->sk_incoming_cpu, val); 1201 break; 1202 1203 case SO_CNX_ADVICE: 1204 if (val == 1) 1205 dst_negative_advice(sk); 1206 break; 1207 1208 case SO_ZEROCOPY: 1209 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1210 if (!((sk->sk_type == SOCK_STREAM && 1211 sk->sk_protocol == IPPROTO_TCP) || 1212 (sk->sk_type == SOCK_DGRAM && 1213 sk->sk_protocol == IPPROTO_UDP))) 1214 ret = -ENOTSUPP; 1215 } else if (sk->sk_family != PF_RDS) { 1216 ret = -ENOTSUPP; 1217 } 1218 if (!ret) { 1219 if (val < 0 || val > 1) 1220 ret = -EINVAL; 1221 else 1222 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1223 } 1224 break; 1225 1226 case SO_TXTIME: 1227 if (optlen != sizeof(struct sock_txtime)) { 1228 ret = -EINVAL; 1229 break; 1230 } else if (copy_from_user(&sk_txtime, optval, 1231 sizeof(struct sock_txtime))) { 1232 ret = -EFAULT; 1233 break; 1234 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1235 ret = -EINVAL; 1236 break; 1237 } 1238 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1239 * scheduler has enough safe guards. 1240 */ 1241 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1242 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1243 ret = -EPERM; 1244 break; 1245 } 1246 sock_valbool_flag(sk, SOCK_TXTIME, true); 1247 sk->sk_clockid = sk_txtime.clockid; 1248 sk->sk_txtime_deadline_mode = 1249 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1250 sk->sk_txtime_report_errors = 1251 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1252 break; 1253 1254 case SO_BINDTOIFINDEX: 1255 ret = sock_bindtoindex_locked(sk, val); 1256 break; 1257 1258 default: 1259 ret = -ENOPROTOOPT; 1260 break; 1261 } 1262 release_sock(sk); 1263 return ret; 1264 } 1265 EXPORT_SYMBOL(sock_setsockopt); 1266 1267 1268 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1269 struct ucred *ucred) 1270 { 1271 ucred->pid = pid_vnr(pid); 1272 ucred->uid = ucred->gid = -1; 1273 if (cred) { 1274 struct user_namespace *current_ns = current_user_ns(); 1275 1276 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1277 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1278 } 1279 } 1280 1281 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1282 { 1283 struct user_namespace *user_ns = current_user_ns(); 1284 int i; 1285 1286 for (i = 0; i < src->ngroups; i++) 1287 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1288 return -EFAULT; 1289 1290 return 0; 1291 } 1292 1293 int sock_getsockopt(struct socket *sock, int level, int optname, 1294 char __user *optval, int __user *optlen) 1295 { 1296 struct sock *sk = sock->sk; 1297 1298 union { 1299 int val; 1300 u64 val64; 1301 unsigned long ulval; 1302 struct linger ling; 1303 struct old_timeval32 tm32; 1304 struct __kernel_old_timeval tm; 1305 struct __kernel_sock_timeval stm; 1306 struct sock_txtime txtime; 1307 } v; 1308 1309 int lv = sizeof(int); 1310 int len; 1311 1312 if (get_user(len, optlen)) 1313 return -EFAULT; 1314 if (len < 0) 1315 return -EINVAL; 1316 1317 memset(&v, 0, sizeof(v)); 1318 1319 switch (optname) { 1320 case SO_DEBUG: 1321 v.val = sock_flag(sk, SOCK_DBG); 1322 break; 1323 1324 case SO_DONTROUTE: 1325 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1326 break; 1327 1328 case SO_BROADCAST: 1329 v.val = sock_flag(sk, SOCK_BROADCAST); 1330 break; 1331 1332 case SO_SNDBUF: 1333 v.val = sk->sk_sndbuf; 1334 break; 1335 1336 case SO_RCVBUF: 1337 v.val = sk->sk_rcvbuf; 1338 break; 1339 1340 case SO_REUSEADDR: 1341 v.val = sk->sk_reuse; 1342 break; 1343 1344 case SO_REUSEPORT: 1345 v.val = sk->sk_reuseport; 1346 break; 1347 1348 case SO_KEEPALIVE: 1349 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1350 break; 1351 1352 case SO_TYPE: 1353 v.val = sk->sk_type; 1354 break; 1355 1356 case SO_PROTOCOL: 1357 v.val = sk->sk_protocol; 1358 break; 1359 1360 case SO_DOMAIN: 1361 v.val = sk->sk_family; 1362 break; 1363 1364 case SO_ERROR: 1365 v.val = -sock_error(sk); 1366 if (v.val == 0) 1367 v.val = xchg(&sk->sk_err_soft, 0); 1368 break; 1369 1370 case SO_OOBINLINE: 1371 v.val = sock_flag(sk, SOCK_URGINLINE); 1372 break; 1373 1374 case SO_NO_CHECK: 1375 v.val = sk->sk_no_check_tx; 1376 break; 1377 1378 case SO_PRIORITY: 1379 v.val = sk->sk_priority; 1380 break; 1381 1382 case SO_LINGER: 1383 lv = sizeof(v.ling); 1384 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1385 v.ling.l_linger = sk->sk_lingertime / HZ; 1386 break; 1387 1388 case SO_BSDCOMPAT: 1389 sock_warn_obsolete_bsdism("getsockopt"); 1390 break; 1391 1392 case SO_TIMESTAMP_OLD: 1393 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1394 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1395 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1396 break; 1397 1398 case SO_TIMESTAMPNS_OLD: 1399 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1400 break; 1401 1402 case SO_TIMESTAMP_NEW: 1403 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1404 break; 1405 1406 case SO_TIMESTAMPNS_NEW: 1407 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1408 break; 1409 1410 case SO_TIMESTAMPING_OLD: 1411 v.val = sk->sk_tsflags; 1412 break; 1413 1414 case SO_RCVTIMEO_OLD: 1415 case SO_RCVTIMEO_NEW: 1416 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1417 break; 1418 1419 case SO_SNDTIMEO_OLD: 1420 case SO_SNDTIMEO_NEW: 1421 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1422 break; 1423 1424 case SO_RCVLOWAT: 1425 v.val = sk->sk_rcvlowat; 1426 break; 1427 1428 case SO_SNDLOWAT: 1429 v.val = 1; 1430 break; 1431 1432 case SO_PASSCRED: 1433 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1434 break; 1435 1436 case SO_PEERCRED: 1437 { 1438 struct ucred peercred; 1439 if (len > sizeof(peercred)) 1440 len = sizeof(peercred); 1441 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1442 if (copy_to_user(optval, &peercred, len)) 1443 return -EFAULT; 1444 goto lenout; 1445 } 1446 1447 case SO_PEERGROUPS: 1448 { 1449 int ret, n; 1450 1451 if (!sk->sk_peer_cred) 1452 return -ENODATA; 1453 1454 n = sk->sk_peer_cred->group_info->ngroups; 1455 if (len < n * sizeof(gid_t)) { 1456 len = n * sizeof(gid_t); 1457 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1458 } 1459 len = n * sizeof(gid_t); 1460 1461 ret = groups_to_user((gid_t __user *)optval, 1462 sk->sk_peer_cred->group_info); 1463 if (ret) 1464 return ret; 1465 goto lenout; 1466 } 1467 1468 case SO_PEERNAME: 1469 { 1470 char address[128]; 1471 1472 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1473 if (lv < 0) 1474 return -ENOTCONN; 1475 if (lv < len) 1476 return -EINVAL; 1477 if (copy_to_user(optval, address, len)) 1478 return -EFAULT; 1479 goto lenout; 1480 } 1481 1482 /* Dubious BSD thing... Probably nobody even uses it, but 1483 * the UNIX standard wants it for whatever reason... -DaveM 1484 */ 1485 case SO_ACCEPTCONN: 1486 v.val = sk->sk_state == TCP_LISTEN; 1487 break; 1488 1489 case SO_PASSSEC: 1490 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1491 break; 1492 1493 case SO_PEERSEC: 1494 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1495 1496 case SO_MARK: 1497 v.val = sk->sk_mark; 1498 break; 1499 1500 case SO_RXQ_OVFL: 1501 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1502 break; 1503 1504 case SO_WIFI_STATUS: 1505 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1506 break; 1507 1508 case SO_PEEK_OFF: 1509 if (!sock->ops->set_peek_off) 1510 return -EOPNOTSUPP; 1511 1512 v.val = sk->sk_peek_off; 1513 break; 1514 case SO_NOFCS: 1515 v.val = sock_flag(sk, SOCK_NOFCS); 1516 break; 1517 1518 case SO_BINDTODEVICE: 1519 return sock_getbindtodevice(sk, optval, optlen, len); 1520 1521 case SO_GET_FILTER: 1522 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1523 if (len < 0) 1524 return len; 1525 1526 goto lenout; 1527 1528 case SO_LOCK_FILTER: 1529 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1530 break; 1531 1532 case SO_BPF_EXTENSIONS: 1533 v.val = bpf_tell_extensions(); 1534 break; 1535 1536 case SO_SELECT_ERR_QUEUE: 1537 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1538 break; 1539 1540 #ifdef CONFIG_NET_RX_BUSY_POLL 1541 case SO_BUSY_POLL: 1542 v.val = sk->sk_ll_usec; 1543 break; 1544 #endif 1545 1546 case SO_MAX_PACING_RATE: 1547 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1548 lv = sizeof(v.ulval); 1549 v.ulval = sk->sk_max_pacing_rate; 1550 } else { 1551 /* 32bit version */ 1552 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1553 } 1554 break; 1555 1556 case SO_INCOMING_CPU: 1557 v.val = READ_ONCE(sk->sk_incoming_cpu); 1558 break; 1559 1560 case SO_MEMINFO: 1561 { 1562 u32 meminfo[SK_MEMINFO_VARS]; 1563 1564 sk_get_meminfo(sk, meminfo); 1565 1566 len = min_t(unsigned int, len, sizeof(meminfo)); 1567 if (copy_to_user(optval, &meminfo, len)) 1568 return -EFAULT; 1569 1570 goto lenout; 1571 } 1572 1573 #ifdef CONFIG_NET_RX_BUSY_POLL 1574 case SO_INCOMING_NAPI_ID: 1575 v.val = READ_ONCE(sk->sk_napi_id); 1576 1577 /* aggregate non-NAPI IDs down to 0 */ 1578 if (v.val < MIN_NAPI_ID) 1579 v.val = 0; 1580 1581 break; 1582 #endif 1583 1584 case SO_COOKIE: 1585 lv = sizeof(u64); 1586 if (len < lv) 1587 return -EINVAL; 1588 v.val64 = sock_gen_cookie(sk); 1589 break; 1590 1591 case SO_ZEROCOPY: 1592 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1593 break; 1594 1595 case SO_TXTIME: 1596 lv = sizeof(v.txtime); 1597 v.txtime.clockid = sk->sk_clockid; 1598 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1599 SOF_TXTIME_DEADLINE_MODE : 0; 1600 v.txtime.flags |= sk->sk_txtime_report_errors ? 1601 SOF_TXTIME_REPORT_ERRORS : 0; 1602 break; 1603 1604 case SO_BINDTOIFINDEX: 1605 v.val = sk->sk_bound_dev_if; 1606 break; 1607 1608 default: 1609 /* We implement the SO_SNDLOWAT etc to not be settable 1610 * (1003.1g 7). 1611 */ 1612 return -ENOPROTOOPT; 1613 } 1614 1615 if (len > lv) 1616 len = lv; 1617 if (copy_to_user(optval, &v, len)) 1618 return -EFAULT; 1619 lenout: 1620 if (put_user(len, optlen)) 1621 return -EFAULT; 1622 return 0; 1623 } 1624 1625 /* 1626 * Initialize an sk_lock. 1627 * 1628 * (We also register the sk_lock with the lock validator.) 1629 */ 1630 static inline void sock_lock_init(struct sock *sk) 1631 { 1632 if (sk->sk_kern_sock) 1633 sock_lock_init_class_and_name( 1634 sk, 1635 af_family_kern_slock_key_strings[sk->sk_family], 1636 af_family_kern_slock_keys + sk->sk_family, 1637 af_family_kern_key_strings[sk->sk_family], 1638 af_family_kern_keys + sk->sk_family); 1639 else 1640 sock_lock_init_class_and_name( 1641 sk, 1642 af_family_slock_key_strings[sk->sk_family], 1643 af_family_slock_keys + sk->sk_family, 1644 af_family_key_strings[sk->sk_family], 1645 af_family_keys + sk->sk_family); 1646 } 1647 1648 /* 1649 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1650 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1651 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1652 */ 1653 static void sock_copy(struct sock *nsk, const struct sock *osk) 1654 { 1655 const struct proto *prot = READ_ONCE(osk->sk_prot); 1656 #ifdef CONFIG_SECURITY_NETWORK 1657 void *sptr = nsk->sk_security; 1658 #endif 1659 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1660 1661 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1662 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1663 1664 #ifdef CONFIG_SECURITY_NETWORK 1665 nsk->sk_security = sptr; 1666 security_sk_clone(osk, nsk); 1667 #endif 1668 } 1669 1670 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1671 int family) 1672 { 1673 struct sock *sk; 1674 struct kmem_cache *slab; 1675 1676 slab = prot->slab; 1677 if (slab != NULL) { 1678 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1679 if (!sk) 1680 return sk; 1681 if (want_init_on_alloc(priority)) 1682 sk_prot_clear_nulls(sk, prot->obj_size); 1683 } else 1684 sk = kmalloc(prot->obj_size, priority); 1685 1686 if (sk != NULL) { 1687 if (security_sk_alloc(sk, family, priority)) 1688 goto out_free; 1689 1690 if (!try_module_get(prot->owner)) 1691 goto out_free_sec; 1692 sk_tx_queue_clear(sk); 1693 } 1694 1695 return sk; 1696 1697 out_free_sec: 1698 security_sk_free(sk); 1699 out_free: 1700 if (slab != NULL) 1701 kmem_cache_free(slab, sk); 1702 else 1703 kfree(sk); 1704 return NULL; 1705 } 1706 1707 static void sk_prot_free(struct proto *prot, struct sock *sk) 1708 { 1709 struct kmem_cache *slab; 1710 struct module *owner; 1711 1712 owner = prot->owner; 1713 slab = prot->slab; 1714 1715 cgroup_sk_free(&sk->sk_cgrp_data); 1716 mem_cgroup_sk_free(sk); 1717 security_sk_free(sk); 1718 if (slab != NULL) 1719 kmem_cache_free(slab, sk); 1720 else 1721 kfree(sk); 1722 module_put(owner); 1723 } 1724 1725 /** 1726 * sk_alloc - All socket objects are allocated here 1727 * @net: the applicable net namespace 1728 * @family: protocol family 1729 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1730 * @prot: struct proto associated with this new sock instance 1731 * @kern: is this to be a kernel socket? 1732 */ 1733 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1734 struct proto *prot, int kern) 1735 { 1736 struct sock *sk; 1737 1738 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1739 if (sk) { 1740 sk->sk_family = family; 1741 /* 1742 * See comment in struct sock definition to understand 1743 * why we need sk_prot_creator -acme 1744 */ 1745 sk->sk_prot = sk->sk_prot_creator = prot; 1746 sk->sk_kern_sock = kern; 1747 sock_lock_init(sk); 1748 sk->sk_net_refcnt = kern ? 0 : 1; 1749 if (likely(sk->sk_net_refcnt)) { 1750 get_net(net); 1751 sock_inuse_add(net, 1); 1752 } 1753 1754 sock_net_set(sk, net); 1755 refcount_set(&sk->sk_wmem_alloc, 1); 1756 1757 mem_cgroup_sk_alloc(sk); 1758 cgroup_sk_alloc(&sk->sk_cgrp_data); 1759 sock_update_classid(&sk->sk_cgrp_data); 1760 sock_update_netprioidx(&sk->sk_cgrp_data); 1761 sk_tx_queue_clear(sk); 1762 } 1763 1764 return sk; 1765 } 1766 EXPORT_SYMBOL(sk_alloc); 1767 1768 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1769 * grace period. This is the case for UDP sockets and TCP listeners. 1770 */ 1771 static void __sk_destruct(struct rcu_head *head) 1772 { 1773 struct sock *sk = container_of(head, struct sock, sk_rcu); 1774 struct sk_filter *filter; 1775 1776 if (sk->sk_destruct) 1777 sk->sk_destruct(sk); 1778 1779 filter = rcu_dereference_check(sk->sk_filter, 1780 refcount_read(&sk->sk_wmem_alloc) == 0); 1781 if (filter) { 1782 sk_filter_uncharge(sk, filter); 1783 RCU_INIT_POINTER(sk->sk_filter, NULL); 1784 } 1785 1786 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1787 1788 #ifdef CONFIG_BPF_SYSCALL 1789 bpf_sk_storage_free(sk); 1790 #endif 1791 1792 if (atomic_read(&sk->sk_omem_alloc)) 1793 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1794 __func__, atomic_read(&sk->sk_omem_alloc)); 1795 1796 if (sk->sk_frag.page) { 1797 put_page(sk->sk_frag.page); 1798 sk->sk_frag.page = NULL; 1799 } 1800 1801 if (sk->sk_peer_cred) 1802 put_cred(sk->sk_peer_cred); 1803 put_pid(sk->sk_peer_pid); 1804 if (likely(sk->sk_net_refcnt)) 1805 put_net(sock_net(sk)); 1806 sk_prot_free(sk->sk_prot_creator, sk); 1807 } 1808 1809 void sk_destruct(struct sock *sk) 1810 { 1811 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1812 1813 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1814 reuseport_detach_sock(sk); 1815 use_call_rcu = true; 1816 } 1817 1818 if (use_call_rcu) 1819 call_rcu(&sk->sk_rcu, __sk_destruct); 1820 else 1821 __sk_destruct(&sk->sk_rcu); 1822 } 1823 1824 static void __sk_free(struct sock *sk) 1825 { 1826 if (likely(sk->sk_net_refcnt)) 1827 sock_inuse_add(sock_net(sk), -1); 1828 1829 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1830 sock_diag_broadcast_destroy(sk); 1831 else 1832 sk_destruct(sk); 1833 } 1834 1835 void sk_free(struct sock *sk) 1836 { 1837 /* 1838 * We subtract one from sk_wmem_alloc and can know if 1839 * some packets are still in some tx queue. 1840 * If not null, sock_wfree() will call __sk_free(sk) later 1841 */ 1842 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1843 __sk_free(sk); 1844 } 1845 EXPORT_SYMBOL(sk_free); 1846 1847 static void sk_init_common(struct sock *sk) 1848 { 1849 skb_queue_head_init(&sk->sk_receive_queue); 1850 skb_queue_head_init(&sk->sk_write_queue); 1851 skb_queue_head_init(&sk->sk_error_queue); 1852 1853 rwlock_init(&sk->sk_callback_lock); 1854 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1855 af_rlock_keys + sk->sk_family, 1856 af_family_rlock_key_strings[sk->sk_family]); 1857 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1858 af_wlock_keys + sk->sk_family, 1859 af_family_wlock_key_strings[sk->sk_family]); 1860 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1861 af_elock_keys + sk->sk_family, 1862 af_family_elock_key_strings[sk->sk_family]); 1863 lockdep_set_class_and_name(&sk->sk_callback_lock, 1864 af_callback_keys + sk->sk_family, 1865 af_family_clock_key_strings[sk->sk_family]); 1866 } 1867 1868 /** 1869 * sk_clone_lock - clone a socket, and lock its clone 1870 * @sk: the socket to clone 1871 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1872 * 1873 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1874 */ 1875 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1876 { 1877 struct proto *prot = READ_ONCE(sk->sk_prot); 1878 struct sock *newsk; 1879 bool is_charged = true; 1880 1881 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1882 if (newsk != NULL) { 1883 struct sk_filter *filter; 1884 1885 sock_copy(newsk, sk); 1886 1887 newsk->sk_prot_creator = prot; 1888 1889 /* SANITY */ 1890 if (likely(newsk->sk_net_refcnt)) 1891 get_net(sock_net(newsk)); 1892 sk_node_init(&newsk->sk_node); 1893 sock_lock_init(newsk); 1894 bh_lock_sock(newsk); 1895 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1896 newsk->sk_backlog.len = 0; 1897 1898 atomic_set(&newsk->sk_rmem_alloc, 0); 1899 /* 1900 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1901 */ 1902 refcount_set(&newsk->sk_wmem_alloc, 1); 1903 atomic_set(&newsk->sk_omem_alloc, 0); 1904 sk_init_common(newsk); 1905 1906 newsk->sk_dst_cache = NULL; 1907 newsk->sk_dst_pending_confirm = 0; 1908 newsk->sk_wmem_queued = 0; 1909 newsk->sk_forward_alloc = 0; 1910 atomic_set(&newsk->sk_drops, 0); 1911 newsk->sk_send_head = NULL; 1912 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1913 atomic_set(&newsk->sk_zckey, 0); 1914 1915 sock_reset_flag(newsk, SOCK_DONE); 1916 1917 /* sk->sk_memcg will be populated at accept() time */ 1918 newsk->sk_memcg = NULL; 1919 1920 cgroup_sk_clone(&newsk->sk_cgrp_data); 1921 1922 rcu_read_lock(); 1923 filter = rcu_dereference(sk->sk_filter); 1924 if (filter != NULL) 1925 /* though it's an empty new sock, the charging may fail 1926 * if sysctl_optmem_max was changed between creation of 1927 * original socket and cloning 1928 */ 1929 is_charged = sk_filter_charge(newsk, filter); 1930 RCU_INIT_POINTER(newsk->sk_filter, filter); 1931 rcu_read_unlock(); 1932 1933 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1934 /* We need to make sure that we don't uncharge the new 1935 * socket if we couldn't charge it in the first place 1936 * as otherwise we uncharge the parent's filter. 1937 */ 1938 if (!is_charged) 1939 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1940 sk_free_unlock_clone(newsk); 1941 newsk = NULL; 1942 goto out; 1943 } 1944 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1945 1946 if (bpf_sk_storage_clone(sk, newsk)) { 1947 sk_free_unlock_clone(newsk); 1948 newsk = NULL; 1949 goto out; 1950 } 1951 1952 /* Clear sk_user_data if parent had the pointer tagged 1953 * as not suitable for copying when cloning. 1954 */ 1955 if (sk_user_data_is_nocopy(newsk)) 1956 newsk->sk_user_data = NULL; 1957 1958 newsk->sk_err = 0; 1959 newsk->sk_err_soft = 0; 1960 newsk->sk_priority = 0; 1961 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1962 if (likely(newsk->sk_net_refcnt)) 1963 sock_inuse_add(sock_net(newsk), 1); 1964 1965 /* 1966 * Before updating sk_refcnt, we must commit prior changes to memory 1967 * (Documentation/RCU/rculist_nulls.txt for details) 1968 */ 1969 smp_wmb(); 1970 refcount_set(&newsk->sk_refcnt, 2); 1971 1972 /* 1973 * Increment the counter in the same struct proto as the master 1974 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1975 * is the same as sk->sk_prot->socks, as this field was copied 1976 * with memcpy). 1977 * 1978 * This _changes_ the previous behaviour, where 1979 * tcp_create_openreq_child always was incrementing the 1980 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1981 * to be taken into account in all callers. -acme 1982 */ 1983 sk_refcnt_debug_inc(newsk); 1984 sk_set_socket(newsk, NULL); 1985 sk_tx_queue_clear(newsk); 1986 RCU_INIT_POINTER(newsk->sk_wq, NULL); 1987 1988 if (newsk->sk_prot->sockets_allocated) 1989 sk_sockets_allocated_inc(newsk); 1990 1991 if (sock_needs_netstamp(sk) && 1992 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1993 net_enable_timestamp(); 1994 } 1995 out: 1996 return newsk; 1997 } 1998 EXPORT_SYMBOL_GPL(sk_clone_lock); 1999 2000 void sk_free_unlock_clone(struct sock *sk) 2001 { 2002 /* It is still raw copy of parent, so invalidate 2003 * destructor and make plain sk_free() */ 2004 sk->sk_destruct = NULL; 2005 bh_unlock_sock(sk); 2006 sk_free(sk); 2007 } 2008 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2009 2010 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2011 { 2012 u32 max_segs = 1; 2013 2014 sk_dst_set(sk, dst); 2015 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2016 if (sk->sk_route_caps & NETIF_F_GSO) 2017 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2018 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2019 if (sk_can_gso(sk)) { 2020 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2021 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2022 } else { 2023 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2024 sk->sk_gso_max_size = dst->dev->gso_max_size; 2025 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2026 } 2027 } 2028 sk->sk_gso_max_segs = max_segs; 2029 } 2030 EXPORT_SYMBOL_GPL(sk_setup_caps); 2031 2032 /* 2033 * Simple resource managers for sockets. 2034 */ 2035 2036 2037 /* 2038 * Write buffer destructor automatically called from kfree_skb. 2039 */ 2040 void sock_wfree(struct sk_buff *skb) 2041 { 2042 struct sock *sk = skb->sk; 2043 unsigned int len = skb->truesize; 2044 2045 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2046 /* 2047 * Keep a reference on sk_wmem_alloc, this will be released 2048 * after sk_write_space() call 2049 */ 2050 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2051 sk->sk_write_space(sk); 2052 len = 1; 2053 } 2054 /* 2055 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2056 * could not do because of in-flight packets 2057 */ 2058 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2059 __sk_free(sk); 2060 } 2061 EXPORT_SYMBOL(sock_wfree); 2062 2063 /* This variant of sock_wfree() is used by TCP, 2064 * since it sets SOCK_USE_WRITE_QUEUE. 2065 */ 2066 void __sock_wfree(struct sk_buff *skb) 2067 { 2068 struct sock *sk = skb->sk; 2069 2070 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2071 __sk_free(sk); 2072 } 2073 2074 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2075 { 2076 skb_orphan(skb); 2077 skb->sk = sk; 2078 #ifdef CONFIG_INET 2079 if (unlikely(!sk_fullsock(sk))) { 2080 skb->destructor = sock_edemux; 2081 sock_hold(sk); 2082 return; 2083 } 2084 #endif 2085 skb->destructor = sock_wfree; 2086 skb_set_hash_from_sk(skb, sk); 2087 /* 2088 * We used to take a refcount on sk, but following operation 2089 * is enough to guarantee sk_free() wont free this sock until 2090 * all in-flight packets are completed 2091 */ 2092 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2093 } 2094 EXPORT_SYMBOL(skb_set_owner_w); 2095 2096 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2097 { 2098 #ifdef CONFIG_TLS_DEVICE 2099 /* Drivers depend on in-order delivery for crypto offload, 2100 * partial orphan breaks out-of-order-OK logic. 2101 */ 2102 if (skb->decrypted) 2103 return false; 2104 #endif 2105 return (skb->destructor == sock_wfree || 2106 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2107 } 2108 2109 /* This helper is used by netem, as it can hold packets in its 2110 * delay queue. We want to allow the owner socket to send more 2111 * packets, as if they were already TX completed by a typical driver. 2112 * But we also want to keep skb->sk set because some packet schedulers 2113 * rely on it (sch_fq for example). 2114 */ 2115 void skb_orphan_partial(struct sk_buff *skb) 2116 { 2117 if (skb_is_tcp_pure_ack(skb)) 2118 return; 2119 2120 if (can_skb_orphan_partial(skb)) { 2121 struct sock *sk = skb->sk; 2122 2123 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 2124 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 2125 skb->destructor = sock_efree; 2126 } 2127 } else { 2128 skb_orphan(skb); 2129 } 2130 } 2131 EXPORT_SYMBOL(skb_orphan_partial); 2132 2133 /* 2134 * Read buffer destructor automatically called from kfree_skb. 2135 */ 2136 void sock_rfree(struct sk_buff *skb) 2137 { 2138 struct sock *sk = skb->sk; 2139 unsigned int len = skb->truesize; 2140 2141 atomic_sub(len, &sk->sk_rmem_alloc); 2142 sk_mem_uncharge(sk, len); 2143 } 2144 EXPORT_SYMBOL(sock_rfree); 2145 2146 /* 2147 * Buffer destructor for skbs that are not used directly in read or write 2148 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2149 */ 2150 void sock_efree(struct sk_buff *skb) 2151 { 2152 sock_put(skb->sk); 2153 } 2154 EXPORT_SYMBOL(sock_efree); 2155 2156 /* Buffer destructor for prefetch/receive path where reference count may 2157 * not be held, e.g. for listen sockets. 2158 */ 2159 #ifdef CONFIG_INET 2160 void sock_pfree(struct sk_buff *skb) 2161 { 2162 if (sk_is_refcounted(skb->sk)) 2163 sock_gen_put(skb->sk); 2164 } 2165 EXPORT_SYMBOL(sock_pfree); 2166 #endif /* CONFIG_INET */ 2167 2168 kuid_t sock_i_uid(struct sock *sk) 2169 { 2170 kuid_t uid; 2171 2172 read_lock_bh(&sk->sk_callback_lock); 2173 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2174 read_unlock_bh(&sk->sk_callback_lock); 2175 return uid; 2176 } 2177 EXPORT_SYMBOL(sock_i_uid); 2178 2179 unsigned long sock_i_ino(struct sock *sk) 2180 { 2181 unsigned long ino; 2182 2183 read_lock_bh(&sk->sk_callback_lock); 2184 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2185 read_unlock_bh(&sk->sk_callback_lock); 2186 return ino; 2187 } 2188 EXPORT_SYMBOL(sock_i_ino); 2189 2190 /* 2191 * Allocate a skb from the socket's send buffer. 2192 */ 2193 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2194 gfp_t priority) 2195 { 2196 if (force || 2197 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2198 struct sk_buff *skb = alloc_skb(size, priority); 2199 2200 if (skb) { 2201 skb_set_owner_w(skb, sk); 2202 return skb; 2203 } 2204 } 2205 return NULL; 2206 } 2207 EXPORT_SYMBOL(sock_wmalloc); 2208 2209 static void sock_ofree(struct sk_buff *skb) 2210 { 2211 struct sock *sk = skb->sk; 2212 2213 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2214 } 2215 2216 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2217 gfp_t priority) 2218 { 2219 struct sk_buff *skb; 2220 2221 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2222 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2223 sysctl_optmem_max) 2224 return NULL; 2225 2226 skb = alloc_skb(size, priority); 2227 if (!skb) 2228 return NULL; 2229 2230 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2231 skb->sk = sk; 2232 skb->destructor = sock_ofree; 2233 return skb; 2234 } 2235 2236 /* 2237 * Allocate a memory block from the socket's option memory buffer. 2238 */ 2239 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2240 { 2241 if ((unsigned int)size <= sysctl_optmem_max && 2242 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2243 void *mem; 2244 /* First do the add, to avoid the race if kmalloc 2245 * might sleep. 2246 */ 2247 atomic_add(size, &sk->sk_omem_alloc); 2248 mem = kmalloc(size, priority); 2249 if (mem) 2250 return mem; 2251 atomic_sub(size, &sk->sk_omem_alloc); 2252 } 2253 return NULL; 2254 } 2255 EXPORT_SYMBOL(sock_kmalloc); 2256 2257 /* Free an option memory block. Note, we actually want the inline 2258 * here as this allows gcc to detect the nullify and fold away the 2259 * condition entirely. 2260 */ 2261 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2262 const bool nullify) 2263 { 2264 if (WARN_ON_ONCE(!mem)) 2265 return; 2266 if (nullify) 2267 kzfree(mem); 2268 else 2269 kfree(mem); 2270 atomic_sub(size, &sk->sk_omem_alloc); 2271 } 2272 2273 void sock_kfree_s(struct sock *sk, void *mem, int size) 2274 { 2275 __sock_kfree_s(sk, mem, size, false); 2276 } 2277 EXPORT_SYMBOL(sock_kfree_s); 2278 2279 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2280 { 2281 __sock_kfree_s(sk, mem, size, true); 2282 } 2283 EXPORT_SYMBOL(sock_kzfree_s); 2284 2285 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2286 I think, these locks should be removed for datagram sockets. 2287 */ 2288 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2289 { 2290 DEFINE_WAIT(wait); 2291 2292 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2293 for (;;) { 2294 if (!timeo) 2295 break; 2296 if (signal_pending(current)) 2297 break; 2298 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2299 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2300 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2301 break; 2302 if (sk->sk_shutdown & SEND_SHUTDOWN) 2303 break; 2304 if (sk->sk_err) 2305 break; 2306 timeo = schedule_timeout(timeo); 2307 } 2308 finish_wait(sk_sleep(sk), &wait); 2309 return timeo; 2310 } 2311 2312 2313 /* 2314 * Generic send/receive buffer handlers 2315 */ 2316 2317 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2318 unsigned long data_len, int noblock, 2319 int *errcode, int max_page_order) 2320 { 2321 struct sk_buff *skb; 2322 long timeo; 2323 int err; 2324 2325 timeo = sock_sndtimeo(sk, noblock); 2326 for (;;) { 2327 err = sock_error(sk); 2328 if (err != 0) 2329 goto failure; 2330 2331 err = -EPIPE; 2332 if (sk->sk_shutdown & SEND_SHUTDOWN) 2333 goto failure; 2334 2335 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2336 break; 2337 2338 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2339 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2340 err = -EAGAIN; 2341 if (!timeo) 2342 goto failure; 2343 if (signal_pending(current)) 2344 goto interrupted; 2345 timeo = sock_wait_for_wmem(sk, timeo); 2346 } 2347 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2348 errcode, sk->sk_allocation); 2349 if (skb) 2350 skb_set_owner_w(skb, sk); 2351 return skb; 2352 2353 interrupted: 2354 err = sock_intr_errno(timeo); 2355 failure: 2356 *errcode = err; 2357 return NULL; 2358 } 2359 EXPORT_SYMBOL(sock_alloc_send_pskb); 2360 2361 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2362 int noblock, int *errcode) 2363 { 2364 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2365 } 2366 EXPORT_SYMBOL(sock_alloc_send_skb); 2367 2368 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2369 struct sockcm_cookie *sockc) 2370 { 2371 u32 tsflags; 2372 2373 switch (cmsg->cmsg_type) { 2374 case SO_MARK: 2375 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2376 return -EPERM; 2377 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2378 return -EINVAL; 2379 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2380 break; 2381 case SO_TIMESTAMPING_OLD: 2382 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2383 return -EINVAL; 2384 2385 tsflags = *(u32 *)CMSG_DATA(cmsg); 2386 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2387 return -EINVAL; 2388 2389 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2390 sockc->tsflags |= tsflags; 2391 break; 2392 case SCM_TXTIME: 2393 if (!sock_flag(sk, SOCK_TXTIME)) 2394 return -EINVAL; 2395 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2396 return -EINVAL; 2397 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2398 break; 2399 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2400 case SCM_RIGHTS: 2401 case SCM_CREDENTIALS: 2402 break; 2403 default: 2404 return -EINVAL; 2405 } 2406 return 0; 2407 } 2408 EXPORT_SYMBOL(__sock_cmsg_send); 2409 2410 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2411 struct sockcm_cookie *sockc) 2412 { 2413 struct cmsghdr *cmsg; 2414 int ret; 2415 2416 for_each_cmsghdr(cmsg, msg) { 2417 if (!CMSG_OK(msg, cmsg)) 2418 return -EINVAL; 2419 if (cmsg->cmsg_level != SOL_SOCKET) 2420 continue; 2421 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2422 if (ret) 2423 return ret; 2424 } 2425 return 0; 2426 } 2427 EXPORT_SYMBOL(sock_cmsg_send); 2428 2429 static void sk_enter_memory_pressure(struct sock *sk) 2430 { 2431 if (!sk->sk_prot->enter_memory_pressure) 2432 return; 2433 2434 sk->sk_prot->enter_memory_pressure(sk); 2435 } 2436 2437 static void sk_leave_memory_pressure(struct sock *sk) 2438 { 2439 if (sk->sk_prot->leave_memory_pressure) { 2440 sk->sk_prot->leave_memory_pressure(sk); 2441 } else { 2442 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2443 2444 if (memory_pressure && READ_ONCE(*memory_pressure)) 2445 WRITE_ONCE(*memory_pressure, 0); 2446 } 2447 } 2448 2449 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2450 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2451 2452 /** 2453 * skb_page_frag_refill - check that a page_frag contains enough room 2454 * @sz: minimum size of the fragment we want to get 2455 * @pfrag: pointer to page_frag 2456 * @gfp: priority for memory allocation 2457 * 2458 * Note: While this allocator tries to use high order pages, there is 2459 * no guarantee that allocations succeed. Therefore, @sz MUST be 2460 * less or equal than PAGE_SIZE. 2461 */ 2462 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2463 { 2464 if (pfrag->page) { 2465 if (page_ref_count(pfrag->page) == 1) { 2466 pfrag->offset = 0; 2467 return true; 2468 } 2469 if (pfrag->offset + sz <= pfrag->size) 2470 return true; 2471 put_page(pfrag->page); 2472 } 2473 2474 pfrag->offset = 0; 2475 if (SKB_FRAG_PAGE_ORDER && 2476 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2477 /* Avoid direct reclaim but allow kswapd to wake */ 2478 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2479 __GFP_COMP | __GFP_NOWARN | 2480 __GFP_NORETRY, 2481 SKB_FRAG_PAGE_ORDER); 2482 if (likely(pfrag->page)) { 2483 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2484 return true; 2485 } 2486 } 2487 pfrag->page = alloc_page(gfp); 2488 if (likely(pfrag->page)) { 2489 pfrag->size = PAGE_SIZE; 2490 return true; 2491 } 2492 return false; 2493 } 2494 EXPORT_SYMBOL(skb_page_frag_refill); 2495 2496 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2497 { 2498 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2499 return true; 2500 2501 sk_enter_memory_pressure(sk); 2502 sk_stream_moderate_sndbuf(sk); 2503 return false; 2504 } 2505 EXPORT_SYMBOL(sk_page_frag_refill); 2506 2507 static void __lock_sock(struct sock *sk) 2508 __releases(&sk->sk_lock.slock) 2509 __acquires(&sk->sk_lock.slock) 2510 { 2511 DEFINE_WAIT(wait); 2512 2513 for (;;) { 2514 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2515 TASK_UNINTERRUPTIBLE); 2516 spin_unlock_bh(&sk->sk_lock.slock); 2517 schedule(); 2518 spin_lock_bh(&sk->sk_lock.slock); 2519 if (!sock_owned_by_user(sk)) 2520 break; 2521 } 2522 finish_wait(&sk->sk_lock.wq, &wait); 2523 } 2524 2525 void __release_sock(struct sock *sk) 2526 __releases(&sk->sk_lock.slock) 2527 __acquires(&sk->sk_lock.slock) 2528 { 2529 struct sk_buff *skb, *next; 2530 2531 while ((skb = sk->sk_backlog.head) != NULL) { 2532 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2533 2534 spin_unlock_bh(&sk->sk_lock.slock); 2535 2536 do { 2537 next = skb->next; 2538 prefetch(next); 2539 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2540 skb_mark_not_on_list(skb); 2541 sk_backlog_rcv(sk, skb); 2542 2543 cond_resched(); 2544 2545 skb = next; 2546 } while (skb != NULL); 2547 2548 spin_lock_bh(&sk->sk_lock.slock); 2549 } 2550 2551 /* 2552 * Doing the zeroing here guarantee we can not loop forever 2553 * while a wild producer attempts to flood us. 2554 */ 2555 sk->sk_backlog.len = 0; 2556 } 2557 2558 void __sk_flush_backlog(struct sock *sk) 2559 { 2560 spin_lock_bh(&sk->sk_lock.slock); 2561 __release_sock(sk); 2562 spin_unlock_bh(&sk->sk_lock.slock); 2563 } 2564 2565 /** 2566 * sk_wait_data - wait for data to arrive at sk_receive_queue 2567 * @sk: sock to wait on 2568 * @timeo: for how long 2569 * @skb: last skb seen on sk_receive_queue 2570 * 2571 * Now socket state including sk->sk_err is changed only under lock, 2572 * hence we may omit checks after joining wait queue. 2573 * We check receive queue before schedule() only as optimization; 2574 * it is very likely that release_sock() added new data. 2575 */ 2576 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2577 { 2578 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2579 int rc; 2580 2581 add_wait_queue(sk_sleep(sk), &wait); 2582 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2583 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2584 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2585 remove_wait_queue(sk_sleep(sk), &wait); 2586 return rc; 2587 } 2588 EXPORT_SYMBOL(sk_wait_data); 2589 2590 /** 2591 * __sk_mem_raise_allocated - increase memory_allocated 2592 * @sk: socket 2593 * @size: memory size to allocate 2594 * @amt: pages to allocate 2595 * @kind: allocation type 2596 * 2597 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2598 */ 2599 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2600 { 2601 struct proto *prot = sk->sk_prot; 2602 long allocated = sk_memory_allocated_add(sk, amt); 2603 bool charged = true; 2604 2605 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2606 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2607 goto suppress_allocation; 2608 2609 /* Under limit. */ 2610 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2611 sk_leave_memory_pressure(sk); 2612 return 1; 2613 } 2614 2615 /* Under pressure. */ 2616 if (allocated > sk_prot_mem_limits(sk, 1)) 2617 sk_enter_memory_pressure(sk); 2618 2619 /* Over hard limit. */ 2620 if (allocated > sk_prot_mem_limits(sk, 2)) 2621 goto suppress_allocation; 2622 2623 /* guarantee minimum buffer size under pressure */ 2624 if (kind == SK_MEM_RECV) { 2625 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2626 return 1; 2627 2628 } else { /* SK_MEM_SEND */ 2629 int wmem0 = sk_get_wmem0(sk, prot); 2630 2631 if (sk->sk_type == SOCK_STREAM) { 2632 if (sk->sk_wmem_queued < wmem0) 2633 return 1; 2634 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2635 return 1; 2636 } 2637 } 2638 2639 if (sk_has_memory_pressure(sk)) { 2640 u64 alloc; 2641 2642 if (!sk_under_memory_pressure(sk)) 2643 return 1; 2644 alloc = sk_sockets_allocated_read_positive(sk); 2645 if (sk_prot_mem_limits(sk, 2) > alloc * 2646 sk_mem_pages(sk->sk_wmem_queued + 2647 atomic_read(&sk->sk_rmem_alloc) + 2648 sk->sk_forward_alloc)) 2649 return 1; 2650 } 2651 2652 suppress_allocation: 2653 2654 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2655 sk_stream_moderate_sndbuf(sk); 2656 2657 /* Fail only if socket is _under_ its sndbuf. 2658 * In this case we cannot block, so that we have to fail. 2659 */ 2660 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2661 return 1; 2662 } 2663 2664 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2665 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2666 2667 sk_memory_allocated_sub(sk, amt); 2668 2669 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2670 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2671 2672 return 0; 2673 } 2674 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2675 2676 /** 2677 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2678 * @sk: socket 2679 * @size: memory size to allocate 2680 * @kind: allocation type 2681 * 2682 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2683 * rmem allocation. This function assumes that protocols which have 2684 * memory_pressure use sk_wmem_queued as write buffer accounting. 2685 */ 2686 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2687 { 2688 int ret, amt = sk_mem_pages(size); 2689 2690 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2691 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2692 if (!ret) 2693 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2694 return ret; 2695 } 2696 EXPORT_SYMBOL(__sk_mem_schedule); 2697 2698 /** 2699 * __sk_mem_reduce_allocated - reclaim memory_allocated 2700 * @sk: socket 2701 * @amount: number of quanta 2702 * 2703 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2704 */ 2705 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2706 { 2707 sk_memory_allocated_sub(sk, amount); 2708 2709 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2710 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2711 2712 if (sk_under_memory_pressure(sk) && 2713 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2714 sk_leave_memory_pressure(sk); 2715 } 2716 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2717 2718 /** 2719 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2720 * @sk: socket 2721 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2722 */ 2723 void __sk_mem_reclaim(struct sock *sk, int amount) 2724 { 2725 amount >>= SK_MEM_QUANTUM_SHIFT; 2726 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2727 __sk_mem_reduce_allocated(sk, amount); 2728 } 2729 EXPORT_SYMBOL(__sk_mem_reclaim); 2730 2731 int sk_set_peek_off(struct sock *sk, int val) 2732 { 2733 sk->sk_peek_off = val; 2734 return 0; 2735 } 2736 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2737 2738 /* 2739 * Set of default routines for initialising struct proto_ops when 2740 * the protocol does not support a particular function. In certain 2741 * cases where it makes no sense for a protocol to have a "do nothing" 2742 * function, some default processing is provided. 2743 */ 2744 2745 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2746 { 2747 return -EOPNOTSUPP; 2748 } 2749 EXPORT_SYMBOL(sock_no_bind); 2750 2751 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2752 int len, int flags) 2753 { 2754 return -EOPNOTSUPP; 2755 } 2756 EXPORT_SYMBOL(sock_no_connect); 2757 2758 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2759 { 2760 return -EOPNOTSUPP; 2761 } 2762 EXPORT_SYMBOL(sock_no_socketpair); 2763 2764 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2765 bool kern) 2766 { 2767 return -EOPNOTSUPP; 2768 } 2769 EXPORT_SYMBOL(sock_no_accept); 2770 2771 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2772 int peer) 2773 { 2774 return -EOPNOTSUPP; 2775 } 2776 EXPORT_SYMBOL(sock_no_getname); 2777 2778 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2779 { 2780 return -EOPNOTSUPP; 2781 } 2782 EXPORT_SYMBOL(sock_no_ioctl); 2783 2784 int sock_no_listen(struct socket *sock, int backlog) 2785 { 2786 return -EOPNOTSUPP; 2787 } 2788 EXPORT_SYMBOL(sock_no_listen); 2789 2790 int sock_no_shutdown(struct socket *sock, int how) 2791 { 2792 return -EOPNOTSUPP; 2793 } 2794 EXPORT_SYMBOL(sock_no_shutdown); 2795 2796 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2797 char __user *optval, unsigned int optlen) 2798 { 2799 return -EOPNOTSUPP; 2800 } 2801 EXPORT_SYMBOL(sock_no_setsockopt); 2802 2803 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2804 char __user *optval, int __user *optlen) 2805 { 2806 return -EOPNOTSUPP; 2807 } 2808 EXPORT_SYMBOL(sock_no_getsockopt); 2809 2810 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2811 { 2812 return -EOPNOTSUPP; 2813 } 2814 EXPORT_SYMBOL(sock_no_sendmsg); 2815 2816 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2817 { 2818 return -EOPNOTSUPP; 2819 } 2820 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2821 2822 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2823 int flags) 2824 { 2825 return -EOPNOTSUPP; 2826 } 2827 EXPORT_SYMBOL(sock_no_recvmsg); 2828 2829 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2830 { 2831 /* Mirror missing mmap method error code */ 2832 return -ENODEV; 2833 } 2834 EXPORT_SYMBOL(sock_no_mmap); 2835 2836 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2837 { 2838 ssize_t res; 2839 struct msghdr msg = {.msg_flags = flags}; 2840 struct kvec iov; 2841 char *kaddr = kmap(page); 2842 iov.iov_base = kaddr + offset; 2843 iov.iov_len = size; 2844 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2845 kunmap(page); 2846 return res; 2847 } 2848 EXPORT_SYMBOL(sock_no_sendpage); 2849 2850 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2851 int offset, size_t size, int flags) 2852 { 2853 ssize_t res; 2854 struct msghdr msg = {.msg_flags = flags}; 2855 struct kvec iov; 2856 char *kaddr = kmap(page); 2857 2858 iov.iov_base = kaddr + offset; 2859 iov.iov_len = size; 2860 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2861 kunmap(page); 2862 return res; 2863 } 2864 EXPORT_SYMBOL(sock_no_sendpage_locked); 2865 2866 /* 2867 * Default Socket Callbacks 2868 */ 2869 2870 static void sock_def_wakeup(struct sock *sk) 2871 { 2872 struct socket_wq *wq; 2873 2874 rcu_read_lock(); 2875 wq = rcu_dereference(sk->sk_wq); 2876 if (skwq_has_sleeper(wq)) 2877 wake_up_interruptible_all(&wq->wait); 2878 rcu_read_unlock(); 2879 } 2880 2881 static void sock_def_error_report(struct sock *sk) 2882 { 2883 struct socket_wq *wq; 2884 2885 rcu_read_lock(); 2886 wq = rcu_dereference(sk->sk_wq); 2887 if (skwq_has_sleeper(wq)) 2888 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2889 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2890 rcu_read_unlock(); 2891 } 2892 2893 void sock_def_readable(struct sock *sk) 2894 { 2895 struct socket_wq *wq; 2896 2897 rcu_read_lock(); 2898 wq = rcu_dereference(sk->sk_wq); 2899 if (skwq_has_sleeper(wq)) 2900 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2901 EPOLLRDNORM | EPOLLRDBAND); 2902 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2903 rcu_read_unlock(); 2904 } 2905 2906 static void sock_def_write_space(struct sock *sk) 2907 { 2908 struct socket_wq *wq; 2909 2910 rcu_read_lock(); 2911 2912 /* Do not wake up a writer until he can make "significant" 2913 * progress. --DaveM 2914 */ 2915 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2916 wq = rcu_dereference(sk->sk_wq); 2917 if (skwq_has_sleeper(wq)) 2918 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2919 EPOLLWRNORM | EPOLLWRBAND); 2920 2921 /* Should agree with poll, otherwise some programs break */ 2922 if (sock_writeable(sk)) 2923 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2924 } 2925 2926 rcu_read_unlock(); 2927 } 2928 2929 static void sock_def_destruct(struct sock *sk) 2930 { 2931 } 2932 2933 void sk_send_sigurg(struct sock *sk) 2934 { 2935 if (sk->sk_socket && sk->sk_socket->file) 2936 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2937 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2938 } 2939 EXPORT_SYMBOL(sk_send_sigurg); 2940 2941 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2942 unsigned long expires) 2943 { 2944 if (!mod_timer(timer, expires)) 2945 sock_hold(sk); 2946 } 2947 EXPORT_SYMBOL(sk_reset_timer); 2948 2949 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2950 { 2951 if (del_timer(timer)) 2952 __sock_put(sk); 2953 } 2954 EXPORT_SYMBOL(sk_stop_timer); 2955 2956 void sock_init_data(struct socket *sock, struct sock *sk) 2957 { 2958 sk_init_common(sk); 2959 sk->sk_send_head = NULL; 2960 2961 timer_setup(&sk->sk_timer, NULL, 0); 2962 2963 sk->sk_allocation = GFP_KERNEL; 2964 sk->sk_rcvbuf = sysctl_rmem_default; 2965 sk->sk_sndbuf = sysctl_wmem_default; 2966 sk->sk_state = TCP_CLOSE; 2967 sk_set_socket(sk, sock); 2968 2969 sock_set_flag(sk, SOCK_ZAPPED); 2970 2971 if (sock) { 2972 sk->sk_type = sock->type; 2973 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 2974 sock->sk = sk; 2975 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2976 } else { 2977 RCU_INIT_POINTER(sk->sk_wq, NULL); 2978 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2979 } 2980 2981 rwlock_init(&sk->sk_callback_lock); 2982 if (sk->sk_kern_sock) 2983 lockdep_set_class_and_name( 2984 &sk->sk_callback_lock, 2985 af_kern_callback_keys + sk->sk_family, 2986 af_family_kern_clock_key_strings[sk->sk_family]); 2987 else 2988 lockdep_set_class_and_name( 2989 &sk->sk_callback_lock, 2990 af_callback_keys + sk->sk_family, 2991 af_family_clock_key_strings[sk->sk_family]); 2992 2993 sk->sk_state_change = sock_def_wakeup; 2994 sk->sk_data_ready = sock_def_readable; 2995 sk->sk_write_space = sock_def_write_space; 2996 sk->sk_error_report = sock_def_error_report; 2997 sk->sk_destruct = sock_def_destruct; 2998 2999 sk->sk_frag.page = NULL; 3000 sk->sk_frag.offset = 0; 3001 sk->sk_peek_off = -1; 3002 3003 sk->sk_peer_pid = NULL; 3004 sk->sk_peer_cred = NULL; 3005 sk->sk_write_pending = 0; 3006 sk->sk_rcvlowat = 1; 3007 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3008 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3009 3010 sk->sk_stamp = SK_DEFAULT_STAMP; 3011 #if BITS_PER_LONG==32 3012 seqlock_init(&sk->sk_stamp_seq); 3013 #endif 3014 atomic_set(&sk->sk_zckey, 0); 3015 3016 #ifdef CONFIG_NET_RX_BUSY_POLL 3017 sk->sk_napi_id = 0; 3018 sk->sk_ll_usec = sysctl_net_busy_read; 3019 #endif 3020 3021 sk->sk_max_pacing_rate = ~0UL; 3022 sk->sk_pacing_rate = ~0UL; 3023 WRITE_ONCE(sk->sk_pacing_shift, 10); 3024 sk->sk_incoming_cpu = -1; 3025 3026 sk_rx_queue_clear(sk); 3027 /* 3028 * Before updating sk_refcnt, we must commit prior changes to memory 3029 * (Documentation/RCU/rculist_nulls.txt for details) 3030 */ 3031 smp_wmb(); 3032 refcount_set(&sk->sk_refcnt, 1); 3033 atomic_set(&sk->sk_drops, 0); 3034 } 3035 EXPORT_SYMBOL(sock_init_data); 3036 3037 void lock_sock_nested(struct sock *sk, int subclass) 3038 { 3039 might_sleep(); 3040 spin_lock_bh(&sk->sk_lock.slock); 3041 if (sk->sk_lock.owned) 3042 __lock_sock(sk); 3043 sk->sk_lock.owned = 1; 3044 spin_unlock(&sk->sk_lock.slock); 3045 /* 3046 * The sk_lock has mutex_lock() semantics here: 3047 */ 3048 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3049 local_bh_enable(); 3050 } 3051 EXPORT_SYMBOL(lock_sock_nested); 3052 3053 void release_sock(struct sock *sk) 3054 { 3055 spin_lock_bh(&sk->sk_lock.slock); 3056 if (sk->sk_backlog.tail) 3057 __release_sock(sk); 3058 3059 /* Warning : release_cb() might need to release sk ownership, 3060 * ie call sock_release_ownership(sk) before us. 3061 */ 3062 if (sk->sk_prot->release_cb) 3063 sk->sk_prot->release_cb(sk); 3064 3065 sock_release_ownership(sk); 3066 if (waitqueue_active(&sk->sk_lock.wq)) 3067 wake_up(&sk->sk_lock.wq); 3068 spin_unlock_bh(&sk->sk_lock.slock); 3069 } 3070 EXPORT_SYMBOL(release_sock); 3071 3072 /** 3073 * lock_sock_fast - fast version of lock_sock 3074 * @sk: socket 3075 * 3076 * This version should be used for very small section, where process wont block 3077 * return false if fast path is taken: 3078 * 3079 * sk_lock.slock locked, owned = 0, BH disabled 3080 * 3081 * return true if slow path is taken: 3082 * 3083 * sk_lock.slock unlocked, owned = 1, BH enabled 3084 */ 3085 bool lock_sock_fast(struct sock *sk) 3086 { 3087 might_sleep(); 3088 spin_lock_bh(&sk->sk_lock.slock); 3089 3090 if (!sk->sk_lock.owned) 3091 /* 3092 * Note : We must disable BH 3093 */ 3094 return false; 3095 3096 __lock_sock(sk); 3097 sk->sk_lock.owned = 1; 3098 spin_unlock(&sk->sk_lock.slock); 3099 /* 3100 * The sk_lock has mutex_lock() semantics here: 3101 */ 3102 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3103 local_bh_enable(); 3104 return true; 3105 } 3106 EXPORT_SYMBOL(lock_sock_fast); 3107 3108 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3109 bool timeval, bool time32) 3110 { 3111 struct sock *sk = sock->sk; 3112 struct timespec64 ts; 3113 3114 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3115 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3116 if (ts.tv_sec == -1) 3117 return -ENOENT; 3118 if (ts.tv_sec == 0) { 3119 ktime_t kt = ktime_get_real(); 3120 sock_write_timestamp(sk, kt); 3121 ts = ktime_to_timespec64(kt); 3122 } 3123 3124 if (timeval) 3125 ts.tv_nsec /= 1000; 3126 3127 #ifdef CONFIG_COMPAT_32BIT_TIME 3128 if (time32) 3129 return put_old_timespec32(&ts, userstamp); 3130 #endif 3131 #ifdef CONFIG_SPARC64 3132 /* beware of padding in sparc64 timeval */ 3133 if (timeval && !in_compat_syscall()) { 3134 struct __kernel_old_timeval __user tv = { 3135 .tv_sec = ts.tv_sec, 3136 .tv_usec = ts.tv_nsec, 3137 }; 3138 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3139 return -EFAULT; 3140 return 0; 3141 } 3142 #endif 3143 return put_timespec64(&ts, userstamp); 3144 } 3145 EXPORT_SYMBOL(sock_gettstamp); 3146 3147 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3148 { 3149 if (!sock_flag(sk, flag)) { 3150 unsigned long previous_flags = sk->sk_flags; 3151 3152 sock_set_flag(sk, flag); 3153 /* 3154 * we just set one of the two flags which require net 3155 * time stamping, but time stamping might have been on 3156 * already because of the other one 3157 */ 3158 if (sock_needs_netstamp(sk) && 3159 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3160 net_enable_timestamp(); 3161 } 3162 } 3163 3164 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3165 int level, int type) 3166 { 3167 struct sock_exterr_skb *serr; 3168 struct sk_buff *skb; 3169 int copied, err; 3170 3171 err = -EAGAIN; 3172 skb = sock_dequeue_err_skb(sk); 3173 if (skb == NULL) 3174 goto out; 3175 3176 copied = skb->len; 3177 if (copied > len) { 3178 msg->msg_flags |= MSG_TRUNC; 3179 copied = len; 3180 } 3181 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3182 if (err) 3183 goto out_free_skb; 3184 3185 sock_recv_timestamp(msg, sk, skb); 3186 3187 serr = SKB_EXT_ERR(skb); 3188 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3189 3190 msg->msg_flags |= MSG_ERRQUEUE; 3191 err = copied; 3192 3193 out_free_skb: 3194 kfree_skb(skb); 3195 out: 3196 return err; 3197 } 3198 EXPORT_SYMBOL(sock_recv_errqueue); 3199 3200 /* 3201 * Get a socket option on an socket. 3202 * 3203 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3204 * asynchronous errors should be reported by getsockopt. We assume 3205 * this means if you specify SO_ERROR (otherwise whats the point of it). 3206 */ 3207 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3208 char __user *optval, int __user *optlen) 3209 { 3210 struct sock *sk = sock->sk; 3211 3212 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3213 } 3214 EXPORT_SYMBOL(sock_common_getsockopt); 3215 3216 #ifdef CONFIG_COMPAT 3217 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 3218 char __user *optval, int __user *optlen) 3219 { 3220 struct sock *sk = sock->sk; 3221 3222 if (sk->sk_prot->compat_getsockopt != NULL) 3223 return sk->sk_prot->compat_getsockopt(sk, level, optname, 3224 optval, optlen); 3225 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3226 } 3227 EXPORT_SYMBOL(compat_sock_common_getsockopt); 3228 #endif 3229 3230 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3231 int flags) 3232 { 3233 struct sock *sk = sock->sk; 3234 int addr_len = 0; 3235 int err; 3236 3237 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3238 flags & ~MSG_DONTWAIT, &addr_len); 3239 if (err >= 0) 3240 msg->msg_namelen = addr_len; 3241 return err; 3242 } 3243 EXPORT_SYMBOL(sock_common_recvmsg); 3244 3245 /* 3246 * Set socket options on an inet socket. 3247 */ 3248 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3249 char __user *optval, unsigned int optlen) 3250 { 3251 struct sock *sk = sock->sk; 3252 3253 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3254 } 3255 EXPORT_SYMBOL(sock_common_setsockopt); 3256 3257 #ifdef CONFIG_COMPAT 3258 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3259 char __user *optval, unsigned int optlen) 3260 { 3261 struct sock *sk = sock->sk; 3262 3263 if (sk->sk_prot->compat_setsockopt != NULL) 3264 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3265 optval, optlen); 3266 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3267 } 3268 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3269 #endif 3270 3271 void sk_common_release(struct sock *sk) 3272 { 3273 if (sk->sk_prot->destroy) 3274 sk->sk_prot->destroy(sk); 3275 3276 /* 3277 * Observation: when sock_common_release is called, processes have 3278 * no access to socket. But net still has. 3279 * Step one, detach it from networking: 3280 * 3281 * A. Remove from hash tables. 3282 */ 3283 3284 sk->sk_prot->unhash(sk); 3285 3286 /* 3287 * In this point socket cannot receive new packets, but it is possible 3288 * that some packets are in flight because some CPU runs receiver and 3289 * did hash table lookup before we unhashed socket. They will achieve 3290 * receive queue and will be purged by socket destructor. 3291 * 3292 * Also we still have packets pending on receive queue and probably, 3293 * our own packets waiting in device queues. sock_destroy will drain 3294 * receive queue, but transmitted packets will delay socket destruction 3295 * until the last reference will be released. 3296 */ 3297 3298 sock_orphan(sk); 3299 3300 xfrm_sk_free_policy(sk); 3301 3302 sk_refcnt_debug_release(sk); 3303 3304 sock_put(sk); 3305 } 3306 EXPORT_SYMBOL(sk_common_release); 3307 3308 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3309 { 3310 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3311 3312 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3313 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3314 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3315 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3316 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3317 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3318 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3319 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3320 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3321 } 3322 3323 #ifdef CONFIG_PROC_FS 3324 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3325 struct prot_inuse { 3326 int val[PROTO_INUSE_NR]; 3327 }; 3328 3329 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3330 3331 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3332 { 3333 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3334 } 3335 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3336 3337 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3338 { 3339 int cpu, idx = prot->inuse_idx; 3340 int res = 0; 3341 3342 for_each_possible_cpu(cpu) 3343 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3344 3345 return res >= 0 ? res : 0; 3346 } 3347 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3348 3349 static void sock_inuse_add(struct net *net, int val) 3350 { 3351 this_cpu_add(*net->core.sock_inuse, val); 3352 } 3353 3354 int sock_inuse_get(struct net *net) 3355 { 3356 int cpu, res = 0; 3357 3358 for_each_possible_cpu(cpu) 3359 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3360 3361 return res; 3362 } 3363 3364 EXPORT_SYMBOL_GPL(sock_inuse_get); 3365 3366 static int __net_init sock_inuse_init_net(struct net *net) 3367 { 3368 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3369 if (net->core.prot_inuse == NULL) 3370 return -ENOMEM; 3371 3372 net->core.sock_inuse = alloc_percpu(int); 3373 if (net->core.sock_inuse == NULL) 3374 goto out; 3375 3376 return 0; 3377 3378 out: 3379 free_percpu(net->core.prot_inuse); 3380 return -ENOMEM; 3381 } 3382 3383 static void __net_exit sock_inuse_exit_net(struct net *net) 3384 { 3385 free_percpu(net->core.prot_inuse); 3386 free_percpu(net->core.sock_inuse); 3387 } 3388 3389 static struct pernet_operations net_inuse_ops = { 3390 .init = sock_inuse_init_net, 3391 .exit = sock_inuse_exit_net, 3392 }; 3393 3394 static __init int net_inuse_init(void) 3395 { 3396 if (register_pernet_subsys(&net_inuse_ops)) 3397 panic("Cannot initialize net inuse counters"); 3398 3399 return 0; 3400 } 3401 3402 core_initcall(net_inuse_init); 3403 3404 static int assign_proto_idx(struct proto *prot) 3405 { 3406 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3407 3408 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3409 pr_err("PROTO_INUSE_NR exhausted\n"); 3410 return -ENOSPC; 3411 } 3412 3413 set_bit(prot->inuse_idx, proto_inuse_idx); 3414 return 0; 3415 } 3416 3417 static void release_proto_idx(struct proto *prot) 3418 { 3419 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3420 clear_bit(prot->inuse_idx, proto_inuse_idx); 3421 } 3422 #else 3423 static inline int assign_proto_idx(struct proto *prot) 3424 { 3425 return 0; 3426 } 3427 3428 static inline void release_proto_idx(struct proto *prot) 3429 { 3430 } 3431 3432 static void sock_inuse_add(struct net *net, int val) 3433 { 3434 } 3435 #endif 3436 3437 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3438 { 3439 if (!rsk_prot) 3440 return; 3441 kfree(rsk_prot->slab_name); 3442 rsk_prot->slab_name = NULL; 3443 kmem_cache_destroy(rsk_prot->slab); 3444 rsk_prot->slab = NULL; 3445 } 3446 3447 static int req_prot_init(const struct proto *prot) 3448 { 3449 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3450 3451 if (!rsk_prot) 3452 return 0; 3453 3454 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3455 prot->name); 3456 if (!rsk_prot->slab_name) 3457 return -ENOMEM; 3458 3459 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3460 rsk_prot->obj_size, 0, 3461 SLAB_ACCOUNT | prot->slab_flags, 3462 NULL); 3463 3464 if (!rsk_prot->slab) { 3465 pr_crit("%s: Can't create request sock SLAB cache!\n", 3466 prot->name); 3467 return -ENOMEM; 3468 } 3469 return 0; 3470 } 3471 3472 int proto_register(struct proto *prot, int alloc_slab) 3473 { 3474 int ret = -ENOBUFS; 3475 3476 if (alloc_slab) { 3477 prot->slab = kmem_cache_create_usercopy(prot->name, 3478 prot->obj_size, 0, 3479 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3480 prot->slab_flags, 3481 prot->useroffset, prot->usersize, 3482 NULL); 3483 3484 if (prot->slab == NULL) { 3485 pr_crit("%s: Can't create sock SLAB cache!\n", 3486 prot->name); 3487 goto out; 3488 } 3489 3490 if (req_prot_init(prot)) 3491 goto out_free_request_sock_slab; 3492 3493 if (prot->twsk_prot != NULL) { 3494 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3495 3496 if (prot->twsk_prot->twsk_slab_name == NULL) 3497 goto out_free_request_sock_slab; 3498 3499 prot->twsk_prot->twsk_slab = 3500 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3501 prot->twsk_prot->twsk_obj_size, 3502 0, 3503 SLAB_ACCOUNT | 3504 prot->slab_flags, 3505 NULL); 3506 if (prot->twsk_prot->twsk_slab == NULL) 3507 goto out_free_timewait_sock_slab_name; 3508 } 3509 } 3510 3511 mutex_lock(&proto_list_mutex); 3512 ret = assign_proto_idx(prot); 3513 if (ret) { 3514 mutex_unlock(&proto_list_mutex); 3515 goto out_free_timewait_sock_slab_name; 3516 } 3517 list_add(&prot->node, &proto_list); 3518 mutex_unlock(&proto_list_mutex); 3519 return ret; 3520 3521 out_free_timewait_sock_slab_name: 3522 if (alloc_slab && prot->twsk_prot) 3523 kfree(prot->twsk_prot->twsk_slab_name); 3524 out_free_request_sock_slab: 3525 if (alloc_slab) { 3526 req_prot_cleanup(prot->rsk_prot); 3527 3528 kmem_cache_destroy(prot->slab); 3529 prot->slab = NULL; 3530 } 3531 out: 3532 return ret; 3533 } 3534 EXPORT_SYMBOL(proto_register); 3535 3536 void proto_unregister(struct proto *prot) 3537 { 3538 mutex_lock(&proto_list_mutex); 3539 release_proto_idx(prot); 3540 list_del(&prot->node); 3541 mutex_unlock(&proto_list_mutex); 3542 3543 kmem_cache_destroy(prot->slab); 3544 prot->slab = NULL; 3545 3546 req_prot_cleanup(prot->rsk_prot); 3547 3548 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3549 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3550 kfree(prot->twsk_prot->twsk_slab_name); 3551 prot->twsk_prot->twsk_slab = NULL; 3552 } 3553 } 3554 EXPORT_SYMBOL(proto_unregister); 3555 3556 int sock_load_diag_module(int family, int protocol) 3557 { 3558 if (!protocol) { 3559 if (!sock_is_registered(family)) 3560 return -ENOENT; 3561 3562 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3563 NETLINK_SOCK_DIAG, family); 3564 } 3565 3566 #ifdef CONFIG_INET 3567 if (family == AF_INET && 3568 protocol != IPPROTO_RAW && 3569 protocol < MAX_INET_PROTOS && 3570 !rcu_access_pointer(inet_protos[protocol])) 3571 return -ENOENT; 3572 #endif 3573 3574 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3575 NETLINK_SOCK_DIAG, family, protocol); 3576 } 3577 EXPORT_SYMBOL(sock_load_diag_module); 3578 3579 #ifdef CONFIG_PROC_FS 3580 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3581 __acquires(proto_list_mutex) 3582 { 3583 mutex_lock(&proto_list_mutex); 3584 return seq_list_start_head(&proto_list, *pos); 3585 } 3586 3587 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3588 { 3589 return seq_list_next(v, &proto_list, pos); 3590 } 3591 3592 static void proto_seq_stop(struct seq_file *seq, void *v) 3593 __releases(proto_list_mutex) 3594 { 3595 mutex_unlock(&proto_list_mutex); 3596 } 3597 3598 static char proto_method_implemented(const void *method) 3599 { 3600 return method == NULL ? 'n' : 'y'; 3601 } 3602 static long sock_prot_memory_allocated(struct proto *proto) 3603 { 3604 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3605 } 3606 3607 static const char *sock_prot_memory_pressure(struct proto *proto) 3608 { 3609 return proto->memory_pressure != NULL ? 3610 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3611 } 3612 3613 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3614 { 3615 3616 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3617 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3618 proto->name, 3619 proto->obj_size, 3620 sock_prot_inuse_get(seq_file_net(seq), proto), 3621 sock_prot_memory_allocated(proto), 3622 sock_prot_memory_pressure(proto), 3623 proto->max_header, 3624 proto->slab == NULL ? "no" : "yes", 3625 module_name(proto->owner), 3626 proto_method_implemented(proto->close), 3627 proto_method_implemented(proto->connect), 3628 proto_method_implemented(proto->disconnect), 3629 proto_method_implemented(proto->accept), 3630 proto_method_implemented(proto->ioctl), 3631 proto_method_implemented(proto->init), 3632 proto_method_implemented(proto->destroy), 3633 proto_method_implemented(proto->shutdown), 3634 proto_method_implemented(proto->setsockopt), 3635 proto_method_implemented(proto->getsockopt), 3636 proto_method_implemented(proto->sendmsg), 3637 proto_method_implemented(proto->recvmsg), 3638 proto_method_implemented(proto->sendpage), 3639 proto_method_implemented(proto->bind), 3640 proto_method_implemented(proto->backlog_rcv), 3641 proto_method_implemented(proto->hash), 3642 proto_method_implemented(proto->unhash), 3643 proto_method_implemented(proto->get_port), 3644 proto_method_implemented(proto->enter_memory_pressure)); 3645 } 3646 3647 static int proto_seq_show(struct seq_file *seq, void *v) 3648 { 3649 if (v == &proto_list) 3650 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3651 "protocol", 3652 "size", 3653 "sockets", 3654 "memory", 3655 "press", 3656 "maxhdr", 3657 "slab", 3658 "module", 3659 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3660 else 3661 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3662 return 0; 3663 } 3664 3665 static const struct seq_operations proto_seq_ops = { 3666 .start = proto_seq_start, 3667 .next = proto_seq_next, 3668 .stop = proto_seq_stop, 3669 .show = proto_seq_show, 3670 }; 3671 3672 static __net_init int proto_init_net(struct net *net) 3673 { 3674 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3675 sizeof(struct seq_net_private))) 3676 return -ENOMEM; 3677 3678 return 0; 3679 } 3680 3681 static __net_exit void proto_exit_net(struct net *net) 3682 { 3683 remove_proc_entry("protocols", net->proc_net); 3684 } 3685 3686 3687 static __net_initdata struct pernet_operations proto_net_ops = { 3688 .init = proto_init_net, 3689 .exit = proto_exit_net, 3690 }; 3691 3692 static int __init proto_init(void) 3693 { 3694 return register_pernet_subsys(&proto_net_ops); 3695 } 3696 3697 subsys_initcall(proto_init); 3698 3699 #endif /* PROC_FS */ 3700 3701 #ifdef CONFIG_NET_RX_BUSY_POLL 3702 bool sk_busy_loop_end(void *p, unsigned long start_time) 3703 { 3704 struct sock *sk = p; 3705 3706 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3707 sk_busy_loop_timeout(sk, start_time); 3708 } 3709 EXPORT_SYMBOL(sk_busy_loop_end); 3710 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3711 3712 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3713 { 3714 if (!sk->sk_prot->bind_add) 3715 return -EOPNOTSUPP; 3716 return sk->sk_prot->bind_add(sk, addr, addr_len); 3717 } 3718 EXPORT_SYMBOL(sock_bind_add); 3719