1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 static DEFINE_MUTEX(proto_list_mutex); 143 static LIST_HEAD(proto_list); 144 145 static void sock_inuse_add(struct net *net, int val); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229 static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231 }; 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234 }; 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237 }; 238 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241 }; 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244 }; 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247 }; 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250 }; 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253 }; 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256 }; 257 258 /* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262 static struct lock_class_key af_callback_keys[AF_MAX]; 263 static struct lock_class_key af_rlock_keys[AF_MAX]; 264 static struct lock_class_key af_wlock_keys[AF_MAX]; 265 static struct lock_class_key af_elock_keys[AF_MAX]; 266 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268 /* Run time adjustable parameters. */ 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270 EXPORT_SYMBOL(sysctl_wmem_max); 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272 EXPORT_SYMBOL(sysctl_rmem_max); 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276 /* Maximal space eaten by iovec or ancillary data plus some space */ 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278 EXPORT_SYMBOL(sysctl_optmem_max); 279 280 int sysctl_tstamp_allow_data __read_mostly = 1; 281 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283 EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285 /** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293 void sk_set_memalloc(struct sock *sk) 294 { 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298 } 299 EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301 void sk_clear_memalloc(struct sock *sk) 302 { 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315 } 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319 { 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331 } 332 EXPORT_SYMBOL(__sk_backlog_rcv); 333 334 void sk_error_report(struct sock *sk) 335 { 336 sk->sk_error_report(sk); 337 338 switch (sk->sk_family) { 339 case AF_INET: 340 fallthrough; 341 case AF_INET6: 342 trace_inet_sk_error_report(sk); 343 break; 344 default: 345 break; 346 } 347 } 348 EXPORT_SYMBOL(sk_error_report); 349 350 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 351 { 352 struct __kernel_sock_timeval tv; 353 354 if (timeo == MAX_SCHEDULE_TIMEOUT) { 355 tv.tv_sec = 0; 356 tv.tv_usec = 0; 357 } else { 358 tv.tv_sec = timeo / HZ; 359 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 360 } 361 362 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 363 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 364 *(struct old_timeval32 *)optval = tv32; 365 return sizeof(tv32); 366 } 367 368 if (old_timeval) { 369 struct __kernel_old_timeval old_tv; 370 old_tv.tv_sec = tv.tv_sec; 371 old_tv.tv_usec = tv.tv_usec; 372 *(struct __kernel_old_timeval *)optval = old_tv; 373 return sizeof(old_tv); 374 } 375 376 *(struct __kernel_sock_timeval *)optval = tv; 377 return sizeof(tv); 378 } 379 380 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 381 bool old_timeval) 382 { 383 struct __kernel_sock_timeval tv; 384 385 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 386 struct old_timeval32 tv32; 387 388 if (optlen < sizeof(tv32)) 389 return -EINVAL; 390 391 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 392 return -EFAULT; 393 tv.tv_sec = tv32.tv_sec; 394 tv.tv_usec = tv32.tv_usec; 395 } else if (old_timeval) { 396 struct __kernel_old_timeval old_tv; 397 398 if (optlen < sizeof(old_tv)) 399 return -EINVAL; 400 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 401 return -EFAULT; 402 tv.tv_sec = old_tv.tv_sec; 403 tv.tv_usec = old_tv.tv_usec; 404 } else { 405 if (optlen < sizeof(tv)) 406 return -EINVAL; 407 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 408 return -EFAULT; 409 } 410 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 411 return -EDOM; 412 413 if (tv.tv_sec < 0) { 414 static int warned __read_mostly; 415 416 *timeo_p = 0; 417 if (warned < 10 && net_ratelimit()) { 418 warned++; 419 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 420 __func__, current->comm, task_pid_nr(current)); 421 } 422 return 0; 423 } 424 *timeo_p = MAX_SCHEDULE_TIMEOUT; 425 if (tv.tv_sec == 0 && tv.tv_usec == 0) 426 return 0; 427 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 428 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 429 return 0; 430 } 431 432 static bool sock_needs_netstamp(const struct sock *sk) 433 { 434 switch (sk->sk_family) { 435 case AF_UNSPEC: 436 case AF_UNIX: 437 return false; 438 default: 439 return true; 440 } 441 } 442 443 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 444 { 445 if (sk->sk_flags & flags) { 446 sk->sk_flags &= ~flags; 447 if (sock_needs_netstamp(sk) && 448 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 449 net_disable_timestamp(); 450 } 451 } 452 453 454 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 455 { 456 unsigned long flags; 457 struct sk_buff_head *list = &sk->sk_receive_queue; 458 459 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 460 atomic_inc(&sk->sk_drops); 461 trace_sock_rcvqueue_full(sk, skb); 462 return -ENOMEM; 463 } 464 465 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 466 atomic_inc(&sk->sk_drops); 467 return -ENOBUFS; 468 } 469 470 skb->dev = NULL; 471 skb_set_owner_r(skb, sk); 472 473 /* we escape from rcu protected region, make sure we dont leak 474 * a norefcounted dst 475 */ 476 skb_dst_force(skb); 477 478 spin_lock_irqsave(&list->lock, flags); 479 sock_skb_set_dropcount(sk, skb); 480 __skb_queue_tail(list, skb); 481 spin_unlock_irqrestore(&list->lock, flags); 482 483 if (!sock_flag(sk, SOCK_DEAD)) 484 sk->sk_data_ready(sk); 485 return 0; 486 } 487 EXPORT_SYMBOL(__sock_queue_rcv_skb); 488 489 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 490 { 491 int err; 492 493 err = sk_filter(sk, skb); 494 if (err) 495 return err; 496 497 return __sock_queue_rcv_skb(sk, skb); 498 } 499 EXPORT_SYMBOL(sock_queue_rcv_skb); 500 501 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 502 const int nested, unsigned int trim_cap, bool refcounted) 503 { 504 int rc = NET_RX_SUCCESS; 505 506 if (sk_filter_trim_cap(sk, skb, trim_cap)) 507 goto discard_and_relse; 508 509 skb->dev = NULL; 510 511 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 512 atomic_inc(&sk->sk_drops); 513 goto discard_and_relse; 514 } 515 if (nested) 516 bh_lock_sock_nested(sk); 517 else 518 bh_lock_sock(sk); 519 if (!sock_owned_by_user(sk)) { 520 /* 521 * trylock + unlock semantics: 522 */ 523 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 524 525 rc = sk_backlog_rcv(sk, skb); 526 527 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 528 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 529 bh_unlock_sock(sk); 530 atomic_inc(&sk->sk_drops); 531 goto discard_and_relse; 532 } 533 534 bh_unlock_sock(sk); 535 out: 536 if (refcounted) 537 sock_put(sk); 538 return rc; 539 discard_and_relse: 540 kfree_skb(skb); 541 goto out; 542 } 543 EXPORT_SYMBOL(__sk_receive_skb); 544 545 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 546 u32)); 547 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 548 u32)); 549 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 550 { 551 struct dst_entry *dst = __sk_dst_get(sk); 552 553 if (dst && dst->obsolete && 554 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 555 dst, cookie) == NULL) { 556 sk_tx_queue_clear(sk); 557 sk->sk_dst_pending_confirm = 0; 558 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(__sk_dst_check); 566 567 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 568 { 569 struct dst_entry *dst = sk_dst_get(sk); 570 571 if (dst && dst->obsolete && 572 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 573 dst, cookie) == NULL) { 574 sk_dst_reset(sk); 575 dst_release(dst); 576 return NULL; 577 } 578 579 return dst; 580 } 581 EXPORT_SYMBOL(sk_dst_check); 582 583 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 584 { 585 int ret = -ENOPROTOOPT; 586 #ifdef CONFIG_NETDEVICES 587 struct net *net = sock_net(sk); 588 589 /* Sorry... */ 590 ret = -EPERM; 591 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 592 goto out; 593 594 ret = -EINVAL; 595 if (ifindex < 0) 596 goto out; 597 598 sk->sk_bound_dev_if = ifindex; 599 if (sk->sk_prot->rehash) 600 sk->sk_prot->rehash(sk); 601 sk_dst_reset(sk); 602 603 ret = 0; 604 605 out: 606 #endif 607 608 return ret; 609 } 610 611 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 612 { 613 int ret; 614 615 if (lock_sk) 616 lock_sock(sk); 617 ret = sock_bindtoindex_locked(sk, ifindex); 618 if (lock_sk) 619 release_sock(sk); 620 621 return ret; 622 } 623 EXPORT_SYMBOL(sock_bindtoindex); 624 625 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 626 { 627 int ret = -ENOPROTOOPT; 628 #ifdef CONFIG_NETDEVICES 629 struct net *net = sock_net(sk); 630 char devname[IFNAMSIZ]; 631 int index; 632 633 ret = -EINVAL; 634 if (optlen < 0) 635 goto out; 636 637 /* Bind this socket to a particular device like "eth0", 638 * as specified in the passed interface name. If the 639 * name is "" or the option length is zero the socket 640 * is not bound. 641 */ 642 if (optlen > IFNAMSIZ - 1) 643 optlen = IFNAMSIZ - 1; 644 memset(devname, 0, sizeof(devname)); 645 646 ret = -EFAULT; 647 if (copy_from_sockptr(devname, optval, optlen)) 648 goto out; 649 650 index = 0; 651 if (devname[0] != '\0') { 652 struct net_device *dev; 653 654 rcu_read_lock(); 655 dev = dev_get_by_name_rcu(net, devname); 656 if (dev) 657 index = dev->ifindex; 658 rcu_read_unlock(); 659 ret = -ENODEV; 660 if (!dev) 661 goto out; 662 } 663 664 return sock_bindtoindex(sk, index, true); 665 out: 666 #endif 667 668 return ret; 669 } 670 671 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 672 int __user *optlen, int len) 673 { 674 int ret = -ENOPROTOOPT; 675 #ifdef CONFIG_NETDEVICES 676 struct net *net = sock_net(sk); 677 char devname[IFNAMSIZ]; 678 679 if (sk->sk_bound_dev_if == 0) { 680 len = 0; 681 goto zero; 682 } 683 684 ret = -EINVAL; 685 if (len < IFNAMSIZ) 686 goto out; 687 688 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 689 if (ret) 690 goto out; 691 692 len = strlen(devname) + 1; 693 694 ret = -EFAULT; 695 if (copy_to_user(optval, devname, len)) 696 goto out; 697 698 zero: 699 ret = -EFAULT; 700 if (put_user(len, optlen)) 701 goto out; 702 703 ret = 0; 704 705 out: 706 #endif 707 708 return ret; 709 } 710 711 bool sk_mc_loop(struct sock *sk) 712 { 713 if (dev_recursion_level()) 714 return false; 715 if (!sk) 716 return true; 717 switch (sk->sk_family) { 718 case AF_INET: 719 return inet_sk(sk)->mc_loop; 720 #if IS_ENABLED(CONFIG_IPV6) 721 case AF_INET6: 722 return inet6_sk(sk)->mc_loop; 723 #endif 724 } 725 WARN_ON_ONCE(1); 726 return true; 727 } 728 EXPORT_SYMBOL(sk_mc_loop); 729 730 void sock_set_reuseaddr(struct sock *sk) 731 { 732 lock_sock(sk); 733 sk->sk_reuse = SK_CAN_REUSE; 734 release_sock(sk); 735 } 736 EXPORT_SYMBOL(sock_set_reuseaddr); 737 738 void sock_set_reuseport(struct sock *sk) 739 { 740 lock_sock(sk); 741 sk->sk_reuseport = true; 742 release_sock(sk); 743 } 744 EXPORT_SYMBOL(sock_set_reuseport); 745 746 void sock_no_linger(struct sock *sk) 747 { 748 lock_sock(sk); 749 sk->sk_lingertime = 0; 750 sock_set_flag(sk, SOCK_LINGER); 751 release_sock(sk); 752 } 753 EXPORT_SYMBOL(sock_no_linger); 754 755 void sock_set_priority(struct sock *sk, u32 priority) 756 { 757 lock_sock(sk); 758 sk->sk_priority = priority; 759 release_sock(sk); 760 } 761 EXPORT_SYMBOL(sock_set_priority); 762 763 void sock_set_sndtimeo(struct sock *sk, s64 secs) 764 { 765 lock_sock(sk); 766 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 767 sk->sk_sndtimeo = secs * HZ; 768 else 769 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 770 release_sock(sk); 771 } 772 EXPORT_SYMBOL(sock_set_sndtimeo); 773 774 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 775 { 776 if (val) { 777 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 778 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 779 sock_set_flag(sk, SOCK_RCVTSTAMP); 780 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 781 } else { 782 sock_reset_flag(sk, SOCK_RCVTSTAMP); 783 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 784 } 785 } 786 787 void sock_enable_timestamps(struct sock *sk) 788 { 789 lock_sock(sk); 790 __sock_set_timestamps(sk, true, false, true); 791 release_sock(sk); 792 } 793 EXPORT_SYMBOL(sock_enable_timestamps); 794 795 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 796 { 797 switch (optname) { 798 case SO_TIMESTAMP_OLD: 799 __sock_set_timestamps(sk, valbool, false, false); 800 break; 801 case SO_TIMESTAMP_NEW: 802 __sock_set_timestamps(sk, valbool, true, false); 803 break; 804 case SO_TIMESTAMPNS_OLD: 805 __sock_set_timestamps(sk, valbool, false, true); 806 break; 807 case SO_TIMESTAMPNS_NEW: 808 __sock_set_timestamps(sk, valbool, true, true); 809 break; 810 } 811 } 812 813 int sock_set_timestamping(struct sock *sk, int optname, int val) 814 { 815 if (val & ~SOF_TIMESTAMPING_MASK) 816 return -EINVAL; 817 818 if (val & SOF_TIMESTAMPING_OPT_ID && 819 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 820 if (sk->sk_protocol == IPPROTO_TCP && 821 sk->sk_type == SOCK_STREAM) { 822 if ((1 << sk->sk_state) & 823 (TCPF_CLOSE | TCPF_LISTEN)) 824 return -EINVAL; 825 sk->sk_tskey = tcp_sk(sk)->snd_una; 826 } else { 827 sk->sk_tskey = 0; 828 } 829 } 830 831 if (val & SOF_TIMESTAMPING_OPT_STATS && 832 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 833 return -EINVAL; 834 835 sk->sk_tsflags = val; 836 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 837 838 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 839 sock_enable_timestamp(sk, 840 SOCK_TIMESTAMPING_RX_SOFTWARE); 841 else 842 sock_disable_timestamp(sk, 843 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 844 return 0; 845 } 846 847 void sock_set_keepalive(struct sock *sk) 848 { 849 lock_sock(sk); 850 if (sk->sk_prot->keepalive) 851 sk->sk_prot->keepalive(sk, true); 852 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 853 release_sock(sk); 854 } 855 EXPORT_SYMBOL(sock_set_keepalive); 856 857 static void __sock_set_rcvbuf(struct sock *sk, int val) 858 { 859 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 860 * as a negative value. 861 */ 862 val = min_t(int, val, INT_MAX / 2); 863 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 864 865 /* We double it on the way in to account for "struct sk_buff" etc. 866 * overhead. Applications assume that the SO_RCVBUF setting they make 867 * will allow that much actual data to be received on that socket. 868 * 869 * Applications are unaware that "struct sk_buff" and other overheads 870 * allocate from the receive buffer during socket buffer allocation. 871 * 872 * And after considering the possible alternatives, returning the value 873 * we actually used in getsockopt is the most desirable behavior. 874 */ 875 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 876 } 877 878 void sock_set_rcvbuf(struct sock *sk, int val) 879 { 880 lock_sock(sk); 881 __sock_set_rcvbuf(sk, val); 882 release_sock(sk); 883 } 884 EXPORT_SYMBOL(sock_set_rcvbuf); 885 886 static void __sock_set_mark(struct sock *sk, u32 val) 887 { 888 if (val != sk->sk_mark) { 889 sk->sk_mark = val; 890 sk_dst_reset(sk); 891 } 892 } 893 894 void sock_set_mark(struct sock *sk, u32 val) 895 { 896 lock_sock(sk); 897 __sock_set_mark(sk, val); 898 release_sock(sk); 899 } 900 EXPORT_SYMBOL(sock_set_mark); 901 902 /* 903 * This is meant for all protocols to use and covers goings on 904 * at the socket level. Everything here is generic. 905 */ 906 907 int sock_setsockopt(struct socket *sock, int level, int optname, 908 sockptr_t optval, unsigned int optlen) 909 { 910 struct sock_txtime sk_txtime; 911 struct sock *sk = sock->sk; 912 int val; 913 int valbool; 914 struct linger ling; 915 int ret = 0; 916 917 /* 918 * Options without arguments 919 */ 920 921 if (optname == SO_BINDTODEVICE) 922 return sock_setbindtodevice(sk, optval, optlen); 923 924 if (optlen < sizeof(int)) 925 return -EINVAL; 926 927 if (copy_from_sockptr(&val, optval, sizeof(val))) 928 return -EFAULT; 929 930 valbool = val ? 1 : 0; 931 932 lock_sock(sk); 933 934 switch (optname) { 935 case SO_DEBUG: 936 if (val && !capable(CAP_NET_ADMIN)) 937 ret = -EACCES; 938 else 939 sock_valbool_flag(sk, SOCK_DBG, valbool); 940 break; 941 case SO_REUSEADDR: 942 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 943 break; 944 case SO_REUSEPORT: 945 sk->sk_reuseport = valbool; 946 break; 947 case SO_TYPE: 948 case SO_PROTOCOL: 949 case SO_DOMAIN: 950 case SO_ERROR: 951 ret = -ENOPROTOOPT; 952 break; 953 case SO_DONTROUTE: 954 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 955 sk_dst_reset(sk); 956 break; 957 case SO_BROADCAST: 958 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 959 break; 960 case SO_SNDBUF: 961 /* Don't error on this BSD doesn't and if you think 962 * about it this is right. Otherwise apps have to 963 * play 'guess the biggest size' games. RCVBUF/SNDBUF 964 * are treated in BSD as hints 965 */ 966 val = min_t(u32, val, sysctl_wmem_max); 967 set_sndbuf: 968 /* Ensure val * 2 fits into an int, to prevent max_t() 969 * from treating it as a negative value. 970 */ 971 val = min_t(int, val, INT_MAX / 2); 972 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 973 WRITE_ONCE(sk->sk_sndbuf, 974 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 975 /* Wake up sending tasks if we upped the value. */ 976 sk->sk_write_space(sk); 977 break; 978 979 case SO_SNDBUFFORCE: 980 if (!capable(CAP_NET_ADMIN)) { 981 ret = -EPERM; 982 break; 983 } 984 985 /* No negative values (to prevent underflow, as val will be 986 * multiplied by 2). 987 */ 988 if (val < 0) 989 val = 0; 990 goto set_sndbuf; 991 992 case SO_RCVBUF: 993 /* Don't error on this BSD doesn't and if you think 994 * about it this is right. Otherwise apps have to 995 * play 'guess the biggest size' games. RCVBUF/SNDBUF 996 * are treated in BSD as hints 997 */ 998 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 999 break; 1000 1001 case SO_RCVBUFFORCE: 1002 if (!capable(CAP_NET_ADMIN)) { 1003 ret = -EPERM; 1004 break; 1005 } 1006 1007 /* No negative values (to prevent underflow, as val will be 1008 * multiplied by 2). 1009 */ 1010 __sock_set_rcvbuf(sk, max(val, 0)); 1011 break; 1012 1013 case SO_KEEPALIVE: 1014 if (sk->sk_prot->keepalive) 1015 sk->sk_prot->keepalive(sk, valbool); 1016 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1017 break; 1018 1019 case SO_OOBINLINE: 1020 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1021 break; 1022 1023 case SO_NO_CHECK: 1024 sk->sk_no_check_tx = valbool; 1025 break; 1026 1027 case SO_PRIORITY: 1028 if ((val >= 0 && val <= 6) || 1029 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1030 sk->sk_priority = val; 1031 else 1032 ret = -EPERM; 1033 break; 1034 1035 case SO_LINGER: 1036 if (optlen < sizeof(ling)) { 1037 ret = -EINVAL; /* 1003.1g */ 1038 break; 1039 } 1040 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1041 ret = -EFAULT; 1042 break; 1043 } 1044 if (!ling.l_onoff) 1045 sock_reset_flag(sk, SOCK_LINGER); 1046 else { 1047 #if (BITS_PER_LONG == 32) 1048 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1049 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1050 else 1051 #endif 1052 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1053 sock_set_flag(sk, SOCK_LINGER); 1054 } 1055 break; 1056 1057 case SO_BSDCOMPAT: 1058 break; 1059 1060 case SO_PASSCRED: 1061 if (valbool) 1062 set_bit(SOCK_PASSCRED, &sock->flags); 1063 else 1064 clear_bit(SOCK_PASSCRED, &sock->flags); 1065 break; 1066 1067 case SO_TIMESTAMP_OLD: 1068 case SO_TIMESTAMP_NEW: 1069 case SO_TIMESTAMPNS_OLD: 1070 case SO_TIMESTAMPNS_NEW: 1071 sock_set_timestamp(sk, valbool, optname); 1072 break; 1073 1074 case SO_TIMESTAMPING_NEW: 1075 case SO_TIMESTAMPING_OLD: 1076 ret = sock_set_timestamping(sk, optname, val); 1077 break; 1078 1079 case SO_RCVLOWAT: 1080 if (val < 0) 1081 val = INT_MAX; 1082 if (sock->ops->set_rcvlowat) 1083 ret = sock->ops->set_rcvlowat(sk, val); 1084 else 1085 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1086 break; 1087 1088 case SO_RCVTIMEO_OLD: 1089 case SO_RCVTIMEO_NEW: 1090 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1091 optlen, optname == SO_RCVTIMEO_OLD); 1092 break; 1093 1094 case SO_SNDTIMEO_OLD: 1095 case SO_SNDTIMEO_NEW: 1096 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1097 optlen, optname == SO_SNDTIMEO_OLD); 1098 break; 1099 1100 case SO_ATTACH_FILTER: { 1101 struct sock_fprog fprog; 1102 1103 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1104 if (!ret) 1105 ret = sk_attach_filter(&fprog, sk); 1106 break; 1107 } 1108 case SO_ATTACH_BPF: 1109 ret = -EINVAL; 1110 if (optlen == sizeof(u32)) { 1111 u32 ufd; 1112 1113 ret = -EFAULT; 1114 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1115 break; 1116 1117 ret = sk_attach_bpf(ufd, sk); 1118 } 1119 break; 1120 1121 case SO_ATTACH_REUSEPORT_CBPF: { 1122 struct sock_fprog fprog; 1123 1124 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1125 if (!ret) 1126 ret = sk_reuseport_attach_filter(&fprog, sk); 1127 break; 1128 } 1129 case SO_ATTACH_REUSEPORT_EBPF: 1130 ret = -EINVAL; 1131 if (optlen == sizeof(u32)) { 1132 u32 ufd; 1133 1134 ret = -EFAULT; 1135 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1136 break; 1137 1138 ret = sk_reuseport_attach_bpf(ufd, sk); 1139 } 1140 break; 1141 1142 case SO_DETACH_REUSEPORT_BPF: 1143 ret = reuseport_detach_prog(sk); 1144 break; 1145 1146 case SO_DETACH_FILTER: 1147 ret = sk_detach_filter(sk); 1148 break; 1149 1150 case SO_LOCK_FILTER: 1151 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1152 ret = -EPERM; 1153 else 1154 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1155 break; 1156 1157 case SO_PASSSEC: 1158 if (valbool) 1159 set_bit(SOCK_PASSSEC, &sock->flags); 1160 else 1161 clear_bit(SOCK_PASSSEC, &sock->flags); 1162 break; 1163 case SO_MARK: 1164 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1165 ret = -EPERM; 1166 break; 1167 } 1168 1169 __sock_set_mark(sk, val); 1170 break; 1171 1172 case SO_RXQ_OVFL: 1173 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1174 break; 1175 1176 case SO_WIFI_STATUS: 1177 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1178 break; 1179 1180 case SO_PEEK_OFF: 1181 if (sock->ops->set_peek_off) 1182 ret = sock->ops->set_peek_off(sk, val); 1183 else 1184 ret = -EOPNOTSUPP; 1185 break; 1186 1187 case SO_NOFCS: 1188 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1189 break; 1190 1191 case SO_SELECT_ERR_QUEUE: 1192 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1193 break; 1194 1195 #ifdef CONFIG_NET_RX_BUSY_POLL 1196 case SO_BUSY_POLL: 1197 /* allow unprivileged users to decrease the value */ 1198 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1199 ret = -EPERM; 1200 else { 1201 if (val < 0) 1202 ret = -EINVAL; 1203 else 1204 sk->sk_ll_usec = val; 1205 } 1206 break; 1207 case SO_PREFER_BUSY_POLL: 1208 if (valbool && !capable(CAP_NET_ADMIN)) 1209 ret = -EPERM; 1210 else 1211 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1212 break; 1213 case SO_BUSY_POLL_BUDGET: 1214 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1215 ret = -EPERM; 1216 } else { 1217 if (val < 0 || val > U16_MAX) 1218 ret = -EINVAL; 1219 else 1220 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1221 } 1222 break; 1223 #endif 1224 1225 case SO_MAX_PACING_RATE: 1226 { 1227 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1228 1229 if (sizeof(ulval) != sizeof(val) && 1230 optlen >= sizeof(ulval) && 1231 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1232 ret = -EFAULT; 1233 break; 1234 } 1235 if (ulval != ~0UL) 1236 cmpxchg(&sk->sk_pacing_status, 1237 SK_PACING_NONE, 1238 SK_PACING_NEEDED); 1239 sk->sk_max_pacing_rate = ulval; 1240 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1241 break; 1242 } 1243 case SO_INCOMING_CPU: 1244 WRITE_ONCE(sk->sk_incoming_cpu, val); 1245 break; 1246 1247 case SO_CNX_ADVICE: 1248 if (val == 1) 1249 dst_negative_advice(sk); 1250 break; 1251 1252 case SO_ZEROCOPY: 1253 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1254 if (!((sk->sk_type == SOCK_STREAM && 1255 sk->sk_protocol == IPPROTO_TCP) || 1256 (sk->sk_type == SOCK_DGRAM && 1257 sk->sk_protocol == IPPROTO_UDP))) 1258 ret = -ENOTSUPP; 1259 } else if (sk->sk_family != PF_RDS) { 1260 ret = -ENOTSUPP; 1261 } 1262 if (!ret) { 1263 if (val < 0 || val > 1) 1264 ret = -EINVAL; 1265 else 1266 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1267 } 1268 break; 1269 1270 case SO_TXTIME: 1271 if (optlen != sizeof(struct sock_txtime)) { 1272 ret = -EINVAL; 1273 break; 1274 } else if (copy_from_sockptr(&sk_txtime, optval, 1275 sizeof(struct sock_txtime))) { 1276 ret = -EFAULT; 1277 break; 1278 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1279 ret = -EINVAL; 1280 break; 1281 } 1282 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1283 * scheduler has enough safe guards. 1284 */ 1285 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1286 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1287 ret = -EPERM; 1288 break; 1289 } 1290 sock_valbool_flag(sk, SOCK_TXTIME, true); 1291 sk->sk_clockid = sk_txtime.clockid; 1292 sk->sk_txtime_deadline_mode = 1293 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1294 sk->sk_txtime_report_errors = 1295 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1296 break; 1297 1298 case SO_BINDTOIFINDEX: 1299 ret = sock_bindtoindex_locked(sk, val); 1300 break; 1301 1302 default: 1303 ret = -ENOPROTOOPT; 1304 break; 1305 } 1306 release_sock(sk); 1307 return ret; 1308 } 1309 EXPORT_SYMBOL(sock_setsockopt); 1310 1311 1312 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1313 struct ucred *ucred) 1314 { 1315 ucred->pid = pid_vnr(pid); 1316 ucred->uid = ucred->gid = -1; 1317 if (cred) { 1318 struct user_namespace *current_ns = current_user_ns(); 1319 1320 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1321 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1322 } 1323 } 1324 1325 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1326 { 1327 struct user_namespace *user_ns = current_user_ns(); 1328 int i; 1329 1330 for (i = 0; i < src->ngroups; i++) 1331 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1332 return -EFAULT; 1333 1334 return 0; 1335 } 1336 1337 int sock_getsockopt(struct socket *sock, int level, int optname, 1338 char __user *optval, int __user *optlen) 1339 { 1340 struct sock *sk = sock->sk; 1341 1342 union { 1343 int val; 1344 u64 val64; 1345 unsigned long ulval; 1346 struct linger ling; 1347 struct old_timeval32 tm32; 1348 struct __kernel_old_timeval tm; 1349 struct __kernel_sock_timeval stm; 1350 struct sock_txtime txtime; 1351 } v; 1352 1353 int lv = sizeof(int); 1354 int len; 1355 1356 if (get_user(len, optlen)) 1357 return -EFAULT; 1358 if (len < 0) 1359 return -EINVAL; 1360 1361 memset(&v, 0, sizeof(v)); 1362 1363 switch (optname) { 1364 case SO_DEBUG: 1365 v.val = sock_flag(sk, SOCK_DBG); 1366 break; 1367 1368 case SO_DONTROUTE: 1369 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1370 break; 1371 1372 case SO_BROADCAST: 1373 v.val = sock_flag(sk, SOCK_BROADCAST); 1374 break; 1375 1376 case SO_SNDBUF: 1377 v.val = sk->sk_sndbuf; 1378 break; 1379 1380 case SO_RCVBUF: 1381 v.val = sk->sk_rcvbuf; 1382 break; 1383 1384 case SO_REUSEADDR: 1385 v.val = sk->sk_reuse; 1386 break; 1387 1388 case SO_REUSEPORT: 1389 v.val = sk->sk_reuseport; 1390 break; 1391 1392 case SO_KEEPALIVE: 1393 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1394 break; 1395 1396 case SO_TYPE: 1397 v.val = sk->sk_type; 1398 break; 1399 1400 case SO_PROTOCOL: 1401 v.val = sk->sk_protocol; 1402 break; 1403 1404 case SO_DOMAIN: 1405 v.val = sk->sk_family; 1406 break; 1407 1408 case SO_ERROR: 1409 v.val = -sock_error(sk); 1410 if (v.val == 0) 1411 v.val = xchg(&sk->sk_err_soft, 0); 1412 break; 1413 1414 case SO_OOBINLINE: 1415 v.val = sock_flag(sk, SOCK_URGINLINE); 1416 break; 1417 1418 case SO_NO_CHECK: 1419 v.val = sk->sk_no_check_tx; 1420 break; 1421 1422 case SO_PRIORITY: 1423 v.val = sk->sk_priority; 1424 break; 1425 1426 case SO_LINGER: 1427 lv = sizeof(v.ling); 1428 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1429 v.ling.l_linger = sk->sk_lingertime / HZ; 1430 break; 1431 1432 case SO_BSDCOMPAT: 1433 break; 1434 1435 case SO_TIMESTAMP_OLD: 1436 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1437 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1438 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1439 break; 1440 1441 case SO_TIMESTAMPNS_OLD: 1442 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1443 break; 1444 1445 case SO_TIMESTAMP_NEW: 1446 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1447 break; 1448 1449 case SO_TIMESTAMPNS_NEW: 1450 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1451 break; 1452 1453 case SO_TIMESTAMPING_OLD: 1454 v.val = sk->sk_tsflags; 1455 break; 1456 1457 case SO_RCVTIMEO_OLD: 1458 case SO_RCVTIMEO_NEW: 1459 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1460 break; 1461 1462 case SO_SNDTIMEO_OLD: 1463 case SO_SNDTIMEO_NEW: 1464 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1465 break; 1466 1467 case SO_RCVLOWAT: 1468 v.val = sk->sk_rcvlowat; 1469 break; 1470 1471 case SO_SNDLOWAT: 1472 v.val = 1; 1473 break; 1474 1475 case SO_PASSCRED: 1476 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1477 break; 1478 1479 case SO_PEERCRED: 1480 { 1481 struct ucred peercred; 1482 if (len > sizeof(peercred)) 1483 len = sizeof(peercred); 1484 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1485 if (copy_to_user(optval, &peercred, len)) 1486 return -EFAULT; 1487 goto lenout; 1488 } 1489 1490 case SO_PEERGROUPS: 1491 { 1492 int ret, n; 1493 1494 if (!sk->sk_peer_cred) 1495 return -ENODATA; 1496 1497 n = sk->sk_peer_cred->group_info->ngroups; 1498 if (len < n * sizeof(gid_t)) { 1499 len = n * sizeof(gid_t); 1500 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1501 } 1502 len = n * sizeof(gid_t); 1503 1504 ret = groups_to_user((gid_t __user *)optval, 1505 sk->sk_peer_cred->group_info); 1506 if (ret) 1507 return ret; 1508 goto lenout; 1509 } 1510 1511 case SO_PEERNAME: 1512 { 1513 char address[128]; 1514 1515 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1516 if (lv < 0) 1517 return -ENOTCONN; 1518 if (lv < len) 1519 return -EINVAL; 1520 if (copy_to_user(optval, address, len)) 1521 return -EFAULT; 1522 goto lenout; 1523 } 1524 1525 /* Dubious BSD thing... Probably nobody even uses it, but 1526 * the UNIX standard wants it for whatever reason... -DaveM 1527 */ 1528 case SO_ACCEPTCONN: 1529 v.val = sk->sk_state == TCP_LISTEN; 1530 break; 1531 1532 case SO_PASSSEC: 1533 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1534 break; 1535 1536 case SO_PEERSEC: 1537 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1538 1539 case SO_MARK: 1540 v.val = sk->sk_mark; 1541 break; 1542 1543 case SO_RXQ_OVFL: 1544 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1545 break; 1546 1547 case SO_WIFI_STATUS: 1548 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1549 break; 1550 1551 case SO_PEEK_OFF: 1552 if (!sock->ops->set_peek_off) 1553 return -EOPNOTSUPP; 1554 1555 v.val = sk->sk_peek_off; 1556 break; 1557 case SO_NOFCS: 1558 v.val = sock_flag(sk, SOCK_NOFCS); 1559 break; 1560 1561 case SO_BINDTODEVICE: 1562 return sock_getbindtodevice(sk, optval, optlen, len); 1563 1564 case SO_GET_FILTER: 1565 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1566 if (len < 0) 1567 return len; 1568 1569 goto lenout; 1570 1571 case SO_LOCK_FILTER: 1572 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1573 break; 1574 1575 case SO_BPF_EXTENSIONS: 1576 v.val = bpf_tell_extensions(); 1577 break; 1578 1579 case SO_SELECT_ERR_QUEUE: 1580 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1581 break; 1582 1583 #ifdef CONFIG_NET_RX_BUSY_POLL 1584 case SO_BUSY_POLL: 1585 v.val = sk->sk_ll_usec; 1586 break; 1587 case SO_PREFER_BUSY_POLL: 1588 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1589 break; 1590 #endif 1591 1592 case SO_MAX_PACING_RATE: 1593 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1594 lv = sizeof(v.ulval); 1595 v.ulval = sk->sk_max_pacing_rate; 1596 } else { 1597 /* 32bit version */ 1598 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1599 } 1600 break; 1601 1602 case SO_INCOMING_CPU: 1603 v.val = READ_ONCE(sk->sk_incoming_cpu); 1604 break; 1605 1606 case SO_MEMINFO: 1607 { 1608 u32 meminfo[SK_MEMINFO_VARS]; 1609 1610 sk_get_meminfo(sk, meminfo); 1611 1612 len = min_t(unsigned int, len, sizeof(meminfo)); 1613 if (copy_to_user(optval, &meminfo, len)) 1614 return -EFAULT; 1615 1616 goto lenout; 1617 } 1618 1619 #ifdef CONFIG_NET_RX_BUSY_POLL 1620 case SO_INCOMING_NAPI_ID: 1621 v.val = READ_ONCE(sk->sk_napi_id); 1622 1623 /* aggregate non-NAPI IDs down to 0 */ 1624 if (v.val < MIN_NAPI_ID) 1625 v.val = 0; 1626 1627 break; 1628 #endif 1629 1630 case SO_COOKIE: 1631 lv = sizeof(u64); 1632 if (len < lv) 1633 return -EINVAL; 1634 v.val64 = sock_gen_cookie(sk); 1635 break; 1636 1637 case SO_ZEROCOPY: 1638 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1639 break; 1640 1641 case SO_TXTIME: 1642 lv = sizeof(v.txtime); 1643 v.txtime.clockid = sk->sk_clockid; 1644 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1645 SOF_TXTIME_DEADLINE_MODE : 0; 1646 v.txtime.flags |= sk->sk_txtime_report_errors ? 1647 SOF_TXTIME_REPORT_ERRORS : 0; 1648 break; 1649 1650 case SO_BINDTOIFINDEX: 1651 v.val = sk->sk_bound_dev_if; 1652 break; 1653 1654 case SO_NETNS_COOKIE: 1655 lv = sizeof(u64); 1656 if (len != lv) 1657 return -EINVAL; 1658 v.val64 = sock_net(sk)->net_cookie; 1659 break; 1660 1661 default: 1662 /* We implement the SO_SNDLOWAT etc to not be settable 1663 * (1003.1g 7). 1664 */ 1665 return -ENOPROTOOPT; 1666 } 1667 1668 if (len > lv) 1669 len = lv; 1670 if (copy_to_user(optval, &v, len)) 1671 return -EFAULT; 1672 lenout: 1673 if (put_user(len, optlen)) 1674 return -EFAULT; 1675 return 0; 1676 } 1677 1678 /* 1679 * Initialize an sk_lock. 1680 * 1681 * (We also register the sk_lock with the lock validator.) 1682 */ 1683 static inline void sock_lock_init(struct sock *sk) 1684 { 1685 if (sk->sk_kern_sock) 1686 sock_lock_init_class_and_name( 1687 sk, 1688 af_family_kern_slock_key_strings[sk->sk_family], 1689 af_family_kern_slock_keys + sk->sk_family, 1690 af_family_kern_key_strings[sk->sk_family], 1691 af_family_kern_keys + sk->sk_family); 1692 else 1693 sock_lock_init_class_and_name( 1694 sk, 1695 af_family_slock_key_strings[sk->sk_family], 1696 af_family_slock_keys + sk->sk_family, 1697 af_family_key_strings[sk->sk_family], 1698 af_family_keys + sk->sk_family); 1699 } 1700 1701 /* 1702 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1703 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1704 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1705 */ 1706 static void sock_copy(struct sock *nsk, const struct sock *osk) 1707 { 1708 const struct proto *prot = READ_ONCE(osk->sk_prot); 1709 #ifdef CONFIG_SECURITY_NETWORK 1710 void *sptr = nsk->sk_security; 1711 #endif 1712 1713 /* If we move sk_tx_queue_mapping out of the private section, 1714 * we must check if sk_tx_queue_clear() is called after 1715 * sock_copy() in sk_clone_lock(). 1716 */ 1717 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1718 offsetof(struct sock, sk_dontcopy_begin) || 1719 offsetof(struct sock, sk_tx_queue_mapping) >= 1720 offsetof(struct sock, sk_dontcopy_end)); 1721 1722 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1723 1724 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1725 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1726 1727 #ifdef CONFIG_SECURITY_NETWORK 1728 nsk->sk_security = sptr; 1729 security_sk_clone(osk, nsk); 1730 #endif 1731 } 1732 1733 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1734 int family) 1735 { 1736 struct sock *sk; 1737 struct kmem_cache *slab; 1738 1739 slab = prot->slab; 1740 if (slab != NULL) { 1741 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1742 if (!sk) 1743 return sk; 1744 if (want_init_on_alloc(priority)) 1745 sk_prot_clear_nulls(sk, prot->obj_size); 1746 } else 1747 sk = kmalloc(prot->obj_size, priority); 1748 1749 if (sk != NULL) { 1750 if (security_sk_alloc(sk, family, priority)) 1751 goto out_free; 1752 1753 if (!try_module_get(prot->owner)) 1754 goto out_free_sec; 1755 } 1756 1757 return sk; 1758 1759 out_free_sec: 1760 security_sk_free(sk); 1761 out_free: 1762 if (slab != NULL) 1763 kmem_cache_free(slab, sk); 1764 else 1765 kfree(sk); 1766 return NULL; 1767 } 1768 1769 static void sk_prot_free(struct proto *prot, struct sock *sk) 1770 { 1771 struct kmem_cache *slab; 1772 struct module *owner; 1773 1774 owner = prot->owner; 1775 slab = prot->slab; 1776 1777 cgroup_sk_free(&sk->sk_cgrp_data); 1778 mem_cgroup_sk_free(sk); 1779 security_sk_free(sk); 1780 if (slab != NULL) 1781 kmem_cache_free(slab, sk); 1782 else 1783 kfree(sk); 1784 module_put(owner); 1785 } 1786 1787 /** 1788 * sk_alloc - All socket objects are allocated here 1789 * @net: the applicable net namespace 1790 * @family: protocol family 1791 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1792 * @prot: struct proto associated with this new sock instance 1793 * @kern: is this to be a kernel socket? 1794 */ 1795 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1796 struct proto *prot, int kern) 1797 { 1798 struct sock *sk; 1799 1800 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1801 if (sk) { 1802 sk->sk_family = family; 1803 /* 1804 * See comment in struct sock definition to understand 1805 * why we need sk_prot_creator -acme 1806 */ 1807 sk->sk_prot = sk->sk_prot_creator = prot; 1808 sk->sk_kern_sock = kern; 1809 sock_lock_init(sk); 1810 sk->sk_net_refcnt = kern ? 0 : 1; 1811 if (likely(sk->sk_net_refcnt)) { 1812 get_net(net); 1813 sock_inuse_add(net, 1); 1814 } 1815 1816 sock_net_set(sk, net); 1817 refcount_set(&sk->sk_wmem_alloc, 1); 1818 1819 mem_cgroup_sk_alloc(sk); 1820 cgroup_sk_alloc(&sk->sk_cgrp_data); 1821 sock_update_classid(&sk->sk_cgrp_data); 1822 sock_update_netprioidx(&sk->sk_cgrp_data); 1823 sk_tx_queue_clear(sk); 1824 } 1825 1826 return sk; 1827 } 1828 EXPORT_SYMBOL(sk_alloc); 1829 1830 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1831 * grace period. This is the case for UDP sockets and TCP listeners. 1832 */ 1833 static void __sk_destruct(struct rcu_head *head) 1834 { 1835 struct sock *sk = container_of(head, struct sock, sk_rcu); 1836 struct sk_filter *filter; 1837 1838 if (sk->sk_destruct) 1839 sk->sk_destruct(sk); 1840 1841 filter = rcu_dereference_check(sk->sk_filter, 1842 refcount_read(&sk->sk_wmem_alloc) == 0); 1843 if (filter) { 1844 sk_filter_uncharge(sk, filter); 1845 RCU_INIT_POINTER(sk->sk_filter, NULL); 1846 } 1847 1848 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1849 1850 #ifdef CONFIG_BPF_SYSCALL 1851 bpf_sk_storage_free(sk); 1852 #endif 1853 1854 if (atomic_read(&sk->sk_omem_alloc)) 1855 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1856 __func__, atomic_read(&sk->sk_omem_alloc)); 1857 1858 if (sk->sk_frag.page) { 1859 put_page(sk->sk_frag.page); 1860 sk->sk_frag.page = NULL; 1861 } 1862 1863 if (sk->sk_peer_cred) 1864 put_cred(sk->sk_peer_cred); 1865 put_pid(sk->sk_peer_pid); 1866 if (likely(sk->sk_net_refcnt)) 1867 put_net(sock_net(sk)); 1868 sk_prot_free(sk->sk_prot_creator, sk); 1869 } 1870 1871 void sk_destruct(struct sock *sk) 1872 { 1873 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1874 1875 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1876 reuseport_detach_sock(sk); 1877 use_call_rcu = true; 1878 } 1879 1880 if (use_call_rcu) 1881 call_rcu(&sk->sk_rcu, __sk_destruct); 1882 else 1883 __sk_destruct(&sk->sk_rcu); 1884 } 1885 1886 static void __sk_free(struct sock *sk) 1887 { 1888 if (likely(sk->sk_net_refcnt)) 1889 sock_inuse_add(sock_net(sk), -1); 1890 1891 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1892 sock_diag_broadcast_destroy(sk); 1893 else 1894 sk_destruct(sk); 1895 } 1896 1897 void sk_free(struct sock *sk) 1898 { 1899 /* 1900 * We subtract one from sk_wmem_alloc and can know if 1901 * some packets are still in some tx queue. 1902 * If not null, sock_wfree() will call __sk_free(sk) later 1903 */ 1904 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1905 __sk_free(sk); 1906 } 1907 EXPORT_SYMBOL(sk_free); 1908 1909 static void sk_init_common(struct sock *sk) 1910 { 1911 skb_queue_head_init(&sk->sk_receive_queue); 1912 skb_queue_head_init(&sk->sk_write_queue); 1913 skb_queue_head_init(&sk->sk_error_queue); 1914 1915 rwlock_init(&sk->sk_callback_lock); 1916 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1917 af_rlock_keys + sk->sk_family, 1918 af_family_rlock_key_strings[sk->sk_family]); 1919 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1920 af_wlock_keys + sk->sk_family, 1921 af_family_wlock_key_strings[sk->sk_family]); 1922 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1923 af_elock_keys + sk->sk_family, 1924 af_family_elock_key_strings[sk->sk_family]); 1925 lockdep_set_class_and_name(&sk->sk_callback_lock, 1926 af_callback_keys + sk->sk_family, 1927 af_family_clock_key_strings[sk->sk_family]); 1928 } 1929 1930 /** 1931 * sk_clone_lock - clone a socket, and lock its clone 1932 * @sk: the socket to clone 1933 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1934 * 1935 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1936 */ 1937 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1938 { 1939 struct proto *prot = READ_ONCE(sk->sk_prot); 1940 struct sk_filter *filter; 1941 bool is_charged = true; 1942 struct sock *newsk; 1943 1944 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1945 if (!newsk) 1946 goto out; 1947 1948 sock_copy(newsk, sk); 1949 1950 newsk->sk_prot_creator = prot; 1951 1952 /* SANITY */ 1953 if (likely(newsk->sk_net_refcnt)) 1954 get_net(sock_net(newsk)); 1955 sk_node_init(&newsk->sk_node); 1956 sock_lock_init(newsk); 1957 bh_lock_sock(newsk); 1958 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1959 newsk->sk_backlog.len = 0; 1960 1961 atomic_set(&newsk->sk_rmem_alloc, 0); 1962 1963 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 1964 refcount_set(&newsk->sk_wmem_alloc, 1); 1965 1966 atomic_set(&newsk->sk_omem_alloc, 0); 1967 sk_init_common(newsk); 1968 1969 newsk->sk_dst_cache = NULL; 1970 newsk->sk_dst_pending_confirm = 0; 1971 newsk->sk_wmem_queued = 0; 1972 newsk->sk_forward_alloc = 0; 1973 atomic_set(&newsk->sk_drops, 0); 1974 newsk->sk_send_head = NULL; 1975 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1976 atomic_set(&newsk->sk_zckey, 0); 1977 1978 sock_reset_flag(newsk, SOCK_DONE); 1979 1980 /* sk->sk_memcg will be populated at accept() time */ 1981 newsk->sk_memcg = NULL; 1982 1983 cgroup_sk_clone(&newsk->sk_cgrp_data); 1984 1985 rcu_read_lock(); 1986 filter = rcu_dereference(sk->sk_filter); 1987 if (filter != NULL) 1988 /* though it's an empty new sock, the charging may fail 1989 * if sysctl_optmem_max was changed between creation of 1990 * original socket and cloning 1991 */ 1992 is_charged = sk_filter_charge(newsk, filter); 1993 RCU_INIT_POINTER(newsk->sk_filter, filter); 1994 rcu_read_unlock(); 1995 1996 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1997 /* We need to make sure that we don't uncharge the new 1998 * socket if we couldn't charge it in the first place 1999 * as otherwise we uncharge the parent's filter. 2000 */ 2001 if (!is_charged) 2002 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2003 sk_free_unlock_clone(newsk); 2004 newsk = NULL; 2005 goto out; 2006 } 2007 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2008 2009 if (bpf_sk_storage_clone(sk, newsk)) { 2010 sk_free_unlock_clone(newsk); 2011 newsk = NULL; 2012 goto out; 2013 } 2014 2015 /* Clear sk_user_data if parent had the pointer tagged 2016 * as not suitable for copying when cloning. 2017 */ 2018 if (sk_user_data_is_nocopy(newsk)) 2019 newsk->sk_user_data = NULL; 2020 2021 newsk->sk_err = 0; 2022 newsk->sk_err_soft = 0; 2023 newsk->sk_priority = 0; 2024 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2025 if (likely(newsk->sk_net_refcnt)) 2026 sock_inuse_add(sock_net(newsk), 1); 2027 2028 /* Before updating sk_refcnt, we must commit prior changes to memory 2029 * (Documentation/RCU/rculist_nulls.rst for details) 2030 */ 2031 smp_wmb(); 2032 refcount_set(&newsk->sk_refcnt, 2); 2033 2034 /* Increment the counter in the same struct proto as the master 2035 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2036 * is the same as sk->sk_prot->socks, as this field was copied 2037 * with memcpy). 2038 * 2039 * This _changes_ the previous behaviour, where 2040 * tcp_create_openreq_child always was incrementing the 2041 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2042 * to be taken into account in all callers. -acme 2043 */ 2044 sk_refcnt_debug_inc(newsk); 2045 sk_set_socket(newsk, NULL); 2046 sk_tx_queue_clear(newsk); 2047 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2048 2049 if (newsk->sk_prot->sockets_allocated) 2050 sk_sockets_allocated_inc(newsk); 2051 2052 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2053 net_enable_timestamp(); 2054 out: 2055 return newsk; 2056 } 2057 EXPORT_SYMBOL_GPL(sk_clone_lock); 2058 2059 void sk_free_unlock_clone(struct sock *sk) 2060 { 2061 /* It is still raw copy of parent, so invalidate 2062 * destructor and make plain sk_free() */ 2063 sk->sk_destruct = NULL; 2064 bh_unlock_sock(sk); 2065 sk_free(sk); 2066 } 2067 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2068 2069 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2070 { 2071 u32 max_segs = 1; 2072 2073 sk_dst_set(sk, dst); 2074 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2075 if (sk->sk_route_caps & NETIF_F_GSO) 2076 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2077 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2078 if (sk_can_gso(sk)) { 2079 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2080 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2081 } else { 2082 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2083 sk->sk_gso_max_size = dst->dev->gso_max_size; 2084 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2085 } 2086 } 2087 sk->sk_gso_max_segs = max_segs; 2088 } 2089 EXPORT_SYMBOL_GPL(sk_setup_caps); 2090 2091 /* 2092 * Simple resource managers for sockets. 2093 */ 2094 2095 2096 /* 2097 * Write buffer destructor automatically called from kfree_skb. 2098 */ 2099 void sock_wfree(struct sk_buff *skb) 2100 { 2101 struct sock *sk = skb->sk; 2102 unsigned int len = skb->truesize; 2103 2104 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2105 /* 2106 * Keep a reference on sk_wmem_alloc, this will be released 2107 * after sk_write_space() call 2108 */ 2109 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2110 sk->sk_write_space(sk); 2111 len = 1; 2112 } 2113 /* 2114 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2115 * could not do because of in-flight packets 2116 */ 2117 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2118 __sk_free(sk); 2119 } 2120 EXPORT_SYMBOL(sock_wfree); 2121 2122 /* This variant of sock_wfree() is used by TCP, 2123 * since it sets SOCK_USE_WRITE_QUEUE. 2124 */ 2125 void __sock_wfree(struct sk_buff *skb) 2126 { 2127 struct sock *sk = skb->sk; 2128 2129 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2130 __sk_free(sk); 2131 } 2132 2133 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2134 { 2135 skb_orphan(skb); 2136 skb->sk = sk; 2137 #ifdef CONFIG_INET 2138 if (unlikely(!sk_fullsock(sk))) { 2139 skb->destructor = sock_edemux; 2140 sock_hold(sk); 2141 return; 2142 } 2143 #endif 2144 skb->destructor = sock_wfree; 2145 skb_set_hash_from_sk(skb, sk); 2146 /* 2147 * We used to take a refcount on sk, but following operation 2148 * is enough to guarantee sk_free() wont free this sock until 2149 * all in-flight packets are completed 2150 */ 2151 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2152 } 2153 EXPORT_SYMBOL(skb_set_owner_w); 2154 2155 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2156 { 2157 #ifdef CONFIG_TLS_DEVICE 2158 /* Drivers depend on in-order delivery for crypto offload, 2159 * partial orphan breaks out-of-order-OK logic. 2160 */ 2161 if (skb->decrypted) 2162 return false; 2163 #endif 2164 return (skb->destructor == sock_wfree || 2165 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2166 } 2167 2168 /* This helper is used by netem, as it can hold packets in its 2169 * delay queue. We want to allow the owner socket to send more 2170 * packets, as if they were already TX completed by a typical driver. 2171 * But we also want to keep skb->sk set because some packet schedulers 2172 * rely on it (sch_fq for example). 2173 */ 2174 void skb_orphan_partial(struct sk_buff *skb) 2175 { 2176 if (skb_is_tcp_pure_ack(skb)) 2177 return; 2178 2179 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2180 return; 2181 2182 skb_orphan(skb); 2183 } 2184 EXPORT_SYMBOL(skb_orphan_partial); 2185 2186 /* 2187 * Read buffer destructor automatically called from kfree_skb. 2188 */ 2189 void sock_rfree(struct sk_buff *skb) 2190 { 2191 struct sock *sk = skb->sk; 2192 unsigned int len = skb->truesize; 2193 2194 atomic_sub(len, &sk->sk_rmem_alloc); 2195 sk_mem_uncharge(sk, len); 2196 } 2197 EXPORT_SYMBOL(sock_rfree); 2198 2199 /* 2200 * Buffer destructor for skbs that are not used directly in read or write 2201 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2202 */ 2203 void sock_efree(struct sk_buff *skb) 2204 { 2205 sock_put(skb->sk); 2206 } 2207 EXPORT_SYMBOL(sock_efree); 2208 2209 /* Buffer destructor for prefetch/receive path where reference count may 2210 * not be held, e.g. for listen sockets. 2211 */ 2212 #ifdef CONFIG_INET 2213 void sock_pfree(struct sk_buff *skb) 2214 { 2215 if (sk_is_refcounted(skb->sk)) 2216 sock_gen_put(skb->sk); 2217 } 2218 EXPORT_SYMBOL(sock_pfree); 2219 #endif /* CONFIG_INET */ 2220 2221 kuid_t sock_i_uid(struct sock *sk) 2222 { 2223 kuid_t uid; 2224 2225 read_lock_bh(&sk->sk_callback_lock); 2226 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2227 read_unlock_bh(&sk->sk_callback_lock); 2228 return uid; 2229 } 2230 EXPORT_SYMBOL(sock_i_uid); 2231 2232 unsigned long sock_i_ino(struct sock *sk) 2233 { 2234 unsigned long ino; 2235 2236 read_lock_bh(&sk->sk_callback_lock); 2237 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2238 read_unlock_bh(&sk->sk_callback_lock); 2239 return ino; 2240 } 2241 EXPORT_SYMBOL(sock_i_ino); 2242 2243 /* 2244 * Allocate a skb from the socket's send buffer. 2245 */ 2246 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2247 gfp_t priority) 2248 { 2249 if (force || 2250 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2251 struct sk_buff *skb = alloc_skb(size, priority); 2252 2253 if (skb) { 2254 skb_set_owner_w(skb, sk); 2255 return skb; 2256 } 2257 } 2258 return NULL; 2259 } 2260 EXPORT_SYMBOL(sock_wmalloc); 2261 2262 static void sock_ofree(struct sk_buff *skb) 2263 { 2264 struct sock *sk = skb->sk; 2265 2266 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2267 } 2268 2269 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2270 gfp_t priority) 2271 { 2272 struct sk_buff *skb; 2273 2274 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2275 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2276 sysctl_optmem_max) 2277 return NULL; 2278 2279 skb = alloc_skb(size, priority); 2280 if (!skb) 2281 return NULL; 2282 2283 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2284 skb->sk = sk; 2285 skb->destructor = sock_ofree; 2286 return skb; 2287 } 2288 2289 /* 2290 * Allocate a memory block from the socket's option memory buffer. 2291 */ 2292 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2293 { 2294 if ((unsigned int)size <= sysctl_optmem_max && 2295 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2296 void *mem; 2297 /* First do the add, to avoid the race if kmalloc 2298 * might sleep. 2299 */ 2300 atomic_add(size, &sk->sk_omem_alloc); 2301 mem = kmalloc(size, priority); 2302 if (mem) 2303 return mem; 2304 atomic_sub(size, &sk->sk_omem_alloc); 2305 } 2306 return NULL; 2307 } 2308 EXPORT_SYMBOL(sock_kmalloc); 2309 2310 /* Free an option memory block. Note, we actually want the inline 2311 * here as this allows gcc to detect the nullify and fold away the 2312 * condition entirely. 2313 */ 2314 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2315 const bool nullify) 2316 { 2317 if (WARN_ON_ONCE(!mem)) 2318 return; 2319 if (nullify) 2320 kfree_sensitive(mem); 2321 else 2322 kfree(mem); 2323 atomic_sub(size, &sk->sk_omem_alloc); 2324 } 2325 2326 void sock_kfree_s(struct sock *sk, void *mem, int size) 2327 { 2328 __sock_kfree_s(sk, mem, size, false); 2329 } 2330 EXPORT_SYMBOL(sock_kfree_s); 2331 2332 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2333 { 2334 __sock_kfree_s(sk, mem, size, true); 2335 } 2336 EXPORT_SYMBOL(sock_kzfree_s); 2337 2338 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2339 I think, these locks should be removed for datagram sockets. 2340 */ 2341 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2342 { 2343 DEFINE_WAIT(wait); 2344 2345 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2346 for (;;) { 2347 if (!timeo) 2348 break; 2349 if (signal_pending(current)) 2350 break; 2351 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2352 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2353 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2354 break; 2355 if (sk->sk_shutdown & SEND_SHUTDOWN) 2356 break; 2357 if (sk->sk_err) 2358 break; 2359 timeo = schedule_timeout(timeo); 2360 } 2361 finish_wait(sk_sleep(sk), &wait); 2362 return timeo; 2363 } 2364 2365 2366 /* 2367 * Generic send/receive buffer handlers 2368 */ 2369 2370 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2371 unsigned long data_len, int noblock, 2372 int *errcode, int max_page_order) 2373 { 2374 struct sk_buff *skb; 2375 long timeo; 2376 int err; 2377 2378 timeo = sock_sndtimeo(sk, noblock); 2379 for (;;) { 2380 err = sock_error(sk); 2381 if (err != 0) 2382 goto failure; 2383 2384 err = -EPIPE; 2385 if (sk->sk_shutdown & SEND_SHUTDOWN) 2386 goto failure; 2387 2388 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2389 break; 2390 2391 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2392 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2393 err = -EAGAIN; 2394 if (!timeo) 2395 goto failure; 2396 if (signal_pending(current)) 2397 goto interrupted; 2398 timeo = sock_wait_for_wmem(sk, timeo); 2399 } 2400 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2401 errcode, sk->sk_allocation); 2402 if (skb) 2403 skb_set_owner_w(skb, sk); 2404 return skb; 2405 2406 interrupted: 2407 err = sock_intr_errno(timeo); 2408 failure: 2409 *errcode = err; 2410 return NULL; 2411 } 2412 EXPORT_SYMBOL(sock_alloc_send_pskb); 2413 2414 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2415 int noblock, int *errcode) 2416 { 2417 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2418 } 2419 EXPORT_SYMBOL(sock_alloc_send_skb); 2420 2421 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2422 struct sockcm_cookie *sockc) 2423 { 2424 u32 tsflags; 2425 2426 switch (cmsg->cmsg_type) { 2427 case SO_MARK: 2428 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2429 return -EPERM; 2430 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2431 return -EINVAL; 2432 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2433 break; 2434 case SO_TIMESTAMPING_OLD: 2435 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2436 return -EINVAL; 2437 2438 tsflags = *(u32 *)CMSG_DATA(cmsg); 2439 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2440 return -EINVAL; 2441 2442 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2443 sockc->tsflags |= tsflags; 2444 break; 2445 case SCM_TXTIME: 2446 if (!sock_flag(sk, SOCK_TXTIME)) 2447 return -EINVAL; 2448 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2449 return -EINVAL; 2450 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2451 break; 2452 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2453 case SCM_RIGHTS: 2454 case SCM_CREDENTIALS: 2455 break; 2456 default: 2457 return -EINVAL; 2458 } 2459 return 0; 2460 } 2461 EXPORT_SYMBOL(__sock_cmsg_send); 2462 2463 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2464 struct sockcm_cookie *sockc) 2465 { 2466 struct cmsghdr *cmsg; 2467 int ret; 2468 2469 for_each_cmsghdr(cmsg, msg) { 2470 if (!CMSG_OK(msg, cmsg)) 2471 return -EINVAL; 2472 if (cmsg->cmsg_level != SOL_SOCKET) 2473 continue; 2474 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2475 if (ret) 2476 return ret; 2477 } 2478 return 0; 2479 } 2480 EXPORT_SYMBOL(sock_cmsg_send); 2481 2482 static void sk_enter_memory_pressure(struct sock *sk) 2483 { 2484 if (!sk->sk_prot->enter_memory_pressure) 2485 return; 2486 2487 sk->sk_prot->enter_memory_pressure(sk); 2488 } 2489 2490 static void sk_leave_memory_pressure(struct sock *sk) 2491 { 2492 if (sk->sk_prot->leave_memory_pressure) { 2493 sk->sk_prot->leave_memory_pressure(sk); 2494 } else { 2495 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2496 2497 if (memory_pressure && READ_ONCE(*memory_pressure)) 2498 WRITE_ONCE(*memory_pressure, 0); 2499 } 2500 } 2501 2502 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2503 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2504 2505 /** 2506 * skb_page_frag_refill - check that a page_frag contains enough room 2507 * @sz: minimum size of the fragment we want to get 2508 * @pfrag: pointer to page_frag 2509 * @gfp: priority for memory allocation 2510 * 2511 * Note: While this allocator tries to use high order pages, there is 2512 * no guarantee that allocations succeed. Therefore, @sz MUST be 2513 * less or equal than PAGE_SIZE. 2514 */ 2515 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2516 { 2517 if (pfrag->page) { 2518 if (page_ref_count(pfrag->page) == 1) { 2519 pfrag->offset = 0; 2520 return true; 2521 } 2522 if (pfrag->offset + sz <= pfrag->size) 2523 return true; 2524 put_page(pfrag->page); 2525 } 2526 2527 pfrag->offset = 0; 2528 if (SKB_FRAG_PAGE_ORDER && 2529 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2530 /* Avoid direct reclaim but allow kswapd to wake */ 2531 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2532 __GFP_COMP | __GFP_NOWARN | 2533 __GFP_NORETRY, 2534 SKB_FRAG_PAGE_ORDER); 2535 if (likely(pfrag->page)) { 2536 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2537 return true; 2538 } 2539 } 2540 pfrag->page = alloc_page(gfp); 2541 if (likely(pfrag->page)) { 2542 pfrag->size = PAGE_SIZE; 2543 return true; 2544 } 2545 return false; 2546 } 2547 EXPORT_SYMBOL(skb_page_frag_refill); 2548 2549 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2550 { 2551 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2552 return true; 2553 2554 sk_enter_memory_pressure(sk); 2555 sk_stream_moderate_sndbuf(sk); 2556 return false; 2557 } 2558 EXPORT_SYMBOL(sk_page_frag_refill); 2559 2560 void __lock_sock(struct sock *sk) 2561 __releases(&sk->sk_lock.slock) 2562 __acquires(&sk->sk_lock.slock) 2563 { 2564 DEFINE_WAIT(wait); 2565 2566 for (;;) { 2567 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2568 TASK_UNINTERRUPTIBLE); 2569 spin_unlock_bh(&sk->sk_lock.slock); 2570 schedule(); 2571 spin_lock_bh(&sk->sk_lock.slock); 2572 if (!sock_owned_by_user(sk)) 2573 break; 2574 } 2575 finish_wait(&sk->sk_lock.wq, &wait); 2576 } 2577 2578 void __release_sock(struct sock *sk) 2579 __releases(&sk->sk_lock.slock) 2580 __acquires(&sk->sk_lock.slock) 2581 { 2582 struct sk_buff *skb, *next; 2583 2584 while ((skb = sk->sk_backlog.head) != NULL) { 2585 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2586 2587 spin_unlock_bh(&sk->sk_lock.slock); 2588 2589 do { 2590 next = skb->next; 2591 prefetch(next); 2592 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2593 skb_mark_not_on_list(skb); 2594 sk_backlog_rcv(sk, skb); 2595 2596 cond_resched(); 2597 2598 skb = next; 2599 } while (skb != NULL); 2600 2601 spin_lock_bh(&sk->sk_lock.slock); 2602 } 2603 2604 /* 2605 * Doing the zeroing here guarantee we can not loop forever 2606 * while a wild producer attempts to flood us. 2607 */ 2608 sk->sk_backlog.len = 0; 2609 } 2610 2611 void __sk_flush_backlog(struct sock *sk) 2612 { 2613 spin_lock_bh(&sk->sk_lock.slock); 2614 __release_sock(sk); 2615 spin_unlock_bh(&sk->sk_lock.slock); 2616 } 2617 2618 /** 2619 * sk_wait_data - wait for data to arrive at sk_receive_queue 2620 * @sk: sock to wait on 2621 * @timeo: for how long 2622 * @skb: last skb seen on sk_receive_queue 2623 * 2624 * Now socket state including sk->sk_err is changed only under lock, 2625 * hence we may omit checks after joining wait queue. 2626 * We check receive queue before schedule() only as optimization; 2627 * it is very likely that release_sock() added new data. 2628 */ 2629 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2630 { 2631 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2632 int rc; 2633 2634 add_wait_queue(sk_sleep(sk), &wait); 2635 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2636 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2637 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2638 remove_wait_queue(sk_sleep(sk), &wait); 2639 return rc; 2640 } 2641 EXPORT_SYMBOL(sk_wait_data); 2642 2643 /** 2644 * __sk_mem_raise_allocated - increase memory_allocated 2645 * @sk: socket 2646 * @size: memory size to allocate 2647 * @amt: pages to allocate 2648 * @kind: allocation type 2649 * 2650 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2651 */ 2652 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2653 { 2654 struct proto *prot = sk->sk_prot; 2655 long allocated = sk_memory_allocated_add(sk, amt); 2656 bool charged = true; 2657 2658 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2659 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2660 goto suppress_allocation; 2661 2662 /* Under limit. */ 2663 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2664 sk_leave_memory_pressure(sk); 2665 return 1; 2666 } 2667 2668 /* Under pressure. */ 2669 if (allocated > sk_prot_mem_limits(sk, 1)) 2670 sk_enter_memory_pressure(sk); 2671 2672 /* Over hard limit. */ 2673 if (allocated > sk_prot_mem_limits(sk, 2)) 2674 goto suppress_allocation; 2675 2676 /* guarantee minimum buffer size under pressure */ 2677 if (kind == SK_MEM_RECV) { 2678 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2679 return 1; 2680 2681 } else { /* SK_MEM_SEND */ 2682 int wmem0 = sk_get_wmem0(sk, prot); 2683 2684 if (sk->sk_type == SOCK_STREAM) { 2685 if (sk->sk_wmem_queued < wmem0) 2686 return 1; 2687 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2688 return 1; 2689 } 2690 } 2691 2692 if (sk_has_memory_pressure(sk)) { 2693 u64 alloc; 2694 2695 if (!sk_under_memory_pressure(sk)) 2696 return 1; 2697 alloc = sk_sockets_allocated_read_positive(sk); 2698 if (sk_prot_mem_limits(sk, 2) > alloc * 2699 sk_mem_pages(sk->sk_wmem_queued + 2700 atomic_read(&sk->sk_rmem_alloc) + 2701 sk->sk_forward_alloc)) 2702 return 1; 2703 } 2704 2705 suppress_allocation: 2706 2707 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2708 sk_stream_moderate_sndbuf(sk); 2709 2710 /* Fail only if socket is _under_ its sndbuf. 2711 * In this case we cannot block, so that we have to fail. 2712 */ 2713 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2714 return 1; 2715 } 2716 2717 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2718 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2719 2720 sk_memory_allocated_sub(sk, amt); 2721 2722 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2723 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2724 2725 return 0; 2726 } 2727 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2728 2729 /** 2730 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2731 * @sk: socket 2732 * @size: memory size to allocate 2733 * @kind: allocation type 2734 * 2735 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2736 * rmem allocation. This function assumes that protocols which have 2737 * memory_pressure use sk_wmem_queued as write buffer accounting. 2738 */ 2739 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2740 { 2741 int ret, amt = sk_mem_pages(size); 2742 2743 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2744 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2745 if (!ret) 2746 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2747 return ret; 2748 } 2749 EXPORT_SYMBOL(__sk_mem_schedule); 2750 2751 /** 2752 * __sk_mem_reduce_allocated - reclaim memory_allocated 2753 * @sk: socket 2754 * @amount: number of quanta 2755 * 2756 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2757 */ 2758 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2759 { 2760 sk_memory_allocated_sub(sk, amount); 2761 2762 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2763 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2764 2765 if (sk_under_memory_pressure(sk) && 2766 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2767 sk_leave_memory_pressure(sk); 2768 } 2769 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2770 2771 /** 2772 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2773 * @sk: socket 2774 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2775 */ 2776 void __sk_mem_reclaim(struct sock *sk, int amount) 2777 { 2778 amount >>= SK_MEM_QUANTUM_SHIFT; 2779 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2780 __sk_mem_reduce_allocated(sk, amount); 2781 } 2782 EXPORT_SYMBOL(__sk_mem_reclaim); 2783 2784 int sk_set_peek_off(struct sock *sk, int val) 2785 { 2786 sk->sk_peek_off = val; 2787 return 0; 2788 } 2789 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2790 2791 /* 2792 * Set of default routines for initialising struct proto_ops when 2793 * the protocol does not support a particular function. In certain 2794 * cases where it makes no sense for a protocol to have a "do nothing" 2795 * function, some default processing is provided. 2796 */ 2797 2798 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2799 { 2800 return -EOPNOTSUPP; 2801 } 2802 EXPORT_SYMBOL(sock_no_bind); 2803 2804 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2805 int len, int flags) 2806 { 2807 return -EOPNOTSUPP; 2808 } 2809 EXPORT_SYMBOL(sock_no_connect); 2810 2811 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2812 { 2813 return -EOPNOTSUPP; 2814 } 2815 EXPORT_SYMBOL(sock_no_socketpair); 2816 2817 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2818 bool kern) 2819 { 2820 return -EOPNOTSUPP; 2821 } 2822 EXPORT_SYMBOL(sock_no_accept); 2823 2824 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2825 int peer) 2826 { 2827 return -EOPNOTSUPP; 2828 } 2829 EXPORT_SYMBOL(sock_no_getname); 2830 2831 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2832 { 2833 return -EOPNOTSUPP; 2834 } 2835 EXPORT_SYMBOL(sock_no_ioctl); 2836 2837 int sock_no_listen(struct socket *sock, int backlog) 2838 { 2839 return -EOPNOTSUPP; 2840 } 2841 EXPORT_SYMBOL(sock_no_listen); 2842 2843 int sock_no_shutdown(struct socket *sock, int how) 2844 { 2845 return -EOPNOTSUPP; 2846 } 2847 EXPORT_SYMBOL(sock_no_shutdown); 2848 2849 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2850 { 2851 return -EOPNOTSUPP; 2852 } 2853 EXPORT_SYMBOL(sock_no_sendmsg); 2854 2855 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2856 { 2857 return -EOPNOTSUPP; 2858 } 2859 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2860 2861 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2862 int flags) 2863 { 2864 return -EOPNOTSUPP; 2865 } 2866 EXPORT_SYMBOL(sock_no_recvmsg); 2867 2868 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2869 { 2870 /* Mirror missing mmap method error code */ 2871 return -ENODEV; 2872 } 2873 EXPORT_SYMBOL(sock_no_mmap); 2874 2875 /* 2876 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2877 * various sock-based usage counts. 2878 */ 2879 void __receive_sock(struct file *file) 2880 { 2881 struct socket *sock; 2882 2883 sock = sock_from_file(file); 2884 if (sock) { 2885 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2886 sock_update_classid(&sock->sk->sk_cgrp_data); 2887 } 2888 } 2889 2890 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2891 { 2892 ssize_t res; 2893 struct msghdr msg = {.msg_flags = flags}; 2894 struct kvec iov; 2895 char *kaddr = kmap(page); 2896 iov.iov_base = kaddr + offset; 2897 iov.iov_len = size; 2898 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2899 kunmap(page); 2900 return res; 2901 } 2902 EXPORT_SYMBOL(sock_no_sendpage); 2903 2904 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2905 int offset, size_t size, int flags) 2906 { 2907 ssize_t res; 2908 struct msghdr msg = {.msg_flags = flags}; 2909 struct kvec iov; 2910 char *kaddr = kmap(page); 2911 2912 iov.iov_base = kaddr + offset; 2913 iov.iov_len = size; 2914 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2915 kunmap(page); 2916 return res; 2917 } 2918 EXPORT_SYMBOL(sock_no_sendpage_locked); 2919 2920 /* 2921 * Default Socket Callbacks 2922 */ 2923 2924 static void sock_def_wakeup(struct sock *sk) 2925 { 2926 struct socket_wq *wq; 2927 2928 rcu_read_lock(); 2929 wq = rcu_dereference(sk->sk_wq); 2930 if (skwq_has_sleeper(wq)) 2931 wake_up_interruptible_all(&wq->wait); 2932 rcu_read_unlock(); 2933 } 2934 2935 static void sock_def_error_report(struct sock *sk) 2936 { 2937 struct socket_wq *wq; 2938 2939 rcu_read_lock(); 2940 wq = rcu_dereference(sk->sk_wq); 2941 if (skwq_has_sleeper(wq)) 2942 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2943 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2944 rcu_read_unlock(); 2945 } 2946 2947 void sock_def_readable(struct sock *sk) 2948 { 2949 struct socket_wq *wq; 2950 2951 rcu_read_lock(); 2952 wq = rcu_dereference(sk->sk_wq); 2953 if (skwq_has_sleeper(wq)) 2954 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2955 EPOLLRDNORM | EPOLLRDBAND); 2956 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2957 rcu_read_unlock(); 2958 } 2959 2960 static void sock_def_write_space(struct sock *sk) 2961 { 2962 struct socket_wq *wq; 2963 2964 rcu_read_lock(); 2965 2966 /* Do not wake up a writer until he can make "significant" 2967 * progress. --DaveM 2968 */ 2969 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2970 wq = rcu_dereference(sk->sk_wq); 2971 if (skwq_has_sleeper(wq)) 2972 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2973 EPOLLWRNORM | EPOLLWRBAND); 2974 2975 /* Should agree with poll, otherwise some programs break */ 2976 if (sock_writeable(sk)) 2977 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2978 } 2979 2980 rcu_read_unlock(); 2981 } 2982 2983 static void sock_def_destruct(struct sock *sk) 2984 { 2985 } 2986 2987 void sk_send_sigurg(struct sock *sk) 2988 { 2989 if (sk->sk_socket && sk->sk_socket->file) 2990 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2991 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2992 } 2993 EXPORT_SYMBOL(sk_send_sigurg); 2994 2995 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2996 unsigned long expires) 2997 { 2998 if (!mod_timer(timer, expires)) 2999 sock_hold(sk); 3000 } 3001 EXPORT_SYMBOL(sk_reset_timer); 3002 3003 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3004 { 3005 if (del_timer(timer)) 3006 __sock_put(sk); 3007 } 3008 EXPORT_SYMBOL(sk_stop_timer); 3009 3010 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3011 { 3012 if (del_timer_sync(timer)) 3013 __sock_put(sk); 3014 } 3015 EXPORT_SYMBOL(sk_stop_timer_sync); 3016 3017 void sock_init_data(struct socket *sock, struct sock *sk) 3018 { 3019 sk_init_common(sk); 3020 sk->sk_send_head = NULL; 3021 3022 timer_setup(&sk->sk_timer, NULL, 0); 3023 3024 sk->sk_allocation = GFP_KERNEL; 3025 sk->sk_rcvbuf = sysctl_rmem_default; 3026 sk->sk_sndbuf = sysctl_wmem_default; 3027 sk->sk_state = TCP_CLOSE; 3028 sk_set_socket(sk, sock); 3029 3030 sock_set_flag(sk, SOCK_ZAPPED); 3031 3032 if (sock) { 3033 sk->sk_type = sock->type; 3034 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3035 sock->sk = sk; 3036 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3037 } else { 3038 RCU_INIT_POINTER(sk->sk_wq, NULL); 3039 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3040 } 3041 3042 rwlock_init(&sk->sk_callback_lock); 3043 if (sk->sk_kern_sock) 3044 lockdep_set_class_and_name( 3045 &sk->sk_callback_lock, 3046 af_kern_callback_keys + sk->sk_family, 3047 af_family_kern_clock_key_strings[sk->sk_family]); 3048 else 3049 lockdep_set_class_and_name( 3050 &sk->sk_callback_lock, 3051 af_callback_keys + sk->sk_family, 3052 af_family_clock_key_strings[sk->sk_family]); 3053 3054 sk->sk_state_change = sock_def_wakeup; 3055 sk->sk_data_ready = sock_def_readable; 3056 sk->sk_write_space = sock_def_write_space; 3057 sk->sk_error_report = sock_def_error_report; 3058 sk->sk_destruct = sock_def_destruct; 3059 3060 sk->sk_frag.page = NULL; 3061 sk->sk_frag.offset = 0; 3062 sk->sk_peek_off = -1; 3063 3064 sk->sk_peer_pid = NULL; 3065 sk->sk_peer_cred = NULL; 3066 sk->sk_write_pending = 0; 3067 sk->sk_rcvlowat = 1; 3068 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3069 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3070 3071 sk->sk_stamp = SK_DEFAULT_STAMP; 3072 #if BITS_PER_LONG==32 3073 seqlock_init(&sk->sk_stamp_seq); 3074 #endif 3075 atomic_set(&sk->sk_zckey, 0); 3076 3077 #ifdef CONFIG_NET_RX_BUSY_POLL 3078 sk->sk_napi_id = 0; 3079 sk->sk_ll_usec = sysctl_net_busy_read; 3080 #endif 3081 3082 sk->sk_max_pacing_rate = ~0UL; 3083 sk->sk_pacing_rate = ~0UL; 3084 WRITE_ONCE(sk->sk_pacing_shift, 10); 3085 sk->sk_incoming_cpu = -1; 3086 3087 sk_rx_queue_clear(sk); 3088 /* 3089 * Before updating sk_refcnt, we must commit prior changes to memory 3090 * (Documentation/RCU/rculist_nulls.rst for details) 3091 */ 3092 smp_wmb(); 3093 refcount_set(&sk->sk_refcnt, 1); 3094 atomic_set(&sk->sk_drops, 0); 3095 } 3096 EXPORT_SYMBOL(sock_init_data); 3097 3098 void lock_sock_nested(struct sock *sk, int subclass) 3099 { 3100 might_sleep(); 3101 spin_lock_bh(&sk->sk_lock.slock); 3102 if (sk->sk_lock.owned) 3103 __lock_sock(sk); 3104 sk->sk_lock.owned = 1; 3105 spin_unlock(&sk->sk_lock.slock); 3106 /* 3107 * The sk_lock has mutex_lock() semantics here: 3108 */ 3109 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3110 local_bh_enable(); 3111 } 3112 EXPORT_SYMBOL(lock_sock_nested); 3113 3114 void release_sock(struct sock *sk) 3115 { 3116 spin_lock_bh(&sk->sk_lock.slock); 3117 if (sk->sk_backlog.tail) 3118 __release_sock(sk); 3119 3120 /* Warning : release_cb() might need to release sk ownership, 3121 * ie call sock_release_ownership(sk) before us. 3122 */ 3123 if (sk->sk_prot->release_cb) 3124 sk->sk_prot->release_cb(sk); 3125 3126 sock_release_ownership(sk); 3127 if (waitqueue_active(&sk->sk_lock.wq)) 3128 wake_up(&sk->sk_lock.wq); 3129 spin_unlock_bh(&sk->sk_lock.slock); 3130 } 3131 EXPORT_SYMBOL(release_sock); 3132 3133 /** 3134 * lock_sock_fast - fast version of lock_sock 3135 * @sk: socket 3136 * 3137 * This version should be used for very small section, where process wont block 3138 * return false if fast path is taken: 3139 * 3140 * sk_lock.slock locked, owned = 0, BH disabled 3141 * 3142 * return true if slow path is taken: 3143 * 3144 * sk_lock.slock unlocked, owned = 1, BH enabled 3145 */ 3146 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3147 { 3148 might_sleep(); 3149 spin_lock_bh(&sk->sk_lock.slock); 3150 3151 if (!sk->sk_lock.owned) 3152 /* 3153 * Note : We must disable BH 3154 */ 3155 return false; 3156 3157 __lock_sock(sk); 3158 sk->sk_lock.owned = 1; 3159 spin_unlock(&sk->sk_lock.slock); 3160 /* 3161 * The sk_lock has mutex_lock() semantics here: 3162 */ 3163 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3164 __acquire(&sk->sk_lock.slock); 3165 local_bh_enable(); 3166 return true; 3167 } 3168 EXPORT_SYMBOL(lock_sock_fast); 3169 3170 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3171 bool timeval, bool time32) 3172 { 3173 struct sock *sk = sock->sk; 3174 struct timespec64 ts; 3175 3176 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3177 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3178 if (ts.tv_sec == -1) 3179 return -ENOENT; 3180 if (ts.tv_sec == 0) { 3181 ktime_t kt = ktime_get_real(); 3182 sock_write_timestamp(sk, kt); 3183 ts = ktime_to_timespec64(kt); 3184 } 3185 3186 if (timeval) 3187 ts.tv_nsec /= 1000; 3188 3189 #ifdef CONFIG_COMPAT_32BIT_TIME 3190 if (time32) 3191 return put_old_timespec32(&ts, userstamp); 3192 #endif 3193 #ifdef CONFIG_SPARC64 3194 /* beware of padding in sparc64 timeval */ 3195 if (timeval && !in_compat_syscall()) { 3196 struct __kernel_old_timeval __user tv = { 3197 .tv_sec = ts.tv_sec, 3198 .tv_usec = ts.tv_nsec, 3199 }; 3200 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3201 return -EFAULT; 3202 return 0; 3203 } 3204 #endif 3205 return put_timespec64(&ts, userstamp); 3206 } 3207 EXPORT_SYMBOL(sock_gettstamp); 3208 3209 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3210 { 3211 if (!sock_flag(sk, flag)) { 3212 unsigned long previous_flags = sk->sk_flags; 3213 3214 sock_set_flag(sk, flag); 3215 /* 3216 * we just set one of the two flags which require net 3217 * time stamping, but time stamping might have been on 3218 * already because of the other one 3219 */ 3220 if (sock_needs_netstamp(sk) && 3221 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3222 net_enable_timestamp(); 3223 } 3224 } 3225 3226 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3227 int level, int type) 3228 { 3229 struct sock_exterr_skb *serr; 3230 struct sk_buff *skb; 3231 int copied, err; 3232 3233 err = -EAGAIN; 3234 skb = sock_dequeue_err_skb(sk); 3235 if (skb == NULL) 3236 goto out; 3237 3238 copied = skb->len; 3239 if (copied > len) { 3240 msg->msg_flags |= MSG_TRUNC; 3241 copied = len; 3242 } 3243 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3244 if (err) 3245 goto out_free_skb; 3246 3247 sock_recv_timestamp(msg, sk, skb); 3248 3249 serr = SKB_EXT_ERR(skb); 3250 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3251 3252 msg->msg_flags |= MSG_ERRQUEUE; 3253 err = copied; 3254 3255 out_free_skb: 3256 kfree_skb(skb); 3257 out: 3258 return err; 3259 } 3260 EXPORT_SYMBOL(sock_recv_errqueue); 3261 3262 /* 3263 * Get a socket option on an socket. 3264 * 3265 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3266 * asynchronous errors should be reported by getsockopt. We assume 3267 * this means if you specify SO_ERROR (otherwise whats the point of it). 3268 */ 3269 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3270 char __user *optval, int __user *optlen) 3271 { 3272 struct sock *sk = sock->sk; 3273 3274 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3275 } 3276 EXPORT_SYMBOL(sock_common_getsockopt); 3277 3278 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3279 int flags) 3280 { 3281 struct sock *sk = sock->sk; 3282 int addr_len = 0; 3283 int err; 3284 3285 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3286 flags & ~MSG_DONTWAIT, &addr_len); 3287 if (err >= 0) 3288 msg->msg_namelen = addr_len; 3289 return err; 3290 } 3291 EXPORT_SYMBOL(sock_common_recvmsg); 3292 3293 /* 3294 * Set socket options on an inet socket. 3295 */ 3296 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3297 sockptr_t optval, unsigned int optlen) 3298 { 3299 struct sock *sk = sock->sk; 3300 3301 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3302 } 3303 EXPORT_SYMBOL(sock_common_setsockopt); 3304 3305 void sk_common_release(struct sock *sk) 3306 { 3307 if (sk->sk_prot->destroy) 3308 sk->sk_prot->destroy(sk); 3309 3310 /* 3311 * Observation: when sk_common_release is called, processes have 3312 * no access to socket. But net still has. 3313 * Step one, detach it from networking: 3314 * 3315 * A. Remove from hash tables. 3316 */ 3317 3318 sk->sk_prot->unhash(sk); 3319 3320 /* 3321 * In this point socket cannot receive new packets, but it is possible 3322 * that some packets are in flight because some CPU runs receiver and 3323 * did hash table lookup before we unhashed socket. They will achieve 3324 * receive queue and will be purged by socket destructor. 3325 * 3326 * Also we still have packets pending on receive queue and probably, 3327 * our own packets waiting in device queues. sock_destroy will drain 3328 * receive queue, but transmitted packets will delay socket destruction 3329 * until the last reference will be released. 3330 */ 3331 3332 sock_orphan(sk); 3333 3334 xfrm_sk_free_policy(sk); 3335 3336 sk_refcnt_debug_release(sk); 3337 3338 sock_put(sk); 3339 } 3340 EXPORT_SYMBOL(sk_common_release); 3341 3342 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3343 { 3344 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3345 3346 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3347 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3348 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3349 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3350 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3351 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3352 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3353 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3354 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3355 } 3356 3357 #ifdef CONFIG_PROC_FS 3358 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3359 struct prot_inuse { 3360 int val[PROTO_INUSE_NR]; 3361 }; 3362 3363 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3364 3365 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3366 { 3367 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3368 } 3369 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3370 3371 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3372 { 3373 int cpu, idx = prot->inuse_idx; 3374 int res = 0; 3375 3376 for_each_possible_cpu(cpu) 3377 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3378 3379 return res >= 0 ? res : 0; 3380 } 3381 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3382 3383 static void sock_inuse_add(struct net *net, int val) 3384 { 3385 this_cpu_add(*net->core.sock_inuse, val); 3386 } 3387 3388 int sock_inuse_get(struct net *net) 3389 { 3390 int cpu, res = 0; 3391 3392 for_each_possible_cpu(cpu) 3393 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3394 3395 return res; 3396 } 3397 3398 EXPORT_SYMBOL_GPL(sock_inuse_get); 3399 3400 static int __net_init sock_inuse_init_net(struct net *net) 3401 { 3402 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3403 if (net->core.prot_inuse == NULL) 3404 return -ENOMEM; 3405 3406 net->core.sock_inuse = alloc_percpu(int); 3407 if (net->core.sock_inuse == NULL) 3408 goto out; 3409 3410 return 0; 3411 3412 out: 3413 free_percpu(net->core.prot_inuse); 3414 return -ENOMEM; 3415 } 3416 3417 static void __net_exit sock_inuse_exit_net(struct net *net) 3418 { 3419 free_percpu(net->core.prot_inuse); 3420 free_percpu(net->core.sock_inuse); 3421 } 3422 3423 static struct pernet_operations net_inuse_ops = { 3424 .init = sock_inuse_init_net, 3425 .exit = sock_inuse_exit_net, 3426 }; 3427 3428 static __init int net_inuse_init(void) 3429 { 3430 if (register_pernet_subsys(&net_inuse_ops)) 3431 panic("Cannot initialize net inuse counters"); 3432 3433 return 0; 3434 } 3435 3436 core_initcall(net_inuse_init); 3437 3438 static int assign_proto_idx(struct proto *prot) 3439 { 3440 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3441 3442 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3443 pr_err("PROTO_INUSE_NR exhausted\n"); 3444 return -ENOSPC; 3445 } 3446 3447 set_bit(prot->inuse_idx, proto_inuse_idx); 3448 return 0; 3449 } 3450 3451 static void release_proto_idx(struct proto *prot) 3452 { 3453 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3454 clear_bit(prot->inuse_idx, proto_inuse_idx); 3455 } 3456 #else 3457 static inline int assign_proto_idx(struct proto *prot) 3458 { 3459 return 0; 3460 } 3461 3462 static inline void release_proto_idx(struct proto *prot) 3463 { 3464 } 3465 3466 static void sock_inuse_add(struct net *net, int val) 3467 { 3468 } 3469 #endif 3470 3471 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3472 { 3473 if (!twsk_prot) 3474 return; 3475 kfree(twsk_prot->twsk_slab_name); 3476 twsk_prot->twsk_slab_name = NULL; 3477 kmem_cache_destroy(twsk_prot->twsk_slab); 3478 twsk_prot->twsk_slab = NULL; 3479 } 3480 3481 static int tw_prot_init(const struct proto *prot) 3482 { 3483 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3484 3485 if (!twsk_prot) 3486 return 0; 3487 3488 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3489 prot->name); 3490 if (!twsk_prot->twsk_slab_name) 3491 return -ENOMEM; 3492 3493 twsk_prot->twsk_slab = 3494 kmem_cache_create(twsk_prot->twsk_slab_name, 3495 twsk_prot->twsk_obj_size, 0, 3496 SLAB_ACCOUNT | prot->slab_flags, 3497 NULL); 3498 if (!twsk_prot->twsk_slab) { 3499 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3500 prot->name); 3501 return -ENOMEM; 3502 } 3503 3504 return 0; 3505 } 3506 3507 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3508 { 3509 if (!rsk_prot) 3510 return; 3511 kfree(rsk_prot->slab_name); 3512 rsk_prot->slab_name = NULL; 3513 kmem_cache_destroy(rsk_prot->slab); 3514 rsk_prot->slab = NULL; 3515 } 3516 3517 static int req_prot_init(const struct proto *prot) 3518 { 3519 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3520 3521 if (!rsk_prot) 3522 return 0; 3523 3524 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3525 prot->name); 3526 if (!rsk_prot->slab_name) 3527 return -ENOMEM; 3528 3529 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3530 rsk_prot->obj_size, 0, 3531 SLAB_ACCOUNT | prot->slab_flags, 3532 NULL); 3533 3534 if (!rsk_prot->slab) { 3535 pr_crit("%s: Can't create request sock SLAB cache!\n", 3536 prot->name); 3537 return -ENOMEM; 3538 } 3539 return 0; 3540 } 3541 3542 int proto_register(struct proto *prot, int alloc_slab) 3543 { 3544 int ret = -ENOBUFS; 3545 3546 if (alloc_slab) { 3547 prot->slab = kmem_cache_create_usercopy(prot->name, 3548 prot->obj_size, 0, 3549 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3550 prot->slab_flags, 3551 prot->useroffset, prot->usersize, 3552 NULL); 3553 3554 if (prot->slab == NULL) { 3555 pr_crit("%s: Can't create sock SLAB cache!\n", 3556 prot->name); 3557 goto out; 3558 } 3559 3560 if (req_prot_init(prot)) 3561 goto out_free_request_sock_slab; 3562 3563 if (tw_prot_init(prot)) 3564 goto out_free_timewait_sock_slab; 3565 } 3566 3567 mutex_lock(&proto_list_mutex); 3568 ret = assign_proto_idx(prot); 3569 if (ret) { 3570 mutex_unlock(&proto_list_mutex); 3571 goto out_free_timewait_sock_slab; 3572 } 3573 list_add(&prot->node, &proto_list); 3574 mutex_unlock(&proto_list_mutex); 3575 return ret; 3576 3577 out_free_timewait_sock_slab: 3578 if (alloc_slab) 3579 tw_prot_cleanup(prot->twsk_prot); 3580 out_free_request_sock_slab: 3581 if (alloc_slab) { 3582 req_prot_cleanup(prot->rsk_prot); 3583 3584 kmem_cache_destroy(prot->slab); 3585 prot->slab = NULL; 3586 } 3587 out: 3588 return ret; 3589 } 3590 EXPORT_SYMBOL(proto_register); 3591 3592 void proto_unregister(struct proto *prot) 3593 { 3594 mutex_lock(&proto_list_mutex); 3595 release_proto_idx(prot); 3596 list_del(&prot->node); 3597 mutex_unlock(&proto_list_mutex); 3598 3599 kmem_cache_destroy(prot->slab); 3600 prot->slab = NULL; 3601 3602 req_prot_cleanup(prot->rsk_prot); 3603 tw_prot_cleanup(prot->twsk_prot); 3604 } 3605 EXPORT_SYMBOL(proto_unregister); 3606 3607 int sock_load_diag_module(int family, int protocol) 3608 { 3609 if (!protocol) { 3610 if (!sock_is_registered(family)) 3611 return -ENOENT; 3612 3613 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3614 NETLINK_SOCK_DIAG, family); 3615 } 3616 3617 #ifdef CONFIG_INET 3618 if (family == AF_INET && 3619 protocol != IPPROTO_RAW && 3620 protocol < MAX_INET_PROTOS && 3621 !rcu_access_pointer(inet_protos[protocol])) 3622 return -ENOENT; 3623 #endif 3624 3625 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3626 NETLINK_SOCK_DIAG, family, protocol); 3627 } 3628 EXPORT_SYMBOL(sock_load_diag_module); 3629 3630 #ifdef CONFIG_PROC_FS 3631 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3632 __acquires(proto_list_mutex) 3633 { 3634 mutex_lock(&proto_list_mutex); 3635 return seq_list_start_head(&proto_list, *pos); 3636 } 3637 3638 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3639 { 3640 return seq_list_next(v, &proto_list, pos); 3641 } 3642 3643 static void proto_seq_stop(struct seq_file *seq, void *v) 3644 __releases(proto_list_mutex) 3645 { 3646 mutex_unlock(&proto_list_mutex); 3647 } 3648 3649 static char proto_method_implemented(const void *method) 3650 { 3651 return method == NULL ? 'n' : 'y'; 3652 } 3653 static long sock_prot_memory_allocated(struct proto *proto) 3654 { 3655 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3656 } 3657 3658 static const char *sock_prot_memory_pressure(struct proto *proto) 3659 { 3660 return proto->memory_pressure != NULL ? 3661 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3662 } 3663 3664 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3665 { 3666 3667 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3668 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3669 proto->name, 3670 proto->obj_size, 3671 sock_prot_inuse_get(seq_file_net(seq), proto), 3672 sock_prot_memory_allocated(proto), 3673 sock_prot_memory_pressure(proto), 3674 proto->max_header, 3675 proto->slab == NULL ? "no" : "yes", 3676 module_name(proto->owner), 3677 proto_method_implemented(proto->close), 3678 proto_method_implemented(proto->connect), 3679 proto_method_implemented(proto->disconnect), 3680 proto_method_implemented(proto->accept), 3681 proto_method_implemented(proto->ioctl), 3682 proto_method_implemented(proto->init), 3683 proto_method_implemented(proto->destroy), 3684 proto_method_implemented(proto->shutdown), 3685 proto_method_implemented(proto->setsockopt), 3686 proto_method_implemented(proto->getsockopt), 3687 proto_method_implemented(proto->sendmsg), 3688 proto_method_implemented(proto->recvmsg), 3689 proto_method_implemented(proto->sendpage), 3690 proto_method_implemented(proto->bind), 3691 proto_method_implemented(proto->backlog_rcv), 3692 proto_method_implemented(proto->hash), 3693 proto_method_implemented(proto->unhash), 3694 proto_method_implemented(proto->get_port), 3695 proto_method_implemented(proto->enter_memory_pressure)); 3696 } 3697 3698 static int proto_seq_show(struct seq_file *seq, void *v) 3699 { 3700 if (v == &proto_list) 3701 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3702 "protocol", 3703 "size", 3704 "sockets", 3705 "memory", 3706 "press", 3707 "maxhdr", 3708 "slab", 3709 "module", 3710 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3711 else 3712 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3713 return 0; 3714 } 3715 3716 static const struct seq_operations proto_seq_ops = { 3717 .start = proto_seq_start, 3718 .next = proto_seq_next, 3719 .stop = proto_seq_stop, 3720 .show = proto_seq_show, 3721 }; 3722 3723 static __net_init int proto_init_net(struct net *net) 3724 { 3725 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3726 sizeof(struct seq_net_private))) 3727 return -ENOMEM; 3728 3729 return 0; 3730 } 3731 3732 static __net_exit void proto_exit_net(struct net *net) 3733 { 3734 remove_proc_entry("protocols", net->proc_net); 3735 } 3736 3737 3738 static __net_initdata struct pernet_operations proto_net_ops = { 3739 .init = proto_init_net, 3740 .exit = proto_exit_net, 3741 }; 3742 3743 static int __init proto_init(void) 3744 { 3745 return register_pernet_subsys(&proto_net_ops); 3746 } 3747 3748 subsys_initcall(proto_init); 3749 3750 #endif /* PROC_FS */ 3751 3752 #ifdef CONFIG_NET_RX_BUSY_POLL 3753 bool sk_busy_loop_end(void *p, unsigned long start_time) 3754 { 3755 struct sock *sk = p; 3756 3757 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3758 sk_busy_loop_timeout(sk, start_time); 3759 } 3760 EXPORT_SYMBOL(sk_busy_loop_end); 3761 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3762 3763 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3764 { 3765 if (!sk->sk_prot->bind_add) 3766 return -EOPNOTSUPP; 3767 return sk->sk_prot->bind_add(sk, addr, addr_len); 3768 } 3769 EXPORT_SYMBOL(sock_bind_add); 3770