1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 #include <linux/ethtool.h> 143 144 static DEFINE_MUTEX(proto_list_mutex); 145 static LIST_HEAD(proto_list); 146 147 static void sock_inuse_add(struct net *net, int val); 148 149 /** 150 * sk_ns_capable - General socket capability test 151 * @sk: Socket to use a capability on or through 152 * @user_ns: The user namespace of the capability to use 153 * @cap: The capability to use 154 * 155 * Test to see if the opener of the socket had when the socket was 156 * created and the current process has the capability @cap in the user 157 * namespace @user_ns. 158 */ 159 bool sk_ns_capable(const struct sock *sk, 160 struct user_namespace *user_ns, int cap) 161 { 162 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 163 ns_capable(user_ns, cap); 164 } 165 EXPORT_SYMBOL(sk_ns_capable); 166 167 /** 168 * sk_capable - Socket global capability test 169 * @sk: Socket to use a capability on or through 170 * @cap: The global capability to use 171 * 172 * Test to see if the opener of the socket had when the socket was 173 * created and the current process has the capability @cap in all user 174 * namespaces. 175 */ 176 bool sk_capable(const struct sock *sk, int cap) 177 { 178 return sk_ns_capable(sk, &init_user_ns, cap); 179 } 180 EXPORT_SYMBOL(sk_capable); 181 182 /** 183 * sk_net_capable - Network namespace socket capability test 184 * @sk: Socket to use a capability on or through 185 * @cap: The capability to use 186 * 187 * Test to see if the opener of the socket had when the socket was created 188 * and the current process has the capability @cap over the network namespace 189 * the socket is a member of. 190 */ 191 bool sk_net_capable(const struct sock *sk, int cap) 192 { 193 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 194 } 195 EXPORT_SYMBOL(sk_net_capable); 196 197 /* 198 * Each address family might have different locking rules, so we have 199 * one slock key per address family and separate keys for internal and 200 * userspace sockets. 201 */ 202 static struct lock_class_key af_family_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_keys[AF_MAX]; 204 static struct lock_class_key af_family_slock_keys[AF_MAX]; 205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 206 207 /* 208 * Make lock validator output more readable. (we pre-construct these 209 * strings build-time, so that runtime initialization of socket 210 * locks is fast): 211 */ 212 213 #define _sock_locks(x) \ 214 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 215 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 216 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 217 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 218 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 219 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 220 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 221 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 222 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 223 x "27" , x "28" , x "AF_CAN" , \ 224 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 225 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 226 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 227 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 228 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 229 x "AF_MCTP" , \ 230 x "AF_MAX" 231 232 static const char *const af_family_key_strings[AF_MAX+1] = { 233 _sock_locks("sk_lock-") 234 }; 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 236 _sock_locks("slock-") 237 }; 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 239 _sock_locks("clock-") 240 }; 241 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 243 _sock_locks("k-sk_lock-") 244 }; 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-slock-") 247 }; 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 249 _sock_locks("k-clock-") 250 }; 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 252 _sock_locks("rlock-") 253 }; 254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 255 _sock_locks("wlock-") 256 }; 257 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 258 _sock_locks("elock-") 259 }; 260 261 /* 262 * sk_callback_lock and sk queues locking rules are per-address-family, 263 * so split the lock classes by using a per-AF key: 264 */ 265 static struct lock_class_key af_callback_keys[AF_MAX]; 266 static struct lock_class_key af_rlock_keys[AF_MAX]; 267 static struct lock_class_key af_wlock_keys[AF_MAX]; 268 static struct lock_class_key af_elock_keys[AF_MAX]; 269 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 270 271 /* Run time adjustable parameters. */ 272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 273 EXPORT_SYMBOL(sysctl_wmem_max); 274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 275 EXPORT_SYMBOL(sysctl_rmem_max); 276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 278 279 /* Maximal space eaten by iovec or ancillary data plus some space */ 280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 281 EXPORT_SYMBOL(sysctl_optmem_max); 282 283 int sysctl_tstamp_allow_data __read_mostly = 1; 284 285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 286 EXPORT_SYMBOL_GPL(memalloc_socks_key); 287 288 /** 289 * sk_set_memalloc - sets %SOCK_MEMALLOC 290 * @sk: socket to set it on 291 * 292 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 293 * It's the responsibility of the admin to adjust min_free_kbytes 294 * to meet the requirements 295 */ 296 void sk_set_memalloc(struct sock *sk) 297 { 298 sock_set_flag(sk, SOCK_MEMALLOC); 299 sk->sk_allocation |= __GFP_MEMALLOC; 300 static_branch_inc(&memalloc_socks_key); 301 } 302 EXPORT_SYMBOL_GPL(sk_set_memalloc); 303 304 void sk_clear_memalloc(struct sock *sk) 305 { 306 sock_reset_flag(sk, SOCK_MEMALLOC); 307 sk->sk_allocation &= ~__GFP_MEMALLOC; 308 static_branch_dec(&memalloc_socks_key); 309 310 /* 311 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 312 * progress of swapping. SOCK_MEMALLOC may be cleared while 313 * it has rmem allocations due to the last swapfile being deactivated 314 * but there is a risk that the socket is unusable due to exceeding 315 * the rmem limits. Reclaim the reserves and obey rmem limits again. 316 */ 317 sk_mem_reclaim(sk); 318 } 319 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 320 321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 322 { 323 int ret; 324 unsigned int noreclaim_flag; 325 326 /* these should have been dropped before queueing */ 327 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 328 329 noreclaim_flag = memalloc_noreclaim_save(); 330 ret = sk->sk_backlog_rcv(sk, skb); 331 memalloc_noreclaim_restore(noreclaim_flag); 332 333 return ret; 334 } 335 EXPORT_SYMBOL(__sk_backlog_rcv); 336 337 void sk_error_report(struct sock *sk) 338 { 339 sk->sk_error_report(sk); 340 341 switch (sk->sk_family) { 342 case AF_INET: 343 fallthrough; 344 case AF_INET6: 345 trace_inet_sk_error_report(sk); 346 break; 347 default: 348 break; 349 } 350 } 351 EXPORT_SYMBOL(sk_error_report); 352 353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 354 { 355 struct __kernel_sock_timeval tv; 356 357 if (timeo == MAX_SCHEDULE_TIMEOUT) { 358 tv.tv_sec = 0; 359 tv.tv_usec = 0; 360 } else { 361 tv.tv_sec = timeo / HZ; 362 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 363 } 364 365 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 366 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 367 *(struct old_timeval32 *)optval = tv32; 368 return sizeof(tv32); 369 } 370 371 if (old_timeval) { 372 struct __kernel_old_timeval old_tv; 373 old_tv.tv_sec = tv.tv_sec; 374 old_tv.tv_usec = tv.tv_usec; 375 *(struct __kernel_old_timeval *)optval = old_tv; 376 return sizeof(old_tv); 377 } 378 379 *(struct __kernel_sock_timeval *)optval = tv; 380 return sizeof(tv); 381 } 382 383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 384 bool old_timeval) 385 { 386 struct __kernel_sock_timeval tv; 387 388 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 389 struct old_timeval32 tv32; 390 391 if (optlen < sizeof(tv32)) 392 return -EINVAL; 393 394 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 395 return -EFAULT; 396 tv.tv_sec = tv32.tv_sec; 397 tv.tv_usec = tv32.tv_usec; 398 } else if (old_timeval) { 399 struct __kernel_old_timeval old_tv; 400 401 if (optlen < sizeof(old_tv)) 402 return -EINVAL; 403 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 404 return -EFAULT; 405 tv.tv_sec = old_tv.tv_sec; 406 tv.tv_usec = old_tv.tv_usec; 407 } else { 408 if (optlen < sizeof(tv)) 409 return -EINVAL; 410 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 411 return -EFAULT; 412 } 413 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 414 return -EDOM; 415 416 if (tv.tv_sec < 0) { 417 static int warned __read_mostly; 418 419 *timeo_p = 0; 420 if (warned < 10 && net_ratelimit()) { 421 warned++; 422 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 423 __func__, current->comm, task_pid_nr(current)); 424 } 425 return 0; 426 } 427 *timeo_p = MAX_SCHEDULE_TIMEOUT; 428 if (tv.tv_sec == 0 && tv.tv_usec == 0) 429 return 0; 430 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 431 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 432 return 0; 433 } 434 435 static bool sock_needs_netstamp(const struct sock *sk) 436 { 437 switch (sk->sk_family) { 438 case AF_UNSPEC: 439 case AF_UNIX: 440 return false; 441 default: 442 return true; 443 } 444 } 445 446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 447 { 448 if (sk->sk_flags & flags) { 449 sk->sk_flags &= ~flags; 450 if (sock_needs_netstamp(sk) && 451 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 452 net_disable_timestamp(); 453 } 454 } 455 456 457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 458 { 459 unsigned long flags; 460 struct sk_buff_head *list = &sk->sk_receive_queue; 461 462 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 463 atomic_inc(&sk->sk_drops); 464 trace_sock_rcvqueue_full(sk, skb); 465 return -ENOMEM; 466 } 467 468 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 469 atomic_inc(&sk->sk_drops); 470 return -ENOBUFS; 471 } 472 473 skb->dev = NULL; 474 skb_set_owner_r(skb, sk); 475 476 /* we escape from rcu protected region, make sure we dont leak 477 * a norefcounted dst 478 */ 479 skb_dst_force(skb); 480 481 spin_lock_irqsave(&list->lock, flags); 482 sock_skb_set_dropcount(sk, skb); 483 __skb_queue_tail(list, skb); 484 spin_unlock_irqrestore(&list->lock, flags); 485 486 if (!sock_flag(sk, SOCK_DEAD)) 487 sk->sk_data_ready(sk); 488 return 0; 489 } 490 EXPORT_SYMBOL(__sock_queue_rcv_skb); 491 492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 493 { 494 int err; 495 496 err = sk_filter(sk, skb); 497 if (err) 498 return err; 499 500 return __sock_queue_rcv_skb(sk, skb); 501 } 502 EXPORT_SYMBOL(sock_queue_rcv_skb); 503 504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 505 const int nested, unsigned int trim_cap, bool refcounted) 506 { 507 int rc = NET_RX_SUCCESS; 508 509 if (sk_filter_trim_cap(sk, skb, trim_cap)) 510 goto discard_and_relse; 511 512 skb->dev = NULL; 513 514 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 515 atomic_inc(&sk->sk_drops); 516 goto discard_and_relse; 517 } 518 if (nested) 519 bh_lock_sock_nested(sk); 520 else 521 bh_lock_sock(sk); 522 if (!sock_owned_by_user(sk)) { 523 /* 524 * trylock + unlock semantics: 525 */ 526 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 527 528 rc = sk_backlog_rcv(sk, skb); 529 530 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 531 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 532 bh_unlock_sock(sk); 533 atomic_inc(&sk->sk_drops); 534 goto discard_and_relse; 535 } 536 537 bh_unlock_sock(sk); 538 out: 539 if (refcounted) 540 sock_put(sk); 541 return rc; 542 discard_and_relse: 543 kfree_skb(skb); 544 goto out; 545 } 546 EXPORT_SYMBOL(__sk_receive_skb); 547 548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 549 u32)); 550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 551 u32)); 552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 553 { 554 struct dst_entry *dst = __sk_dst_get(sk); 555 556 if (dst && dst->obsolete && 557 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 558 dst, cookie) == NULL) { 559 sk_tx_queue_clear(sk); 560 sk->sk_dst_pending_confirm = 0; 561 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 562 dst_release(dst); 563 return NULL; 564 } 565 566 return dst; 567 } 568 EXPORT_SYMBOL(__sk_dst_check); 569 570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 571 { 572 struct dst_entry *dst = sk_dst_get(sk); 573 574 if (dst && dst->obsolete && 575 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 576 dst, cookie) == NULL) { 577 sk_dst_reset(sk); 578 dst_release(dst); 579 return NULL; 580 } 581 582 return dst; 583 } 584 EXPORT_SYMBOL(sk_dst_check); 585 586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 587 { 588 int ret = -ENOPROTOOPT; 589 #ifdef CONFIG_NETDEVICES 590 struct net *net = sock_net(sk); 591 592 /* Sorry... */ 593 ret = -EPERM; 594 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 595 goto out; 596 597 ret = -EINVAL; 598 if (ifindex < 0) 599 goto out; 600 601 sk->sk_bound_dev_if = ifindex; 602 if (sk->sk_prot->rehash) 603 sk->sk_prot->rehash(sk); 604 sk_dst_reset(sk); 605 606 ret = 0; 607 608 out: 609 #endif 610 611 return ret; 612 } 613 614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 615 { 616 int ret; 617 618 if (lock_sk) 619 lock_sock(sk); 620 ret = sock_bindtoindex_locked(sk, ifindex); 621 if (lock_sk) 622 release_sock(sk); 623 624 return ret; 625 } 626 EXPORT_SYMBOL(sock_bindtoindex); 627 628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 629 { 630 int ret = -ENOPROTOOPT; 631 #ifdef CONFIG_NETDEVICES 632 struct net *net = sock_net(sk); 633 char devname[IFNAMSIZ]; 634 int index; 635 636 ret = -EINVAL; 637 if (optlen < 0) 638 goto out; 639 640 /* Bind this socket to a particular device like "eth0", 641 * as specified in the passed interface name. If the 642 * name is "" or the option length is zero the socket 643 * is not bound. 644 */ 645 if (optlen > IFNAMSIZ - 1) 646 optlen = IFNAMSIZ - 1; 647 memset(devname, 0, sizeof(devname)); 648 649 ret = -EFAULT; 650 if (copy_from_sockptr(devname, optval, optlen)) 651 goto out; 652 653 index = 0; 654 if (devname[0] != '\0') { 655 struct net_device *dev; 656 657 rcu_read_lock(); 658 dev = dev_get_by_name_rcu(net, devname); 659 if (dev) 660 index = dev->ifindex; 661 rcu_read_unlock(); 662 ret = -ENODEV; 663 if (!dev) 664 goto out; 665 } 666 667 return sock_bindtoindex(sk, index, true); 668 out: 669 #endif 670 671 return ret; 672 } 673 674 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 675 int __user *optlen, int len) 676 { 677 int ret = -ENOPROTOOPT; 678 #ifdef CONFIG_NETDEVICES 679 struct net *net = sock_net(sk); 680 char devname[IFNAMSIZ]; 681 682 if (sk->sk_bound_dev_if == 0) { 683 len = 0; 684 goto zero; 685 } 686 687 ret = -EINVAL; 688 if (len < IFNAMSIZ) 689 goto out; 690 691 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 692 if (ret) 693 goto out; 694 695 len = strlen(devname) + 1; 696 697 ret = -EFAULT; 698 if (copy_to_user(optval, devname, len)) 699 goto out; 700 701 zero: 702 ret = -EFAULT; 703 if (put_user(len, optlen)) 704 goto out; 705 706 ret = 0; 707 708 out: 709 #endif 710 711 return ret; 712 } 713 714 bool sk_mc_loop(struct sock *sk) 715 { 716 if (dev_recursion_level()) 717 return false; 718 if (!sk) 719 return true; 720 switch (sk->sk_family) { 721 case AF_INET: 722 return inet_sk(sk)->mc_loop; 723 #if IS_ENABLED(CONFIG_IPV6) 724 case AF_INET6: 725 return inet6_sk(sk)->mc_loop; 726 #endif 727 } 728 WARN_ON_ONCE(1); 729 return true; 730 } 731 EXPORT_SYMBOL(sk_mc_loop); 732 733 void sock_set_reuseaddr(struct sock *sk) 734 { 735 lock_sock(sk); 736 sk->sk_reuse = SK_CAN_REUSE; 737 release_sock(sk); 738 } 739 EXPORT_SYMBOL(sock_set_reuseaddr); 740 741 void sock_set_reuseport(struct sock *sk) 742 { 743 lock_sock(sk); 744 sk->sk_reuseport = true; 745 release_sock(sk); 746 } 747 EXPORT_SYMBOL(sock_set_reuseport); 748 749 void sock_no_linger(struct sock *sk) 750 { 751 lock_sock(sk); 752 sk->sk_lingertime = 0; 753 sock_set_flag(sk, SOCK_LINGER); 754 release_sock(sk); 755 } 756 EXPORT_SYMBOL(sock_no_linger); 757 758 void sock_set_priority(struct sock *sk, u32 priority) 759 { 760 lock_sock(sk); 761 sk->sk_priority = priority; 762 release_sock(sk); 763 } 764 EXPORT_SYMBOL(sock_set_priority); 765 766 void sock_set_sndtimeo(struct sock *sk, s64 secs) 767 { 768 lock_sock(sk); 769 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 770 sk->sk_sndtimeo = secs * HZ; 771 else 772 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 773 release_sock(sk); 774 } 775 EXPORT_SYMBOL(sock_set_sndtimeo); 776 777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 778 { 779 if (val) { 780 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 781 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 782 sock_set_flag(sk, SOCK_RCVTSTAMP); 783 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 784 } else { 785 sock_reset_flag(sk, SOCK_RCVTSTAMP); 786 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 787 } 788 } 789 790 void sock_enable_timestamps(struct sock *sk) 791 { 792 lock_sock(sk); 793 __sock_set_timestamps(sk, true, false, true); 794 release_sock(sk); 795 } 796 EXPORT_SYMBOL(sock_enable_timestamps); 797 798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 799 { 800 switch (optname) { 801 case SO_TIMESTAMP_OLD: 802 __sock_set_timestamps(sk, valbool, false, false); 803 break; 804 case SO_TIMESTAMP_NEW: 805 __sock_set_timestamps(sk, valbool, true, false); 806 break; 807 case SO_TIMESTAMPNS_OLD: 808 __sock_set_timestamps(sk, valbool, false, true); 809 break; 810 case SO_TIMESTAMPNS_NEW: 811 __sock_set_timestamps(sk, valbool, true, true); 812 break; 813 } 814 } 815 816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 817 { 818 struct net *net = sock_net(sk); 819 struct net_device *dev = NULL; 820 bool match = false; 821 int *vclock_index; 822 int i, num; 823 824 if (sk->sk_bound_dev_if) 825 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 826 827 if (!dev) { 828 pr_err("%s: sock not bind to device\n", __func__); 829 return -EOPNOTSUPP; 830 } 831 832 num = ethtool_get_phc_vclocks(dev, &vclock_index); 833 for (i = 0; i < num; i++) { 834 if (*(vclock_index + i) == phc_index) { 835 match = true; 836 break; 837 } 838 } 839 840 if (num > 0) 841 kfree(vclock_index); 842 843 if (!match) 844 return -EINVAL; 845 846 sk->sk_bind_phc = phc_index; 847 848 return 0; 849 } 850 851 int sock_set_timestamping(struct sock *sk, int optname, 852 struct so_timestamping timestamping) 853 { 854 int val = timestamping.flags; 855 int ret; 856 857 if (val & ~SOF_TIMESTAMPING_MASK) 858 return -EINVAL; 859 860 if (val & SOF_TIMESTAMPING_OPT_ID && 861 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 862 if (sk->sk_protocol == IPPROTO_TCP && 863 sk->sk_type == SOCK_STREAM) { 864 if ((1 << sk->sk_state) & 865 (TCPF_CLOSE | TCPF_LISTEN)) 866 return -EINVAL; 867 sk->sk_tskey = tcp_sk(sk)->snd_una; 868 } else { 869 sk->sk_tskey = 0; 870 } 871 } 872 873 if (val & SOF_TIMESTAMPING_OPT_STATS && 874 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 875 return -EINVAL; 876 877 if (val & SOF_TIMESTAMPING_BIND_PHC) { 878 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 879 if (ret) 880 return ret; 881 } 882 883 sk->sk_tsflags = val; 884 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 885 886 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 887 sock_enable_timestamp(sk, 888 SOCK_TIMESTAMPING_RX_SOFTWARE); 889 else 890 sock_disable_timestamp(sk, 891 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 892 return 0; 893 } 894 895 void sock_set_keepalive(struct sock *sk) 896 { 897 lock_sock(sk); 898 if (sk->sk_prot->keepalive) 899 sk->sk_prot->keepalive(sk, true); 900 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 901 release_sock(sk); 902 } 903 EXPORT_SYMBOL(sock_set_keepalive); 904 905 static void __sock_set_rcvbuf(struct sock *sk, int val) 906 { 907 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 908 * as a negative value. 909 */ 910 val = min_t(int, val, INT_MAX / 2); 911 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 912 913 /* We double it on the way in to account for "struct sk_buff" etc. 914 * overhead. Applications assume that the SO_RCVBUF setting they make 915 * will allow that much actual data to be received on that socket. 916 * 917 * Applications are unaware that "struct sk_buff" and other overheads 918 * allocate from the receive buffer during socket buffer allocation. 919 * 920 * And after considering the possible alternatives, returning the value 921 * we actually used in getsockopt is the most desirable behavior. 922 */ 923 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 924 } 925 926 void sock_set_rcvbuf(struct sock *sk, int val) 927 { 928 lock_sock(sk); 929 __sock_set_rcvbuf(sk, val); 930 release_sock(sk); 931 } 932 EXPORT_SYMBOL(sock_set_rcvbuf); 933 934 static void __sock_set_mark(struct sock *sk, u32 val) 935 { 936 if (val != sk->sk_mark) { 937 sk->sk_mark = val; 938 sk_dst_reset(sk); 939 } 940 } 941 942 void sock_set_mark(struct sock *sk, u32 val) 943 { 944 lock_sock(sk); 945 __sock_set_mark(sk, val); 946 release_sock(sk); 947 } 948 EXPORT_SYMBOL(sock_set_mark); 949 950 /* 951 * This is meant for all protocols to use and covers goings on 952 * at the socket level. Everything here is generic. 953 */ 954 955 int sock_setsockopt(struct socket *sock, int level, int optname, 956 sockptr_t optval, unsigned int optlen) 957 { 958 struct so_timestamping timestamping; 959 struct sock_txtime sk_txtime; 960 struct sock *sk = sock->sk; 961 int val; 962 int valbool; 963 struct linger ling; 964 int ret = 0; 965 966 /* 967 * Options without arguments 968 */ 969 970 if (optname == SO_BINDTODEVICE) 971 return sock_setbindtodevice(sk, optval, optlen); 972 973 if (optlen < sizeof(int)) 974 return -EINVAL; 975 976 if (copy_from_sockptr(&val, optval, sizeof(val))) 977 return -EFAULT; 978 979 valbool = val ? 1 : 0; 980 981 lock_sock(sk); 982 983 switch (optname) { 984 case SO_DEBUG: 985 if (val && !capable(CAP_NET_ADMIN)) 986 ret = -EACCES; 987 else 988 sock_valbool_flag(sk, SOCK_DBG, valbool); 989 break; 990 case SO_REUSEADDR: 991 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 992 break; 993 case SO_REUSEPORT: 994 sk->sk_reuseport = valbool; 995 break; 996 case SO_TYPE: 997 case SO_PROTOCOL: 998 case SO_DOMAIN: 999 case SO_ERROR: 1000 ret = -ENOPROTOOPT; 1001 break; 1002 case SO_DONTROUTE: 1003 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1004 sk_dst_reset(sk); 1005 break; 1006 case SO_BROADCAST: 1007 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1008 break; 1009 case SO_SNDBUF: 1010 /* Don't error on this BSD doesn't and if you think 1011 * about it this is right. Otherwise apps have to 1012 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1013 * are treated in BSD as hints 1014 */ 1015 val = min_t(u32, val, sysctl_wmem_max); 1016 set_sndbuf: 1017 /* Ensure val * 2 fits into an int, to prevent max_t() 1018 * from treating it as a negative value. 1019 */ 1020 val = min_t(int, val, INT_MAX / 2); 1021 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1022 WRITE_ONCE(sk->sk_sndbuf, 1023 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1024 /* Wake up sending tasks if we upped the value. */ 1025 sk->sk_write_space(sk); 1026 break; 1027 1028 case SO_SNDBUFFORCE: 1029 if (!capable(CAP_NET_ADMIN)) { 1030 ret = -EPERM; 1031 break; 1032 } 1033 1034 /* No negative values (to prevent underflow, as val will be 1035 * multiplied by 2). 1036 */ 1037 if (val < 0) 1038 val = 0; 1039 goto set_sndbuf; 1040 1041 case SO_RCVBUF: 1042 /* Don't error on this BSD doesn't and if you think 1043 * about it this is right. Otherwise apps have to 1044 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1045 * are treated in BSD as hints 1046 */ 1047 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 1048 break; 1049 1050 case SO_RCVBUFFORCE: 1051 if (!capable(CAP_NET_ADMIN)) { 1052 ret = -EPERM; 1053 break; 1054 } 1055 1056 /* No negative values (to prevent underflow, as val will be 1057 * multiplied by 2). 1058 */ 1059 __sock_set_rcvbuf(sk, max(val, 0)); 1060 break; 1061 1062 case SO_KEEPALIVE: 1063 if (sk->sk_prot->keepalive) 1064 sk->sk_prot->keepalive(sk, valbool); 1065 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1066 break; 1067 1068 case SO_OOBINLINE: 1069 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1070 break; 1071 1072 case SO_NO_CHECK: 1073 sk->sk_no_check_tx = valbool; 1074 break; 1075 1076 case SO_PRIORITY: 1077 if ((val >= 0 && val <= 6) || 1078 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1079 sk->sk_priority = val; 1080 else 1081 ret = -EPERM; 1082 break; 1083 1084 case SO_LINGER: 1085 if (optlen < sizeof(ling)) { 1086 ret = -EINVAL; /* 1003.1g */ 1087 break; 1088 } 1089 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1090 ret = -EFAULT; 1091 break; 1092 } 1093 if (!ling.l_onoff) 1094 sock_reset_flag(sk, SOCK_LINGER); 1095 else { 1096 #if (BITS_PER_LONG == 32) 1097 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1098 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1099 else 1100 #endif 1101 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1102 sock_set_flag(sk, SOCK_LINGER); 1103 } 1104 break; 1105 1106 case SO_BSDCOMPAT: 1107 break; 1108 1109 case SO_PASSCRED: 1110 if (valbool) 1111 set_bit(SOCK_PASSCRED, &sock->flags); 1112 else 1113 clear_bit(SOCK_PASSCRED, &sock->flags); 1114 break; 1115 1116 case SO_TIMESTAMP_OLD: 1117 case SO_TIMESTAMP_NEW: 1118 case SO_TIMESTAMPNS_OLD: 1119 case SO_TIMESTAMPNS_NEW: 1120 sock_set_timestamp(sk, optname, valbool); 1121 break; 1122 1123 case SO_TIMESTAMPING_NEW: 1124 case SO_TIMESTAMPING_OLD: 1125 if (optlen == sizeof(timestamping)) { 1126 if (copy_from_sockptr(×tamping, optval, 1127 sizeof(timestamping))) { 1128 ret = -EFAULT; 1129 break; 1130 } 1131 } else { 1132 memset(×tamping, 0, sizeof(timestamping)); 1133 timestamping.flags = val; 1134 } 1135 ret = sock_set_timestamping(sk, optname, timestamping); 1136 break; 1137 1138 case SO_RCVLOWAT: 1139 if (val < 0) 1140 val = INT_MAX; 1141 if (sock->ops->set_rcvlowat) 1142 ret = sock->ops->set_rcvlowat(sk, val); 1143 else 1144 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1145 break; 1146 1147 case SO_RCVTIMEO_OLD: 1148 case SO_RCVTIMEO_NEW: 1149 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1150 optlen, optname == SO_RCVTIMEO_OLD); 1151 break; 1152 1153 case SO_SNDTIMEO_OLD: 1154 case SO_SNDTIMEO_NEW: 1155 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1156 optlen, optname == SO_SNDTIMEO_OLD); 1157 break; 1158 1159 case SO_ATTACH_FILTER: { 1160 struct sock_fprog fprog; 1161 1162 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1163 if (!ret) 1164 ret = sk_attach_filter(&fprog, sk); 1165 break; 1166 } 1167 case SO_ATTACH_BPF: 1168 ret = -EINVAL; 1169 if (optlen == sizeof(u32)) { 1170 u32 ufd; 1171 1172 ret = -EFAULT; 1173 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1174 break; 1175 1176 ret = sk_attach_bpf(ufd, sk); 1177 } 1178 break; 1179 1180 case SO_ATTACH_REUSEPORT_CBPF: { 1181 struct sock_fprog fprog; 1182 1183 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1184 if (!ret) 1185 ret = sk_reuseport_attach_filter(&fprog, sk); 1186 break; 1187 } 1188 case SO_ATTACH_REUSEPORT_EBPF: 1189 ret = -EINVAL; 1190 if (optlen == sizeof(u32)) { 1191 u32 ufd; 1192 1193 ret = -EFAULT; 1194 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1195 break; 1196 1197 ret = sk_reuseport_attach_bpf(ufd, sk); 1198 } 1199 break; 1200 1201 case SO_DETACH_REUSEPORT_BPF: 1202 ret = reuseport_detach_prog(sk); 1203 break; 1204 1205 case SO_DETACH_FILTER: 1206 ret = sk_detach_filter(sk); 1207 break; 1208 1209 case SO_LOCK_FILTER: 1210 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1211 ret = -EPERM; 1212 else 1213 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1214 break; 1215 1216 case SO_PASSSEC: 1217 if (valbool) 1218 set_bit(SOCK_PASSSEC, &sock->flags); 1219 else 1220 clear_bit(SOCK_PASSSEC, &sock->flags); 1221 break; 1222 case SO_MARK: 1223 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1224 ret = -EPERM; 1225 break; 1226 } 1227 1228 __sock_set_mark(sk, val); 1229 break; 1230 1231 case SO_RXQ_OVFL: 1232 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1233 break; 1234 1235 case SO_WIFI_STATUS: 1236 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1237 break; 1238 1239 case SO_PEEK_OFF: 1240 if (sock->ops->set_peek_off) 1241 ret = sock->ops->set_peek_off(sk, val); 1242 else 1243 ret = -EOPNOTSUPP; 1244 break; 1245 1246 case SO_NOFCS: 1247 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1248 break; 1249 1250 case SO_SELECT_ERR_QUEUE: 1251 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1252 break; 1253 1254 #ifdef CONFIG_NET_RX_BUSY_POLL 1255 case SO_BUSY_POLL: 1256 /* allow unprivileged users to decrease the value */ 1257 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1258 ret = -EPERM; 1259 else { 1260 if (val < 0) 1261 ret = -EINVAL; 1262 else 1263 WRITE_ONCE(sk->sk_ll_usec, val); 1264 } 1265 break; 1266 case SO_PREFER_BUSY_POLL: 1267 if (valbool && !capable(CAP_NET_ADMIN)) 1268 ret = -EPERM; 1269 else 1270 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1271 break; 1272 case SO_BUSY_POLL_BUDGET: 1273 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1274 ret = -EPERM; 1275 } else { 1276 if (val < 0 || val > U16_MAX) 1277 ret = -EINVAL; 1278 else 1279 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1280 } 1281 break; 1282 #endif 1283 1284 case SO_MAX_PACING_RATE: 1285 { 1286 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1287 1288 if (sizeof(ulval) != sizeof(val) && 1289 optlen >= sizeof(ulval) && 1290 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1291 ret = -EFAULT; 1292 break; 1293 } 1294 if (ulval != ~0UL) 1295 cmpxchg(&sk->sk_pacing_status, 1296 SK_PACING_NONE, 1297 SK_PACING_NEEDED); 1298 sk->sk_max_pacing_rate = ulval; 1299 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1300 break; 1301 } 1302 case SO_INCOMING_CPU: 1303 WRITE_ONCE(sk->sk_incoming_cpu, val); 1304 break; 1305 1306 case SO_CNX_ADVICE: 1307 if (val == 1) 1308 dst_negative_advice(sk); 1309 break; 1310 1311 case SO_ZEROCOPY: 1312 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1313 if (!((sk->sk_type == SOCK_STREAM && 1314 sk->sk_protocol == IPPROTO_TCP) || 1315 (sk->sk_type == SOCK_DGRAM && 1316 sk->sk_protocol == IPPROTO_UDP))) 1317 ret = -ENOTSUPP; 1318 } else if (sk->sk_family != PF_RDS) { 1319 ret = -ENOTSUPP; 1320 } 1321 if (!ret) { 1322 if (val < 0 || val > 1) 1323 ret = -EINVAL; 1324 else 1325 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1326 } 1327 break; 1328 1329 case SO_TXTIME: 1330 if (optlen != sizeof(struct sock_txtime)) { 1331 ret = -EINVAL; 1332 break; 1333 } else if (copy_from_sockptr(&sk_txtime, optval, 1334 sizeof(struct sock_txtime))) { 1335 ret = -EFAULT; 1336 break; 1337 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1338 ret = -EINVAL; 1339 break; 1340 } 1341 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1342 * scheduler has enough safe guards. 1343 */ 1344 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1345 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1346 ret = -EPERM; 1347 break; 1348 } 1349 sock_valbool_flag(sk, SOCK_TXTIME, true); 1350 sk->sk_clockid = sk_txtime.clockid; 1351 sk->sk_txtime_deadline_mode = 1352 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1353 sk->sk_txtime_report_errors = 1354 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1355 break; 1356 1357 case SO_BINDTOIFINDEX: 1358 ret = sock_bindtoindex_locked(sk, val); 1359 break; 1360 1361 default: 1362 ret = -ENOPROTOOPT; 1363 break; 1364 } 1365 release_sock(sk); 1366 return ret; 1367 } 1368 EXPORT_SYMBOL(sock_setsockopt); 1369 1370 1371 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1372 struct ucred *ucred) 1373 { 1374 ucred->pid = pid_vnr(pid); 1375 ucred->uid = ucred->gid = -1; 1376 if (cred) { 1377 struct user_namespace *current_ns = current_user_ns(); 1378 1379 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1380 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1381 } 1382 } 1383 1384 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1385 { 1386 struct user_namespace *user_ns = current_user_ns(); 1387 int i; 1388 1389 for (i = 0; i < src->ngroups; i++) 1390 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1391 return -EFAULT; 1392 1393 return 0; 1394 } 1395 1396 int sock_getsockopt(struct socket *sock, int level, int optname, 1397 char __user *optval, int __user *optlen) 1398 { 1399 struct sock *sk = sock->sk; 1400 1401 union { 1402 int val; 1403 u64 val64; 1404 unsigned long ulval; 1405 struct linger ling; 1406 struct old_timeval32 tm32; 1407 struct __kernel_old_timeval tm; 1408 struct __kernel_sock_timeval stm; 1409 struct sock_txtime txtime; 1410 struct so_timestamping timestamping; 1411 } v; 1412 1413 int lv = sizeof(int); 1414 int len; 1415 1416 if (get_user(len, optlen)) 1417 return -EFAULT; 1418 if (len < 0) 1419 return -EINVAL; 1420 1421 memset(&v, 0, sizeof(v)); 1422 1423 switch (optname) { 1424 case SO_DEBUG: 1425 v.val = sock_flag(sk, SOCK_DBG); 1426 break; 1427 1428 case SO_DONTROUTE: 1429 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1430 break; 1431 1432 case SO_BROADCAST: 1433 v.val = sock_flag(sk, SOCK_BROADCAST); 1434 break; 1435 1436 case SO_SNDBUF: 1437 v.val = sk->sk_sndbuf; 1438 break; 1439 1440 case SO_RCVBUF: 1441 v.val = sk->sk_rcvbuf; 1442 break; 1443 1444 case SO_REUSEADDR: 1445 v.val = sk->sk_reuse; 1446 break; 1447 1448 case SO_REUSEPORT: 1449 v.val = sk->sk_reuseport; 1450 break; 1451 1452 case SO_KEEPALIVE: 1453 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1454 break; 1455 1456 case SO_TYPE: 1457 v.val = sk->sk_type; 1458 break; 1459 1460 case SO_PROTOCOL: 1461 v.val = sk->sk_protocol; 1462 break; 1463 1464 case SO_DOMAIN: 1465 v.val = sk->sk_family; 1466 break; 1467 1468 case SO_ERROR: 1469 v.val = -sock_error(sk); 1470 if (v.val == 0) 1471 v.val = xchg(&sk->sk_err_soft, 0); 1472 break; 1473 1474 case SO_OOBINLINE: 1475 v.val = sock_flag(sk, SOCK_URGINLINE); 1476 break; 1477 1478 case SO_NO_CHECK: 1479 v.val = sk->sk_no_check_tx; 1480 break; 1481 1482 case SO_PRIORITY: 1483 v.val = sk->sk_priority; 1484 break; 1485 1486 case SO_LINGER: 1487 lv = sizeof(v.ling); 1488 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1489 v.ling.l_linger = sk->sk_lingertime / HZ; 1490 break; 1491 1492 case SO_BSDCOMPAT: 1493 break; 1494 1495 case SO_TIMESTAMP_OLD: 1496 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1497 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1498 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1499 break; 1500 1501 case SO_TIMESTAMPNS_OLD: 1502 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1503 break; 1504 1505 case SO_TIMESTAMP_NEW: 1506 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1507 break; 1508 1509 case SO_TIMESTAMPNS_NEW: 1510 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1511 break; 1512 1513 case SO_TIMESTAMPING_OLD: 1514 lv = sizeof(v.timestamping); 1515 v.timestamping.flags = sk->sk_tsflags; 1516 v.timestamping.bind_phc = sk->sk_bind_phc; 1517 break; 1518 1519 case SO_RCVTIMEO_OLD: 1520 case SO_RCVTIMEO_NEW: 1521 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1522 break; 1523 1524 case SO_SNDTIMEO_OLD: 1525 case SO_SNDTIMEO_NEW: 1526 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1527 break; 1528 1529 case SO_RCVLOWAT: 1530 v.val = sk->sk_rcvlowat; 1531 break; 1532 1533 case SO_SNDLOWAT: 1534 v.val = 1; 1535 break; 1536 1537 case SO_PASSCRED: 1538 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1539 break; 1540 1541 case SO_PEERCRED: 1542 { 1543 struct ucred peercred; 1544 if (len > sizeof(peercred)) 1545 len = sizeof(peercred); 1546 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1547 if (copy_to_user(optval, &peercred, len)) 1548 return -EFAULT; 1549 goto lenout; 1550 } 1551 1552 case SO_PEERGROUPS: 1553 { 1554 int ret, n; 1555 1556 if (!sk->sk_peer_cred) 1557 return -ENODATA; 1558 1559 n = sk->sk_peer_cred->group_info->ngroups; 1560 if (len < n * sizeof(gid_t)) { 1561 len = n * sizeof(gid_t); 1562 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1563 } 1564 len = n * sizeof(gid_t); 1565 1566 ret = groups_to_user((gid_t __user *)optval, 1567 sk->sk_peer_cred->group_info); 1568 if (ret) 1569 return ret; 1570 goto lenout; 1571 } 1572 1573 case SO_PEERNAME: 1574 { 1575 char address[128]; 1576 1577 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1578 if (lv < 0) 1579 return -ENOTCONN; 1580 if (lv < len) 1581 return -EINVAL; 1582 if (copy_to_user(optval, address, len)) 1583 return -EFAULT; 1584 goto lenout; 1585 } 1586 1587 /* Dubious BSD thing... Probably nobody even uses it, but 1588 * the UNIX standard wants it for whatever reason... -DaveM 1589 */ 1590 case SO_ACCEPTCONN: 1591 v.val = sk->sk_state == TCP_LISTEN; 1592 break; 1593 1594 case SO_PASSSEC: 1595 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1596 break; 1597 1598 case SO_PEERSEC: 1599 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1600 1601 case SO_MARK: 1602 v.val = sk->sk_mark; 1603 break; 1604 1605 case SO_RXQ_OVFL: 1606 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1607 break; 1608 1609 case SO_WIFI_STATUS: 1610 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1611 break; 1612 1613 case SO_PEEK_OFF: 1614 if (!sock->ops->set_peek_off) 1615 return -EOPNOTSUPP; 1616 1617 v.val = sk->sk_peek_off; 1618 break; 1619 case SO_NOFCS: 1620 v.val = sock_flag(sk, SOCK_NOFCS); 1621 break; 1622 1623 case SO_BINDTODEVICE: 1624 return sock_getbindtodevice(sk, optval, optlen, len); 1625 1626 case SO_GET_FILTER: 1627 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1628 if (len < 0) 1629 return len; 1630 1631 goto lenout; 1632 1633 case SO_LOCK_FILTER: 1634 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1635 break; 1636 1637 case SO_BPF_EXTENSIONS: 1638 v.val = bpf_tell_extensions(); 1639 break; 1640 1641 case SO_SELECT_ERR_QUEUE: 1642 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1643 break; 1644 1645 #ifdef CONFIG_NET_RX_BUSY_POLL 1646 case SO_BUSY_POLL: 1647 v.val = sk->sk_ll_usec; 1648 break; 1649 case SO_PREFER_BUSY_POLL: 1650 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1651 break; 1652 #endif 1653 1654 case SO_MAX_PACING_RATE: 1655 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1656 lv = sizeof(v.ulval); 1657 v.ulval = sk->sk_max_pacing_rate; 1658 } else { 1659 /* 32bit version */ 1660 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1661 } 1662 break; 1663 1664 case SO_INCOMING_CPU: 1665 v.val = READ_ONCE(sk->sk_incoming_cpu); 1666 break; 1667 1668 case SO_MEMINFO: 1669 { 1670 u32 meminfo[SK_MEMINFO_VARS]; 1671 1672 sk_get_meminfo(sk, meminfo); 1673 1674 len = min_t(unsigned int, len, sizeof(meminfo)); 1675 if (copy_to_user(optval, &meminfo, len)) 1676 return -EFAULT; 1677 1678 goto lenout; 1679 } 1680 1681 #ifdef CONFIG_NET_RX_BUSY_POLL 1682 case SO_INCOMING_NAPI_ID: 1683 v.val = READ_ONCE(sk->sk_napi_id); 1684 1685 /* aggregate non-NAPI IDs down to 0 */ 1686 if (v.val < MIN_NAPI_ID) 1687 v.val = 0; 1688 1689 break; 1690 #endif 1691 1692 case SO_COOKIE: 1693 lv = sizeof(u64); 1694 if (len < lv) 1695 return -EINVAL; 1696 v.val64 = sock_gen_cookie(sk); 1697 break; 1698 1699 case SO_ZEROCOPY: 1700 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1701 break; 1702 1703 case SO_TXTIME: 1704 lv = sizeof(v.txtime); 1705 v.txtime.clockid = sk->sk_clockid; 1706 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1707 SOF_TXTIME_DEADLINE_MODE : 0; 1708 v.txtime.flags |= sk->sk_txtime_report_errors ? 1709 SOF_TXTIME_REPORT_ERRORS : 0; 1710 break; 1711 1712 case SO_BINDTOIFINDEX: 1713 v.val = sk->sk_bound_dev_if; 1714 break; 1715 1716 case SO_NETNS_COOKIE: 1717 lv = sizeof(u64); 1718 if (len != lv) 1719 return -EINVAL; 1720 v.val64 = sock_net(sk)->net_cookie; 1721 break; 1722 1723 default: 1724 /* We implement the SO_SNDLOWAT etc to not be settable 1725 * (1003.1g 7). 1726 */ 1727 return -ENOPROTOOPT; 1728 } 1729 1730 if (len > lv) 1731 len = lv; 1732 if (copy_to_user(optval, &v, len)) 1733 return -EFAULT; 1734 lenout: 1735 if (put_user(len, optlen)) 1736 return -EFAULT; 1737 return 0; 1738 } 1739 1740 /* 1741 * Initialize an sk_lock. 1742 * 1743 * (We also register the sk_lock with the lock validator.) 1744 */ 1745 static inline void sock_lock_init(struct sock *sk) 1746 { 1747 if (sk->sk_kern_sock) 1748 sock_lock_init_class_and_name( 1749 sk, 1750 af_family_kern_slock_key_strings[sk->sk_family], 1751 af_family_kern_slock_keys + sk->sk_family, 1752 af_family_kern_key_strings[sk->sk_family], 1753 af_family_kern_keys + sk->sk_family); 1754 else 1755 sock_lock_init_class_and_name( 1756 sk, 1757 af_family_slock_key_strings[sk->sk_family], 1758 af_family_slock_keys + sk->sk_family, 1759 af_family_key_strings[sk->sk_family], 1760 af_family_keys + sk->sk_family); 1761 } 1762 1763 /* 1764 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1765 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1766 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1767 */ 1768 static void sock_copy(struct sock *nsk, const struct sock *osk) 1769 { 1770 const struct proto *prot = READ_ONCE(osk->sk_prot); 1771 #ifdef CONFIG_SECURITY_NETWORK 1772 void *sptr = nsk->sk_security; 1773 #endif 1774 1775 /* If we move sk_tx_queue_mapping out of the private section, 1776 * we must check if sk_tx_queue_clear() is called after 1777 * sock_copy() in sk_clone_lock(). 1778 */ 1779 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1780 offsetof(struct sock, sk_dontcopy_begin) || 1781 offsetof(struct sock, sk_tx_queue_mapping) >= 1782 offsetof(struct sock, sk_dontcopy_end)); 1783 1784 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1785 1786 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1787 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1788 1789 #ifdef CONFIG_SECURITY_NETWORK 1790 nsk->sk_security = sptr; 1791 security_sk_clone(osk, nsk); 1792 #endif 1793 } 1794 1795 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1796 int family) 1797 { 1798 struct sock *sk; 1799 struct kmem_cache *slab; 1800 1801 slab = prot->slab; 1802 if (slab != NULL) { 1803 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1804 if (!sk) 1805 return sk; 1806 if (want_init_on_alloc(priority)) 1807 sk_prot_clear_nulls(sk, prot->obj_size); 1808 } else 1809 sk = kmalloc(prot->obj_size, priority); 1810 1811 if (sk != NULL) { 1812 if (security_sk_alloc(sk, family, priority)) 1813 goto out_free; 1814 1815 if (!try_module_get(prot->owner)) 1816 goto out_free_sec; 1817 } 1818 1819 return sk; 1820 1821 out_free_sec: 1822 security_sk_free(sk); 1823 out_free: 1824 if (slab != NULL) 1825 kmem_cache_free(slab, sk); 1826 else 1827 kfree(sk); 1828 return NULL; 1829 } 1830 1831 static void sk_prot_free(struct proto *prot, struct sock *sk) 1832 { 1833 struct kmem_cache *slab; 1834 struct module *owner; 1835 1836 owner = prot->owner; 1837 slab = prot->slab; 1838 1839 cgroup_sk_free(&sk->sk_cgrp_data); 1840 mem_cgroup_sk_free(sk); 1841 security_sk_free(sk); 1842 if (slab != NULL) 1843 kmem_cache_free(slab, sk); 1844 else 1845 kfree(sk); 1846 module_put(owner); 1847 } 1848 1849 /** 1850 * sk_alloc - All socket objects are allocated here 1851 * @net: the applicable net namespace 1852 * @family: protocol family 1853 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1854 * @prot: struct proto associated with this new sock instance 1855 * @kern: is this to be a kernel socket? 1856 */ 1857 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1858 struct proto *prot, int kern) 1859 { 1860 struct sock *sk; 1861 1862 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1863 if (sk) { 1864 sk->sk_family = family; 1865 /* 1866 * See comment in struct sock definition to understand 1867 * why we need sk_prot_creator -acme 1868 */ 1869 sk->sk_prot = sk->sk_prot_creator = prot; 1870 sk->sk_kern_sock = kern; 1871 sock_lock_init(sk); 1872 sk->sk_net_refcnt = kern ? 0 : 1; 1873 if (likely(sk->sk_net_refcnt)) { 1874 get_net(net); 1875 sock_inuse_add(net, 1); 1876 } 1877 1878 sock_net_set(sk, net); 1879 refcount_set(&sk->sk_wmem_alloc, 1); 1880 1881 mem_cgroup_sk_alloc(sk); 1882 cgroup_sk_alloc(&sk->sk_cgrp_data); 1883 sock_update_classid(&sk->sk_cgrp_data); 1884 sock_update_netprioidx(&sk->sk_cgrp_data); 1885 sk_tx_queue_clear(sk); 1886 } 1887 1888 return sk; 1889 } 1890 EXPORT_SYMBOL(sk_alloc); 1891 1892 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1893 * grace period. This is the case for UDP sockets and TCP listeners. 1894 */ 1895 static void __sk_destruct(struct rcu_head *head) 1896 { 1897 struct sock *sk = container_of(head, struct sock, sk_rcu); 1898 struct sk_filter *filter; 1899 1900 if (sk->sk_destruct) 1901 sk->sk_destruct(sk); 1902 1903 filter = rcu_dereference_check(sk->sk_filter, 1904 refcount_read(&sk->sk_wmem_alloc) == 0); 1905 if (filter) { 1906 sk_filter_uncharge(sk, filter); 1907 RCU_INIT_POINTER(sk->sk_filter, NULL); 1908 } 1909 1910 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1911 1912 #ifdef CONFIG_BPF_SYSCALL 1913 bpf_sk_storage_free(sk); 1914 #endif 1915 1916 if (atomic_read(&sk->sk_omem_alloc)) 1917 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1918 __func__, atomic_read(&sk->sk_omem_alloc)); 1919 1920 if (sk->sk_frag.page) { 1921 put_page(sk->sk_frag.page); 1922 sk->sk_frag.page = NULL; 1923 } 1924 1925 if (sk->sk_peer_cred) 1926 put_cred(sk->sk_peer_cred); 1927 put_pid(sk->sk_peer_pid); 1928 if (likely(sk->sk_net_refcnt)) 1929 put_net(sock_net(sk)); 1930 sk_prot_free(sk->sk_prot_creator, sk); 1931 } 1932 1933 void sk_destruct(struct sock *sk) 1934 { 1935 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1936 1937 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1938 reuseport_detach_sock(sk); 1939 use_call_rcu = true; 1940 } 1941 1942 if (use_call_rcu) 1943 call_rcu(&sk->sk_rcu, __sk_destruct); 1944 else 1945 __sk_destruct(&sk->sk_rcu); 1946 } 1947 1948 static void __sk_free(struct sock *sk) 1949 { 1950 if (likely(sk->sk_net_refcnt)) 1951 sock_inuse_add(sock_net(sk), -1); 1952 1953 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1954 sock_diag_broadcast_destroy(sk); 1955 else 1956 sk_destruct(sk); 1957 } 1958 1959 void sk_free(struct sock *sk) 1960 { 1961 /* 1962 * We subtract one from sk_wmem_alloc and can know if 1963 * some packets are still in some tx queue. 1964 * If not null, sock_wfree() will call __sk_free(sk) later 1965 */ 1966 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1967 __sk_free(sk); 1968 } 1969 EXPORT_SYMBOL(sk_free); 1970 1971 static void sk_init_common(struct sock *sk) 1972 { 1973 skb_queue_head_init(&sk->sk_receive_queue); 1974 skb_queue_head_init(&sk->sk_write_queue); 1975 skb_queue_head_init(&sk->sk_error_queue); 1976 1977 rwlock_init(&sk->sk_callback_lock); 1978 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1979 af_rlock_keys + sk->sk_family, 1980 af_family_rlock_key_strings[sk->sk_family]); 1981 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1982 af_wlock_keys + sk->sk_family, 1983 af_family_wlock_key_strings[sk->sk_family]); 1984 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1985 af_elock_keys + sk->sk_family, 1986 af_family_elock_key_strings[sk->sk_family]); 1987 lockdep_set_class_and_name(&sk->sk_callback_lock, 1988 af_callback_keys + sk->sk_family, 1989 af_family_clock_key_strings[sk->sk_family]); 1990 } 1991 1992 /** 1993 * sk_clone_lock - clone a socket, and lock its clone 1994 * @sk: the socket to clone 1995 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1996 * 1997 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1998 */ 1999 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2000 { 2001 struct proto *prot = READ_ONCE(sk->sk_prot); 2002 struct sk_filter *filter; 2003 bool is_charged = true; 2004 struct sock *newsk; 2005 2006 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2007 if (!newsk) 2008 goto out; 2009 2010 sock_copy(newsk, sk); 2011 2012 newsk->sk_prot_creator = prot; 2013 2014 /* SANITY */ 2015 if (likely(newsk->sk_net_refcnt)) 2016 get_net(sock_net(newsk)); 2017 sk_node_init(&newsk->sk_node); 2018 sock_lock_init(newsk); 2019 bh_lock_sock(newsk); 2020 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2021 newsk->sk_backlog.len = 0; 2022 2023 atomic_set(&newsk->sk_rmem_alloc, 0); 2024 2025 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2026 refcount_set(&newsk->sk_wmem_alloc, 1); 2027 2028 atomic_set(&newsk->sk_omem_alloc, 0); 2029 sk_init_common(newsk); 2030 2031 newsk->sk_dst_cache = NULL; 2032 newsk->sk_dst_pending_confirm = 0; 2033 newsk->sk_wmem_queued = 0; 2034 newsk->sk_forward_alloc = 0; 2035 atomic_set(&newsk->sk_drops, 0); 2036 newsk->sk_send_head = NULL; 2037 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2038 atomic_set(&newsk->sk_zckey, 0); 2039 2040 sock_reset_flag(newsk, SOCK_DONE); 2041 2042 /* sk->sk_memcg will be populated at accept() time */ 2043 newsk->sk_memcg = NULL; 2044 2045 cgroup_sk_clone(&newsk->sk_cgrp_data); 2046 2047 rcu_read_lock(); 2048 filter = rcu_dereference(sk->sk_filter); 2049 if (filter != NULL) 2050 /* though it's an empty new sock, the charging may fail 2051 * if sysctl_optmem_max was changed between creation of 2052 * original socket and cloning 2053 */ 2054 is_charged = sk_filter_charge(newsk, filter); 2055 RCU_INIT_POINTER(newsk->sk_filter, filter); 2056 rcu_read_unlock(); 2057 2058 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2059 /* We need to make sure that we don't uncharge the new 2060 * socket if we couldn't charge it in the first place 2061 * as otherwise we uncharge the parent's filter. 2062 */ 2063 if (!is_charged) 2064 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2065 sk_free_unlock_clone(newsk); 2066 newsk = NULL; 2067 goto out; 2068 } 2069 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2070 2071 if (bpf_sk_storage_clone(sk, newsk)) { 2072 sk_free_unlock_clone(newsk); 2073 newsk = NULL; 2074 goto out; 2075 } 2076 2077 /* Clear sk_user_data if parent had the pointer tagged 2078 * as not suitable for copying when cloning. 2079 */ 2080 if (sk_user_data_is_nocopy(newsk)) 2081 newsk->sk_user_data = NULL; 2082 2083 newsk->sk_err = 0; 2084 newsk->sk_err_soft = 0; 2085 newsk->sk_priority = 0; 2086 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2087 if (likely(newsk->sk_net_refcnt)) 2088 sock_inuse_add(sock_net(newsk), 1); 2089 2090 /* Before updating sk_refcnt, we must commit prior changes to memory 2091 * (Documentation/RCU/rculist_nulls.rst for details) 2092 */ 2093 smp_wmb(); 2094 refcount_set(&newsk->sk_refcnt, 2); 2095 2096 /* Increment the counter in the same struct proto as the master 2097 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2098 * is the same as sk->sk_prot->socks, as this field was copied 2099 * with memcpy). 2100 * 2101 * This _changes_ the previous behaviour, where 2102 * tcp_create_openreq_child always was incrementing the 2103 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2104 * to be taken into account in all callers. -acme 2105 */ 2106 sk_refcnt_debug_inc(newsk); 2107 sk_set_socket(newsk, NULL); 2108 sk_tx_queue_clear(newsk); 2109 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2110 2111 if (newsk->sk_prot->sockets_allocated) 2112 sk_sockets_allocated_inc(newsk); 2113 2114 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2115 net_enable_timestamp(); 2116 out: 2117 return newsk; 2118 } 2119 EXPORT_SYMBOL_GPL(sk_clone_lock); 2120 2121 void sk_free_unlock_clone(struct sock *sk) 2122 { 2123 /* It is still raw copy of parent, so invalidate 2124 * destructor and make plain sk_free() */ 2125 sk->sk_destruct = NULL; 2126 bh_unlock_sock(sk); 2127 sk_free(sk); 2128 } 2129 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2130 2131 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2132 { 2133 u32 max_segs = 1; 2134 2135 sk_dst_set(sk, dst); 2136 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2137 if (sk->sk_route_caps & NETIF_F_GSO) 2138 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2139 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2140 if (sk_can_gso(sk)) { 2141 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2142 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2143 } else { 2144 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2145 sk->sk_gso_max_size = dst->dev->gso_max_size; 2146 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2147 } 2148 } 2149 sk->sk_gso_max_segs = max_segs; 2150 } 2151 EXPORT_SYMBOL_GPL(sk_setup_caps); 2152 2153 /* 2154 * Simple resource managers for sockets. 2155 */ 2156 2157 2158 /* 2159 * Write buffer destructor automatically called from kfree_skb. 2160 */ 2161 void sock_wfree(struct sk_buff *skb) 2162 { 2163 struct sock *sk = skb->sk; 2164 unsigned int len = skb->truesize; 2165 2166 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2167 /* 2168 * Keep a reference on sk_wmem_alloc, this will be released 2169 * after sk_write_space() call 2170 */ 2171 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2172 sk->sk_write_space(sk); 2173 len = 1; 2174 } 2175 /* 2176 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2177 * could not do because of in-flight packets 2178 */ 2179 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2180 __sk_free(sk); 2181 } 2182 EXPORT_SYMBOL(sock_wfree); 2183 2184 /* This variant of sock_wfree() is used by TCP, 2185 * since it sets SOCK_USE_WRITE_QUEUE. 2186 */ 2187 void __sock_wfree(struct sk_buff *skb) 2188 { 2189 struct sock *sk = skb->sk; 2190 2191 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2192 __sk_free(sk); 2193 } 2194 2195 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2196 { 2197 skb_orphan(skb); 2198 skb->sk = sk; 2199 #ifdef CONFIG_INET 2200 if (unlikely(!sk_fullsock(sk))) { 2201 skb->destructor = sock_edemux; 2202 sock_hold(sk); 2203 return; 2204 } 2205 #endif 2206 skb->destructor = sock_wfree; 2207 skb_set_hash_from_sk(skb, sk); 2208 /* 2209 * We used to take a refcount on sk, but following operation 2210 * is enough to guarantee sk_free() wont free this sock until 2211 * all in-flight packets are completed 2212 */ 2213 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2214 } 2215 EXPORT_SYMBOL(skb_set_owner_w); 2216 2217 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2218 { 2219 #ifdef CONFIG_TLS_DEVICE 2220 /* Drivers depend on in-order delivery for crypto offload, 2221 * partial orphan breaks out-of-order-OK logic. 2222 */ 2223 if (skb->decrypted) 2224 return false; 2225 #endif 2226 return (skb->destructor == sock_wfree || 2227 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2228 } 2229 2230 /* This helper is used by netem, as it can hold packets in its 2231 * delay queue. We want to allow the owner socket to send more 2232 * packets, as if they were already TX completed by a typical driver. 2233 * But we also want to keep skb->sk set because some packet schedulers 2234 * rely on it (sch_fq for example). 2235 */ 2236 void skb_orphan_partial(struct sk_buff *skb) 2237 { 2238 if (skb_is_tcp_pure_ack(skb)) 2239 return; 2240 2241 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2242 return; 2243 2244 skb_orphan(skb); 2245 } 2246 EXPORT_SYMBOL(skb_orphan_partial); 2247 2248 /* 2249 * Read buffer destructor automatically called from kfree_skb. 2250 */ 2251 void sock_rfree(struct sk_buff *skb) 2252 { 2253 struct sock *sk = skb->sk; 2254 unsigned int len = skb->truesize; 2255 2256 atomic_sub(len, &sk->sk_rmem_alloc); 2257 sk_mem_uncharge(sk, len); 2258 } 2259 EXPORT_SYMBOL(sock_rfree); 2260 2261 /* 2262 * Buffer destructor for skbs that are not used directly in read or write 2263 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2264 */ 2265 void sock_efree(struct sk_buff *skb) 2266 { 2267 sock_put(skb->sk); 2268 } 2269 EXPORT_SYMBOL(sock_efree); 2270 2271 /* Buffer destructor for prefetch/receive path where reference count may 2272 * not be held, e.g. for listen sockets. 2273 */ 2274 #ifdef CONFIG_INET 2275 void sock_pfree(struct sk_buff *skb) 2276 { 2277 if (sk_is_refcounted(skb->sk)) 2278 sock_gen_put(skb->sk); 2279 } 2280 EXPORT_SYMBOL(sock_pfree); 2281 #endif /* CONFIG_INET */ 2282 2283 kuid_t sock_i_uid(struct sock *sk) 2284 { 2285 kuid_t uid; 2286 2287 read_lock_bh(&sk->sk_callback_lock); 2288 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2289 read_unlock_bh(&sk->sk_callback_lock); 2290 return uid; 2291 } 2292 EXPORT_SYMBOL(sock_i_uid); 2293 2294 unsigned long sock_i_ino(struct sock *sk) 2295 { 2296 unsigned long ino; 2297 2298 read_lock_bh(&sk->sk_callback_lock); 2299 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2300 read_unlock_bh(&sk->sk_callback_lock); 2301 return ino; 2302 } 2303 EXPORT_SYMBOL(sock_i_ino); 2304 2305 /* 2306 * Allocate a skb from the socket's send buffer. 2307 */ 2308 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2309 gfp_t priority) 2310 { 2311 if (force || 2312 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2313 struct sk_buff *skb = alloc_skb(size, priority); 2314 2315 if (skb) { 2316 skb_set_owner_w(skb, sk); 2317 return skb; 2318 } 2319 } 2320 return NULL; 2321 } 2322 EXPORT_SYMBOL(sock_wmalloc); 2323 2324 static void sock_ofree(struct sk_buff *skb) 2325 { 2326 struct sock *sk = skb->sk; 2327 2328 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2329 } 2330 2331 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2332 gfp_t priority) 2333 { 2334 struct sk_buff *skb; 2335 2336 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2337 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2338 sysctl_optmem_max) 2339 return NULL; 2340 2341 skb = alloc_skb(size, priority); 2342 if (!skb) 2343 return NULL; 2344 2345 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2346 skb->sk = sk; 2347 skb->destructor = sock_ofree; 2348 return skb; 2349 } 2350 2351 /* 2352 * Allocate a memory block from the socket's option memory buffer. 2353 */ 2354 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2355 { 2356 if ((unsigned int)size <= sysctl_optmem_max && 2357 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2358 void *mem; 2359 /* First do the add, to avoid the race if kmalloc 2360 * might sleep. 2361 */ 2362 atomic_add(size, &sk->sk_omem_alloc); 2363 mem = kmalloc(size, priority); 2364 if (mem) 2365 return mem; 2366 atomic_sub(size, &sk->sk_omem_alloc); 2367 } 2368 return NULL; 2369 } 2370 EXPORT_SYMBOL(sock_kmalloc); 2371 2372 /* Free an option memory block. Note, we actually want the inline 2373 * here as this allows gcc to detect the nullify and fold away the 2374 * condition entirely. 2375 */ 2376 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2377 const bool nullify) 2378 { 2379 if (WARN_ON_ONCE(!mem)) 2380 return; 2381 if (nullify) 2382 kfree_sensitive(mem); 2383 else 2384 kfree(mem); 2385 atomic_sub(size, &sk->sk_omem_alloc); 2386 } 2387 2388 void sock_kfree_s(struct sock *sk, void *mem, int size) 2389 { 2390 __sock_kfree_s(sk, mem, size, false); 2391 } 2392 EXPORT_SYMBOL(sock_kfree_s); 2393 2394 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2395 { 2396 __sock_kfree_s(sk, mem, size, true); 2397 } 2398 EXPORT_SYMBOL(sock_kzfree_s); 2399 2400 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2401 I think, these locks should be removed for datagram sockets. 2402 */ 2403 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2404 { 2405 DEFINE_WAIT(wait); 2406 2407 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2408 for (;;) { 2409 if (!timeo) 2410 break; 2411 if (signal_pending(current)) 2412 break; 2413 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2414 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2415 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2416 break; 2417 if (sk->sk_shutdown & SEND_SHUTDOWN) 2418 break; 2419 if (sk->sk_err) 2420 break; 2421 timeo = schedule_timeout(timeo); 2422 } 2423 finish_wait(sk_sleep(sk), &wait); 2424 return timeo; 2425 } 2426 2427 2428 /* 2429 * Generic send/receive buffer handlers 2430 */ 2431 2432 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2433 unsigned long data_len, int noblock, 2434 int *errcode, int max_page_order) 2435 { 2436 struct sk_buff *skb; 2437 long timeo; 2438 int err; 2439 2440 timeo = sock_sndtimeo(sk, noblock); 2441 for (;;) { 2442 err = sock_error(sk); 2443 if (err != 0) 2444 goto failure; 2445 2446 err = -EPIPE; 2447 if (sk->sk_shutdown & SEND_SHUTDOWN) 2448 goto failure; 2449 2450 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2451 break; 2452 2453 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2454 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2455 err = -EAGAIN; 2456 if (!timeo) 2457 goto failure; 2458 if (signal_pending(current)) 2459 goto interrupted; 2460 timeo = sock_wait_for_wmem(sk, timeo); 2461 } 2462 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2463 errcode, sk->sk_allocation); 2464 if (skb) 2465 skb_set_owner_w(skb, sk); 2466 return skb; 2467 2468 interrupted: 2469 err = sock_intr_errno(timeo); 2470 failure: 2471 *errcode = err; 2472 return NULL; 2473 } 2474 EXPORT_SYMBOL(sock_alloc_send_pskb); 2475 2476 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2477 int noblock, int *errcode) 2478 { 2479 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2480 } 2481 EXPORT_SYMBOL(sock_alloc_send_skb); 2482 2483 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2484 struct sockcm_cookie *sockc) 2485 { 2486 u32 tsflags; 2487 2488 switch (cmsg->cmsg_type) { 2489 case SO_MARK: 2490 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2491 return -EPERM; 2492 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2493 return -EINVAL; 2494 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2495 break; 2496 case SO_TIMESTAMPING_OLD: 2497 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2498 return -EINVAL; 2499 2500 tsflags = *(u32 *)CMSG_DATA(cmsg); 2501 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2502 return -EINVAL; 2503 2504 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2505 sockc->tsflags |= tsflags; 2506 break; 2507 case SCM_TXTIME: 2508 if (!sock_flag(sk, SOCK_TXTIME)) 2509 return -EINVAL; 2510 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2511 return -EINVAL; 2512 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2513 break; 2514 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2515 case SCM_RIGHTS: 2516 case SCM_CREDENTIALS: 2517 break; 2518 default: 2519 return -EINVAL; 2520 } 2521 return 0; 2522 } 2523 EXPORT_SYMBOL(__sock_cmsg_send); 2524 2525 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2526 struct sockcm_cookie *sockc) 2527 { 2528 struct cmsghdr *cmsg; 2529 int ret; 2530 2531 for_each_cmsghdr(cmsg, msg) { 2532 if (!CMSG_OK(msg, cmsg)) 2533 return -EINVAL; 2534 if (cmsg->cmsg_level != SOL_SOCKET) 2535 continue; 2536 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2537 if (ret) 2538 return ret; 2539 } 2540 return 0; 2541 } 2542 EXPORT_SYMBOL(sock_cmsg_send); 2543 2544 static void sk_enter_memory_pressure(struct sock *sk) 2545 { 2546 if (!sk->sk_prot->enter_memory_pressure) 2547 return; 2548 2549 sk->sk_prot->enter_memory_pressure(sk); 2550 } 2551 2552 static void sk_leave_memory_pressure(struct sock *sk) 2553 { 2554 if (sk->sk_prot->leave_memory_pressure) { 2555 sk->sk_prot->leave_memory_pressure(sk); 2556 } else { 2557 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2558 2559 if (memory_pressure && READ_ONCE(*memory_pressure)) 2560 WRITE_ONCE(*memory_pressure, 0); 2561 } 2562 } 2563 2564 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2565 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2566 2567 /** 2568 * skb_page_frag_refill - check that a page_frag contains enough room 2569 * @sz: minimum size of the fragment we want to get 2570 * @pfrag: pointer to page_frag 2571 * @gfp: priority for memory allocation 2572 * 2573 * Note: While this allocator tries to use high order pages, there is 2574 * no guarantee that allocations succeed. Therefore, @sz MUST be 2575 * less or equal than PAGE_SIZE. 2576 */ 2577 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2578 { 2579 if (pfrag->page) { 2580 if (page_ref_count(pfrag->page) == 1) { 2581 pfrag->offset = 0; 2582 return true; 2583 } 2584 if (pfrag->offset + sz <= pfrag->size) 2585 return true; 2586 put_page(pfrag->page); 2587 } 2588 2589 pfrag->offset = 0; 2590 if (SKB_FRAG_PAGE_ORDER && 2591 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2592 /* Avoid direct reclaim but allow kswapd to wake */ 2593 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2594 __GFP_COMP | __GFP_NOWARN | 2595 __GFP_NORETRY, 2596 SKB_FRAG_PAGE_ORDER); 2597 if (likely(pfrag->page)) { 2598 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2599 return true; 2600 } 2601 } 2602 pfrag->page = alloc_page(gfp); 2603 if (likely(pfrag->page)) { 2604 pfrag->size = PAGE_SIZE; 2605 return true; 2606 } 2607 return false; 2608 } 2609 EXPORT_SYMBOL(skb_page_frag_refill); 2610 2611 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2612 { 2613 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2614 return true; 2615 2616 sk_enter_memory_pressure(sk); 2617 sk_stream_moderate_sndbuf(sk); 2618 return false; 2619 } 2620 EXPORT_SYMBOL(sk_page_frag_refill); 2621 2622 void __lock_sock(struct sock *sk) 2623 __releases(&sk->sk_lock.slock) 2624 __acquires(&sk->sk_lock.slock) 2625 { 2626 DEFINE_WAIT(wait); 2627 2628 for (;;) { 2629 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2630 TASK_UNINTERRUPTIBLE); 2631 spin_unlock_bh(&sk->sk_lock.slock); 2632 schedule(); 2633 spin_lock_bh(&sk->sk_lock.slock); 2634 if (!sock_owned_by_user(sk)) 2635 break; 2636 } 2637 finish_wait(&sk->sk_lock.wq, &wait); 2638 } 2639 2640 void __release_sock(struct sock *sk) 2641 __releases(&sk->sk_lock.slock) 2642 __acquires(&sk->sk_lock.slock) 2643 { 2644 struct sk_buff *skb, *next; 2645 2646 while ((skb = sk->sk_backlog.head) != NULL) { 2647 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2648 2649 spin_unlock_bh(&sk->sk_lock.slock); 2650 2651 do { 2652 next = skb->next; 2653 prefetch(next); 2654 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2655 skb_mark_not_on_list(skb); 2656 sk_backlog_rcv(sk, skb); 2657 2658 cond_resched(); 2659 2660 skb = next; 2661 } while (skb != NULL); 2662 2663 spin_lock_bh(&sk->sk_lock.slock); 2664 } 2665 2666 /* 2667 * Doing the zeroing here guarantee we can not loop forever 2668 * while a wild producer attempts to flood us. 2669 */ 2670 sk->sk_backlog.len = 0; 2671 } 2672 2673 void __sk_flush_backlog(struct sock *sk) 2674 { 2675 spin_lock_bh(&sk->sk_lock.slock); 2676 __release_sock(sk); 2677 spin_unlock_bh(&sk->sk_lock.slock); 2678 } 2679 2680 /** 2681 * sk_wait_data - wait for data to arrive at sk_receive_queue 2682 * @sk: sock to wait on 2683 * @timeo: for how long 2684 * @skb: last skb seen on sk_receive_queue 2685 * 2686 * Now socket state including sk->sk_err is changed only under lock, 2687 * hence we may omit checks after joining wait queue. 2688 * We check receive queue before schedule() only as optimization; 2689 * it is very likely that release_sock() added new data. 2690 */ 2691 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2692 { 2693 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2694 int rc; 2695 2696 add_wait_queue(sk_sleep(sk), &wait); 2697 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2698 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2699 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2700 remove_wait_queue(sk_sleep(sk), &wait); 2701 return rc; 2702 } 2703 EXPORT_SYMBOL(sk_wait_data); 2704 2705 /** 2706 * __sk_mem_raise_allocated - increase memory_allocated 2707 * @sk: socket 2708 * @size: memory size to allocate 2709 * @amt: pages to allocate 2710 * @kind: allocation type 2711 * 2712 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2713 */ 2714 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2715 { 2716 struct proto *prot = sk->sk_prot; 2717 long allocated = sk_memory_allocated_add(sk, amt); 2718 bool charged = true; 2719 2720 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2721 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2722 goto suppress_allocation; 2723 2724 /* Under limit. */ 2725 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2726 sk_leave_memory_pressure(sk); 2727 return 1; 2728 } 2729 2730 /* Under pressure. */ 2731 if (allocated > sk_prot_mem_limits(sk, 1)) 2732 sk_enter_memory_pressure(sk); 2733 2734 /* Over hard limit. */ 2735 if (allocated > sk_prot_mem_limits(sk, 2)) 2736 goto suppress_allocation; 2737 2738 /* guarantee minimum buffer size under pressure */ 2739 if (kind == SK_MEM_RECV) { 2740 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2741 return 1; 2742 2743 } else { /* SK_MEM_SEND */ 2744 int wmem0 = sk_get_wmem0(sk, prot); 2745 2746 if (sk->sk_type == SOCK_STREAM) { 2747 if (sk->sk_wmem_queued < wmem0) 2748 return 1; 2749 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2750 return 1; 2751 } 2752 } 2753 2754 if (sk_has_memory_pressure(sk)) { 2755 u64 alloc; 2756 2757 if (!sk_under_memory_pressure(sk)) 2758 return 1; 2759 alloc = sk_sockets_allocated_read_positive(sk); 2760 if (sk_prot_mem_limits(sk, 2) > alloc * 2761 sk_mem_pages(sk->sk_wmem_queued + 2762 atomic_read(&sk->sk_rmem_alloc) + 2763 sk->sk_forward_alloc)) 2764 return 1; 2765 } 2766 2767 suppress_allocation: 2768 2769 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2770 sk_stream_moderate_sndbuf(sk); 2771 2772 /* Fail only if socket is _under_ its sndbuf. 2773 * In this case we cannot block, so that we have to fail. 2774 */ 2775 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2776 return 1; 2777 } 2778 2779 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2780 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2781 2782 sk_memory_allocated_sub(sk, amt); 2783 2784 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2785 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2786 2787 return 0; 2788 } 2789 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2790 2791 /** 2792 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2793 * @sk: socket 2794 * @size: memory size to allocate 2795 * @kind: allocation type 2796 * 2797 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2798 * rmem allocation. This function assumes that protocols which have 2799 * memory_pressure use sk_wmem_queued as write buffer accounting. 2800 */ 2801 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2802 { 2803 int ret, amt = sk_mem_pages(size); 2804 2805 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2806 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2807 if (!ret) 2808 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2809 return ret; 2810 } 2811 EXPORT_SYMBOL(__sk_mem_schedule); 2812 2813 /** 2814 * __sk_mem_reduce_allocated - reclaim memory_allocated 2815 * @sk: socket 2816 * @amount: number of quanta 2817 * 2818 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2819 */ 2820 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2821 { 2822 sk_memory_allocated_sub(sk, amount); 2823 2824 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2825 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2826 2827 if (sk_under_memory_pressure(sk) && 2828 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2829 sk_leave_memory_pressure(sk); 2830 } 2831 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2832 2833 /** 2834 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2835 * @sk: socket 2836 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2837 */ 2838 void __sk_mem_reclaim(struct sock *sk, int amount) 2839 { 2840 amount >>= SK_MEM_QUANTUM_SHIFT; 2841 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2842 __sk_mem_reduce_allocated(sk, amount); 2843 } 2844 EXPORT_SYMBOL(__sk_mem_reclaim); 2845 2846 int sk_set_peek_off(struct sock *sk, int val) 2847 { 2848 sk->sk_peek_off = val; 2849 return 0; 2850 } 2851 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2852 2853 /* 2854 * Set of default routines for initialising struct proto_ops when 2855 * the protocol does not support a particular function. In certain 2856 * cases where it makes no sense for a protocol to have a "do nothing" 2857 * function, some default processing is provided. 2858 */ 2859 2860 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2861 { 2862 return -EOPNOTSUPP; 2863 } 2864 EXPORT_SYMBOL(sock_no_bind); 2865 2866 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2867 int len, int flags) 2868 { 2869 return -EOPNOTSUPP; 2870 } 2871 EXPORT_SYMBOL(sock_no_connect); 2872 2873 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2874 { 2875 return -EOPNOTSUPP; 2876 } 2877 EXPORT_SYMBOL(sock_no_socketpair); 2878 2879 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2880 bool kern) 2881 { 2882 return -EOPNOTSUPP; 2883 } 2884 EXPORT_SYMBOL(sock_no_accept); 2885 2886 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2887 int peer) 2888 { 2889 return -EOPNOTSUPP; 2890 } 2891 EXPORT_SYMBOL(sock_no_getname); 2892 2893 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2894 { 2895 return -EOPNOTSUPP; 2896 } 2897 EXPORT_SYMBOL(sock_no_ioctl); 2898 2899 int sock_no_listen(struct socket *sock, int backlog) 2900 { 2901 return -EOPNOTSUPP; 2902 } 2903 EXPORT_SYMBOL(sock_no_listen); 2904 2905 int sock_no_shutdown(struct socket *sock, int how) 2906 { 2907 return -EOPNOTSUPP; 2908 } 2909 EXPORT_SYMBOL(sock_no_shutdown); 2910 2911 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2912 { 2913 return -EOPNOTSUPP; 2914 } 2915 EXPORT_SYMBOL(sock_no_sendmsg); 2916 2917 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2918 { 2919 return -EOPNOTSUPP; 2920 } 2921 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2922 2923 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2924 int flags) 2925 { 2926 return -EOPNOTSUPP; 2927 } 2928 EXPORT_SYMBOL(sock_no_recvmsg); 2929 2930 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2931 { 2932 /* Mirror missing mmap method error code */ 2933 return -ENODEV; 2934 } 2935 EXPORT_SYMBOL(sock_no_mmap); 2936 2937 /* 2938 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2939 * various sock-based usage counts. 2940 */ 2941 void __receive_sock(struct file *file) 2942 { 2943 struct socket *sock; 2944 2945 sock = sock_from_file(file); 2946 if (sock) { 2947 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2948 sock_update_classid(&sock->sk->sk_cgrp_data); 2949 } 2950 } 2951 2952 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2953 { 2954 ssize_t res; 2955 struct msghdr msg = {.msg_flags = flags}; 2956 struct kvec iov; 2957 char *kaddr = kmap(page); 2958 iov.iov_base = kaddr + offset; 2959 iov.iov_len = size; 2960 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2961 kunmap(page); 2962 return res; 2963 } 2964 EXPORT_SYMBOL(sock_no_sendpage); 2965 2966 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2967 int offset, size_t size, int flags) 2968 { 2969 ssize_t res; 2970 struct msghdr msg = {.msg_flags = flags}; 2971 struct kvec iov; 2972 char *kaddr = kmap(page); 2973 2974 iov.iov_base = kaddr + offset; 2975 iov.iov_len = size; 2976 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2977 kunmap(page); 2978 return res; 2979 } 2980 EXPORT_SYMBOL(sock_no_sendpage_locked); 2981 2982 /* 2983 * Default Socket Callbacks 2984 */ 2985 2986 static void sock_def_wakeup(struct sock *sk) 2987 { 2988 struct socket_wq *wq; 2989 2990 rcu_read_lock(); 2991 wq = rcu_dereference(sk->sk_wq); 2992 if (skwq_has_sleeper(wq)) 2993 wake_up_interruptible_all(&wq->wait); 2994 rcu_read_unlock(); 2995 } 2996 2997 static void sock_def_error_report(struct sock *sk) 2998 { 2999 struct socket_wq *wq; 3000 3001 rcu_read_lock(); 3002 wq = rcu_dereference(sk->sk_wq); 3003 if (skwq_has_sleeper(wq)) 3004 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3005 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3006 rcu_read_unlock(); 3007 } 3008 3009 void sock_def_readable(struct sock *sk) 3010 { 3011 struct socket_wq *wq; 3012 3013 rcu_read_lock(); 3014 wq = rcu_dereference(sk->sk_wq); 3015 if (skwq_has_sleeper(wq)) 3016 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3017 EPOLLRDNORM | EPOLLRDBAND); 3018 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3019 rcu_read_unlock(); 3020 } 3021 3022 static void sock_def_write_space(struct sock *sk) 3023 { 3024 struct socket_wq *wq; 3025 3026 rcu_read_lock(); 3027 3028 /* Do not wake up a writer until he can make "significant" 3029 * progress. --DaveM 3030 */ 3031 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 3032 wq = rcu_dereference(sk->sk_wq); 3033 if (skwq_has_sleeper(wq)) 3034 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3035 EPOLLWRNORM | EPOLLWRBAND); 3036 3037 /* Should agree with poll, otherwise some programs break */ 3038 if (sock_writeable(sk)) 3039 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3040 } 3041 3042 rcu_read_unlock(); 3043 } 3044 3045 static void sock_def_destruct(struct sock *sk) 3046 { 3047 } 3048 3049 void sk_send_sigurg(struct sock *sk) 3050 { 3051 if (sk->sk_socket && sk->sk_socket->file) 3052 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3053 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3054 } 3055 EXPORT_SYMBOL(sk_send_sigurg); 3056 3057 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3058 unsigned long expires) 3059 { 3060 if (!mod_timer(timer, expires)) 3061 sock_hold(sk); 3062 } 3063 EXPORT_SYMBOL(sk_reset_timer); 3064 3065 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3066 { 3067 if (del_timer(timer)) 3068 __sock_put(sk); 3069 } 3070 EXPORT_SYMBOL(sk_stop_timer); 3071 3072 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3073 { 3074 if (del_timer_sync(timer)) 3075 __sock_put(sk); 3076 } 3077 EXPORT_SYMBOL(sk_stop_timer_sync); 3078 3079 void sock_init_data(struct socket *sock, struct sock *sk) 3080 { 3081 sk_init_common(sk); 3082 sk->sk_send_head = NULL; 3083 3084 timer_setup(&sk->sk_timer, NULL, 0); 3085 3086 sk->sk_allocation = GFP_KERNEL; 3087 sk->sk_rcvbuf = sysctl_rmem_default; 3088 sk->sk_sndbuf = sysctl_wmem_default; 3089 sk->sk_state = TCP_CLOSE; 3090 sk_set_socket(sk, sock); 3091 3092 sock_set_flag(sk, SOCK_ZAPPED); 3093 3094 if (sock) { 3095 sk->sk_type = sock->type; 3096 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3097 sock->sk = sk; 3098 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3099 } else { 3100 RCU_INIT_POINTER(sk->sk_wq, NULL); 3101 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3102 } 3103 3104 rwlock_init(&sk->sk_callback_lock); 3105 if (sk->sk_kern_sock) 3106 lockdep_set_class_and_name( 3107 &sk->sk_callback_lock, 3108 af_kern_callback_keys + sk->sk_family, 3109 af_family_kern_clock_key_strings[sk->sk_family]); 3110 else 3111 lockdep_set_class_and_name( 3112 &sk->sk_callback_lock, 3113 af_callback_keys + sk->sk_family, 3114 af_family_clock_key_strings[sk->sk_family]); 3115 3116 sk->sk_state_change = sock_def_wakeup; 3117 sk->sk_data_ready = sock_def_readable; 3118 sk->sk_write_space = sock_def_write_space; 3119 sk->sk_error_report = sock_def_error_report; 3120 sk->sk_destruct = sock_def_destruct; 3121 3122 sk->sk_frag.page = NULL; 3123 sk->sk_frag.offset = 0; 3124 sk->sk_peek_off = -1; 3125 3126 sk->sk_peer_pid = NULL; 3127 sk->sk_peer_cred = NULL; 3128 sk->sk_write_pending = 0; 3129 sk->sk_rcvlowat = 1; 3130 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3131 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3132 3133 sk->sk_stamp = SK_DEFAULT_STAMP; 3134 #if BITS_PER_LONG==32 3135 seqlock_init(&sk->sk_stamp_seq); 3136 #endif 3137 atomic_set(&sk->sk_zckey, 0); 3138 3139 #ifdef CONFIG_NET_RX_BUSY_POLL 3140 sk->sk_napi_id = 0; 3141 sk->sk_ll_usec = sysctl_net_busy_read; 3142 #endif 3143 3144 sk->sk_max_pacing_rate = ~0UL; 3145 sk->sk_pacing_rate = ~0UL; 3146 WRITE_ONCE(sk->sk_pacing_shift, 10); 3147 sk->sk_incoming_cpu = -1; 3148 3149 sk_rx_queue_clear(sk); 3150 /* 3151 * Before updating sk_refcnt, we must commit prior changes to memory 3152 * (Documentation/RCU/rculist_nulls.rst for details) 3153 */ 3154 smp_wmb(); 3155 refcount_set(&sk->sk_refcnt, 1); 3156 atomic_set(&sk->sk_drops, 0); 3157 } 3158 EXPORT_SYMBOL(sock_init_data); 3159 3160 void lock_sock_nested(struct sock *sk, int subclass) 3161 { 3162 might_sleep(); 3163 spin_lock_bh(&sk->sk_lock.slock); 3164 if (sk->sk_lock.owned) 3165 __lock_sock(sk); 3166 sk->sk_lock.owned = 1; 3167 spin_unlock(&sk->sk_lock.slock); 3168 /* 3169 * The sk_lock has mutex_lock() semantics here: 3170 */ 3171 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3172 local_bh_enable(); 3173 } 3174 EXPORT_SYMBOL(lock_sock_nested); 3175 3176 void release_sock(struct sock *sk) 3177 { 3178 spin_lock_bh(&sk->sk_lock.slock); 3179 if (sk->sk_backlog.tail) 3180 __release_sock(sk); 3181 3182 /* Warning : release_cb() might need to release sk ownership, 3183 * ie call sock_release_ownership(sk) before us. 3184 */ 3185 if (sk->sk_prot->release_cb) 3186 sk->sk_prot->release_cb(sk); 3187 3188 sock_release_ownership(sk); 3189 if (waitqueue_active(&sk->sk_lock.wq)) 3190 wake_up(&sk->sk_lock.wq); 3191 spin_unlock_bh(&sk->sk_lock.slock); 3192 } 3193 EXPORT_SYMBOL(release_sock); 3194 3195 /** 3196 * lock_sock_fast - fast version of lock_sock 3197 * @sk: socket 3198 * 3199 * This version should be used for very small section, where process wont block 3200 * return false if fast path is taken: 3201 * 3202 * sk_lock.slock locked, owned = 0, BH disabled 3203 * 3204 * return true if slow path is taken: 3205 * 3206 * sk_lock.slock unlocked, owned = 1, BH enabled 3207 */ 3208 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3209 { 3210 might_sleep(); 3211 spin_lock_bh(&sk->sk_lock.slock); 3212 3213 if (!sk->sk_lock.owned) 3214 /* 3215 * Note : We must disable BH 3216 */ 3217 return false; 3218 3219 __lock_sock(sk); 3220 sk->sk_lock.owned = 1; 3221 spin_unlock(&sk->sk_lock.slock); 3222 /* 3223 * The sk_lock has mutex_lock() semantics here: 3224 */ 3225 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3226 __acquire(&sk->sk_lock.slock); 3227 local_bh_enable(); 3228 return true; 3229 } 3230 EXPORT_SYMBOL(lock_sock_fast); 3231 3232 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3233 bool timeval, bool time32) 3234 { 3235 struct sock *sk = sock->sk; 3236 struct timespec64 ts; 3237 3238 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3239 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3240 if (ts.tv_sec == -1) 3241 return -ENOENT; 3242 if (ts.tv_sec == 0) { 3243 ktime_t kt = ktime_get_real(); 3244 sock_write_timestamp(sk, kt); 3245 ts = ktime_to_timespec64(kt); 3246 } 3247 3248 if (timeval) 3249 ts.tv_nsec /= 1000; 3250 3251 #ifdef CONFIG_COMPAT_32BIT_TIME 3252 if (time32) 3253 return put_old_timespec32(&ts, userstamp); 3254 #endif 3255 #ifdef CONFIG_SPARC64 3256 /* beware of padding in sparc64 timeval */ 3257 if (timeval && !in_compat_syscall()) { 3258 struct __kernel_old_timeval __user tv = { 3259 .tv_sec = ts.tv_sec, 3260 .tv_usec = ts.tv_nsec, 3261 }; 3262 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3263 return -EFAULT; 3264 return 0; 3265 } 3266 #endif 3267 return put_timespec64(&ts, userstamp); 3268 } 3269 EXPORT_SYMBOL(sock_gettstamp); 3270 3271 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3272 { 3273 if (!sock_flag(sk, flag)) { 3274 unsigned long previous_flags = sk->sk_flags; 3275 3276 sock_set_flag(sk, flag); 3277 /* 3278 * we just set one of the two flags which require net 3279 * time stamping, but time stamping might have been on 3280 * already because of the other one 3281 */ 3282 if (sock_needs_netstamp(sk) && 3283 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3284 net_enable_timestamp(); 3285 } 3286 } 3287 3288 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3289 int level, int type) 3290 { 3291 struct sock_exterr_skb *serr; 3292 struct sk_buff *skb; 3293 int copied, err; 3294 3295 err = -EAGAIN; 3296 skb = sock_dequeue_err_skb(sk); 3297 if (skb == NULL) 3298 goto out; 3299 3300 copied = skb->len; 3301 if (copied > len) { 3302 msg->msg_flags |= MSG_TRUNC; 3303 copied = len; 3304 } 3305 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3306 if (err) 3307 goto out_free_skb; 3308 3309 sock_recv_timestamp(msg, sk, skb); 3310 3311 serr = SKB_EXT_ERR(skb); 3312 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3313 3314 msg->msg_flags |= MSG_ERRQUEUE; 3315 err = copied; 3316 3317 out_free_skb: 3318 kfree_skb(skb); 3319 out: 3320 return err; 3321 } 3322 EXPORT_SYMBOL(sock_recv_errqueue); 3323 3324 /* 3325 * Get a socket option on an socket. 3326 * 3327 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3328 * asynchronous errors should be reported by getsockopt. We assume 3329 * this means if you specify SO_ERROR (otherwise whats the point of it). 3330 */ 3331 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3332 char __user *optval, int __user *optlen) 3333 { 3334 struct sock *sk = sock->sk; 3335 3336 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3337 } 3338 EXPORT_SYMBOL(sock_common_getsockopt); 3339 3340 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3341 int flags) 3342 { 3343 struct sock *sk = sock->sk; 3344 int addr_len = 0; 3345 int err; 3346 3347 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3348 flags & ~MSG_DONTWAIT, &addr_len); 3349 if (err >= 0) 3350 msg->msg_namelen = addr_len; 3351 return err; 3352 } 3353 EXPORT_SYMBOL(sock_common_recvmsg); 3354 3355 /* 3356 * Set socket options on an inet socket. 3357 */ 3358 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3359 sockptr_t optval, unsigned int optlen) 3360 { 3361 struct sock *sk = sock->sk; 3362 3363 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3364 } 3365 EXPORT_SYMBOL(sock_common_setsockopt); 3366 3367 void sk_common_release(struct sock *sk) 3368 { 3369 if (sk->sk_prot->destroy) 3370 sk->sk_prot->destroy(sk); 3371 3372 /* 3373 * Observation: when sk_common_release is called, processes have 3374 * no access to socket. But net still has. 3375 * Step one, detach it from networking: 3376 * 3377 * A. Remove from hash tables. 3378 */ 3379 3380 sk->sk_prot->unhash(sk); 3381 3382 /* 3383 * In this point socket cannot receive new packets, but it is possible 3384 * that some packets are in flight because some CPU runs receiver and 3385 * did hash table lookup before we unhashed socket. They will achieve 3386 * receive queue and will be purged by socket destructor. 3387 * 3388 * Also we still have packets pending on receive queue and probably, 3389 * our own packets waiting in device queues. sock_destroy will drain 3390 * receive queue, but transmitted packets will delay socket destruction 3391 * until the last reference will be released. 3392 */ 3393 3394 sock_orphan(sk); 3395 3396 xfrm_sk_free_policy(sk); 3397 3398 sk_refcnt_debug_release(sk); 3399 3400 sock_put(sk); 3401 } 3402 EXPORT_SYMBOL(sk_common_release); 3403 3404 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3405 { 3406 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3407 3408 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3409 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3410 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3411 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3412 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3413 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3414 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3415 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3416 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3417 } 3418 3419 #ifdef CONFIG_PROC_FS 3420 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3421 struct prot_inuse { 3422 int val[PROTO_INUSE_NR]; 3423 }; 3424 3425 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3426 3427 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3428 { 3429 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3430 } 3431 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3432 3433 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3434 { 3435 int cpu, idx = prot->inuse_idx; 3436 int res = 0; 3437 3438 for_each_possible_cpu(cpu) 3439 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3440 3441 return res >= 0 ? res : 0; 3442 } 3443 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3444 3445 static void sock_inuse_add(struct net *net, int val) 3446 { 3447 this_cpu_add(*net->core.sock_inuse, val); 3448 } 3449 3450 int sock_inuse_get(struct net *net) 3451 { 3452 int cpu, res = 0; 3453 3454 for_each_possible_cpu(cpu) 3455 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3456 3457 return res; 3458 } 3459 3460 EXPORT_SYMBOL_GPL(sock_inuse_get); 3461 3462 static int __net_init sock_inuse_init_net(struct net *net) 3463 { 3464 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3465 if (net->core.prot_inuse == NULL) 3466 return -ENOMEM; 3467 3468 net->core.sock_inuse = alloc_percpu(int); 3469 if (net->core.sock_inuse == NULL) 3470 goto out; 3471 3472 return 0; 3473 3474 out: 3475 free_percpu(net->core.prot_inuse); 3476 return -ENOMEM; 3477 } 3478 3479 static void __net_exit sock_inuse_exit_net(struct net *net) 3480 { 3481 free_percpu(net->core.prot_inuse); 3482 free_percpu(net->core.sock_inuse); 3483 } 3484 3485 static struct pernet_operations net_inuse_ops = { 3486 .init = sock_inuse_init_net, 3487 .exit = sock_inuse_exit_net, 3488 }; 3489 3490 static __init int net_inuse_init(void) 3491 { 3492 if (register_pernet_subsys(&net_inuse_ops)) 3493 panic("Cannot initialize net inuse counters"); 3494 3495 return 0; 3496 } 3497 3498 core_initcall(net_inuse_init); 3499 3500 static int assign_proto_idx(struct proto *prot) 3501 { 3502 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3503 3504 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3505 pr_err("PROTO_INUSE_NR exhausted\n"); 3506 return -ENOSPC; 3507 } 3508 3509 set_bit(prot->inuse_idx, proto_inuse_idx); 3510 return 0; 3511 } 3512 3513 static void release_proto_idx(struct proto *prot) 3514 { 3515 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3516 clear_bit(prot->inuse_idx, proto_inuse_idx); 3517 } 3518 #else 3519 static inline int assign_proto_idx(struct proto *prot) 3520 { 3521 return 0; 3522 } 3523 3524 static inline void release_proto_idx(struct proto *prot) 3525 { 3526 } 3527 3528 static void sock_inuse_add(struct net *net, int val) 3529 { 3530 } 3531 #endif 3532 3533 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3534 { 3535 if (!twsk_prot) 3536 return; 3537 kfree(twsk_prot->twsk_slab_name); 3538 twsk_prot->twsk_slab_name = NULL; 3539 kmem_cache_destroy(twsk_prot->twsk_slab); 3540 twsk_prot->twsk_slab = NULL; 3541 } 3542 3543 static int tw_prot_init(const struct proto *prot) 3544 { 3545 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3546 3547 if (!twsk_prot) 3548 return 0; 3549 3550 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3551 prot->name); 3552 if (!twsk_prot->twsk_slab_name) 3553 return -ENOMEM; 3554 3555 twsk_prot->twsk_slab = 3556 kmem_cache_create(twsk_prot->twsk_slab_name, 3557 twsk_prot->twsk_obj_size, 0, 3558 SLAB_ACCOUNT | prot->slab_flags, 3559 NULL); 3560 if (!twsk_prot->twsk_slab) { 3561 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3562 prot->name); 3563 return -ENOMEM; 3564 } 3565 3566 return 0; 3567 } 3568 3569 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3570 { 3571 if (!rsk_prot) 3572 return; 3573 kfree(rsk_prot->slab_name); 3574 rsk_prot->slab_name = NULL; 3575 kmem_cache_destroy(rsk_prot->slab); 3576 rsk_prot->slab = NULL; 3577 } 3578 3579 static int req_prot_init(const struct proto *prot) 3580 { 3581 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3582 3583 if (!rsk_prot) 3584 return 0; 3585 3586 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3587 prot->name); 3588 if (!rsk_prot->slab_name) 3589 return -ENOMEM; 3590 3591 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3592 rsk_prot->obj_size, 0, 3593 SLAB_ACCOUNT | prot->slab_flags, 3594 NULL); 3595 3596 if (!rsk_prot->slab) { 3597 pr_crit("%s: Can't create request sock SLAB cache!\n", 3598 prot->name); 3599 return -ENOMEM; 3600 } 3601 return 0; 3602 } 3603 3604 int proto_register(struct proto *prot, int alloc_slab) 3605 { 3606 int ret = -ENOBUFS; 3607 3608 if (alloc_slab) { 3609 prot->slab = kmem_cache_create_usercopy(prot->name, 3610 prot->obj_size, 0, 3611 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3612 prot->slab_flags, 3613 prot->useroffset, prot->usersize, 3614 NULL); 3615 3616 if (prot->slab == NULL) { 3617 pr_crit("%s: Can't create sock SLAB cache!\n", 3618 prot->name); 3619 goto out; 3620 } 3621 3622 if (req_prot_init(prot)) 3623 goto out_free_request_sock_slab; 3624 3625 if (tw_prot_init(prot)) 3626 goto out_free_timewait_sock_slab; 3627 } 3628 3629 mutex_lock(&proto_list_mutex); 3630 ret = assign_proto_idx(prot); 3631 if (ret) { 3632 mutex_unlock(&proto_list_mutex); 3633 goto out_free_timewait_sock_slab; 3634 } 3635 list_add(&prot->node, &proto_list); 3636 mutex_unlock(&proto_list_mutex); 3637 return ret; 3638 3639 out_free_timewait_sock_slab: 3640 if (alloc_slab) 3641 tw_prot_cleanup(prot->twsk_prot); 3642 out_free_request_sock_slab: 3643 if (alloc_slab) { 3644 req_prot_cleanup(prot->rsk_prot); 3645 3646 kmem_cache_destroy(prot->slab); 3647 prot->slab = NULL; 3648 } 3649 out: 3650 return ret; 3651 } 3652 EXPORT_SYMBOL(proto_register); 3653 3654 void proto_unregister(struct proto *prot) 3655 { 3656 mutex_lock(&proto_list_mutex); 3657 release_proto_idx(prot); 3658 list_del(&prot->node); 3659 mutex_unlock(&proto_list_mutex); 3660 3661 kmem_cache_destroy(prot->slab); 3662 prot->slab = NULL; 3663 3664 req_prot_cleanup(prot->rsk_prot); 3665 tw_prot_cleanup(prot->twsk_prot); 3666 } 3667 EXPORT_SYMBOL(proto_unregister); 3668 3669 int sock_load_diag_module(int family, int protocol) 3670 { 3671 if (!protocol) { 3672 if (!sock_is_registered(family)) 3673 return -ENOENT; 3674 3675 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3676 NETLINK_SOCK_DIAG, family); 3677 } 3678 3679 #ifdef CONFIG_INET 3680 if (family == AF_INET && 3681 protocol != IPPROTO_RAW && 3682 protocol < MAX_INET_PROTOS && 3683 !rcu_access_pointer(inet_protos[protocol])) 3684 return -ENOENT; 3685 #endif 3686 3687 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3688 NETLINK_SOCK_DIAG, family, protocol); 3689 } 3690 EXPORT_SYMBOL(sock_load_diag_module); 3691 3692 #ifdef CONFIG_PROC_FS 3693 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3694 __acquires(proto_list_mutex) 3695 { 3696 mutex_lock(&proto_list_mutex); 3697 return seq_list_start_head(&proto_list, *pos); 3698 } 3699 3700 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3701 { 3702 return seq_list_next(v, &proto_list, pos); 3703 } 3704 3705 static void proto_seq_stop(struct seq_file *seq, void *v) 3706 __releases(proto_list_mutex) 3707 { 3708 mutex_unlock(&proto_list_mutex); 3709 } 3710 3711 static char proto_method_implemented(const void *method) 3712 { 3713 return method == NULL ? 'n' : 'y'; 3714 } 3715 static long sock_prot_memory_allocated(struct proto *proto) 3716 { 3717 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3718 } 3719 3720 static const char *sock_prot_memory_pressure(struct proto *proto) 3721 { 3722 return proto->memory_pressure != NULL ? 3723 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3724 } 3725 3726 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3727 { 3728 3729 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3730 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3731 proto->name, 3732 proto->obj_size, 3733 sock_prot_inuse_get(seq_file_net(seq), proto), 3734 sock_prot_memory_allocated(proto), 3735 sock_prot_memory_pressure(proto), 3736 proto->max_header, 3737 proto->slab == NULL ? "no" : "yes", 3738 module_name(proto->owner), 3739 proto_method_implemented(proto->close), 3740 proto_method_implemented(proto->connect), 3741 proto_method_implemented(proto->disconnect), 3742 proto_method_implemented(proto->accept), 3743 proto_method_implemented(proto->ioctl), 3744 proto_method_implemented(proto->init), 3745 proto_method_implemented(proto->destroy), 3746 proto_method_implemented(proto->shutdown), 3747 proto_method_implemented(proto->setsockopt), 3748 proto_method_implemented(proto->getsockopt), 3749 proto_method_implemented(proto->sendmsg), 3750 proto_method_implemented(proto->recvmsg), 3751 proto_method_implemented(proto->sendpage), 3752 proto_method_implemented(proto->bind), 3753 proto_method_implemented(proto->backlog_rcv), 3754 proto_method_implemented(proto->hash), 3755 proto_method_implemented(proto->unhash), 3756 proto_method_implemented(proto->get_port), 3757 proto_method_implemented(proto->enter_memory_pressure)); 3758 } 3759 3760 static int proto_seq_show(struct seq_file *seq, void *v) 3761 { 3762 if (v == &proto_list) 3763 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3764 "protocol", 3765 "size", 3766 "sockets", 3767 "memory", 3768 "press", 3769 "maxhdr", 3770 "slab", 3771 "module", 3772 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3773 else 3774 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3775 return 0; 3776 } 3777 3778 static const struct seq_operations proto_seq_ops = { 3779 .start = proto_seq_start, 3780 .next = proto_seq_next, 3781 .stop = proto_seq_stop, 3782 .show = proto_seq_show, 3783 }; 3784 3785 static __net_init int proto_init_net(struct net *net) 3786 { 3787 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3788 sizeof(struct seq_net_private))) 3789 return -ENOMEM; 3790 3791 return 0; 3792 } 3793 3794 static __net_exit void proto_exit_net(struct net *net) 3795 { 3796 remove_proc_entry("protocols", net->proc_net); 3797 } 3798 3799 3800 static __net_initdata struct pernet_operations proto_net_ops = { 3801 .init = proto_init_net, 3802 .exit = proto_exit_net, 3803 }; 3804 3805 static int __init proto_init(void) 3806 { 3807 return register_pernet_subsys(&proto_net_ops); 3808 } 3809 3810 subsys_initcall(proto_init); 3811 3812 #endif /* PROC_FS */ 3813 3814 #ifdef CONFIG_NET_RX_BUSY_POLL 3815 bool sk_busy_loop_end(void *p, unsigned long start_time) 3816 { 3817 struct sock *sk = p; 3818 3819 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3820 sk_busy_loop_timeout(sk, start_time); 3821 } 3822 EXPORT_SYMBOL(sk_busy_loop_end); 3823 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3824 3825 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3826 { 3827 if (!sk->sk_prot->bind_add) 3828 return -EOPNOTSUPP; 3829 return sk->sk_prot->bind_add(sk, addr, addr_len); 3830 } 3831 EXPORT_SYMBOL(sock_bind_add); 3832