1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <net/net_namespace.h> 128 #include <net/request_sock.h> 129 #include <net/sock.h> 130 #include <net/proto_memory.h> 131 #include <linux/net_tstamp.h> 132 #include <net/xfrm.h> 133 #include <linux/ipsec.h> 134 #include <net/cls_cgroup.h> 135 #include <net/netprio_cgroup.h> 136 #include <linux/sock_diag.h> 137 138 #include <linux/filter.h> 139 #include <net/sock_reuseport.h> 140 #include <net/bpf_sk_storage.h> 141 142 #include <trace/events/sock.h> 143 144 #include <net/tcp.h> 145 #include <net/busy_poll.h> 146 #include <net/phonet/phonet.h> 147 148 #include <linux/ethtool.h> 149 150 #include "dev.h" 151 152 static DEFINE_MUTEX(proto_list_mutex); 153 static LIST_HEAD(proto_list); 154 155 static void sock_def_write_space_wfree(struct sock *sk); 156 static void sock_def_write_space(struct sock *sk); 157 158 /** 159 * sk_ns_capable - General socket capability test 160 * @sk: Socket to use a capability on or through 161 * @user_ns: The user namespace of the capability to use 162 * @cap: The capability to use 163 * 164 * Test to see if the opener of the socket had when the socket was 165 * created and the current process has the capability @cap in the user 166 * namespace @user_ns. 167 */ 168 bool sk_ns_capable(const struct sock *sk, 169 struct user_namespace *user_ns, int cap) 170 { 171 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 172 ns_capable(user_ns, cap); 173 } 174 EXPORT_SYMBOL(sk_ns_capable); 175 176 /** 177 * sk_capable - Socket global capability test 178 * @sk: Socket to use a capability on or through 179 * @cap: The global capability to use 180 * 181 * Test to see if the opener of the socket had when the socket was 182 * created and the current process has the capability @cap in all user 183 * namespaces. 184 */ 185 bool sk_capable(const struct sock *sk, int cap) 186 { 187 return sk_ns_capable(sk, &init_user_ns, cap); 188 } 189 EXPORT_SYMBOL(sk_capable); 190 191 /** 192 * sk_net_capable - Network namespace socket capability test 193 * @sk: Socket to use a capability on or through 194 * @cap: The capability to use 195 * 196 * Test to see if the opener of the socket had when the socket was created 197 * and the current process has the capability @cap over the network namespace 198 * the socket is a member of. 199 */ 200 bool sk_net_capable(const struct sock *sk, int cap) 201 { 202 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 203 } 204 EXPORT_SYMBOL(sk_net_capable); 205 206 /* 207 * Each address family might have different locking rules, so we have 208 * one slock key per address family and separate keys for internal and 209 * userspace sockets. 210 */ 211 static struct lock_class_key af_family_keys[AF_MAX]; 212 static struct lock_class_key af_family_kern_keys[AF_MAX]; 213 static struct lock_class_key af_family_slock_keys[AF_MAX]; 214 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 215 216 /* 217 * Make lock validator output more readable. (we pre-construct these 218 * strings build-time, so that runtime initialization of socket 219 * locks is fast): 220 */ 221 222 #define _sock_locks(x) \ 223 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 224 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 225 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 226 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 227 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 228 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 229 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 230 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 231 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 232 x "27" , x "28" , x "AF_CAN" , \ 233 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 234 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 235 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 236 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 237 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 238 x "AF_MCTP" , \ 239 x "AF_MAX" 240 241 static const char *const af_family_key_strings[AF_MAX+1] = { 242 _sock_locks("sk_lock-") 243 }; 244 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 245 _sock_locks("slock-") 246 }; 247 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 248 _sock_locks("clock-") 249 }; 250 251 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 252 _sock_locks("k-sk_lock-") 253 }; 254 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 255 _sock_locks("k-slock-") 256 }; 257 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-clock-") 259 }; 260 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 261 _sock_locks("rlock-") 262 }; 263 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 264 _sock_locks("wlock-") 265 }; 266 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 267 _sock_locks("elock-") 268 }; 269 270 /* 271 * sk_callback_lock and sk queues locking rules are per-address-family, 272 * so split the lock classes by using a per-AF key: 273 */ 274 static struct lock_class_key af_callback_keys[AF_MAX]; 275 static struct lock_class_key af_rlock_keys[AF_MAX]; 276 static struct lock_class_key af_wlock_keys[AF_MAX]; 277 static struct lock_class_key af_elock_keys[AF_MAX]; 278 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 279 280 /* Run time adjustable parameters. */ 281 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 282 EXPORT_SYMBOL(sysctl_wmem_max); 283 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 284 EXPORT_SYMBOL(sysctl_rmem_max); 285 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 286 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 287 288 int sysctl_tstamp_allow_data __read_mostly = 1; 289 290 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 291 EXPORT_SYMBOL_GPL(memalloc_socks_key); 292 293 /** 294 * sk_set_memalloc - sets %SOCK_MEMALLOC 295 * @sk: socket to set it on 296 * 297 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 298 * It's the responsibility of the admin to adjust min_free_kbytes 299 * to meet the requirements 300 */ 301 void sk_set_memalloc(struct sock *sk) 302 { 303 sock_set_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation |= __GFP_MEMALLOC; 305 static_branch_inc(&memalloc_socks_key); 306 } 307 EXPORT_SYMBOL_GPL(sk_set_memalloc); 308 309 void sk_clear_memalloc(struct sock *sk) 310 { 311 sock_reset_flag(sk, SOCK_MEMALLOC); 312 sk->sk_allocation &= ~__GFP_MEMALLOC; 313 static_branch_dec(&memalloc_socks_key); 314 315 /* 316 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 317 * progress of swapping. SOCK_MEMALLOC may be cleared while 318 * it has rmem allocations due to the last swapfile being deactivated 319 * but there is a risk that the socket is unusable due to exceeding 320 * the rmem limits. Reclaim the reserves and obey rmem limits again. 321 */ 322 sk_mem_reclaim(sk); 323 } 324 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 325 326 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 327 { 328 int ret; 329 unsigned int noreclaim_flag; 330 331 /* these should have been dropped before queueing */ 332 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 333 334 noreclaim_flag = memalloc_noreclaim_save(); 335 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 336 tcp_v6_do_rcv, 337 tcp_v4_do_rcv, 338 sk, skb); 339 memalloc_noreclaim_restore(noreclaim_flag); 340 341 return ret; 342 } 343 EXPORT_SYMBOL(__sk_backlog_rcv); 344 345 void sk_error_report(struct sock *sk) 346 { 347 sk->sk_error_report(sk); 348 349 switch (sk->sk_family) { 350 case AF_INET: 351 fallthrough; 352 case AF_INET6: 353 trace_inet_sk_error_report(sk); 354 break; 355 default: 356 break; 357 } 358 } 359 EXPORT_SYMBOL(sk_error_report); 360 361 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 362 { 363 struct __kernel_sock_timeval tv; 364 365 if (timeo == MAX_SCHEDULE_TIMEOUT) { 366 tv.tv_sec = 0; 367 tv.tv_usec = 0; 368 } else { 369 tv.tv_sec = timeo / HZ; 370 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 371 } 372 373 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 374 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 375 *(struct old_timeval32 *)optval = tv32; 376 return sizeof(tv32); 377 } 378 379 if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 old_tv.tv_sec = tv.tv_sec; 382 old_tv.tv_usec = tv.tv_usec; 383 *(struct __kernel_old_timeval *)optval = old_tv; 384 return sizeof(old_tv); 385 } 386 387 *(struct __kernel_sock_timeval *)optval = tv; 388 return sizeof(tv); 389 } 390 EXPORT_SYMBOL(sock_get_timeout); 391 392 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 393 sockptr_t optval, int optlen, bool old_timeval) 394 { 395 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 396 struct old_timeval32 tv32; 397 398 if (optlen < sizeof(tv32)) 399 return -EINVAL; 400 401 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 402 return -EFAULT; 403 tv->tv_sec = tv32.tv_sec; 404 tv->tv_usec = tv32.tv_usec; 405 } else if (old_timeval) { 406 struct __kernel_old_timeval old_tv; 407 408 if (optlen < sizeof(old_tv)) 409 return -EINVAL; 410 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 411 return -EFAULT; 412 tv->tv_sec = old_tv.tv_sec; 413 tv->tv_usec = old_tv.tv_usec; 414 } else { 415 if (optlen < sizeof(*tv)) 416 return -EINVAL; 417 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 418 return -EFAULT; 419 } 420 421 return 0; 422 } 423 EXPORT_SYMBOL(sock_copy_user_timeval); 424 425 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 426 bool old_timeval) 427 { 428 struct __kernel_sock_timeval tv; 429 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 430 long val; 431 432 if (err) 433 return err; 434 435 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 436 return -EDOM; 437 438 if (tv.tv_sec < 0) { 439 static int warned __read_mostly; 440 441 WRITE_ONCE(*timeo_p, 0); 442 if (warned < 10 && net_ratelimit()) { 443 warned++; 444 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 445 __func__, current->comm, task_pid_nr(current)); 446 } 447 return 0; 448 } 449 val = MAX_SCHEDULE_TIMEOUT; 450 if ((tv.tv_sec || tv.tv_usec) && 451 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 452 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 453 USEC_PER_SEC / HZ); 454 WRITE_ONCE(*timeo_p, val); 455 return 0; 456 } 457 458 static bool sock_needs_netstamp(const struct sock *sk) 459 { 460 switch (sk->sk_family) { 461 case AF_UNSPEC: 462 case AF_UNIX: 463 return false; 464 default: 465 return true; 466 } 467 } 468 469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 470 { 471 if (sk->sk_flags & flags) { 472 sk->sk_flags &= ~flags; 473 if (sock_needs_netstamp(sk) && 474 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 475 net_disable_timestamp(); 476 } 477 } 478 479 480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 481 { 482 unsigned long flags; 483 struct sk_buff_head *list = &sk->sk_receive_queue; 484 485 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 486 atomic_inc(&sk->sk_drops); 487 trace_sock_rcvqueue_full(sk, skb); 488 return -ENOMEM; 489 } 490 491 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 492 atomic_inc(&sk->sk_drops); 493 return -ENOBUFS; 494 } 495 496 skb->dev = NULL; 497 skb_set_owner_r(skb, sk); 498 499 /* we escape from rcu protected region, make sure we dont leak 500 * a norefcounted dst 501 */ 502 skb_dst_force(skb); 503 504 spin_lock_irqsave(&list->lock, flags); 505 sock_skb_set_dropcount(sk, skb); 506 __skb_queue_tail(list, skb); 507 spin_unlock_irqrestore(&list->lock, flags); 508 509 if (!sock_flag(sk, SOCK_DEAD)) 510 sk->sk_data_ready(sk); 511 return 0; 512 } 513 EXPORT_SYMBOL(__sock_queue_rcv_skb); 514 515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 516 enum skb_drop_reason *reason) 517 { 518 enum skb_drop_reason drop_reason; 519 int err; 520 521 err = sk_filter(sk, skb); 522 if (err) { 523 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 524 goto out; 525 } 526 err = __sock_queue_rcv_skb(sk, skb); 527 switch (err) { 528 case -ENOMEM: 529 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 530 break; 531 case -ENOBUFS: 532 drop_reason = SKB_DROP_REASON_PROTO_MEM; 533 break; 534 default: 535 drop_reason = SKB_NOT_DROPPED_YET; 536 break; 537 } 538 out: 539 if (reason) 540 *reason = drop_reason; 541 return err; 542 } 543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 544 545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 546 const int nested, unsigned int trim_cap, bool refcounted) 547 { 548 int rc = NET_RX_SUCCESS; 549 550 if (sk_filter_trim_cap(sk, skb, trim_cap)) 551 goto discard_and_relse; 552 553 skb->dev = NULL; 554 555 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 556 atomic_inc(&sk->sk_drops); 557 goto discard_and_relse; 558 } 559 if (nested) 560 bh_lock_sock_nested(sk); 561 else 562 bh_lock_sock(sk); 563 if (!sock_owned_by_user(sk)) { 564 /* 565 * trylock + unlock semantics: 566 */ 567 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 568 569 rc = sk_backlog_rcv(sk, skb); 570 571 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 572 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 573 bh_unlock_sock(sk); 574 atomic_inc(&sk->sk_drops); 575 goto discard_and_relse; 576 } 577 578 bh_unlock_sock(sk); 579 out: 580 if (refcounted) 581 sock_put(sk); 582 return rc; 583 discard_and_relse: 584 kfree_skb(skb); 585 goto out; 586 } 587 EXPORT_SYMBOL(__sk_receive_skb); 588 589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 590 u32)); 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 592 u32)); 593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 594 { 595 struct dst_entry *dst = __sk_dst_get(sk); 596 597 if (dst && dst->obsolete && 598 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 599 dst, cookie) == NULL) { 600 sk_tx_queue_clear(sk); 601 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 602 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 603 dst_release(dst); 604 return NULL; 605 } 606 607 return dst; 608 } 609 EXPORT_SYMBOL(__sk_dst_check); 610 611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 612 { 613 struct dst_entry *dst = sk_dst_get(sk); 614 615 if (dst && dst->obsolete && 616 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 617 dst, cookie) == NULL) { 618 sk_dst_reset(sk); 619 dst_release(dst); 620 return NULL; 621 } 622 623 return dst; 624 } 625 EXPORT_SYMBOL(sk_dst_check); 626 627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 628 { 629 int ret = -ENOPROTOOPT; 630 #ifdef CONFIG_NETDEVICES 631 struct net *net = sock_net(sk); 632 633 /* Sorry... */ 634 ret = -EPERM; 635 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 636 goto out; 637 638 ret = -EINVAL; 639 if (ifindex < 0) 640 goto out; 641 642 /* Paired with all READ_ONCE() done locklessly. */ 643 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 644 645 if (sk->sk_prot->rehash) 646 sk->sk_prot->rehash(sk); 647 sk_dst_reset(sk); 648 649 ret = 0; 650 651 out: 652 #endif 653 654 return ret; 655 } 656 657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 658 { 659 int ret; 660 661 if (lock_sk) 662 lock_sock(sk); 663 ret = sock_bindtoindex_locked(sk, ifindex); 664 if (lock_sk) 665 release_sock(sk); 666 667 return ret; 668 } 669 EXPORT_SYMBOL(sock_bindtoindex); 670 671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 672 { 673 int ret = -ENOPROTOOPT; 674 #ifdef CONFIG_NETDEVICES 675 struct net *net = sock_net(sk); 676 char devname[IFNAMSIZ]; 677 int index; 678 679 ret = -EINVAL; 680 if (optlen < 0) 681 goto out; 682 683 /* Bind this socket to a particular device like "eth0", 684 * as specified in the passed interface name. If the 685 * name is "" or the option length is zero the socket 686 * is not bound. 687 */ 688 if (optlen > IFNAMSIZ - 1) 689 optlen = IFNAMSIZ - 1; 690 memset(devname, 0, sizeof(devname)); 691 692 ret = -EFAULT; 693 if (copy_from_sockptr(devname, optval, optlen)) 694 goto out; 695 696 index = 0; 697 if (devname[0] != '\0') { 698 struct net_device *dev; 699 700 rcu_read_lock(); 701 dev = dev_get_by_name_rcu(net, devname); 702 if (dev) 703 index = dev->ifindex; 704 rcu_read_unlock(); 705 ret = -ENODEV; 706 if (!dev) 707 goto out; 708 } 709 710 sockopt_lock_sock(sk); 711 ret = sock_bindtoindex_locked(sk, index); 712 sockopt_release_sock(sk); 713 out: 714 #endif 715 716 return ret; 717 } 718 719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 720 sockptr_t optlen, int len) 721 { 722 int ret = -ENOPROTOOPT; 723 #ifdef CONFIG_NETDEVICES 724 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 725 struct net *net = sock_net(sk); 726 char devname[IFNAMSIZ]; 727 728 if (bound_dev_if == 0) { 729 len = 0; 730 goto zero; 731 } 732 733 ret = -EINVAL; 734 if (len < IFNAMSIZ) 735 goto out; 736 737 ret = netdev_get_name(net, devname, bound_dev_if); 738 if (ret) 739 goto out; 740 741 len = strlen(devname) + 1; 742 743 ret = -EFAULT; 744 if (copy_to_sockptr(optval, devname, len)) 745 goto out; 746 747 zero: 748 ret = -EFAULT; 749 if (copy_to_sockptr(optlen, &len, sizeof(int))) 750 goto out; 751 752 ret = 0; 753 754 out: 755 #endif 756 757 return ret; 758 } 759 760 bool sk_mc_loop(const struct sock *sk) 761 { 762 if (dev_recursion_level()) 763 return false; 764 if (!sk) 765 return true; 766 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 767 switch (READ_ONCE(sk->sk_family)) { 768 case AF_INET: 769 return inet_test_bit(MC_LOOP, sk); 770 #if IS_ENABLED(CONFIG_IPV6) 771 case AF_INET6: 772 return inet6_test_bit(MC6_LOOP, sk); 773 #endif 774 } 775 WARN_ON_ONCE(1); 776 return true; 777 } 778 EXPORT_SYMBOL(sk_mc_loop); 779 780 void sock_set_reuseaddr(struct sock *sk) 781 { 782 lock_sock(sk); 783 sk->sk_reuse = SK_CAN_REUSE; 784 release_sock(sk); 785 } 786 EXPORT_SYMBOL(sock_set_reuseaddr); 787 788 void sock_set_reuseport(struct sock *sk) 789 { 790 lock_sock(sk); 791 sk->sk_reuseport = true; 792 release_sock(sk); 793 } 794 EXPORT_SYMBOL(sock_set_reuseport); 795 796 void sock_no_linger(struct sock *sk) 797 { 798 lock_sock(sk); 799 WRITE_ONCE(sk->sk_lingertime, 0); 800 sock_set_flag(sk, SOCK_LINGER); 801 release_sock(sk); 802 } 803 EXPORT_SYMBOL(sock_no_linger); 804 805 void sock_set_priority(struct sock *sk, u32 priority) 806 { 807 WRITE_ONCE(sk->sk_priority, priority); 808 } 809 EXPORT_SYMBOL(sock_set_priority); 810 811 void sock_set_sndtimeo(struct sock *sk, s64 secs) 812 { 813 lock_sock(sk); 814 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 815 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 816 else 817 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 818 release_sock(sk); 819 } 820 EXPORT_SYMBOL(sock_set_sndtimeo); 821 822 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 823 { 824 if (val) { 825 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 826 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 827 sock_set_flag(sk, SOCK_RCVTSTAMP); 828 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 829 } else { 830 sock_reset_flag(sk, SOCK_RCVTSTAMP); 831 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 832 } 833 } 834 835 void sock_enable_timestamps(struct sock *sk) 836 { 837 lock_sock(sk); 838 __sock_set_timestamps(sk, true, false, true); 839 release_sock(sk); 840 } 841 EXPORT_SYMBOL(sock_enable_timestamps); 842 843 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 844 { 845 switch (optname) { 846 case SO_TIMESTAMP_OLD: 847 __sock_set_timestamps(sk, valbool, false, false); 848 break; 849 case SO_TIMESTAMP_NEW: 850 __sock_set_timestamps(sk, valbool, true, false); 851 break; 852 case SO_TIMESTAMPNS_OLD: 853 __sock_set_timestamps(sk, valbool, false, true); 854 break; 855 case SO_TIMESTAMPNS_NEW: 856 __sock_set_timestamps(sk, valbool, true, true); 857 break; 858 } 859 } 860 861 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 862 { 863 struct net *net = sock_net(sk); 864 struct net_device *dev = NULL; 865 bool match = false; 866 int *vclock_index; 867 int i, num; 868 869 if (sk->sk_bound_dev_if) 870 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 871 872 if (!dev) { 873 pr_err("%s: sock not bind to device\n", __func__); 874 return -EOPNOTSUPP; 875 } 876 877 num = ethtool_get_phc_vclocks(dev, &vclock_index); 878 dev_put(dev); 879 880 for (i = 0; i < num; i++) { 881 if (*(vclock_index + i) == phc_index) { 882 match = true; 883 break; 884 } 885 } 886 887 if (num > 0) 888 kfree(vclock_index); 889 890 if (!match) 891 return -EINVAL; 892 893 WRITE_ONCE(sk->sk_bind_phc, phc_index); 894 895 return 0; 896 } 897 898 int sock_set_timestamping(struct sock *sk, int optname, 899 struct so_timestamping timestamping) 900 { 901 int val = timestamping.flags; 902 int ret; 903 904 if (val & ~SOF_TIMESTAMPING_MASK) 905 return -EINVAL; 906 907 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 908 !(val & SOF_TIMESTAMPING_OPT_ID)) 909 return -EINVAL; 910 911 if (val & SOF_TIMESTAMPING_OPT_ID && 912 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 913 if (sk_is_tcp(sk)) { 914 if ((1 << sk->sk_state) & 915 (TCPF_CLOSE | TCPF_LISTEN)) 916 return -EINVAL; 917 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 918 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 919 else 920 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 921 } else { 922 atomic_set(&sk->sk_tskey, 0); 923 } 924 } 925 926 if (val & SOF_TIMESTAMPING_OPT_STATS && 927 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 928 return -EINVAL; 929 930 if (val & SOF_TIMESTAMPING_BIND_PHC) { 931 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 932 if (ret) 933 return ret; 934 } 935 936 WRITE_ONCE(sk->sk_tsflags, val); 937 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 938 939 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 940 sock_enable_timestamp(sk, 941 SOCK_TIMESTAMPING_RX_SOFTWARE); 942 else 943 sock_disable_timestamp(sk, 944 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 945 return 0; 946 } 947 948 void sock_set_keepalive(struct sock *sk) 949 { 950 lock_sock(sk); 951 if (sk->sk_prot->keepalive) 952 sk->sk_prot->keepalive(sk, true); 953 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 954 release_sock(sk); 955 } 956 EXPORT_SYMBOL(sock_set_keepalive); 957 958 static void __sock_set_rcvbuf(struct sock *sk, int val) 959 { 960 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 961 * as a negative value. 962 */ 963 val = min_t(int, val, INT_MAX / 2); 964 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 965 966 /* We double it on the way in to account for "struct sk_buff" etc. 967 * overhead. Applications assume that the SO_RCVBUF setting they make 968 * will allow that much actual data to be received on that socket. 969 * 970 * Applications are unaware that "struct sk_buff" and other overheads 971 * allocate from the receive buffer during socket buffer allocation. 972 * 973 * And after considering the possible alternatives, returning the value 974 * we actually used in getsockopt is the most desirable behavior. 975 */ 976 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 977 } 978 979 void sock_set_rcvbuf(struct sock *sk, int val) 980 { 981 lock_sock(sk); 982 __sock_set_rcvbuf(sk, val); 983 release_sock(sk); 984 } 985 EXPORT_SYMBOL(sock_set_rcvbuf); 986 987 static void __sock_set_mark(struct sock *sk, u32 val) 988 { 989 if (val != sk->sk_mark) { 990 WRITE_ONCE(sk->sk_mark, val); 991 sk_dst_reset(sk); 992 } 993 } 994 995 void sock_set_mark(struct sock *sk, u32 val) 996 { 997 lock_sock(sk); 998 __sock_set_mark(sk, val); 999 release_sock(sk); 1000 } 1001 EXPORT_SYMBOL(sock_set_mark); 1002 1003 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1004 { 1005 /* Round down bytes to multiple of pages */ 1006 bytes = round_down(bytes, PAGE_SIZE); 1007 1008 WARN_ON(bytes > sk->sk_reserved_mem); 1009 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1010 sk_mem_reclaim(sk); 1011 } 1012 1013 static int sock_reserve_memory(struct sock *sk, int bytes) 1014 { 1015 long allocated; 1016 bool charged; 1017 int pages; 1018 1019 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 1020 return -EOPNOTSUPP; 1021 1022 if (!bytes) 1023 return 0; 1024 1025 pages = sk_mem_pages(bytes); 1026 1027 /* pre-charge to memcg */ 1028 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 1029 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1030 if (!charged) 1031 return -ENOMEM; 1032 1033 /* pre-charge to forward_alloc */ 1034 sk_memory_allocated_add(sk, pages); 1035 allocated = sk_memory_allocated(sk); 1036 /* If the system goes into memory pressure with this 1037 * precharge, give up and return error. 1038 */ 1039 if (allocated > sk_prot_mem_limits(sk, 1)) { 1040 sk_memory_allocated_sub(sk, pages); 1041 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1042 return -ENOMEM; 1043 } 1044 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1045 1046 WRITE_ONCE(sk->sk_reserved_mem, 1047 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1048 1049 return 0; 1050 } 1051 1052 void sockopt_lock_sock(struct sock *sk) 1053 { 1054 /* When current->bpf_ctx is set, the setsockopt is called from 1055 * a bpf prog. bpf has ensured the sk lock has been 1056 * acquired before calling setsockopt(). 1057 */ 1058 if (has_current_bpf_ctx()) 1059 return; 1060 1061 lock_sock(sk); 1062 } 1063 EXPORT_SYMBOL(sockopt_lock_sock); 1064 1065 void sockopt_release_sock(struct sock *sk) 1066 { 1067 if (has_current_bpf_ctx()) 1068 return; 1069 1070 release_sock(sk); 1071 } 1072 EXPORT_SYMBOL(sockopt_release_sock); 1073 1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1075 { 1076 return has_current_bpf_ctx() || ns_capable(ns, cap); 1077 } 1078 EXPORT_SYMBOL(sockopt_ns_capable); 1079 1080 bool sockopt_capable(int cap) 1081 { 1082 return has_current_bpf_ctx() || capable(cap); 1083 } 1084 EXPORT_SYMBOL(sockopt_capable); 1085 1086 static int sockopt_validate_clockid(__kernel_clockid_t value) 1087 { 1088 switch (value) { 1089 case CLOCK_REALTIME: 1090 case CLOCK_MONOTONIC: 1091 case CLOCK_TAI: 1092 return 0; 1093 } 1094 return -EINVAL; 1095 } 1096 1097 /* 1098 * This is meant for all protocols to use and covers goings on 1099 * at the socket level. Everything here is generic. 1100 */ 1101 1102 int sk_setsockopt(struct sock *sk, int level, int optname, 1103 sockptr_t optval, unsigned int optlen) 1104 { 1105 struct so_timestamping timestamping; 1106 struct socket *sock = sk->sk_socket; 1107 struct sock_txtime sk_txtime; 1108 int val; 1109 int valbool; 1110 struct linger ling; 1111 int ret = 0; 1112 1113 /* 1114 * Options without arguments 1115 */ 1116 1117 if (optname == SO_BINDTODEVICE) 1118 return sock_setbindtodevice(sk, optval, optlen); 1119 1120 if (optlen < sizeof(int)) 1121 return -EINVAL; 1122 1123 if (copy_from_sockptr(&val, optval, sizeof(val))) 1124 return -EFAULT; 1125 1126 valbool = val ? 1 : 0; 1127 1128 /* handle options which do not require locking the socket. */ 1129 switch (optname) { 1130 case SO_PRIORITY: 1131 if ((val >= 0 && val <= 6) || 1132 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 1133 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1134 sock_set_priority(sk, val); 1135 return 0; 1136 } 1137 return -EPERM; 1138 case SO_PASSSEC: 1139 assign_bit(SOCK_PASSSEC, &sock->flags, valbool); 1140 return 0; 1141 case SO_PASSCRED: 1142 assign_bit(SOCK_PASSCRED, &sock->flags, valbool); 1143 return 0; 1144 case SO_PASSPIDFD: 1145 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); 1146 return 0; 1147 case SO_TYPE: 1148 case SO_PROTOCOL: 1149 case SO_DOMAIN: 1150 case SO_ERROR: 1151 return -ENOPROTOOPT; 1152 #ifdef CONFIG_NET_RX_BUSY_POLL 1153 case SO_BUSY_POLL: 1154 if (val < 0) 1155 return -EINVAL; 1156 WRITE_ONCE(sk->sk_ll_usec, val); 1157 return 0; 1158 case SO_PREFER_BUSY_POLL: 1159 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1160 return -EPERM; 1161 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1162 return 0; 1163 case SO_BUSY_POLL_BUDGET: 1164 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1165 !sockopt_capable(CAP_NET_ADMIN)) 1166 return -EPERM; 1167 if (val < 0 || val > U16_MAX) 1168 return -EINVAL; 1169 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1170 return 0; 1171 #endif 1172 case SO_MAX_PACING_RATE: 1173 { 1174 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1175 unsigned long pacing_rate; 1176 1177 if (sizeof(ulval) != sizeof(val) && 1178 optlen >= sizeof(ulval) && 1179 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1180 return -EFAULT; 1181 } 1182 if (ulval != ~0UL) 1183 cmpxchg(&sk->sk_pacing_status, 1184 SK_PACING_NONE, 1185 SK_PACING_NEEDED); 1186 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1187 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1188 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1189 if (ulval < pacing_rate) 1190 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1191 return 0; 1192 } 1193 case SO_TXREHASH: 1194 if (val < -1 || val > 1) 1195 return -EINVAL; 1196 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1197 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1198 /* Paired with READ_ONCE() in tcp_rtx_synack() 1199 * and sk_getsockopt(). 1200 */ 1201 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1202 return 0; 1203 case SO_PEEK_OFF: 1204 { 1205 int (*set_peek_off)(struct sock *sk, int val); 1206 1207 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1208 if (set_peek_off) 1209 ret = set_peek_off(sk, val); 1210 else 1211 ret = -EOPNOTSUPP; 1212 return ret; 1213 } 1214 } 1215 1216 sockopt_lock_sock(sk); 1217 1218 switch (optname) { 1219 case SO_DEBUG: 1220 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1221 ret = -EACCES; 1222 else 1223 sock_valbool_flag(sk, SOCK_DBG, valbool); 1224 break; 1225 case SO_REUSEADDR: 1226 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1227 break; 1228 case SO_REUSEPORT: 1229 sk->sk_reuseport = valbool; 1230 break; 1231 case SO_DONTROUTE: 1232 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1233 sk_dst_reset(sk); 1234 break; 1235 case SO_BROADCAST: 1236 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1237 break; 1238 case SO_SNDBUF: 1239 /* Don't error on this BSD doesn't and if you think 1240 * about it this is right. Otherwise apps have to 1241 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1242 * are treated in BSD as hints 1243 */ 1244 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1245 set_sndbuf: 1246 /* Ensure val * 2 fits into an int, to prevent max_t() 1247 * from treating it as a negative value. 1248 */ 1249 val = min_t(int, val, INT_MAX / 2); 1250 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1251 WRITE_ONCE(sk->sk_sndbuf, 1252 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1253 /* Wake up sending tasks if we upped the value. */ 1254 sk->sk_write_space(sk); 1255 break; 1256 1257 case SO_SNDBUFFORCE: 1258 if (!sockopt_capable(CAP_NET_ADMIN)) { 1259 ret = -EPERM; 1260 break; 1261 } 1262 1263 /* No negative values (to prevent underflow, as val will be 1264 * multiplied by 2). 1265 */ 1266 if (val < 0) 1267 val = 0; 1268 goto set_sndbuf; 1269 1270 case SO_RCVBUF: 1271 /* Don't error on this BSD doesn't and if you think 1272 * about it this is right. Otherwise apps have to 1273 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1274 * are treated in BSD as hints 1275 */ 1276 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1277 break; 1278 1279 case SO_RCVBUFFORCE: 1280 if (!sockopt_capable(CAP_NET_ADMIN)) { 1281 ret = -EPERM; 1282 break; 1283 } 1284 1285 /* No negative values (to prevent underflow, as val will be 1286 * multiplied by 2). 1287 */ 1288 __sock_set_rcvbuf(sk, max(val, 0)); 1289 break; 1290 1291 case SO_KEEPALIVE: 1292 if (sk->sk_prot->keepalive) 1293 sk->sk_prot->keepalive(sk, valbool); 1294 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1295 break; 1296 1297 case SO_OOBINLINE: 1298 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1299 break; 1300 1301 case SO_NO_CHECK: 1302 sk->sk_no_check_tx = valbool; 1303 break; 1304 1305 case SO_LINGER: 1306 if (optlen < sizeof(ling)) { 1307 ret = -EINVAL; /* 1003.1g */ 1308 break; 1309 } 1310 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1311 ret = -EFAULT; 1312 break; 1313 } 1314 if (!ling.l_onoff) { 1315 sock_reset_flag(sk, SOCK_LINGER); 1316 } else { 1317 unsigned long t_sec = ling.l_linger; 1318 1319 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1320 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1321 else 1322 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1323 sock_set_flag(sk, SOCK_LINGER); 1324 } 1325 break; 1326 1327 case SO_BSDCOMPAT: 1328 break; 1329 1330 case SO_TIMESTAMP_OLD: 1331 case SO_TIMESTAMP_NEW: 1332 case SO_TIMESTAMPNS_OLD: 1333 case SO_TIMESTAMPNS_NEW: 1334 sock_set_timestamp(sk, optname, valbool); 1335 break; 1336 1337 case SO_TIMESTAMPING_NEW: 1338 case SO_TIMESTAMPING_OLD: 1339 if (optlen == sizeof(timestamping)) { 1340 if (copy_from_sockptr(×tamping, optval, 1341 sizeof(timestamping))) { 1342 ret = -EFAULT; 1343 break; 1344 } 1345 } else { 1346 memset(×tamping, 0, sizeof(timestamping)); 1347 timestamping.flags = val; 1348 } 1349 ret = sock_set_timestamping(sk, optname, timestamping); 1350 break; 1351 1352 case SO_RCVLOWAT: 1353 { 1354 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1355 1356 if (val < 0) 1357 val = INT_MAX; 1358 if (sock) 1359 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1360 if (set_rcvlowat) 1361 ret = set_rcvlowat(sk, val); 1362 else 1363 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1364 break; 1365 } 1366 case SO_RCVTIMEO_OLD: 1367 case SO_RCVTIMEO_NEW: 1368 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1369 optlen, optname == SO_RCVTIMEO_OLD); 1370 break; 1371 1372 case SO_SNDTIMEO_OLD: 1373 case SO_SNDTIMEO_NEW: 1374 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1375 optlen, optname == SO_SNDTIMEO_OLD); 1376 break; 1377 1378 case SO_ATTACH_FILTER: { 1379 struct sock_fprog fprog; 1380 1381 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1382 if (!ret) 1383 ret = sk_attach_filter(&fprog, sk); 1384 break; 1385 } 1386 case SO_ATTACH_BPF: 1387 ret = -EINVAL; 1388 if (optlen == sizeof(u32)) { 1389 u32 ufd; 1390 1391 ret = -EFAULT; 1392 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1393 break; 1394 1395 ret = sk_attach_bpf(ufd, sk); 1396 } 1397 break; 1398 1399 case SO_ATTACH_REUSEPORT_CBPF: { 1400 struct sock_fprog fprog; 1401 1402 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1403 if (!ret) 1404 ret = sk_reuseport_attach_filter(&fprog, sk); 1405 break; 1406 } 1407 case SO_ATTACH_REUSEPORT_EBPF: 1408 ret = -EINVAL; 1409 if (optlen == sizeof(u32)) { 1410 u32 ufd; 1411 1412 ret = -EFAULT; 1413 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1414 break; 1415 1416 ret = sk_reuseport_attach_bpf(ufd, sk); 1417 } 1418 break; 1419 1420 case SO_DETACH_REUSEPORT_BPF: 1421 ret = reuseport_detach_prog(sk); 1422 break; 1423 1424 case SO_DETACH_FILTER: 1425 ret = sk_detach_filter(sk); 1426 break; 1427 1428 case SO_LOCK_FILTER: 1429 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1430 ret = -EPERM; 1431 else 1432 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1433 break; 1434 1435 case SO_MARK: 1436 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1437 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1438 ret = -EPERM; 1439 break; 1440 } 1441 1442 __sock_set_mark(sk, val); 1443 break; 1444 case SO_RCVMARK: 1445 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1446 break; 1447 1448 case SO_RXQ_OVFL: 1449 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1450 break; 1451 1452 case SO_WIFI_STATUS: 1453 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1454 break; 1455 1456 case SO_NOFCS: 1457 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1458 break; 1459 1460 case SO_SELECT_ERR_QUEUE: 1461 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1462 break; 1463 1464 1465 case SO_INCOMING_CPU: 1466 reuseport_update_incoming_cpu(sk, val); 1467 break; 1468 1469 case SO_CNX_ADVICE: 1470 if (val == 1) 1471 dst_negative_advice(sk); 1472 break; 1473 1474 case SO_ZEROCOPY: 1475 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1476 if (!(sk_is_tcp(sk) || 1477 (sk->sk_type == SOCK_DGRAM && 1478 sk->sk_protocol == IPPROTO_UDP))) 1479 ret = -EOPNOTSUPP; 1480 } else if (sk->sk_family != PF_RDS) { 1481 ret = -EOPNOTSUPP; 1482 } 1483 if (!ret) { 1484 if (val < 0 || val > 1) 1485 ret = -EINVAL; 1486 else 1487 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1488 } 1489 break; 1490 1491 case SO_TXTIME: 1492 if (optlen != sizeof(struct sock_txtime)) { 1493 ret = -EINVAL; 1494 break; 1495 } else if (copy_from_sockptr(&sk_txtime, optval, 1496 sizeof(struct sock_txtime))) { 1497 ret = -EFAULT; 1498 break; 1499 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1500 ret = -EINVAL; 1501 break; 1502 } 1503 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1504 * scheduler has enough safe guards. 1505 */ 1506 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1507 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1508 ret = -EPERM; 1509 break; 1510 } 1511 1512 ret = sockopt_validate_clockid(sk_txtime.clockid); 1513 if (ret) 1514 break; 1515 1516 sock_valbool_flag(sk, SOCK_TXTIME, true); 1517 sk->sk_clockid = sk_txtime.clockid; 1518 sk->sk_txtime_deadline_mode = 1519 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1520 sk->sk_txtime_report_errors = 1521 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1522 break; 1523 1524 case SO_BINDTOIFINDEX: 1525 ret = sock_bindtoindex_locked(sk, val); 1526 break; 1527 1528 case SO_BUF_LOCK: 1529 if (val & ~SOCK_BUF_LOCK_MASK) { 1530 ret = -EINVAL; 1531 break; 1532 } 1533 sk->sk_userlocks = val | (sk->sk_userlocks & 1534 ~SOCK_BUF_LOCK_MASK); 1535 break; 1536 1537 case SO_RESERVE_MEM: 1538 { 1539 int delta; 1540 1541 if (val < 0) { 1542 ret = -EINVAL; 1543 break; 1544 } 1545 1546 delta = val - sk->sk_reserved_mem; 1547 if (delta < 0) 1548 sock_release_reserved_memory(sk, -delta); 1549 else 1550 ret = sock_reserve_memory(sk, delta); 1551 break; 1552 } 1553 1554 default: 1555 ret = -ENOPROTOOPT; 1556 break; 1557 } 1558 sockopt_release_sock(sk); 1559 return ret; 1560 } 1561 1562 int sock_setsockopt(struct socket *sock, int level, int optname, 1563 sockptr_t optval, unsigned int optlen) 1564 { 1565 return sk_setsockopt(sock->sk, level, optname, 1566 optval, optlen); 1567 } 1568 EXPORT_SYMBOL(sock_setsockopt); 1569 1570 static const struct cred *sk_get_peer_cred(struct sock *sk) 1571 { 1572 const struct cred *cred; 1573 1574 spin_lock(&sk->sk_peer_lock); 1575 cred = get_cred(sk->sk_peer_cred); 1576 spin_unlock(&sk->sk_peer_lock); 1577 1578 return cred; 1579 } 1580 1581 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1582 struct ucred *ucred) 1583 { 1584 ucred->pid = pid_vnr(pid); 1585 ucred->uid = ucred->gid = -1; 1586 if (cred) { 1587 struct user_namespace *current_ns = current_user_ns(); 1588 1589 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1590 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1591 } 1592 } 1593 1594 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1595 { 1596 struct user_namespace *user_ns = current_user_ns(); 1597 int i; 1598 1599 for (i = 0; i < src->ngroups; i++) { 1600 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1601 1602 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1603 return -EFAULT; 1604 } 1605 1606 return 0; 1607 } 1608 1609 int sk_getsockopt(struct sock *sk, int level, int optname, 1610 sockptr_t optval, sockptr_t optlen) 1611 { 1612 struct socket *sock = sk->sk_socket; 1613 1614 union { 1615 int val; 1616 u64 val64; 1617 unsigned long ulval; 1618 struct linger ling; 1619 struct old_timeval32 tm32; 1620 struct __kernel_old_timeval tm; 1621 struct __kernel_sock_timeval stm; 1622 struct sock_txtime txtime; 1623 struct so_timestamping timestamping; 1624 } v; 1625 1626 int lv = sizeof(int); 1627 int len; 1628 1629 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1630 return -EFAULT; 1631 if (len < 0) 1632 return -EINVAL; 1633 1634 memset(&v, 0, sizeof(v)); 1635 1636 switch (optname) { 1637 case SO_DEBUG: 1638 v.val = sock_flag(sk, SOCK_DBG); 1639 break; 1640 1641 case SO_DONTROUTE: 1642 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1643 break; 1644 1645 case SO_BROADCAST: 1646 v.val = sock_flag(sk, SOCK_BROADCAST); 1647 break; 1648 1649 case SO_SNDBUF: 1650 v.val = READ_ONCE(sk->sk_sndbuf); 1651 break; 1652 1653 case SO_RCVBUF: 1654 v.val = READ_ONCE(sk->sk_rcvbuf); 1655 break; 1656 1657 case SO_REUSEADDR: 1658 v.val = sk->sk_reuse; 1659 break; 1660 1661 case SO_REUSEPORT: 1662 v.val = sk->sk_reuseport; 1663 break; 1664 1665 case SO_KEEPALIVE: 1666 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1667 break; 1668 1669 case SO_TYPE: 1670 v.val = sk->sk_type; 1671 break; 1672 1673 case SO_PROTOCOL: 1674 v.val = sk->sk_protocol; 1675 break; 1676 1677 case SO_DOMAIN: 1678 v.val = sk->sk_family; 1679 break; 1680 1681 case SO_ERROR: 1682 v.val = -sock_error(sk); 1683 if (v.val == 0) 1684 v.val = xchg(&sk->sk_err_soft, 0); 1685 break; 1686 1687 case SO_OOBINLINE: 1688 v.val = sock_flag(sk, SOCK_URGINLINE); 1689 break; 1690 1691 case SO_NO_CHECK: 1692 v.val = sk->sk_no_check_tx; 1693 break; 1694 1695 case SO_PRIORITY: 1696 v.val = READ_ONCE(sk->sk_priority); 1697 break; 1698 1699 case SO_LINGER: 1700 lv = sizeof(v.ling); 1701 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1702 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1703 break; 1704 1705 case SO_BSDCOMPAT: 1706 break; 1707 1708 case SO_TIMESTAMP_OLD: 1709 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1710 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1711 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1712 break; 1713 1714 case SO_TIMESTAMPNS_OLD: 1715 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1716 break; 1717 1718 case SO_TIMESTAMP_NEW: 1719 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1720 break; 1721 1722 case SO_TIMESTAMPNS_NEW: 1723 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1724 break; 1725 1726 case SO_TIMESTAMPING_OLD: 1727 case SO_TIMESTAMPING_NEW: 1728 lv = sizeof(v.timestamping); 1729 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1730 * returning the flags when they were set through the same option. 1731 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1732 */ 1733 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1734 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1735 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1736 } 1737 break; 1738 1739 case SO_RCVTIMEO_OLD: 1740 case SO_RCVTIMEO_NEW: 1741 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1742 SO_RCVTIMEO_OLD == optname); 1743 break; 1744 1745 case SO_SNDTIMEO_OLD: 1746 case SO_SNDTIMEO_NEW: 1747 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1748 SO_SNDTIMEO_OLD == optname); 1749 break; 1750 1751 case SO_RCVLOWAT: 1752 v.val = READ_ONCE(sk->sk_rcvlowat); 1753 break; 1754 1755 case SO_SNDLOWAT: 1756 v.val = 1; 1757 break; 1758 1759 case SO_PASSCRED: 1760 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1761 break; 1762 1763 case SO_PASSPIDFD: 1764 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); 1765 break; 1766 1767 case SO_PEERCRED: 1768 { 1769 struct ucred peercred; 1770 if (len > sizeof(peercred)) 1771 len = sizeof(peercred); 1772 1773 spin_lock(&sk->sk_peer_lock); 1774 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1775 spin_unlock(&sk->sk_peer_lock); 1776 1777 if (copy_to_sockptr(optval, &peercred, len)) 1778 return -EFAULT; 1779 goto lenout; 1780 } 1781 1782 case SO_PEERPIDFD: 1783 { 1784 struct pid *peer_pid; 1785 struct file *pidfd_file = NULL; 1786 int pidfd; 1787 1788 if (len > sizeof(pidfd)) 1789 len = sizeof(pidfd); 1790 1791 spin_lock(&sk->sk_peer_lock); 1792 peer_pid = get_pid(sk->sk_peer_pid); 1793 spin_unlock(&sk->sk_peer_lock); 1794 1795 if (!peer_pid) 1796 return -ENODATA; 1797 1798 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); 1799 put_pid(peer_pid); 1800 if (pidfd < 0) 1801 return pidfd; 1802 1803 if (copy_to_sockptr(optval, &pidfd, len) || 1804 copy_to_sockptr(optlen, &len, sizeof(int))) { 1805 put_unused_fd(pidfd); 1806 fput(pidfd_file); 1807 1808 return -EFAULT; 1809 } 1810 1811 fd_install(pidfd, pidfd_file); 1812 return 0; 1813 } 1814 1815 case SO_PEERGROUPS: 1816 { 1817 const struct cred *cred; 1818 int ret, n; 1819 1820 cred = sk_get_peer_cred(sk); 1821 if (!cred) 1822 return -ENODATA; 1823 1824 n = cred->group_info->ngroups; 1825 if (len < n * sizeof(gid_t)) { 1826 len = n * sizeof(gid_t); 1827 put_cred(cred); 1828 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1829 } 1830 len = n * sizeof(gid_t); 1831 1832 ret = groups_to_user(optval, cred->group_info); 1833 put_cred(cred); 1834 if (ret) 1835 return ret; 1836 goto lenout; 1837 } 1838 1839 case SO_PEERNAME: 1840 { 1841 struct sockaddr_storage address; 1842 1843 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1844 if (lv < 0) 1845 return -ENOTCONN; 1846 if (lv < len) 1847 return -EINVAL; 1848 if (copy_to_sockptr(optval, &address, len)) 1849 return -EFAULT; 1850 goto lenout; 1851 } 1852 1853 /* Dubious BSD thing... Probably nobody even uses it, but 1854 * the UNIX standard wants it for whatever reason... -DaveM 1855 */ 1856 case SO_ACCEPTCONN: 1857 v.val = sk->sk_state == TCP_LISTEN; 1858 break; 1859 1860 case SO_PASSSEC: 1861 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1862 break; 1863 1864 case SO_PEERSEC: 1865 return security_socket_getpeersec_stream(sock, 1866 optval, optlen, len); 1867 1868 case SO_MARK: 1869 v.val = READ_ONCE(sk->sk_mark); 1870 break; 1871 1872 case SO_RCVMARK: 1873 v.val = sock_flag(sk, SOCK_RCVMARK); 1874 break; 1875 1876 case SO_RXQ_OVFL: 1877 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1878 break; 1879 1880 case SO_WIFI_STATUS: 1881 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1882 break; 1883 1884 case SO_PEEK_OFF: 1885 if (!READ_ONCE(sock->ops)->set_peek_off) 1886 return -EOPNOTSUPP; 1887 1888 v.val = READ_ONCE(sk->sk_peek_off); 1889 break; 1890 case SO_NOFCS: 1891 v.val = sock_flag(sk, SOCK_NOFCS); 1892 break; 1893 1894 case SO_BINDTODEVICE: 1895 return sock_getbindtodevice(sk, optval, optlen, len); 1896 1897 case SO_GET_FILTER: 1898 len = sk_get_filter(sk, optval, len); 1899 if (len < 0) 1900 return len; 1901 1902 goto lenout; 1903 1904 case SO_LOCK_FILTER: 1905 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1906 break; 1907 1908 case SO_BPF_EXTENSIONS: 1909 v.val = bpf_tell_extensions(); 1910 break; 1911 1912 case SO_SELECT_ERR_QUEUE: 1913 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1914 break; 1915 1916 #ifdef CONFIG_NET_RX_BUSY_POLL 1917 case SO_BUSY_POLL: 1918 v.val = READ_ONCE(sk->sk_ll_usec); 1919 break; 1920 case SO_PREFER_BUSY_POLL: 1921 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1922 break; 1923 #endif 1924 1925 case SO_MAX_PACING_RATE: 1926 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 1927 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1928 lv = sizeof(v.ulval); 1929 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 1930 } else { 1931 /* 32bit version */ 1932 v.val = min_t(unsigned long, ~0U, 1933 READ_ONCE(sk->sk_max_pacing_rate)); 1934 } 1935 break; 1936 1937 case SO_INCOMING_CPU: 1938 v.val = READ_ONCE(sk->sk_incoming_cpu); 1939 break; 1940 1941 case SO_MEMINFO: 1942 { 1943 u32 meminfo[SK_MEMINFO_VARS]; 1944 1945 sk_get_meminfo(sk, meminfo); 1946 1947 len = min_t(unsigned int, len, sizeof(meminfo)); 1948 if (copy_to_sockptr(optval, &meminfo, len)) 1949 return -EFAULT; 1950 1951 goto lenout; 1952 } 1953 1954 #ifdef CONFIG_NET_RX_BUSY_POLL 1955 case SO_INCOMING_NAPI_ID: 1956 v.val = READ_ONCE(sk->sk_napi_id); 1957 1958 /* aggregate non-NAPI IDs down to 0 */ 1959 if (v.val < MIN_NAPI_ID) 1960 v.val = 0; 1961 1962 break; 1963 #endif 1964 1965 case SO_COOKIE: 1966 lv = sizeof(u64); 1967 if (len < lv) 1968 return -EINVAL; 1969 v.val64 = sock_gen_cookie(sk); 1970 break; 1971 1972 case SO_ZEROCOPY: 1973 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1974 break; 1975 1976 case SO_TXTIME: 1977 lv = sizeof(v.txtime); 1978 v.txtime.clockid = sk->sk_clockid; 1979 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1980 SOF_TXTIME_DEADLINE_MODE : 0; 1981 v.txtime.flags |= sk->sk_txtime_report_errors ? 1982 SOF_TXTIME_REPORT_ERRORS : 0; 1983 break; 1984 1985 case SO_BINDTOIFINDEX: 1986 v.val = READ_ONCE(sk->sk_bound_dev_if); 1987 break; 1988 1989 case SO_NETNS_COOKIE: 1990 lv = sizeof(u64); 1991 if (len != lv) 1992 return -EINVAL; 1993 v.val64 = sock_net(sk)->net_cookie; 1994 break; 1995 1996 case SO_BUF_LOCK: 1997 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1998 break; 1999 2000 case SO_RESERVE_MEM: 2001 v.val = READ_ONCE(sk->sk_reserved_mem); 2002 break; 2003 2004 case SO_TXREHASH: 2005 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2006 v.val = READ_ONCE(sk->sk_txrehash); 2007 break; 2008 2009 default: 2010 /* We implement the SO_SNDLOWAT etc to not be settable 2011 * (1003.1g 7). 2012 */ 2013 return -ENOPROTOOPT; 2014 } 2015 2016 if (len > lv) 2017 len = lv; 2018 if (copy_to_sockptr(optval, &v, len)) 2019 return -EFAULT; 2020 lenout: 2021 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2022 return -EFAULT; 2023 return 0; 2024 } 2025 2026 /* 2027 * Initialize an sk_lock. 2028 * 2029 * (We also register the sk_lock with the lock validator.) 2030 */ 2031 static inline void sock_lock_init(struct sock *sk) 2032 { 2033 if (sk->sk_kern_sock) 2034 sock_lock_init_class_and_name( 2035 sk, 2036 af_family_kern_slock_key_strings[sk->sk_family], 2037 af_family_kern_slock_keys + sk->sk_family, 2038 af_family_kern_key_strings[sk->sk_family], 2039 af_family_kern_keys + sk->sk_family); 2040 else 2041 sock_lock_init_class_and_name( 2042 sk, 2043 af_family_slock_key_strings[sk->sk_family], 2044 af_family_slock_keys + sk->sk_family, 2045 af_family_key_strings[sk->sk_family], 2046 af_family_keys + sk->sk_family); 2047 } 2048 2049 /* 2050 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2051 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2052 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2053 */ 2054 static void sock_copy(struct sock *nsk, const struct sock *osk) 2055 { 2056 const struct proto *prot = READ_ONCE(osk->sk_prot); 2057 #ifdef CONFIG_SECURITY_NETWORK 2058 void *sptr = nsk->sk_security; 2059 #endif 2060 2061 /* If we move sk_tx_queue_mapping out of the private section, 2062 * we must check if sk_tx_queue_clear() is called after 2063 * sock_copy() in sk_clone_lock(). 2064 */ 2065 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2066 offsetof(struct sock, sk_dontcopy_begin) || 2067 offsetof(struct sock, sk_tx_queue_mapping) >= 2068 offsetof(struct sock, sk_dontcopy_end)); 2069 2070 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2071 2072 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2073 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2074 /* alloc is larger than struct, see sk_prot_alloc() */); 2075 2076 #ifdef CONFIG_SECURITY_NETWORK 2077 nsk->sk_security = sptr; 2078 security_sk_clone(osk, nsk); 2079 #endif 2080 } 2081 2082 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2083 int family) 2084 { 2085 struct sock *sk; 2086 struct kmem_cache *slab; 2087 2088 slab = prot->slab; 2089 if (slab != NULL) { 2090 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2091 if (!sk) 2092 return sk; 2093 if (want_init_on_alloc(priority)) 2094 sk_prot_clear_nulls(sk, prot->obj_size); 2095 } else 2096 sk = kmalloc(prot->obj_size, priority); 2097 2098 if (sk != NULL) { 2099 if (security_sk_alloc(sk, family, priority)) 2100 goto out_free; 2101 2102 if (!try_module_get(prot->owner)) 2103 goto out_free_sec; 2104 } 2105 2106 return sk; 2107 2108 out_free_sec: 2109 security_sk_free(sk); 2110 out_free: 2111 if (slab != NULL) 2112 kmem_cache_free(slab, sk); 2113 else 2114 kfree(sk); 2115 return NULL; 2116 } 2117 2118 static void sk_prot_free(struct proto *prot, struct sock *sk) 2119 { 2120 struct kmem_cache *slab; 2121 struct module *owner; 2122 2123 owner = prot->owner; 2124 slab = prot->slab; 2125 2126 cgroup_sk_free(&sk->sk_cgrp_data); 2127 mem_cgroup_sk_free(sk); 2128 security_sk_free(sk); 2129 if (slab != NULL) 2130 kmem_cache_free(slab, sk); 2131 else 2132 kfree(sk); 2133 module_put(owner); 2134 } 2135 2136 /** 2137 * sk_alloc - All socket objects are allocated here 2138 * @net: the applicable net namespace 2139 * @family: protocol family 2140 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2141 * @prot: struct proto associated with this new sock instance 2142 * @kern: is this to be a kernel socket? 2143 */ 2144 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2145 struct proto *prot, int kern) 2146 { 2147 struct sock *sk; 2148 2149 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2150 if (sk) { 2151 sk->sk_family = family; 2152 /* 2153 * See comment in struct sock definition to understand 2154 * why we need sk_prot_creator -acme 2155 */ 2156 sk->sk_prot = sk->sk_prot_creator = prot; 2157 sk->sk_kern_sock = kern; 2158 sock_lock_init(sk); 2159 sk->sk_net_refcnt = kern ? 0 : 1; 2160 if (likely(sk->sk_net_refcnt)) { 2161 get_net_track(net, &sk->ns_tracker, priority); 2162 sock_inuse_add(net, 1); 2163 } else { 2164 __netns_tracker_alloc(net, &sk->ns_tracker, 2165 false, priority); 2166 } 2167 2168 sock_net_set(sk, net); 2169 refcount_set(&sk->sk_wmem_alloc, 1); 2170 2171 mem_cgroup_sk_alloc(sk); 2172 cgroup_sk_alloc(&sk->sk_cgrp_data); 2173 sock_update_classid(&sk->sk_cgrp_data); 2174 sock_update_netprioidx(&sk->sk_cgrp_data); 2175 sk_tx_queue_clear(sk); 2176 } 2177 2178 return sk; 2179 } 2180 EXPORT_SYMBOL(sk_alloc); 2181 2182 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2183 * grace period. This is the case for UDP sockets and TCP listeners. 2184 */ 2185 static void __sk_destruct(struct rcu_head *head) 2186 { 2187 struct sock *sk = container_of(head, struct sock, sk_rcu); 2188 struct sk_filter *filter; 2189 2190 if (sk->sk_destruct) 2191 sk->sk_destruct(sk); 2192 2193 filter = rcu_dereference_check(sk->sk_filter, 2194 refcount_read(&sk->sk_wmem_alloc) == 0); 2195 if (filter) { 2196 sk_filter_uncharge(sk, filter); 2197 RCU_INIT_POINTER(sk->sk_filter, NULL); 2198 } 2199 2200 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2201 2202 #ifdef CONFIG_BPF_SYSCALL 2203 bpf_sk_storage_free(sk); 2204 #endif 2205 2206 if (atomic_read(&sk->sk_omem_alloc)) 2207 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2208 __func__, atomic_read(&sk->sk_omem_alloc)); 2209 2210 if (sk->sk_frag.page) { 2211 put_page(sk->sk_frag.page); 2212 sk->sk_frag.page = NULL; 2213 } 2214 2215 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2216 put_cred(sk->sk_peer_cred); 2217 put_pid(sk->sk_peer_pid); 2218 2219 if (likely(sk->sk_net_refcnt)) 2220 put_net_track(sock_net(sk), &sk->ns_tracker); 2221 else 2222 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); 2223 2224 sk_prot_free(sk->sk_prot_creator, sk); 2225 } 2226 2227 void sk_destruct(struct sock *sk) 2228 { 2229 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2230 2231 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2232 reuseport_detach_sock(sk); 2233 use_call_rcu = true; 2234 } 2235 2236 if (use_call_rcu) 2237 call_rcu(&sk->sk_rcu, __sk_destruct); 2238 else 2239 __sk_destruct(&sk->sk_rcu); 2240 } 2241 2242 static void __sk_free(struct sock *sk) 2243 { 2244 if (likely(sk->sk_net_refcnt)) 2245 sock_inuse_add(sock_net(sk), -1); 2246 2247 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2248 sock_diag_broadcast_destroy(sk); 2249 else 2250 sk_destruct(sk); 2251 } 2252 2253 void sk_free(struct sock *sk) 2254 { 2255 /* 2256 * We subtract one from sk_wmem_alloc and can know if 2257 * some packets are still in some tx queue. 2258 * If not null, sock_wfree() will call __sk_free(sk) later 2259 */ 2260 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2261 __sk_free(sk); 2262 } 2263 EXPORT_SYMBOL(sk_free); 2264 2265 static void sk_init_common(struct sock *sk) 2266 { 2267 skb_queue_head_init(&sk->sk_receive_queue); 2268 skb_queue_head_init(&sk->sk_write_queue); 2269 skb_queue_head_init(&sk->sk_error_queue); 2270 2271 rwlock_init(&sk->sk_callback_lock); 2272 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2273 af_rlock_keys + sk->sk_family, 2274 af_family_rlock_key_strings[sk->sk_family]); 2275 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2276 af_wlock_keys + sk->sk_family, 2277 af_family_wlock_key_strings[sk->sk_family]); 2278 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2279 af_elock_keys + sk->sk_family, 2280 af_family_elock_key_strings[sk->sk_family]); 2281 if (sk->sk_kern_sock) 2282 lockdep_set_class_and_name(&sk->sk_callback_lock, 2283 af_kern_callback_keys + sk->sk_family, 2284 af_family_kern_clock_key_strings[sk->sk_family]); 2285 else 2286 lockdep_set_class_and_name(&sk->sk_callback_lock, 2287 af_callback_keys + sk->sk_family, 2288 af_family_clock_key_strings[sk->sk_family]); 2289 } 2290 2291 /** 2292 * sk_clone_lock - clone a socket, and lock its clone 2293 * @sk: the socket to clone 2294 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2295 * 2296 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2297 */ 2298 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2299 { 2300 struct proto *prot = READ_ONCE(sk->sk_prot); 2301 struct sk_filter *filter; 2302 bool is_charged = true; 2303 struct sock *newsk; 2304 2305 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2306 if (!newsk) 2307 goto out; 2308 2309 sock_copy(newsk, sk); 2310 2311 newsk->sk_prot_creator = prot; 2312 2313 /* SANITY */ 2314 if (likely(newsk->sk_net_refcnt)) { 2315 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2316 sock_inuse_add(sock_net(newsk), 1); 2317 } else { 2318 /* Kernel sockets are not elevating the struct net refcount. 2319 * Instead, use a tracker to more easily detect if a layer 2320 * is not properly dismantling its kernel sockets at netns 2321 * destroy time. 2322 */ 2323 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2324 false, priority); 2325 } 2326 sk_node_init(&newsk->sk_node); 2327 sock_lock_init(newsk); 2328 bh_lock_sock(newsk); 2329 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2330 newsk->sk_backlog.len = 0; 2331 2332 atomic_set(&newsk->sk_rmem_alloc, 0); 2333 2334 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2335 refcount_set(&newsk->sk_wmem_alloc, 1); 2336 2337 atomic_set(&newsk->sk_omem_alloc, 0); 2338 sk_init_common(newsk); 2339 2340 newsk->sk_dst_cache = NULL; 2341 newsk->sk_dst_pending_confirm = 0; 2342 newsk->sk_wmem_queued = 0; 2343 newsk->sk_forward_alloc = 0; 2344 newsk->sk_reserved_mem = 0; 2345 atomic_set(&newsk->sk_drops, 0); 2346 newsk->sk_send_head = NULL; 2347 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2348 atomic_set(&newsk->sk_zckey, 0); 2349 2350 sock_reset_flag(newsk, SOCK_DONE); 2351 2352 /* sk->sk_memcg will be populated at accept() time */ 2353 newsk->sk_memcg = NULL; 2354 2355 cgroup_sk_clone(&newsk->sk_cgrp_data); 2356 2357 rcu_read_lock(); 2358 filter = rcu_dereference(sk->sk_filter); 2359 if (filter != NULL) 2360 /* though it's an empty new sock, the charging may fail 2361 * if sysctl_optmem_max was changed between creation of 2362 * original socket and cloning 2363 */ 2364 is_charged = sk_filter_charge(newsk, filter); 2365 RCU_INIT_POINTER(newsk->sk_filter, filter); 2366 rcu_read_unlock(); 2367 2368 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2369 /* We need to make sure that we don't uncharge the new 2370 * socket if we couldn't charge it in the first place 2371 * as otherwise we uncharge the parent's filter. 2372 */ 2373 if (!is_charged) 2374 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2375 sk_free_unlock_clone(newsk); 2376 newsk = NULL; 2377 goto out; 2378 } 2379 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2380 2381 if (bpf_sk_storage_clone(sk, newsk)) { 2382 sk_free_unlock_clone(newsk); 2383 newsk = NULL; 2384 goto out; 2385 } 2386 2387 /* Clear sk_user_data if parent had the pointer tagged 2388 * as not suitable for copying when cloning. 2389 */ 2390 if (sk_user_data_is_nocopy(newsk)) 2391 newsk->sk_user_data = NULL; 2392 2393 newsk->sk_err = 0; 2394 newsk->sk_err_soft = 0; 2395 newsk->sk_priority = 0; 2396 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2397 2398 /* Before updating sk_refcnt, we must commit prior changes to memory 2399 * (Documentation/RCU/rculist_nulls.rst for details) 2400 */ 2401 smp_wmb(); 2402 refcount_set(&newsk->sk_refcnt, 2); 2403 2404 sk_set_socket(newsk, NULL); 2405 sk_tx_queue_clear(newsk); 2406 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2407 2408 if (newsk->sk_prot->sockets_allocated) 2409 sk_sockets_allocated_inc(newsk); 2410 2411 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2412 net_enable_timestamp(); 2413 out: 2414 return newsk; 2415 } 2416 EXPORT_SYMBOL_GPL(sk_clone_lock); 2417 2418 void sk_free_unlock_clone(struct sock *sk) 2419 { 2420 /* It is still raw copy of parent, so invalidate 2421 * destructor and make plain sk_free() */ 2422 sk->sk_destruct = NULL; 2423 bh_unlock_sock(sk); 2424 sk_free(sk); 2425 } 2426 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2427 2428 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) 2429 { 2430 bool is_ipv6 = false; 2431 u32 max_size; 2432 2433 #if IS_ENABLED(CONFIG_IPV6) 2434 is_ipv6 = (sk->sk_family == AF_INET6 && 2435 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2436 #endif 2437 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2438 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : 2439 READ_ONCE(dst->dev->gso_ipv4_max_size); 2440 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2441 max_size = GSO_LEGACY_MAX_SIZE; 2442 2443 return max_size - (MAX_TCP_HEADER + 1); 2444 } 2445 2446 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2447 { 2448 u32 max_segs = 1; 2449 2450 sk->sk_route_caps = dst->dev->features; 2451 if (sk_is_tcp(sk)) 2452 sk->sk_route_caps |= NETIF_F_GSO; 2453 if (sk->sk_route_caps & NETIF_F_GSO) 2454 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2455 if (unlikely(sk->sk_gso_disabled)) 2456 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2457 if (sk_can_gso(sk)) { 2458 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2459 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2460 } else { 2461 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2462 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); 2463 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2464 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2465 } 2466 } 2467 sk->sk_gso_max_segs = max_segs; 2468 sk_dst_set(sk, dst); 2469 } 2470 EXPORT_SYMBOL_GPL(sk_setup_caps); 2471 2472 /* 2473 * Simple resource managers for sockets. 2474 */ 2475 2476 2477 /* 2478 * Write buffer destructor automatically called from kfree_skb. 2479 */ 2480 void sock_wfree(struct sk_buff *skb) 2481 { 2482 struct sock *sk = skb->sk; 2483 unsigned int len = skb->truesize; 2484 bool free; 2485 2486 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2487 if (sock_flag(sk, SOCK_RCU_FREE) && 2488 sk->sk_write_space == sock_def_write_space) { 2489 rcu_read_lock(); 2490 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); 2491 sock_def_write_space_wfree(sk); 2492 rcu_read_unlock(); 2493 if (unlikely(free)) 2494 __sk_free(sk); 2495 return; 2496 } 2497 2498 /* 2499 * Keep a reference on sk_wmem_alloc, this will be released 2500 * after sk_write_space() call 2501 */ 2502 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2503 sk->sk_write_space(sk); 2504 len = 1; 2505 } 2506 /* 2507 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2508 * could not do because of in-flight packets 2509 */ 2510 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2511 __sk_free(sk); 2512 } 2513 EXPORT_SYMBOL(sock_wfree); 2514 2515 /* This variant of sock_wfree() is used by TCP, 2516 * since it sets SOCK_USE_WRITE_QUEUE. 2517 */ 2518 void __sock_wfree(struct sk_buff *skb) 2519 { 2520 struct sock *sk = skb->sk; 2521 2522 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2523 __sk_free(sk); 2524 } 2525 2526 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2527 { 2528 skb_orphan(skb); 2529 skb->sk = sk; 2530 #ifdef CONFIG_INET 2531 if (unlikely(!sk_fullsock(sk))) { 2532 skb->destructor = sock_edemux; 2533 sock_hold(sk); 2534 return; 2535 } 2536 #endif 2537 skb->destructor = sock_wfree; 2538 skb_set_hash_from_sk(skb, sk); 2539 /* 2540 * We used to take a refcount on sk, but following operation 2541 * is enough to guarantee sk_free() won't free this sock until 2542 * all in-flight packets are completed 2543 */ 2544 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2545 } 2546 EXPORT_SYMBOL(skb_set_owner_w); 2547 2548 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2549 { 2550 /* Drivers depend on in-order delivery for crypto offload, 2551 * partial orphan breaks out-of-order-OK logic. 2552 */ 2553 if (skb_is_decrypted(skb)) 2554 return false; 2555 2556 return (skb->destructor == sock_wfree || 2557 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2558 } 2559 2560 /* This helper is used by netem, as it can hold packets in its 2561 * delay queue. We want to allow the owner socket to send more 2562 * packets, as if they were already TX completed by a typical driver. 2563 * But we also want to keep skb->sk set because some packet schedulers 2564 * rely on it (sch_fq for example). 2565 */ 2566 void skb_orphan_partial(struct sk_buff *skb) 2567 { 2568 if (skb_is_tcp_pure_ack(skb)) 2569 return; 2570 2571 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2572 return; 2573 2574 skb_orphan(skb); 2575 } 2576 EXPORT_SYMBOL(skb_orphan_partial); 2577 2578 /* 2579 * Read buffer destructor automatically called from kfree_skb. 2580 */ 2581 void sock_rfree(struct sk_buff *skb) 2582 { 2583 struct sock *sk = skb->sk; 2584 unsigned int len = skb->truesize; 2585 2586 atomic_sub(len, &sk->sk_rmem_alloc); 2587 sk_mem_uncharge(sk, len); 2588 } 2589 EXPORT_SYMBOL(sock_rfree); 2590 2591 /* 2592 * Buffer destructor for skbs that are not used directly in read or write 2593 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2594 */ 2595 void sock_efree(struct sk_buff *skb) 2596 { 2597 sock_put(skb->sk); 2598 } 2599 EXPORT_SYMBOL(sock_efree); 2600 2601 /* Buffer destructor for prefetch/receive path where reference count may 2602 * not be held, e.g. for listen sockets. 2603 */ 2604 #ifdef CONFIG_INET 2605 void sock_pfree(struct sk_buff *skb) 2606 { 2607 struct sock *sk = skb->sk; 2608 2609 if (!sk_is_refcounted(sk)) 2610 return; 2611 2612 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2613 inet_reqsk(sk)->rsk_listener = NULL; 2614 reqsk_free(inet_reqsk(sk)); 2615 return; 2616 } 2617 2618 sock_gen_put(sk); 2619 } 2620 EXPORT_SYMBOL(sock_pfree); 2621 #endif /* CONFIG_INET */ 2622 2623 kuid_t sock_i_uid(struct sock *sk) 2624 { 2625 kuid_t uid; 2626 2627 read_lock_bh(&sk->sk_callback_lock); 2628 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2629 read_unlock_bh(&sk->sk_callback_lock); 2630 return uid; 2631 } 2632 EXPORT_SYMBOL(sock_i_uid); 2633 2634 unsigned long __sock_i_ino(struct sock *sk) 2635 { 2636 unsigned long ino; 2637 2638 read_lock(&sk->sk_callback_lock); 2639 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2640 read_unlock(&sk->sk_callback_lock); 2641 return ino; 2642 } 2643 EXPORT_SYMBOL(__sock_i_ino); 2644 2645 unsigned long sock_i_ino(struct sock *sk) 2646 { 2647 unsigned long ino; 2648 2649 local_bh_disable(); 2650 ino = __sock_i_ino(sk); 2651 local_bh_enable(); 2652 return ino; 2653 } 2654 EXPORT_SYMBOL(sock_i_ino); 2655 2656 /* 2657 * Allocate a skb from the socket's send buffer. 2658 */ 2659 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2660 gfp_t priority) 2661 { 2662 if (force || 2663 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2664 struct sk_buff *skb = alloc_skb(size, priority); 2665 2666 if (skb) { 2667 skb_set_owner_w(skb, sk); 2668 return skb; 2669 } 2670 } 2671 return NULL; 2672 } 2673 EXPORT_SYMBOL(sock_wmalloc); 2674 2675 static void sock_ofree(struct sk_buff *skb) 2676 { 2677 struct sock *sk = skb->sk; 2678 2679 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2680 } 2681 2682 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2683 gfp_t priority) 2684 { 2685 struct sk_buff *skb; 2686 2687 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2688 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2689 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2690 return NULL; 2691 2692 skb = alloc_skb(size, priority); 2693 if (!skb) 2694 return NULL; 2695 2696 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2697 skb->sk = sk; 2698 skb->destructor = sock_ofree; 2699 return skb; 2700 } 2701 2702 /* 2703 * Allocate a memory block from the socket's option memory buffer. 2704 */ 2705 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2706 { 2707 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2708 2709 if ((unsigned int)size <= optmem_max && 2710 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2711 void *mem; 2712 /* First do the add, to avoid the race if kmalloc 2713 * might sleep. 2714 */ 2715 atomic_add(size, &sk->sk_omem_alloc); 2716 mem = kmalloc(size, priority); 2717 if (mem) 2718 return mem; 2719 atomic_sub(size, &sk->sk_omem_alloc); 2720 } 2721 return NULL; 2722 } 2723 EXPORT_SYMBOL(sock_kmalloc); 2724 2725 /* Free an option memory block. Note, we actually want the inline 2726 * here as this allows gcc to detect the nullify and fold away the 2727 * condition entirely. 2728 */ 2729 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2730 const bool nullify) 2731 { 2732 if (WARN_ON_ONCE(!mem)) 2733 return; 2734 if (nullify) 2735 kfree_sensitive(mem); 2736 else 2737 kfree(mem); 2738 atomic_sub(size, &sk->sk_omem_alloc); 2739 } 2740 2741 void sock_kfree_s(struct sock *sk, void *mem, int size) 2742 { 2743 __sock_kfree_s(sk, mem, size, false); 2744 } 2745 EXPORT_SYMBOL(sock_kfree_s); 2746 2747 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2748 { 2749 __sock_kfree_s(sk, mem, size, true); 2750 } 2751 EXPORT_SYMBOL(sock_kzfree_s); 2752 2753 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2754 I think, these locks should be removed for datagram sockets. 2755 */ 2756 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2757 { 2758 DEFINE_WAIT(wait); 2759 2760 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2761 for (;;) { 2762 if (!timeo) 2763 break; 2764 if (signal_pending(current)) 2765 break; 2766 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2767 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2768 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2769 break; 2770 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2771 break; 2772 if (READ_ONCE(sk->sk_err)) 2773 break; 2774 timeo = schedule_timeout(timeo); 2775 } 2776 finish_wait(sk_sleep(sk), &wait); 2777 return timeo; 2778 } 2779 2780 2781 /* 2782 * Generic send/receive buffer handlers 2783 */ 2784 2785 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2786 unsigned long data_len, int noblock, 2787 int *errcode, int max_page_order) 2788 { 2789 struct sk_buff *skb; 2790 long timeo; 2791 int err; 2792 2793 timeo = sock_sndtimeo(sk, noblock); 2794 for (;;) { 2795 err = sock_error(sk); 2796 if (err != 0) 2797 goto failure; 2798 2799 err = -EPIPE; 2800 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2801 goto failure; 2802 2803 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2804 break; 2805 2806 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2807 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2808 err = -EAGAIN; 2809 if (!timeo) 2810 goto failure; 2811 if (signal_pending(current)) 2812 goto interrupted; 2813 timeo = sock_wait_for_wmem(sk, timeo); 2814 } 2815 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2816 errcode, sk->sk_allocation); 2817 if (skb) 2818 skb_set_owner_w(skb, sk); 2819 return skb; 2820 2821 interrupted: 2822 err = sock_intr_errno(timeo); 2823 failure: 2824 *errcode = err; 2825 return NULL; 2826 } 2827 EXPORT_SYMBOL(sock_alloc_send_pskb); 2828 2829 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 2830 struct sockcm_cookie *sockc) 2831 { 2832 u32 tsflags; 2833 2834 switch (cmsg->cmsg_type) { 2835 case SO_MARK: 2836 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 2837 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2838 return -EPERM; 2839 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2840 return -EINVAL; 2841 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2842 break; 2843 case SO_TIMESTAMPING_OLD: 2844 case SO_TIMESTAMPING_NEW: 2845 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2846 return -EINVAL; 2847 2848 tsflags = *(u32 *)CMSG_DATA(cmsg); 2849 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2850 return -EINVAL; 2851 2852 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2853 sockc->tsflags |= tsflags; 2854 break; 2855 case SCM_TXTIME: 2856 if (!sock_flag(sk, SOCK_TXTIME)) 2857 return -EINVAL; 2858 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2859 return -EINVAL; 2860 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2861 break; 2862 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2863 case SCM_RIGHTS: 2864 case SCM_CREDENTIALS: 2865 break; 2866 default: 2867 return -EINVAL; 2868 } 2869 return 0; 2870 } 2871 EXPORT_SYMBOL(__sock_cmsg_send); 2872 2873 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2874 struct sockcm_cookie *sockc) 2875 { 2876 struct cmsghdr *cmsg; 2877 int ret; 2878 2879 for_each_cmsghdr(cmsg, msg) { 2880 if (!CMSG_OK(msg, cmsg)) 2881 return -EINVAL; 2882 if (cmsg->cmsg_level != SOL_SOCKET) 2883 continue; 2884 ret = __sock_cmsg_send(sk, cmsg, sockc); 2885 if (ret) 2886 return ret; 2887 } 2888 return 0; 2889 } 2890 EXPORT_SYMBOL(sock_cmsg_send); 2891 2892 static void sk_enter_memory_pressure(struct sock *sk) 2893 { 2894 if (!sk->sk_prot->enter_memory_pressure) 2895 return; 2896 2897 sk->sk_prot->enter_memory_pressure(sk); 2898 } 2899 2900 static void sk_leave_memory_pressure(struct sock *sk) 2901 { 2902 if (sk->sk_prot->leave_memory_pressure) { 2903 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 2904 tcp_leave_memory_pressure, sk); 2905 } else { 2906 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2907 2908 if (memory_pressure && READ_ONCE(*memory_pressure)) 2909 WRITE_ONCE(*memory_pressure, 0); 2910 } 2911 } 2912 2913 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2914 2915 /** 2916 * skb_page_frag_refill - check that a page_frag contains enough room 2917 * @sz: minimum size of the fragment we want to get 2918 * @pfrag: pointer to page_frag 2919 * @gfp: priority for memory allocation 2920 * 2921 * Note: While this allocator tries to use high order pages, there is 2922 * no guarantee that allocations succeed. Therefore, @sz MUST be 2923 * less or equal than PAGE_SIZE. 2924 */ 2925 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2926 { 2927 if (pfrag->page) { 2928 if (page_ref_count(pfrag->page) == 1) { 2929 pfrag->offset = 0; 2930 return true; 2931 } 2932 if (pfrag->offset + sz <= pfrag->size) 2933 return true; 2934 put_page(pfrag->page); 2935 } 2936 2937 pfrag->offset = 0; 2938 if (SKB_FRAG_PAGE_ORDER && 2939 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2940 /* Avoid direct reclaim but allow kswapd to wake */ 2941 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2942 __GFP_COMP | __GFP_NOWARN | 2943 __GFP_NORETRY, 2944 SKB_FRAG_PAGE_ORDER); 2945 if (likely(pfrag->page)) { 2946 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2947 return true; 2948 } 2949 } 2950 pfrag->page = alloc_page(gfp); 2951 if (likely(pfrag->page)) { 2952 pfrag->size = PAGE_SIZE; 2953 return true; 2954 } 2955 return false; 2956 } 2957 EXPORT_SYMBOL(skb_page_frag_refill); 2958 2959 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2960 { 2961 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2962 return true; 2963 2964 sk_enter_memory_pressure(sk); 2965 sk_stream_moderate_sndbuf(sk); 2966 return false; 2967 } 2968 EXPORT_SYMBOL(sk_page_frag_refill); 2969 2970 void __lock_sock(struct sock *sk) 2971 __releases(&sk->sk_lock.slock) 2972 __acquires(&sk->sk_lock.slock) 2973 { 2974 DEFINE_WAIT(wait); 2975 2976 for (;;) { 2977 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2978 TASK_UNINTERRUPTIBLE); 2979 spin_unlock_bh(&sk->sk_lock.slock); 2980 schedule(); 2981 spin_lock_bh(&sk->sk_lock.slock); 2982 if (!sock_owned_by_user(sk)) 2983 break; 2984 } 2985 finish_wait(&sk->sk_lock.wq, &wait); 2986 } 2987 2988 void __release_sock(struct sock *sk) 2989 __releases(&sk->sk_lock.slock) 2990 __acquires(&sk->sk_lock.slock) 2991 { 2992 struct sk_buff *skb, *next; 2993 2994 while ((skb = sk->sk_backlog.head) != NULL) { 2995 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2996 2997 spin_unlock_bh(&sk->sk_lock.slock); 2998 2999 do { 3000 next = skb->next; 3001 prefetch(next); 3002 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3003 skb_mark_not_on_list(skb); 3004 sk_backlog_rcv(sk, skb); 3005 3006 cond_resched(); 3007 3008 skb = next; 3009 } while (skb != NULL); 3010 3011 spin_lock_bh(&sk->sk_lock.slock); 3012 } 3013 3014 /* 3015 * Doing the zeroing here guarantee we can not loop forever 3016 * while a wild producer attempts to flood us. 3017 */ 3018 sk->sk_backlog.len = 0; 3019 } 3020 3021 void __sk_flush_backlog(struct sock *sk) 3022 { 3023 spin_lock_bh(&sk->sk_lock.slock); 3024 __release_sock(sk); 3025 3026 if (sk->sk_prot->release_cb) 3027 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3028 tcp_release_cb, sk); 3029 3030 spin_unlock_bh(&sk->sk_lock.slock); 3031 } 3032 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3033 3034 /** 3035 * sk_wait_data - wait for data to arrive at sk_receive_queue 3036 * @sk: sock to wait on 3037 * @timeo: for how long 3038 * @skb: last skb seen on sk_receive_queue 3039 * 3040 * Now socket state including sk->sk_err is changed only under lock, 3041 * hence we may omit checks after joining wait queue. 3042 * We check receive queue before schedule() only as optimization; 3043 * it is very likely that release_sock() added new data. 3044 */ 3045 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3046 { 3047 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3048 int rc; 3049 3050 add_wait_queue(sk_sleep(sk), &wait); 3051 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3052 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3053 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3054 remove_wait_queue(sk_sleep(sk), &wait); 3055 return rc; 3056 } 3057 EXPORT_SYMBOL(sk_wait_data); 3058 3059 /** 3060 * __sk_mem_raise_allocated - increase memory_allocated 3061 * @sk: socket 3062 * @size: memory size to allocate 3063 * @amt: pages to allocate 3064 * @kind: allocation type 3065 * 3066 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3067 * 3068 * Unlike the globally shared limits among the sockets under same protocol, 3069 * consuming the budget of a memcg won't have direct effect on other ones. 3070 * So be optimistic about memcg's tolerance, and leave the callers to decide 3071 * whether or not to raise allocated through sk_under_memory_pressure() or 3072 * its variants. 3073 */ 3074 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3075 { 3076 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; 3077 struct proto *prot = sk->sk_prot; 3078 bool charged = false; 3079 long allocated; 3080 3081 sk_memory_allocated_add(sk, amt); 3082 allocated = sk_memory_allocated(sk); 3083 3084 if (memcg) { 3085 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) 3086 goto suppress_allocation; 3087 charged = true; 3088 } 3089 3090 /* Under limit. */ 3091 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3092 sk_leave_memory_pressure(sk); 3093 return 1; 3094 } 3095 3096 /* Under pressure. */ 3097 if (allocated > sk_prot_mem_limits(sk, 1)) 3098 sk_enter_memory_pressure(sk); 3099 3100 /* Over hard limit. */ 3101 if (allocated > sk_prot_mem_limits(sk, 2)) 3102 goto suppress_allocation; 3103 3104 /* Guarantee minimum buffer size under pressure (either global 3105 * or memcg) to make sure features described in RFC 7323 (TCP 3106 * Extensions for High Performance) work properly. 3107 * 3108 * This rule does NOT stand when exceeds global or memcg's hard 3109 * limit, or else a DoS attack can be taken place by spawning 3110 * lots of sockets whose usage are under minimum buffer size. 3111 */ 3112 if (kind == SK_MEM_RECV) { 3113 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3114 return 1; 3115 3116 } else { /* SK_MEM_SEND */ 3117 int wmem0 = sk_get_wmem0(sk, prot); 3118 3119 if (sk->sk_type == SOCK_STREAM) { 3120 if (sk->sk_wmem_queued < wmem0) 3121 return 1; 3122 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3123 return 1; 3124 } 3125 } 3126 3127 if (sk_has_memory_pressure(sk)) { 3128 u64 alloc; 3129 3130 /* The following 'average' heuristic is within the 3131 * scope of global accounting, so it only makes 3132 * sense for global memory pressure. 3133 */ 3134 if (!sk_under_global_memory_pressure(sk)) 3135 return 1; 3136 3137 /* Try to be fair among all the sockets under global 3138 * pressure by allowing the ones that below average 3139 * usage to raise. 3140 */ 3141 alloc = sk_sockets_allocated_read_positive(sk); 3142 if (sk_prot_mem_limits(sk, 2) > alloc * 3143 sk_mem_pages(sk->sk_wmem_queued + 3144 atomic_read(&sk->sk_rmem_alloc) + 3145 sk->sk_forward_alloc)) 3146 return 1; 3147 } 3148 3149 suppress_allocation: 3150 3151 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3152 sk_stream_moderate_sndbuf(sk); 3153 3154 /* Fail only if socket is _under_ its sndbuf. 3155 * In this case we cannot block, so that we have to fail. 3156 */ 3157 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3158 /* Force charge with __GFP_NOFAIL */ 3159 if (memcg && !charged) { 3160 mem_cgroup_charge_skmem(memcg, amt, 3161 gfp_memcg_charge() | __GFP_NOFAIL); 3162 } 3163 return 1; 3164 } 3165 } 3166 3167 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 3168 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3169 3170 sk_memory_allocated_sub(sk, amt); 3171 3172 if (charged) 3173 mem_cgroup_uncharge_skmem(memcg, amt); 3174 3175 return 0; 3176 } 3177 3178 /** 3179 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3180 * @sk: socket 3181 * @size: memory size to allocate 3182 * @kind: allocation type 3183 * 3184 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3185 * rmem allocation. This function assumes that protocols which have 3186 * memory_pressure use sk_wmem_queued as write buffer accounting. 3187 */ 3188 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3189 { 3190 int ret, amt = sk_mem_pages(size); 3191 3192 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3193 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3194 if (!ret) 3195 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3196 return ret; 3197 } 3198 EXPORT_SYMBOL(__sk_mem_schedule); 3199 3200 /** 3201 * __sk_mem_reduce_allocated - reclaim memory_allocated 3202 * @sk: socket 3203 * @amount: number of quanta 3204 * 3205 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3206 */ 3207 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3208 { 3209 sk_memory_allocated_sub(sk, amount); 3210 3211 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 3212 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 3213 3214 if (sk_under_global_memory_pressure(sk) && 3215 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3216 sk_leave_memory_pressure(sk); 3217 } 3218 3219 /** 3220 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3221 * @sk: socket 3222 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3223 */ 3224 void __sk_mem_reclaim(struct sock *sk, int amount) 3225 { 3226 amount >>= PAGE_SHIFT; 3227 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3228 __sk_mem_reduce_allocated(sk, amount); 3229 } 3230 EXPORT_SYMBOL(__sk_mem_reclaim); 3231 3232 int sk_set_peek_off(struct sock *sk, int val) 3233 { 3234 WRITE_ONCE(sk->sk_peek_off, val); 3235 return 0; 3236 } 3237 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3238 3239 /* 3240 * Set of default routines for initialising struct proto_ops when 3241 * the protocol does not support a particular function. In certain 3242 * cases where it makes no sense for a protocol to have a "do nothing" 3243 * function, some default processing is provided. 3244 */ 3245 3246 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3247 { 3248 return -EOPNOTSUPP; 3249 } 3250 EXPORT_SYMBOL(sock_no_bind); 3251 3252 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3253 int len, int flags) 3254 { 3255 return -EOPNOTSUPP; 3256 } 3257 EXPORT_SYMBOL(sock_no_connect); 3258 3259 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3260 { 3261 return -EOPNOTSUPP; 3262 } 3263 EXPORT_SYMBOL(sock_no_socketpair); 3264 3265 int sock_no_accept(struct socket *sock, struct socket *newsock, 3266 struct proto_accept_arg *arg) 3267 { 3268 return -EOPNOTSUPP; 3269 } 3270 EXPORT_SYMBOL(sock_no_accept); 3271 3272 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3273 int peer) 3274 { 3275 return -EOPNOTSUPP; 3276 } 3277 EXPORT_SYMBOL(sock_no_getname); 3278 3279 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3280 { 3281 return -EOPNOTSUPP; 3282 } 3283 EXPORT_SYMBOL(sock_no_ioctl); 3284 3285 int sock_no_listen(struct socket *sock, int backlog) 3286 { 3287 return -EOPNOTSUPP; 3288 } 3289 EXPORT_SYMBOL(sock_no_listen); 3290 3291 int sock_no_shutdown(struct socket *sock, int how) 3292 { 3293 return -EOPNOTSUPP; 3294 } 3295 EXPORT_SYMBOL(sock_no_shutdown); 3296 3297 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3298 { 3299 return -EOPNOTSUPP; 3300 } 3301 EXPORT_SYMBOL(sock_no_sendmsg); 3302 3303 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3304 { 3305 return -EOPNOTSUPP; 3306 } 3307 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3308 3309 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3310 int flags) 3311 { 3312 return -EOPNOTSUPP; 3313 } 3314 EXPORT_SYMBOL(sock_no_recvmsg); 3315 3316 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3317 { 3318 /* Mirror missing mmap method error code */ 3319 return -ENODEV; 3320 } 3321 EXPORT_SYMBOL(sock_no_mmap); 3322 3323 /* 3324 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3325 * various sock-based usage counts. 3326 */ 3327 void __receive_sock(struct file *file) 3328 { 3329 struct socket *sock; 3330 3331 sock = sock_from_file(file); 3332 if (sock) { 3333 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3334 sock_update_classid(&sock->sk->sk_cgrp_data); 3335 } 3336 } 3337 3338 /* 3339 * Default Socket Callbacks 3340 */ 3341 3342 static void sock_def_wakeup(struct sock *sk) 3343 { 3344 struct socket_wq *wq; 3345 3346 rcu_read_lock(); 3347 wq = rcu_dereference(sk->sk_wq); 3348 if (skwq_has_sleeper(wq)) 3349 wake_up_interruptible_all(&wq->wait); 3350 rcu_read_unlock(); 3351 } 3352 3353 static void sock_def_error_report(struct sock *sk) 3354 { 3355 struct socket_wq *wq; 3356 3357 rcu_read_lock(); 3358 wq = rcu_dereference(sk->sk_wq); 3359 if (skwq_has_sleeper(wq)) 3360 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3361 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3362 rcu_read_unlock(); 3363 } 3364 3365 void sock_def_readable(struct sock *sk) 3366 { 3367 struct socket_wq *wq; 3368 3369 trace_sk_data_ready(sk); 3370 3371 rcu_read_lock(); 3372 wq = rcu_dereference(sk->sk_wq); 3373 if (skwq_has_sleeper(wq)) 3374 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3375 EPOLLRDNORM | EPOLLRDBAND); 3376 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3377 rcu_read_unlock(); 3378 } 3379 3380 static void sock_def_write_space(struct sock *sk) 3381 { 3382 struct socket_wq *wq; 3383 3384 rcu_read_lock(); 3385 3386 /* Do not wake up a writer until he can make "significant" 3387 * progress. --DaveM 3388 */ 3389 if (sock_writeable(sk)) { 3390 wq = rcu_dereference(sk->sk_wq); 3391 if (skwq_has_sleeper(wq)) 3392 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3393 EPOLLWRNORM | EPOLLWRBAND); 3394 3395 /* Should agree with poll, otherwise some programs break */ 3396 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3397 } 3398 3399 rcu_read_unlock(); 3400 } 3401 3402 /* An optimised version of sock_def_write_space(), should only be called 3403 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3404 * ->sk_wmem_alloc. 3405 */ 3406 static void sock_def_write_space_wfree(struct sock *sk) 3407 { 3408 /* Do not wake up a writer until he can make "significant" 3409 * progress. --DaveM 3410 */ 3411 if (sock_writeable(sk)) { 3412 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3413 3414 /* rely on refcount_sub from sock_wfree() */ 3415 smp_mb__after_atomic(); 3416 if (wq && waitqueue_active(&wq->wait)) 3417 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3418 EPOLLWRNORM | EPOLLWRBAND); 3419 3420 /* Should agree with poll, otherwise some programs break */ 3421 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3422 } 3423 } 3424 3425 static void sock_def_destruct(struct sock *sk) 3426 { 3427 } 3428 3429 void sk_send_sigurg(struct sock *sk) 3430 { 3431 if (sk->sk_socket && sk->sk_socket->file) 3432 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3433 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3434 } 3435 EXPORT_SYMBOL(sk_send_sigurg); 3436 3437 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3438 unsigned long expires) 3439 { 3440 if (!mod_timer(timer, expires)) 3441 sock_hold(sk); 3442 } 3443 EXPORT_SYMBOL(sk_reset_timer); 3444 3445 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3446 { 3447 if (del_timer(timer)) 3448 __sock_put(sk); 3449 } 3450 EXPORT_SYMBOL(sk_stop_timer); 3451 3452 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3453 { 3454 if (del_timer_sync(timer)) 3455 __sock_put(sk); 3456 } 3457 EXPORT_SYMBOL(sk_stop_timer_sync); 3458 3459 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3460 { 3461 sk_init_common(sk); 3462 sk->sk_send_head = NULL; 3463 3464 timer_setup(&sk->sk_timer, NULL, 0); 3465 3466 sk->sk_allocation = GFP_KERNEL; 3467 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3468 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3469 sk->sk_state = TCP_CLOSE; 3470 sk->sk_use_task_frag = true; 3471 sk_set_socket(sk, sock); 3472 3473 sock_set_flag(sk, SOCK_ZAPPED); 3474 3475 if (sock) { 3476 sk->sk_type = sock->type; 3477 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3478 sock->sk = sk; 3479 } else { 3480 RCU_INIT_POINTER(sk->sk_wq, NULL); 3481 } 3482 sk->sk_uid = uid; 3483 3484 sk->sk_state_change = sock_def_wakeup; 3485 sk->sk_data_ready = sock_def_readable; 3486 sk->sk_write_space = sock_def_write_space; 3487 sk->sk_error_report = sock_def_error_report; 3488 sk->sk_destruct = sock_def_destruct; 3489 3490 sk->sk_frag.page = NULL; 3491 sk->sk_frag.offset = 0; 3492 sk->sk_peek_off = -1; 3493 3494 sk->sk_peer_pid = NULL; 3495 sk->sk_peer_cred = NULL; 3496 spin_lock_init(&sk->sk_peer_lock); 3497 3498 sk->sk_write_pending = 0; 3499 sk->sk_rcvlowat = 1; 3500 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3501 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3502 3503 sk->sk_stamp = SK_DEFAULT_STAMP; 3504 #if BITS_PER_LONG==32 3505 seqlock_init(&sk->sk_stamp_seq); 3506 #endif 3507 atomic_set(&sk->sk_zckey, 0); 3508 3509 #ifdef CONFIG_NET_RX_BUSY_POLL 3510 sk->sk_napi_id = 0; 3511 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3512 #endif 3513 3514 sk->sk_max_pacing_rate = ~0UL; 3515 sk->sk_pacing_rate = ~0UL; 3516 WRITE_ONCE(sk->sk_pacing_shift, 10); 3517 sk->sk_incoming_cpu = -1; 3518 3519 sk_rx_queue_clear(sk); 3520 /* 3521 * Before updating sk_refcnt, we must commit prior changes to memory 3522 * (Documentation/RCU/rculist_nulls.rst for details) 3523 */ 3524 smp_wmb(); 3525 refcount_set(&sk->sk_refcnt, 1); 3526 atomic_set(&sk->sk_drops, 0); 3527 } 3528 EXPORT_SYMBOL(sock_init_data_uid); 3529 3530 void sock_init_data(struct socket *sock, struct sock *sk) 3531 { 3532 kuid_t uid = sock ? 3533 SOCK_INODE(sock)->i_uid : 3534 make_kuid(sock_net(sk)->user_ns, 0); 3535 3536 sock_init_data_uid(sock, sk, uid); 3537 } 3538 EXPORT_SYMBOL(sock_init_data); 3539 3540 void lock_sock_nested(struct sock *sk, int subclass) 3541 { 3542 /* The sk_lock has mutex_lock() semantics here. */ 3543 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3544 3545 might_sleep(); 3546 spin_lock_bh(&sk->sk_lock.slock); 3547 if (sock_owned_by_user_nocheck(sk)) 3548 __lock_sock(sk); 3549 sk->sk_lock.owned = 1; 3550 spin_unlock_bh(&sk->sk_lock.slock); 3551 } 3552 EXPORT_SYMBOL(lock_sock_nested); 3553 3554 void release_sock(struct sock *sk) 3555 { 3556 spin_lock_bh(&sk->sk_lock.slock); 3557 if (sk->sk_backlog.tail) 3558 __release_sock(sk); 3559 3560 if (sk->sk_prot->release_cb) 3561 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3562 tcp_release_cb, sk); 3563 3564 sock_release_ownership(sk); 3565 if (waitqueue_active(&sk->sk_lock.wq)) 3566 wake_up(&sk->sk_lock.wq); 3567 spin_unlock_bh(&sk->sk_lock.slock); 3568 } 3569 EXPORT_SYMBOL(release_sock); 3570 3571 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3572 { 3573 might_sleep(); 3574 spin_lock_bh(&sk->sk_lock.slock); 3575 3576 if (!sock_owned_by_user_nocheck(sk)) { 3577 /* 3578 * Fast path return with bottom halves disabled and 3579 * sock::sk_lock.slock held. 3580 * 3581 * The 'mutex' is not contended and holding 3582 * sock::sk_lock.slock prevents all other lockers to 3583 * proceed so the corresponding unlock_sock_fast() can 3584 * avoid the slow path of release_sock() completely and 3585 * just release slock. 3586 * 3587 * From a semantical POV this is equivalent to 'acquiring' 3588 * the 'mutex', hence the corresponding lockdep 3589 * mutex_release() has to happen in the fast path of 3590 * unlock_sock_fast(). 3591 */ 3592 return false; 3593 } 3594 3595 __lock_sock(sk); 3596 sk->sk_lock.owned = 1; 3597 __acquire(&sk->sk_lock.slock); 3598 spin_unlock_bh(&sk->sk_lock.slock); 3599 return true; 3600 } 3601 EXPORT_SYMBOL(__lock_sock_fast); 3602 3603 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3604 bool timeval, bool time32) 3605 { 3606 struct sock *sk = sock->sk; 3607 struct timespec64 ts; 3608 3609 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3610 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3611 if (ts.tv_sec == -1) 3612 return -ENOENT; 3613 if (ts.tv_sec == 0) { 3614 ktime_t kt = ktime_get_real(); 3615 sock_write_timestamp(sk, kt); 3616 ts = ktime_to_timespec64(kt); 3617 } 3618 3619 if (timeval) 3620 ts.tv_nsec /= 1000; 3621 3622 #ifdef CONFIG_COMPAT_32BIT_TIME 3623 if (time32) 3624 return put_old_timespec32(&ts, userstamp); 3625 #endif 3626 #ifdef CONFIG_SPARC64 3627 /* beware of padding in sparc64 timeval */ 3628 if (timeval && !in_compat_syscall()) { 3629 struct __kernel_old_timeval __user tv = { 3630 .tv_sec = ts.tv_sec, 3631 .tv_usec = ts.tv_nsec, 3632 }; 3633 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3634 return -EFAULT; 3635 return 0; 3636 } 3637 #endif 3638 return put_timespec64(&ts, userstamp); 3639 } 3640 EXPORT_SYMBOL(sock_gettstamp); 3641 3642 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3643 { 3644 if (!sock_flag(sk, flag)) { 3645 unsigned long previous_flags = sk->sk_flags; 3646 3647 sock_set_flag(sk, flag); 3648 /* 3649 * we just set one of the two flags which require net 3650 * time stamping, but time stamping might have been on 3651 * already because of the other one 3652 */ 3653 if (sock_needs_netstamp(sk) && 3654 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3655 net_enable_timestamp(); 3656 } 3657 } 3658 3659 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3660 int level, int type) 3661 { 3662 struct sock_exterr_skb *serr; 3663 struct sk_buff *skb; 3664 int copied, err; 3665 3666 err = -EAGAIN; 3667 skb = sock_dequeue_err_skb(sk); 3668 if (skb == NULL) 3669 goto out; 3670 3671 copied = skb->len; 3672 if (copied > len) { 3673 msg->msg_flags |= MSG_TRUNC; 3674 copied = len; 3675 } 3676 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3677 if (err) 3678 goto out_free_skb; 3679 3680 sock_recv_timestamp(msg, sk, skb); 3681 3682 serr = SKB_EXT_ERR(skb); 3683 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3684 3685 msg->msg_flags |= MSG_ERRQUEUE; 3686 err = copied; 3687 3688 out_free_skb: 3689 kfree_skb(skb); 3690 out: 3691 return err; 3692 } 3693 EXPORT_SYMBOL(sock_recv_errqueue); 3694 3695 /* 3696 * Get a socket option on an socket. 3697 * 3698 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3699 * asynchronous errors should be reported by getsockopt. We assume 3700 * this means if you specify SO_ERROR (otherwise what is the point of it). 3701 */ 3702 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3703 char __user *optval, int __user *optlen) 3704 { 3705 struct sock *sk = sock->sk; 3706 3707 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3708 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3709 } 3710 EXPORT_SYMBOL(sock_common_getsockopt); 3711 3712 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3713 int flags) 3714 { 3715 struct sock *sk = sock->sk; 3716 int addr_len = 0; 3717 int err; 3718 3719 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3720 if (err >= 0) 3721 msg->msg_namelen = addr_len; 3722 return err; 3723 } 3724 EXPORT_SYMBOL(sock_common_recvmsg); 3725 3726 /* 3727 * Set socket options on an inet socket. 3728 */ 3729 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3730 sockptr_t optval, unsigned int optlen) 3731 { 3732 struct sock *sk = sock->sk; 3733 3734 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3735 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3736 } 3737 EXPORT_SYMBOL(sock_common_setsockopt); 3738 3739 void sk_common_release(struct sock *sk) 3740 { 3741 if (sk->sk_prot->destroy) 3742 sk->sk_prot->destroy(sk); 3743 3744 /* 3745 * Observation: when sk_common_release is called, processes have 3746 * no access to socket. But net still has. 3747 * Step one, detach it from networking: 3748 * 3749 * A. Remove from hash tables. 3750 */ 3751 3752 sk->sk_prot->unhash(sk); 3753 3754 if (sk->sk_socket) 3755 sk->sk_socket->sk = NULL; 3756 3757 /* 3758 * In this point socket cannot receive new packets, but it is possible 3759 * that some packets are in flight because some CPU runs receiver and 3760 * did hash table lookup before we unhashed socket. They will achieve 3761 * receive queue and will be purged by socket destructor. 3762 * 3763 * Also we still have packets pending on receive queue and probably, 3764 * our own packets waiting in device queues. sock_destroy will drain 3765 * receive queue, but transmitted packets will delay socket destruction 3766 * until the last reference will be released. 3767 */ 3768 3769 sock_orphan(sk); 3770 3771 xfrm_sk_free_policy(sk); 3772 3773 sock_put(sk); 3774 } 3775 EXPORT_SYMBOL(sk_common_release); 3776 3777 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3778 { 3779 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3780 3781 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3782 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3783 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3784 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3785 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); 3786 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3787 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3788 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3789 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3790 } 3791 3792 #ifdef CONFIG_PROC_FS 3793 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3794 3795 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3796 { 3797 int cpu, idx = prot->inuse_idx; 3798 int res = 0; 3799 3800 for_each_possible_cpu(cpu) 3801 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3802 3803 return res >= 0 ? res : 0; 3804 } 3805 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3806 3807 int sock_inuse_get(struct net *net) 3808 { 3809 int cpu, res = 0; 3810 3811 for_each_possible_cpu(cpu) 3812 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 3813 3814 return res; 3815 } 3816 3817 EXPORT_SYMBOL_GPL(sock_inuse_get); 3818 3819 static int __net_init sock_inuse_init_net(struct net *net) 3820 { 3821 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3822 if (net->core.prot_inuse == NULL) 3823 return -ENOMEM; 3824 return 0; 3825 } 3826 3827 static void __net_exit sock_inuse_exit_net(struct net *net) 3828 { 3829 free_percpu(net->core.prot_inuse); 3830 } 3831 3832 static struct pernet_operations net_inuse_ops = { 3833 .init = sock_inuse_init_net, 3834 .exit = sock_inuse_exit_net, 3835 }; 3836 3837 static __init int net_inuse_init(void) 3838 { 3839 if (register_pernet_subsys(&net_inuse_ops)) 3840 panic("Cannot initialize net inuse counters"); 3841 3842 return 0; 3843 } 3844 3845 core_initcall(net_inuse_init); 3846 3847 static int assign_proto_idx(struct proto *prot) 3848 { 3849 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3850 3851 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3852 pr_err("PROTO_INUSE_NR exhausted\n"); 3853 return -ENOSPC; 3854 } 3855 3856 set_bit(prot->inuse_idx, proto_inuse_idx); 3857 return 0; 3858 } 3859 3860 static void release_proto_idx(struct proto *prot) 3861 { 3862 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3863 clear_bit(prot->inuse_idx, proto_inuse_idx); 3864 } 3865 #else 3866 static inline int assign_proto_idx(struct proto *prot) 3867 { 3868 return 0; 3869 } 3870 3871 static inline void release_proto_idx(struct proto *prot) 3872 { 3873 } 3874 3875 #endif 3876 3877 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3878 { 3879 if (!twsk_prot) 3880 return; 3881 kfree(twsk_prot->twsk_slab_name); 3882 twsk_prot->twsk_slab_name = NULL; 3883 kmem_cache_destroy(twsk_prot->twsk_slab); 3884 twsk_prot->twsk_slab = NULL; 3885 } 3886 3887 static int tw_prot_init(const struct proto *prot) 3888 { 3889 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3890 3891 if (!twsk_prot) 3892 return 0; 3893 3894 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3895 prot->name); 3896 if (!twsk_prot->twsk_slab_name) 3897 return -ENOMEM; 3898 3899 twsk_prot->twsk_slab = 3900 kmem_cache_create(twsk_prot->twsk_slab_name, 3901 twsk_prot->twsk_obj_size, 0, 3902 SLAB_ACCOUNT | prot->slab_flags, 3903 NULL); 3904 if (!twsk_prot->twsk_slab) { 3905 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3906 prot->name); 3907 return -ENOMEM; 3908 } 3909 3910 return 0; 3911 } 3912 3913 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3914 { 3915 if (!rsk_prot) 3916 return; 3917 kfree(rsk_prot->slab_name); 3918 rsk_prot->slab_name = NULL; 3919 kmem_cache_destroy(rsk_prot->slab); 3920 rsk_prot->slab = NULL; 3921 } 3922 3923 static int req_prot_init(const struct proto *prot) 3924 { 3925 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3926 3927 if (!rsk_prot) 3928 return 0; 3929 3930 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3931 prot->name); 3932 if (!rsk_prot->slab_name) 3933 return -ENOMEM; 3934 3935 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3936 rsk_prot->obj_size, 0, 3937 SLAB_ACCOUNT | prot->slab_flags, 3938 NULL); 3939 3940 if (!rsk_prot->slab) { 3941 pr_crit("%s: Can't create request sock SLAB cache!\n", 3942 prot->name); 3943 return -ENOMEM; 3944 } 3945 return 0; 3946 } 3947 3948 int proto_register(struct proto *prot, int alloc_slab) 3949 { 3950 int ret = -ENOBUFS; 3951 3952 if (prot->memory_allocated && !prot->sysctl_mem) { 3953 pr_err("%s: missing sysctl_mem\n", prot->name); 3954 return -EINVAL; 3955 } 3956 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 3957 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 3958 return -EINVAL; 3959 } 3960 if (alloc_slab) { 3961 prot->slab = kmem_cache_create_usercopy(prot->name, 3962 prot->obj_size, 0, 3963 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3964 prot->slab_flags, 3965 prot->useroffset, prot->usersize, 3966 NULL); 3967 3968 if (prot->slab == NULL) { 3969 pr_crit("%s: Can't create sock SLAB cache!\n", 3970 prot->name); 3971 goto out; 3972 } 3973 3974 if (req_prot_init(prot)) 3975 goto out_free_request_sock_slab; 3976 3977 if (tw_prot_init(prot)) 3978 goto out_free_timewait_sock_slab; 3979 } 3980 3981 mutex_lock(&proto_list_mutex); 3982 ret = assign_proto_idx(prot); 3983 if (ret) { 3984 mutex_unlock(&proto_list_mutex); 3985 goto out_free_timewait_sock_slab; 3986 } 3987 list_add(&prot->node, &proto_list); 3988 mutex_unlock(&proto_list_mutex); 3989 return ret; 3990 3991 out_free_timewait_sock_slab: 3992 if (alloc_slab) 3993 tw_prot_cleanup(prot->twsk_prot); 3994 out_free_request_sock_slab: 3995 if (alloc_slab) { 3996 req_prot_cleanup(prot->rsk_prot); 3997 3998 kmem_cache_destroy(prot->slab); 3999 prot->slab = NULL; 4000 } 4001 out: 4002 return ret; 4003 } 4004 EXPORT_SYMBOL(proto_register); 4005 4006 void proto_unregister(struct proto *prot) 4007 { 4008 mutex_lock(&proto_list_mutex); 4009 release_proto_idx(prot); 4010 list_del(&prot->node); 4011 mutex_unlock(&proto_list_mutex); 4012 4013 kmem_cache_destroy(prot->slab); 4014 prot->slab = NULL; 4015 4016 req_prot_cleanup(prot->rsk_prot); 4017 tw_prot_cleanup(prot->twsk_prot); 4018 } 4019 EXPORT_SYMBOL(proto_unregister); 4020 4021 int sock_load_diag_module(int family, int protocol) 4022 { 4023 if (!protocol) { 4024 if (!sock_is_registered(family)) 4025 return -ENOENT; 4026 4027 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4028 NETLINK_SOCK_DIAG, family); 4029 } 4030 4031 #ifdef CONFIG_INET 4032 if (family == AF_INET && 4033 protocol != IPPROTO_RAW && 4034 protocol < MAX_INET_PROTOS && 4035 !rcu_access_pointer(inet_protos[protocol])) 4036 return -ENOENT; 4037 #endif 4038 4039 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4040 NETLINK_SOCK_DIAG, family, protocol); 4041 } 4042 EXPORT_SYMBOL(sock_load_diag_module); 4043 4044 #ifdef CONFIG_PROC_FS 4045 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4046 __acquires(proto_list_mutex) 4047 { 4048 mutex_lock(&proto_list_mutex); 4049 return seq_list_start_head(&proto_list, *pos); 4050 } 4051 4052 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4053 { 4054 return seq_list_next(v, &proto_list, pos); 4055 } 4056 4057 static void proto_seq_stop(struct seq_file *seq, void *v) 4058 __releases(proto_list_mutex) 4059 { 4060 mutex_unlock(&proto_list_mutex); 4061 } 4062 4063 static char proto_method_implemented(const void *method) 4064 { 4065 return method == NULL ? 'n' : 'y'; 4066 } 4067 static long sock_prot_memory_allocated(struct proto *proto) 4068 { 4069 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4070 } 4071 4072 static const char *sock_prot_memory_pressure(struct proto *proto) 4073 { 4074 return proto->memory_pressure != NULL ? 4075 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4076 } 4077 4078 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4079 { 4080 4081 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4082 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4083 proto->name, 4084 proto->obj_size, 4085 sock_prot_inuse_get(seq_file_net(seq), proto), 4086 sock_prot_memory_allocated(proto), 4087 sock_prot_memory_pressure(proto), 4088 proto->max_header, 4089 proto->slab == NULL ? "no" : "yes", 4090 module_name(proto->owner), 4091 proto_method_implemented(proto->close), 4092 proto_method_implemented(proto->connect), 4093 proto_method_implemented(proto->disconnect), 4094 proto_method_implemented(proto->accept), 4095 proto_method_implemented(proto->ioctl), 4096 proto_method_implemented(proto->init), 4097 proto_method_implemented(proto->destroy), 4098 proto_method_implemented(proto->shutdown), 4099 proto_method_implemented(proto->setsockopt), 4100 proto_method_implemented(proto->getsockopt), 4101 proto_method_implemented(proto->sendmsg), 4102 proto_method_implemented(proto->recvmsg), 4103 proto_method_implemented(proto->bind), 4104 proto_method_implemented(proto->backlog_rcv), 4105 proto_method_implemented(proto->hash), 4106 proto_method_implemented(proto->unhash), 4107 proto_method_implemented(proto->get_port), 4108 proto_method_implemented(proto->enter_memory_pressure)); 4109 } 4110 4111 static int proto_seq_show(struct seq_file *seq, void *v) 4112 { 4113 if (v == &proto_list) 4114 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4115 "protocol", 4116 "size", 4117 "sockets", 4118 "memory", 4119 "press", 4120 "maxhdr", 4121 "slab", 4122 "module", 4123 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4124 else 4125 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4126 return 0; 4127 } 4128 4129 static const struct seq_operations proto_seq_ops = { 4130 .start = proto_seq_start, 4131 .next = proto_seq_next, 4132 .stop = proto_seq_stop, 4133 .show = proto_seq_show, 4134 }; 4135 4136 static __net_init int proto_init_net(struct net *net) 4137 { 4138 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4139 sizeof(struct seq_net_private))) 4140 return -ENOMEM; 4141 4142 return 0; 4143 } 4144 4145 static __net_exit void proto_exit_net(struct net *net) 4146 { 4147 remove_proc_entry("protocols", net->proc_net); 4148 } 4149 4150 4151 static __net_initdata struct pernet_operations proto_net_ops = { 4152 .init = proto_init_net, 4153 .exit = proto_exit_net, 4154 }; 4155 4156 static int __init proto_init(void) 4157 { 4158 return register_pernet_subsys(&proto_net_ops); 4159 } 4160 4161 subsys_initcall(proto_init); 4162 4163 #endif /* PROC_FS */ 4164 4165 #ifdef CONFIG_NET_RX_BUSY_POLL 4166 bool sk_busy_loop_end(void *p, unsigned long start_time) 4167 { 4168 struct sock *sk = p; 4169 4170 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4171 return true; 4172 4173 if (sk_is_udp(sk) && 4174 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4175 return true; 4176 4177 return sk_busy_loop_timeout(sk, start_time); 4178 } 4179 EXPORT_SYMBOL(sk_busy_loop_end); 4180 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4181 4182 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4183 { 4184 if (!sk->sk_prot->bind_add) 4185 return -EOPNOTSUPP; 4186 return sk->sk_prot->bind_add(sk, addr, addr_len); 4187 } 4188 EXPORT_SYMBOL(sock_bind_add); 4189 4190 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4191 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4192 void __user *arg, void *karg, size_t size) 4193 { 4194 int ret; 4195 4196 if (copy_from_user(karg, arg, size)) 4197 return -EFAULT; 4198 4199 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4200 if (ret) 4201 return ret; 4202 4203 if (copy_to_user(arg, karg, size)) 4204 return -EFAULT; 4205 4206 return 0; 4207 } 4208 EXPORT_SYMBOL(sock_ioctl_inout); 4209 4210 /* This is the most common ioctl prep function, where the result (4 bytes) is 4211 * copied back to userspace if the ioctl() returns successfully. No input is 4212 * copied from userspace as input argument. 4213 */ 4214 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4215 { 4216 int ret, karg = 0; 4217 4218 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4219 if (ret) 4220 return ret; 4221 4222 return put_user(karg, (int __user *)arg); 4223 } 4224 4225 /* A wrapper around sock ioctls, which copies the data from userspace 4226 * (depending on the protocol/ioctl), and copies back the result to userspace. 4227 * The main motivation for this function is to pass kernel memory to the 4228 * protocol ioctl callbacks, instead of userspace memory. 4229 */ 4230 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4231 { 4232 int rc = 1; 4233 4234 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4235 rc = ipmr_sk_ioctl(sk, cmd, arg); 4236 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4237 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4238 else if (sk_is_phonet(sk)) 4239 rc = phonet_sk_ioctl(sk, cmd, arg); 4240 4241 /* If ioctl was processed, returns its value */ 4242 if (rc <= 0) 4243 return rc; 4244 4245 /* Otherwise call the default handler */ 4246 return sock_ioctl_out(sk, cmd, arg); 4247 } 4248 EXPORT_SYMBOL(sk_ioctl); 4249 4250 static int __init sock_struct_check(void) 4251 { 4252 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4253 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4254 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4255 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4256 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4257 4258 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4259 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4260 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4261 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4262 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4263 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4264 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4265 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4266 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4267 4268 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4269 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4270 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4271 4272 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4273 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4274 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4275 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4276 4277 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4278 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4279 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); 4280 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4281 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4282 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4283 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4284 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4285 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4286 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); 4287 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); 4288 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4289 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4290 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4291 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4292 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4293 4294 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4295 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4296 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4297 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4298 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4299 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4300 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4301 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4302 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4303 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4304 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4305 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4306 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4307 return 0; 4308 } 4309 4310 core_initcall(sock_struct_check); 4311