1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 #include <linux/mroute.h> 118 #include <linux/mroute6.h> 119 #include <linux/icmpv6.h> 120 121 #include <linux/uaccess.h> 122 123 #include <linux/netdevice.h> 124 #include <net/protocol.h> 125 #include <linux/skbuff.h> 126 #include <net/net_namespace.h> 127 #include <net/request_sock.h> 128 #include <net/sock.h> 129 #include <linux/net_tstamp.h> 130 #include <net/xfrm.h> 131 #include <linux/ipsec.h> 132 #include <net/cls_cgroup.h> 133 #include <net/netprio_cgroup.h> 134 #include <linux/sock_diag.h> 135 136 #include <linux/filter.h> 137 #include <net/sock_reuseport.h> 138 #include <net/bpf_sk_storage.h> 139 140 #include <trace/events/sock.h> 141 142 #include <net/tcp.h> 143 #include <net/busy_poll.h> 144 #include <net/phonet/phonet.h> 145 146 #include <linux/ethtool.h> 147 148 #include "dev.h" 149 150 static DEFINE_MUTEX(proto_list_mutex); 151 static LIST_HEAD(proto_list); 152 153 static void sock_def_write_space_wfree(struct sock *sk); 154 static void sock_def_write_space(struct sock *sk); 155 156 /** 157 * sk_ns_capable - General socket capability test 158 * @sk: Socket to use a capability on or through 159 * @user_ns: The user namespace of the capability to use 160 * @cap: The capability to use 161 * 162 * Test to see if the opener of the socket had when the socket was 163 * created and the current process has the capability @cap in the user 164 * namespace @user_ns. 165 */ 166 bool sk_ns_capable(const struct sock *sk, 167 struct user_namespace *user_ns, int cap) 168 { 169 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 170 ns_capable(user_ns, cap); 171 } 172 EXPORT_SYMBOL(sk_ns_capable); 173 174 /** 175 * sk_capable - Socket global capability test 176 * @sk: Socket to use a capability on or through 177 * @cap: The global capability to use 178 * 179 * Test to see if the opener of the socket had when the socket was 180 * created and the current process has the capability @cap in all user 181 * namespaces. 182 */ 183 bool sk_capable(const struct sock *sk, int cap) 184 { 185 return sk_ns_capable(sk, &init_user_ns, cap); 186 } 187 EXPORT_SYMBOL(sk_capable); 188 189 /** 190 * sk_net_capable - Network namespace socket capability test 191 * @sk: Socket to use a capability on or through 192 * @cap: The capability to use 193 * 194 * Test to see if the opener of the socket had when the socket was created 195 * and the current process has the capability @cap over the network namespace 196 * the socket is a member of. 197 */ 198 bool sk_net_capable(const struct sock *sk, int cap) 199 { 200 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 201 } 202 EXPORT_SYMBOL(sk_net_capable); 203 204 /* 205 * Each address family might have different locking rules, so we have 206 * one slock key per address family and separate keys for internal and 207 * userspace sockets. 208 */ 209 static struct lock_class_key af_family_keys[AF_MAX]; 210 static struct lock_class_key af_family_kern_keys[AF_MAX]; 211 static struct lock_class_key af_family_slock_keys[AF_MAX]; 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 213 214 /* 215 * Make lock validator output more readable. (we pre-construct these 216 * strings build-time, so that runtime initialization of socket 217 * locks is fast): 218 */ 219 220 #define _sock_locks(x) \ 221 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 222 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 223 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 224 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 225 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 226 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 227 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 228 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 229 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 230 x "27" , x "28" , x "AF_CAN" , \ 231 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 232 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 233 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 234 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 235 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 236 x "AF_MCTP" , \ 237 x "AF_MAX" 238 239 static const char *const af_family_key_strings[AF_MAX+1] = { 240 _sock_locks("sk_lock-") 241 }; 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("slock-") 244 }; 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("clock-") 247 }; 248 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 250 _sock_locks("k-sk_lock-") 251 }; 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 253 _sock_locks("k-slock-") 254 }; 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 256 _sock_locks("k-clock-") 257 }; 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 259 _sock_locks("rlock-") 260 }; 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 262 _sock_locks("wlock-") 263 }; 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 265 _sock_locks("elock-") 266 }; 267 268 /* 269 * sk_callback_lock and sk queues locking rules are per-address-family, 270 * so split the lock classes by using a per-AF key: 271 */ 272 static struct lock_class_key af_callback_keys[AF_MAX]; 273 static struct lock_class_key af_rlock_keys[AF_MAX]; 274 static struct lock_class_key af_wlock_keys[AF_MAX]; 275 static struct lock_class_key af_elock_keys[AF_MAX]; 276 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 277 278 /* Run time adjustable parameters. */ 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 280 EXPORT_SYMBOL(sysctl_wmem_max); 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 282 EXPORT_SYMBOL(sysctl_rmem_max); 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 285 286 /* Maximal space eaten by iovec or ancillary data plus some space */ 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 288 EXPORT_SYMBOL(sysctl_optmem_max); 289 290 int sysctl_tstamp_allow_data __read_mostly = 1; 291 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 293 EXPORT_SYMBOL_GPL(memalloc_socks_key); 294 295 /** 296 * sk_set_memalloc - sets %SOCK_MEMALLOC 297 * @sk: socket to set it on 298 * 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 300 * It's the responsibility of the admin to adjust min_free_kbytes 301 * to meet the requirements 302 */ 303 void sk_set_memalloc(struct sock *sk) 304 { 305 sock_set_flag(sk, SOCK_MEMALLOC); 306 sk->sk_allocation |= __GFP_MEMALLOC; 307 static_branch_inc(&memalloc_socks_key); 308 } 309 EXPORT_SYMBOL_GPL(sk_set_memalloc); 310 311 void sk_clear_memalloc(struct sock *sk) 312 { 313 sock_reset_flag(sk, SOCK_MEMALLOC); 314 sk->sk_allocation &= ~__GFP_MEMALLOC; 315 static_branch_dec(&memalloc_socks_key); 316 317 /* 318 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 319 * progress of swapping. SOCK_MEMALLOC may be cleared while 320 * it has rmem allocations due to the last swapfile being deactivated 321 * but there is a risk that the socket is unusable due to exceeding 322 * the rmem limits. Reclaim the reserves and obey rmem limits again. 323 */ 324 sk_mem_reclaim(sk); 325 } 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 327 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 329 { 330 int ret; 331 unsigned int noreclaim_flag; 332 333 /* these should have been dropped before queueing */ 334 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 335 336 noreclaim_flag = memalloc_noreclaim_save(); 337 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 338 tcp_v6_do_rcv, 339 tcp_v4_do_rcv, 340 sk, skb); 341 memalloc_noreclaim_restore(noreclaim_flag); 342 343 return ret; 344 } 345 EXPORT_SYMBOL(__sk_backlog_rcv); 346 347 void sk_error_report(struct sock *sk) 348 { 349 sk->sk_error_report(sk); 350 351 switch (sk->sk_family) { 352 case AF_INET: 353 fallthrough; 354 case AF_INET6: 355 trace_inet_sk_error_report(sk); 356 break; 357 default: 358 break; 359 } 360 } 361 EXPORT_SYMBOL(sk_error_report); 362 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 364 { 365 struct __kernel_sock_timeval tv; 366 367 if (timeo == MAX_SCHEDULE_TIMEOUT) { 368 tv.tv_sec = 0; 369 tv.tv_usec = 0; 370 } else { 371 tv.tv_sec = timeo / HZ; 372 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 373 } 374 375 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 376 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 377 *(struct old_timeval32 *)optval = tv32; 378 return sizeof(tv32); 379 } 380 381 if (old_timeval) { 382 struct __kernel_old_timeval old_tv; 383 old_tv.tv_sec = tv.tv_sec; 384 old_tv.tv_usec = tv.tv_usec; 385 *(struct __kernel_old_timeval *)optval = old_tv; 386 return sizeof(old_tv); 387 } 388 389 *(struct __kernel_sock_timeval *)optval = tv; 390 return sizeof(tv); 391 } 392 EXPORT_SYMBOL(sock_get_timeout); 393 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 395 sockptr_t optval, int optlen, bool old_timeval) 396 { 397 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 398 struct old_timeval32 tv32; 399 400 if (optlen < sizeof(tv32)) 401 return -EINVAL; 402 403 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 404 return -EFAULT; 405 tv->tv_sec = tv32.tv_sec; 406 tv->tv_usec = tv32.tv_usec; 407 } else if (old_timeval) { 408 struct __kernel_old_timeval old_tv; 409 410 if (optlen < sizeof(old_tv)) 411 return -EINVAL; 412 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 413 return -EFAULT; 414 tv->tv_sec = old_tv.tv_sec; 415 tv->tv_usec = old_tv.tv_usec; 416 } else { 417 if (optlen < sizeof(*tv)) 418 return -EINVAL; 419 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 420 return -EFAULT; 421 } 422 423 return 0; 424 } 425 EXPORT_SYMBOL(sock_copy_user_timeval); 426 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 428 bool old_timeval) 429 { 430 struct __kernel_sock_timeval tv; 431 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 432 long val; 433 434 if (err) 435 return err; 436 437 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 438 return -EDOM; 439 440 if (tv.tv_sec < 0) { 441 static int warned __read_mostly; 442 443 WRITE_ONCE(*timeo_p, 0); 444 if (warned < 10 && net_ratelimit()) { 445 warned++; 446 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 447 __func__, current->comm, task_pid_nr(current)); 448 } 449 return 0; 450 } 451 val = MAX_SCHEDULE_TIMEOUT; 452 if ((tv.tv_sec || tv.tv_usec) && 453 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 454 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 455 USEC_PER_SEC / HZ); 456 WRITE_ONCE(*timeo_p, val); 457 return 0; 458 } 459 460 static bool sock_needs_netstamp(const struct sock *sk) 461 { 462 switch (sk->sk_family) { 463 case AF_UNSPEC: 464 case AF_UNIX: 465 return false; 466 default: 467 return true; 468 } 469 } 470 471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 472 { 473 if (sk->sk_flags & flags) { 474 sk->sk_flags &= ~flags; 475 if (sock_needs_netstamp(sk) && 476 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 477 net_disable_timestamp(); 478 } 479 } 480 481 482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 483 { 484 unsigned long flags; 485 struct sk_buff_head *list = &sk->sk_receive_queue; 486 487 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 488 atomic_inc(&sk->sk_drops); 489 trace_sock_rcvqueue_full(sk, skb); 490 return -ENOMEM; 491 } 492 493 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 494 atomic_inc(&sk->sk_drops); 495 return -ENOBUFS; 496 } 497 498 skb->dev = NULL; 499 skb_set_owner_r(skb, sk); 500 501 /* we escape from rcu protected region, make sure we dont leak 502 * a norefcounted dst 503 */ 504 skb_dst_force(skb); 505 506 spin_lock_irqsave(&list->lock, flags); 507 sock_skb_set_dropcount(sk, skb); 508 __skb_queue_tail(list, skb); 509 spin_unlock_irqrestore(&list->lock, flags); 510 511 if (!sock_flag(sk, SOCK_DEAD)) 512 sk->sk_data_ready(sk); 513 return 0; 514 } 515 EXPORT_SYMBOL(__sock_queue_rcv_skb); 516 517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 518 enum skb_drop_reason *reason) 519 { 520 enum skb_drop_reason drop_reason; 521 int err; 522 523 err = sk_filter(sk, skb); 524 if (err) { 525 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 526 goto out; 527 } 528 err = __sock_queue_rcv_skb(sk, skb); 529 switch (err) { 530 case -ENOMEM: 531 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 532 break; 533 case -ENOBUFS: 534 drop_reason = SKB_DROP_REASON_PROTO_MEM; 535 break; 536 default: 537 drop_reason = SKB_NOT_DROPPED_YET; 538 break; 539 } 540 out: 541 if (reason) 542 *reason = drop_reason; 543 return err; 544 } 545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 546 547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 548 const int nested, unsigned int trim_cap, bool refcounted) 549 { 550 int rc = NET_RX_SUCCESS; 551 552 if (sk_filter_trim_cap(sk, skb, trim_cap)) 553 goto discard_and_relse; 554 555 skb->dev = NULL; 556 557 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 558 atomic_inc(&sk->sk_drops); 559 goto discard_and_relse; 560 } 561 if (nested) 562 bh_lock_sock_nested(sk); 563 else 564 bh_lock_sock(sk); 565 if (!sock_owned_by_user(sk)) { 566 /* 567 * trylock + unlock semantics: 568 */ 569 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 570 571 rc = sk_backlog_rcv(sk, skb); 572 573 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 574 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 575 bh_unlock_sock(sk); 576 atomic_inc(&sk->sk_drops); 577 goto discard_and_relse; 578 } 579 580 bh_unlock_sock(sk); 581 out: 582 if (refcounted) 583 sock_put(sk); 584 return rc; 585 discard_and_relse: 586 kfree_skb(skb); 587 goto out; 588 } 589 EXPORT_SYMBOL(__sk_receive_skb); 590 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 592 u32)); 593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 594 u32)); 595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 596 { 597 struct dst_entry *dst = __sk_dst_get(sk); 598 599 if (dst && dst->obsolete && 600 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 601 dst, cookie) == NULL) { 602 sk_tx_queue_clear(sk); 603 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 604 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 605 dst_release(dst); 606 return NULL; 607 } 608 609 return dst; 610 } 611 EXPORT_SYMBOL(__sk_dst_check); 612 613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 614 { 615 struct dst_entry *dst = sk_dst_get(sk); 616 617 if (dst && dst->obsolete && 618 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 619 dst, cookie) == NULL) { 620 sk_dst_reset(sk); 621 dst_release(dst); 622 return NULL; 623 } 624 625 return dst; 626 } 627 EXPORT_SYMBOL(sk_dst_check); 628 629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 630 { 631 int ret = -ENOPROTOOPT; 632 #ifdef CONFIG_NETDEVICES 633 struct net *net = sock_net(sk); 634 635 /* Sorry... */ 636 ret = -EPERM; 637 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 638 goto out; 639 640 ret = -EINVAL; 641 if (ifindex < 0) 642 goto out; 643 644 /* Paired with all READ_ONCE() done locklessly. */ 645 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 646 647 if (sk->sk_prot->rehash) 648 sk->sk_prot->rehash(sk); 649 sk_dst_reset(sk); 650 651 ret = 0; 652 653 out: 654 #endif 655 656 return ret; 657 } 658 659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 660 { 661 int ret; 662 663 if (lock_sk) 664 lock_sock(sk); 665 ret = sock_bindtoindex_locked(sk, ifindex); 666 if (lock_sk) 667 release_sock(sk); 668 669 return ret; 670 } 671 EXPORT_SYMBOL(sock_bindtoindex); 672 673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 674 { 675 int ret = -ENOPROTOOPT; 676 #ifdef CONFIG_NETDEVICES 677 struct net *net = sock_net(sk); 678 char devname[IFNAMSIZ]; 679 int index; 680 681 ret = -EINVAL; 682 if (optlen < 0) 683 goto out; 684 685 /* Bind this socket to a particular device like "eth0", 686 * as specified in the passed interface name. If the 687 * name is "" or the option length is zero the socket 688 * is not bound. 689 */ 690 if (optlen > IFNAMSIZ - 1) 691 optlen = IFNAMSIZ - 1; 692 memset(devname, 0, sizeof(devname)); 693 694 ret = -EFAULT; 695 if (copy_from_sockptr(devname, optval, optlen)) 696 goto out; 697 698 index = 0; 699 if (devname[0] != '\0') { 700 struct net_device *dev; 701 702 rcu_read_lock(); 703 dev = dev_get_by_name_rcu(net, devname); 704 if (dev) 705 index = dev->ifindex; 706 rcu_read_unlock(); 707 ret = -ENODEV; 708 if (!dev) 709 goto out; 710 } 711 712 sockopt_lock_sock(sk); 713 ret = sock_bindtoindex_locked(sk, index); 714 sockopt_release_sock(sk); 715 out: 716 #endif 717 718 return ret; 719 } 720 721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 722 sockptr_t optlen, int len) 723 { 724 int ret = -ENOPROTOOPT; 725 #ifdef CONFIG_NETDEVICES 726 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 727 struct net *net = sock_net(sk); 728 char devname[IFNAMSIZ]; 729 730 if (bound_dev_if == 0) { 731 len = 0; 732 goto zero; 733 } 734 735 ret = -EINVAL; 736 if (len < IFNAMSIZ) 737 goto out; 738 739 ret = netdev_get_name(net, devname, bound_dev_if); 740 if (ret) 741 goto out; 742 743 len = strlen(devname) + 1; 744 745 ret = -EFAULT; 746 if (copy_to_sockptr(optval, devname, len)) 747 goto out; 748 749 zero: 750 ret = -EFAULT; 751 if (copy_to_sockptr(optlen, &len, sizeof(int))) 752 goto out; 753 754 ret = 0; 755 756 out: 757 #endif 758 759 return ret; 760 } 761 762 bool sk_mc_loop(const struct sock *sk) 763 { 764 if (dev_recursion_level()) 765 return false; 766 if (!sk) 767 return true; 768 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 769 switch (READ_ONCE(sk->sk_family)) { 770 case AF_INET: 771 return inet_test_bit(MC_LOOP, sk); 772 #if IS_ENABLED(CONFIG_IPV6) 773 case AF_INET6: 774 return inet6_test_bit(MC6_LOOP, sk); 775 #endif 776 } 777 WARN_ON_ONCE(1); 778 return true; 779 } 780 EXPORT_SYMBOL(sk_mc_loop); 781 782 void sock_set_reuseaddr(struct sock *sk) 783 { 784 lock_sock(sk); 785 sk->sk_reuse = SK_CAN_REUSE; 786 release_sock(sk); 787 } 788 EXPORT_SYMBOL(sock_set_reuseaddr); 789 790 void sock_set_reuseport(struct sock *sk) 791 { 792 lock_sock(sk); 793 sk->sk_reuseport = true; 794 release_sock(sk); 795 } 796 EXPORT_SYMBOL(sock_set_reuseport); 797 798 void sock_no_linger(struct sock *sk) 799 { 800 lock_sock(sk); 801 WRITE_ONCE(sk->sk_lingertime, 0); 802 sock_set_flag(sk, SOCK_LINGER); 803 release_sock(sk); 804 } 805 EXPORT_SYMBOL(sock_no_linger); 806 807 void sock_set_priority(struct sock *sk, u32 priority) 808 { 809 WRITE_ONCE(sk->sk_priority, priority); 810 } 811 EXPORT_SYMBOL(sock_set_priority); 812 813 void sock_set_sndtimeo(struct sock *sk, s64 secs) 814 { 815 lock_sock(sk); 816 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 817 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 818 else 819 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 820 release_sock(sk); 821 } 822 EXPORT_SYMBOL(sock_set_sndtimeo); 823 824 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 825 { 826 if (val) { 827 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 828 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 829 sock_set_flag(sk, SOCK_RCVTSTAMP); 830 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 831 } else { 832 sock_reset_flag(sk, SOCK_RCVTSTAMP); 833 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 834 } 835 } 836 837 void sock_enable_timestamps(struct sock *sk) 838 { 839 lock_sock(sk); 840 __sock_set_timestamps(sk, true, false, true); 841 release_sock(sk); 842 } 843 EXPORT_SYMBOL(sock_enable_timestamps); 844 845 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 846 { 847 switch (optname) { 848 case SO_TIMESTAMP_OLD: 849 __sock_set_timestamps(sk, valbool, false, false); 850 break; 851 case SO_TIMESTAMP_NEW: 852 __sock_set_timestamps(sk, valbool, true, false); 853 break; 854 case SO_TIMESTAMPNS_OLD: 855 __sock_set_timestamps(sk, valbool, false, true); 856 break; 857 case SO_TIMESTAMPNS_NEW: 858 __sock_set_timestamps(sk, valbool, true, true); 859 break; 860 } 861 } 862 863 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 864 { 865 struct net *net = sock_net(sk); 866 struct net_device *dev = NULL; 867 bool match = false; 868 int *vclock_index; 869 int i, num; 870 871 if (sk->sk_bound_dev_if) 872 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 873 874 if (!dev) { 875 pr_err("%s: sock not bind to device\n", __func__); 876 return -EOPNOTSUPP; 877 } 878 879 num = ethtool_get_phc_vclocks(dev, &vclock_index); 880 dev_put(dev); 881 882 for (i = 0; i < num; i++) { 883 if (*(vclock_index + i) == phc_index) { 884 match = true; 885 break; 886 } 887 } 888 889 if (num > 0) 890 kfree(vclock_index); 891 892 if (!match) 893 return -EINVAL; 894 895 WRITE_ONCE(sk->sk_bind_phc, phc_index); 896 897 return 0; 898 } 899 900 int sock_set_timestamping(struct sock *sk, int optname, 901 struct so_timestamping timestamping) 902 { 903 int val = timestamping.flags; 904 int ret; 905 906 if (val & ~SOF_TIMESTAMPING_MASK) 907 return -EINVAL; 908 909 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 910 !(val & SOF_TIMESTAMPING_OPT_ID)) 911 return -EINVAL; 912 913 if (val & SOF_TIMESTAMPING_OPT_ID && 914 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 915 if (sk_is_tcp(sk)) { 916 if ((1 << sk->sk_state) & 917 (TCPF_CLOSE | TCPF_LISTEN)) 918 return -EINVAL; 919 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 920 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 921 else 922 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 923 } else { 924 atomic_set(&sk->sk_tskey, 0); 925 } 926 } 927 928 if (val & SOF_TIMESTAMPING_OPT_STATS && 929 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 930 return -EINVAL; 931 932 if (val & SOF_TIMESTAMPING_BIND_PHC) { 933 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 934 if (ret) 935 return ret; 936 } 937 938 WRITE_ONCE(sk->sk_tsflags, val); 939 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 940 941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 942 sock_enable_timestamp(sk, 943 SOCK_TIMESTAMPING_RX_SOFTWARE); 944 else 945 sock_disable_timestamp(sk, 946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 947 return 0; 948 } 949 950 void sock_set_keepalive(struct sock *sk) 951 { 952 lock_sock(sk); 953 if (sk->sk_prot->keepalive) 954 sk->sk_prot->keepalive(sk, true); 955 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 956 release_sock(sk); 957 } 958 EXPORT_SYMBOL(sock_set_keepalive); 959 960 static void __sock_set_rcvbuf(struct sock *sk, int val) 961 { 962 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 963 * as a negative value. 964 */ 965 val = min_t(int, val, INT_MAX / 2); 966 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 967 968 /* We double it on the way in to account for "struct sk_buff" etc. 969 * overhead. Applications assume that the SO_RCVBUF setting they make 970 * will allow that much actual data to be received on that socket. 971 * 972 * Applications are unaware that "struct sk_buff" and other overheads 973 * allocate from the receive buffer during socket buffer allocation. 974 * 975 * And after considering the possible alternatives, returning the value 976 * we actually used in getsockopt is the most desirable behavior. 977 */ 978 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 979 } 980 981 void sock_set_rcvbuf(struct sock *sk, int val) 982 { 983 lock_sock(sk); 984 __sock_set_rcvbuf(sk, val); 985 release_sock(sk); 986 } 987 EXPORT_SYMBOL(sock_set_rcvbuf); 988 989 static void __sock_set_mark(struct sock *sk, u32 val) 990 { 991 if (val != sk->sk_mark) { 992 WRITE_ONCE(sk->sk_mark, val); 993 sk_dst_reset(sk); 994 } 995 } 996 997 void sock_set_mark(struct sock *sk, u32 val) 998 { 999 lock_sock(sk); 1000 __sock_set_mark(sk, val); 1001 release_sock(sk); 1002 } 1003 EXPORT_SYMBOL(sock_set_mark); 1004 1005 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1006 { 1007 /* Round down bytes to multiple of pages */ 1008 bytes = round_down(bytes, PAGE_SIZE); 1009 1010 WARN_ON(bytes > sk->sk_reserved_mem); 1011 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1012 sk_mem_reclaim(sk); 1013 } 1014 1015 static int sock_reserve_memory(struct sock *sk, int bytes) 1016 { 1017 long allocated; 1018 bool charged; 1019 int pages; 1020 1021 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 1022 return -EOPNOTSUPP; 1023 1024 if (!bytes) 1025 return 0; 1026 1027 pages = sk_mem_pages(bytes); 1028 1029 /* pre-charge to memcg */ 1030 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 1031 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1032 if (!charged) 1033 return -ENOMEM; 1034 1035 /* pre-charge to forward_alloc */ 1036 sk_memory_allocated_add(sk, pages); 1037 allocated = sk_memory_allocated(sk); 1038 /* If the system goes into memory pressure with this 1039 * precharge, give up and return error. 1040 */ 1041 if (allocated > sk_prot_mem_limits(sk, 1)) { 1042 sk_memory_allocated_sub(sk, pages); 1043 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1044 return -ENOMEM; 1045 } 1046 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1047 1048 WRITE_ONCE(sk->sk_reserved_mem, 1049 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1050 1051 return 0; 1052 } 1053 1054 void sockopt_lock_sock(struct sock *sk) 1055 { 1056 /* When current->bpf_ctx is set, the setsockopt is called from 1057 * a bpf prog. bpf has ensured the sk lock has been 1058 * acquired before calling setsockopt(). 1059 */ 1060 if (has_current_bpf_ctx()) 1061 return; 1062 1063 lock_sock(sk); 1064 } 1065 EXPORT_SYMBOL(sockopt_lock_sock); 1066 1067 void sockopt_release_sock(struct sock *sk) 1068 { 1069 if (has_current_bpf_ctx()) 1070 return; 1071 1072 release_sock(sk); 1073 } 1074 EXPORT_SYMBOL(sockopt_release_sock); 1075 1076 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1077 { 1078 return has_current_bpf_ctx() || ns_capable(ns, cap); 1079 } 1080 EXPORT_SYMBOL(sockopt_ns_capable); 1081 1082 bool sockopt_capable(int cap) 1083 { 1084 return has_current_bpf_ctx() || capable(cap); 1085 } 1086 EXPORT_SYMBOL(sockopt_capable); 1087 1088 /* 1089 * This is meant for all protocols to use and covers goings on 1090 * at the socket level. Everything here is generic. 1091 */ 1092 1093 int sk_setsockopt(struct sock *sk, int level, int optname, 1094 sockptr_t optval, unsigned int optlen) 1095 { 1096 struct so_timestamping timestamping; 1097 struct socket *sock = sk->sk_socket; 1098 struct sock_txtime sk_txtime; 1099 int val; 1100 int valbool; 1101 struct linger ling; 1102 int ret = 0; 1103 1104 /* 1105 * Options without arguments 1106 */ 1107 1108 if (optname == SO_BINDTODEVICE) 1109 return sock_setbindtodevice(sk, optval, optlen); 1110 1111 if (optlen < sizeof(int)) 1112 return -EINVAL; 1113 1114 if (copy_from_sockptr(&val, optval, sizeof(val))) 1115 return -EFAULT; 1116 1117 valbool = val ? 1 : 0; 1118 1119 /* handle options which do not require locking the socket. */ 1120 switch (optname) { 1121 case SO_PRIORITY: 1122 if ((val >= 0 && val <= 6) || 1123 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 1124 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1125 sock_set_priority(sk, val); 1126 return 0; 1127 } 1128 return -EPERM; 1129 case SO_PASSSEC: 1130 assign_bit(SOCK_PASSSEC, &sock->flags, valbool); 1131 return 0; 1132 case SO_PASSCRED: 1133 assign_bit(SOCK_PASSCRED, &sock->flags, valbool); 1134 return 0; 1135 case SO_PASSPIDFD: 1136 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); 1137 return 0; 1138 case SO_TYPE: 1139 case SO_PROTOCOL: 1140 case SO_DOMAIN: 1141 case SO_ERROR: 1142 return -ENOPROTOOPT; 1143 #ifdef CONFIG_NET_RX_BUSY_POLL 1144 case SO_BUSY_POLL: 1145 if (val < 0) 1146 return -EINVAL; 1147 WRITE_ONCE(sk->sk_ll_usec, val); 1148 return 0; 1149 case SO_PREFER_BUSY_POLL: 1150 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1151 return -EPERM; 1152 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1153 return 0; 1154 case SO_BUSY_POLL_BUDGET: 1155 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1156 !sockopt_capable(CAP_NET_ADMIN)) 1157 return -EPERM; 1158 if (val < 0 || val > U16_MAX) 1159 return -EINVAL; 1160 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1161 return 0; 1162 #endif 1163 case SO_MAX_PACING_RATE: 1164 { 1165 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1166 unsigned long pacing_rate; 1167 1168 if (sizeof(ulval) != sizeof(val) && 1169 optlen >= sizeof(ulval) && 1170 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1171 return -EFAULT; 1172 } 1173 if (ulval != ~0UL) 1174 cmpxchg(&sk->sk_pacing_status, 1175 SK_PACING_NONE, 1176 SK_PACING_NEEDED); 1177 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1178 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1179 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1180 if (ulval < pacing_rate) 1181 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1182 return 0; 1183 } 1184 case SO_TXREHASH: 1185 if (val < -1 || val > 1) 1186 return -EINVAL; 1187 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1188 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1189 /* Paired with READ_ONCE() in tcp_rtx_synack() 1190 * and sk_getsockopt(). 1191 */ 1192 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1193 return 0; 1194 } 1195 1196 sockopt_lock_sock(sk); 1197 1198 switch (optname) { 1199 case SO_DEBUG: 1200 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1201 ret = -EACCES; 1202 else 1203 sock_valbool_flag(sk, SOCK_DBG, valbool); 1204 break; 1205 case SO_REUSEADDR: 1206 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1207 break; 1208 case SO_REUSEPORT: 1209 sk->sk_reuseport = valbool; 1210 break; 1211 case SO_DONTROUTE: 1212 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1213 sk_dst_reset(sk); 1214 break; 1215 case SO_BROADCAST: 1216 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1217 break; 1218 case SO_SNDBUF: 1219 /* Don't error on this BSD doesn't and if you think 1220 * about it this is right. Otherwise apps have to 1221 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1222 * are treated in BSD as hints 1223 */ 1224 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1225 set_sndbuf: 1226 /* Ensure val * 2 fits into an int, to prevent max_t() 1227 * from treating it as a negative value. 1228 */ 1229 val = min_t(int, val, INT_MAX / 2); 1230 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1231 WRITE_ONCE(sk->sk_sndbuf, 1232 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1233 /* Wake up sending tasks if we upped the value. */ 1234 sk->sk_write_space(sk); 1235 break; 1236 1237 case SO_SNDBUFFORCE: 1238 if (!sockopt_capable(CAP_NET_ADMIN)) { 1239 ret = -EPERM; 1240 break; 1241 } 1242 1243 /* No negative values (to prevent underflow, as val will be 1244 * multiplied by 2). 1245 */ 1246 if (val < 0) 1247 val = 0; 1248 goto set_sndbuf; 1249 1250 case SO_RCVBUF: 1251 /* Don't error on this BSD doesn't and if you think 1252 * about it this is right. Otherwise apps have to 1253 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1254 * are treated in BSD as hints 1255 */ 1256 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1257 break; 1258 1259 case SO_RCVBUFFORCE: 1260 if (!sockopt_capable(CAP_NET_ADMIN)) { 1261 ret = -EPERM; 1262 break; 1263 } 1264 1265 /* No negative values (to prevent underflow, as val will be 1266 * multiplied by 2). 1267 */ 1268 __sock_set_rcvbuf(sk, max(val, 0)); 1269 break; 1270 1271 case SO_KEEPALIVE: 1272 if (sk->sk_prot->keepalive) 1273 sk->sk_prot->keepalive(sk, valbool); 1274 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1275 break; 1276 1277 case SO_OOBINLINE: 1278 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1279 break; 1280 1281 case SO_NO_CHECK: 1282 sk->sk_no_check_tx = valbool; 1283 break; 1284 1285 case SO_LINGER: 1286 if (optlen < sizeof(ling)) { 1287 ret = -EINVAL; /* 1003.1g */ 1288 break; 1289 } 1290 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1291 ret = -EFAULT; 1292 break; 1293 } 1294 if (!ling.l_onoff) { 1295 sock_reset_flag(sk, SOCK_LINGER); 1296 } else { 1297 unsigned long t_sec = ling.l_linger; 1298 1299 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1300 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1301 else 1302 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1303 sock_set_flag(sk, SOCK_LINGER); 1304 } 1305 break; 1306 1307 case SO_BSDCOMPAT: 1308 break; 1309 1310 case SO_TIMESTAMP_OLD: 1311 case SO_TIMESTAMP_NEW: 1312 case SO_TIMESTAMPNS_OLD: 1313 case SO_TIMESTAMPNS_NEW: 1314 sock_set_timestamp(sk, optname, valbool); 1315 break; 1316 1317 case SO_TIMESTAMPING_NEW: 1318 case SO_TIMESTAMPING_OLD: 1319 if (optlen == sizeof(timestamping)) { 1320 if (copy_from_sockptr(×tamping, optval, 1321 sizeof(timestamping))) { 1322 ret = -EFAULT; 1323 break; 1324 } 1325 } else { 1326 memset(×tamping, 0, sizeof(timestamping)); 1327 timestamping.flags = val; 1328 } 1329 ret = sock_set_timestamping(sk, optname, timestamping); 1330 break; 1331 1332 case SO_RCVLOWAT: 1333 { 1334 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1335 1336 if (val < 0) 1337 val = INT_MAX; 1338 if (sock) 1339 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1340 if (set_rcvlowat) 1341 ret = set_rcvlowat(sk, val); 1342 else 1343 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1344 break; 1345 } 1346 case SO_RCVTIMEO_OLD: 1347 case SO_RCVTIMEO_NEW: 1348 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1349 optlen, optname == SO_RCVTIMEO_OLD); 1350 break; 1351 1352 case SO_SNDTIMEO_OLD: 1353 case SO_SNDTIMEO_NEW: 1354 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1355 optlen, optname == SO_SNDTIMEO_OLD); 1356 break; 1357 1358 case SO_ATTACH_FILTER: { 1359 struct sock_fprog fprog; 1360 1361 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1362 if (!ret) 1363 ret = sk_attach_filter(&fprog, sk); 1364 break; 1365 } 1366 case SO_ATTACH_BPF: 1367 ret = -EINVAL; 1368 if (optlen == sizeof(u32)) { 1369 u32 ufd; 1370 1371 ret = -EFAULT; 1372 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1373 break; 1374 1375 ret = sk_attach_bpf(ufd, sk); 1376 } 1377 break; 1378 1379 case SO_ATTACH_REUSEPORT_CBPF: { 1380 struct sock_fprog fprog; 1381 1382 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1383 if (!ret) 1384 ret = sk_reuseport_attach_filter(&fprog, sk); 1385 break; 1386 } 1387 case SO_ATTACH_REUSEPORT_EBPF: 1388 ret = -EINVAL; 1389 if (optlen == sizeof(u32)) { 1390 u32 ufd; 1391 1392 ret = -EFAULT; 1393 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1394 break; 1395 1396 ret = sk_reuseport_attach_bpf(ufd, sk); 1397 } 1398 break; 1399 1400 case SO_DETACH_REUSEPORT_BPF: 1401 ret = reuseport_detach_prog(sk); 1402 break; 1403 1404 case SO_DETACH_FILTER: 1405 ret = sk_detach_filter(sk); 1406 break; 1407 1408 case SO_LOCK_FILTER: 1409 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1410 ret = -EPERM; 1411 else 1412 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1413 break; 1414 1415 case SO_MARK: 1416 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1417 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1418 ret = -EPERM; 1419 break; 1420 } 1421 1422 __sock_set_mark(sk, val); 1423 break; 1424 case SO_RCVMARK: 1425 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1426 break; 1427 1428 case SO_RXQ_OVFL: 1429 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1430 break; 1431 1432 case SO_WIFI_STATUS: 1433 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1434 break; 1435 1436 case SO_PEEK_OFF: 1437 { 1438 int (*set_peek_off)(struct sock *sk, int val); 1439 1440 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1441 if (set_peek_off) 1442 ret = set_peek_off(sk, val); 1443 else 1444 ret = -EOPNOTSUPP; 1445 break; 1446 } 1447 1448 case SO_NOFCS: 1449 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1450 break; 1451 1452 case SO_SELECT_ERR_QUEUE: 1453 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1454 break; 1455 1456 1457 case SO_INCOMING_CPU: 1458 reuseport_update_incoming_cpu(sk, val); 1459 break; 1460 1461 case SO_CNX_ADVICE: 1462 if (val == 1) 1463 dst_negative_advice(sk); 1464 break; 1465 1466 case SO_ZEROCOPY: 1467 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1468 if (!(sk_is_tcp(sk) || 1469 (sk->sk_type == SOCK_DGRAM && 1470 sk->sk_protocol == IPPROTO_UDP))) 1471 ret = -EOPNOTSUPP; 1472 } else if (sk->sk_family != PF_RDS) { 1473 ret = -EOPNOTSUPP; 1474 } 1475 if (!ret) { 1476 if (val < 0 || val > 1) 1477 ret = -EINVAL; 1478 else 1479 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1480 } 1481 break; 1482 1483 case SO_TXTIME: 1484 if (optlen != sizeof(struct sock_txtime)) { 1485 ret = -EINVAL; 1486 break; 1487 } else if (copy_from_sockptr(&sk_txtime, optval, 1488 sizeof(struct sock_txtime))) { 1489 ret = -EFAULT; 1490 break; 1491 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1492 ret = -EINVAL; 1493 break; 1494 } 1495 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1496 * scheduler has enough safe guards. 1497 */ 1498 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1499 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1500 ret = -EPERM; 1501 break; 1502 } 1503 sock_valbool_flag(sk, SOCK_TXTIME, true); 1504 sk->sk_clockid = sk_txtime.clockid; 1505 sk->sk_txtime_deadline_mode = 1506 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1507 sk->sk_txtime_report_errors = 1508 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1509 break; 1510 1511 case SO_BINDTOIFINDEX: 1512 ret = sock_bindtoindex_locked(sk, val); 1513 break; 1514 1515 case SO_BUF_LOCK: 1516 if (val & ~SOCK_BUF_LOCK_MASK) { 1517 ret = -EINVAL; 1518 break; 1519 } 1520 sk->sk_userlocks = val | (sk->sk_userlocks & 1521 ~SOCK_BUF_LOCK_MASK); 1522 break; 1523 1524 case SO_RESERVE_MEM: 1525 { 1526 int delta; 1527 1528 if (val < 0) { 1529 ret = -EINVAL; 1530 break; 1531 } 1532 1533 delta = val - sk->sk_reserved_mem; 1534 if (delta < 0) 1535 sock_release_reserved_memory(sk, -delta); 1536 else 1537 ret = sock_reserve_memory(sk, delta); 1538 break; 1539 } 1540 1541 default: 1542 ret = -ENOPROTOOPT; 1543 break; 1544 } 1545 sockopt_release_sock(sk); 1546 return ret; 1547 } 1548 1549 int sock_setsockopt(struct socket *sock, int level, int optname, 1550 sockptr_t optval, unsigned int optlen) 1551 { 1552 return sk_setsockopt(sock->sk, level, optname, 1553 optval, optlen); 1554 } 1555 EXPORT_SYMBOL(sock_setsockopt); 1556 1557 static const struct cred *sk_get_peer_cred(struct sock *sk) 1558 { 1559 const struct cred *cred; 1560 1561 spin_lock(&sk->sk_peer_lock); 1562 cred = get_cred(sk->sk_peer_cred); 1563 spin_unlock(&sk->sk_peer_lock); 1564 1565 return cred; 1566 } 1567 1568 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1569 struct ucred *ucred) 1570 { 1571 ucred->pid = pid_vnr(pid); 1572 ucred->uid = ucred->gid = -1; 1573 if (cred) { 1574 struct user_namespace *current_ns = current_user_ns(); 1575 1576 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1577 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1578 } 1579 } 1580 1581 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1582 { 1583 struct user_namespace *user_ns = current_user_ns(); 1584 int i; 1585 1586 for (i = 0; i < src->ngroups; i++) { 1587 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1588 1589 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1590 return -EFAULT; 1591 } 1592 1593 return 0; 1594 } 1595 1596 int sk_getsockopt(struct sock *sk, int level, int optname, 1597 sockptr_t optval, sockptr_t optlen) 1598 { 1599 struct socket *sock = sk->sk_socket; 1600 1601 union { 1602 int val; 1603 u64 val64; 1604 unsigned long ulval; 1605 struct linger ling; 1606 struct old_timeval32 tm32; 1607 struct __kernel_old_timeval tm; 1608 struct __kernel_sock_timeval stm; 1609 struct sock_txtime txtime; 1610 struct so_timestamping timestamping; 1611 } v; 1612 1613 int lv = sizeof(int); 1614 int len; 1615 1616 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1617 return -EFAULT; 1618 if (len < 0) 1619 return -EINVAL; 1620 1621 memset(&v, 0, sizeof(v)); 1622 1623 switch (optname) { 1624 case SO_DEBUG: 1625 v.val = sock_flag(sk, SOCK_DBG); 1626 break; 1627 1628 case SO_DONTROUTE: 1629 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1630 break; 1631 1632 case SO_BROADCAST: 1633 v.val = sock_flag(sk, SOCK_BROADCAST); 1634 break; 1635 1636 case SO_SNDBUF: 1637 v.val = READ_ONCE(sk->sk_sndbuf); 1638 break; 1639 1640 case SO_RCVBUF: 1641 v.val = READ_ONCE(sk->sk_rcvbuf); 1642 break; 1643 1644 case SO_REUSEADDR: 1645 v.val = sk->sk_reuse; 1646 break; 1647 1648 case SO_REUSEPORT: 1649 v.val = sk->sk_reuseport; 1650 break; 1651 1652 case SO_KEEPALIVE: 1653 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1654 break; 1655 1656 case SO_TYPE: 1657 v.val = sk->sk_type; 1658 break; 1659 1660 case SO_PROTOCOL: 1661 v.val = sk->sk_protocol; 1662 break; 1663 1664 case SO_DOMAIN: 1665 v.val = sk->sk_family; 1666 break; 1667 1668 case SO_ERROR: 1669 v.val = -sock_error(sk); 1670 if (v.val == 0) 1671 v.val = xchg(&sk->sk_err_soft, 0); 1672 break; 1673 1674 case SO_OOBINLINE: 1675 v.val = sock_flag(sk, SOCK_URGINLINE); 1676 break; 1677 1678 case SO_NO_CHECK: 1679 v.val = sk->sk_no_check_tx; 1680 break; 1681 1682 case SO_PRIORITY: 1683 v.val = READ_ONCE(sk->sk_priority); 1684 break; 1685 1686 case SO_LINGER: 1687 lv = sizeof(v.ling); 1688 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1689 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1690 break; 1691 1692 case SO_BSDCOMPAT: 1693 break; 1694 1695 case SO_TIMESTAMP_OLD: 1696 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1697 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1698 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1699 break; 1700 1701 case SO_TIMESTAMPNS_OLD: 1702 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1703 break; 1704 1705 case SO_TIMESTAMP_NEW: 1706 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1707 break; 1708 1709 case SO_TIMESTAMPNS_NEW: 1710 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1711 break; 1712 1713 case SO_TIMESTAMPING_OLD: 1714 lv = sizeof(v.timestamping); 1715 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1716 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1717 break; 1718 1719 case SO_RCVTIMEO_OLD: 1720 case SO_RCVTIMEO_NEW: 1721 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1722 SO_RCVTIMEO_OLD == optname); 1723 break; 1724 1725 case SO_SNDTIMEO_OLD: 1726 case SO_SNDTIMEO_NEW: 1727 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1728 SO_SNDTIMEO_OLD == optname); 1729 break; 1730 1731 case SO_RCVLOWAT: 1732 v.val = READ_ONCE(sk->sk_rcvlowat); 1733 break; 1734 1735 case SO_SNDLOWAT: 1736 v.val = 1; 1737 break; 1738 1739 case SO_PASSCRED: 1740 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1741 break; 1742 1743 case SO_PASSPIDFD: 1744 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); 1745 break; 1746 1747 case SO_PEERCRED: 1748 { 1749 struct ucred peercred; 1750 if (len > sizeof(peercred)) 1751 len = sizeof(peercred); 1752 1753 spin_lock(&sk->sk_peer_lock); 1754 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1755 spin_unlock(&sk->sk_peer_lock); 1756 1757 if (copy_to_sockptr(optval, &peercred, len)) 1758 return -EFAULT; 1759 goto lenout; 1760 } 1761 1762 case SO_PEERPIDFD: 1763 { 1764 struct pid *peer_pid; 1765 struct file *pidfd_file = NULL; 1766 int pidfd; 1767 1768 if (len > sizeof(pidfd)) 1769 len = sizeof(pidfd); 1770 1771 spin_lock(&sk->sk_peer_lock); 1772 peer_pid = get_pid(sk->sk_peer_pid); 1773 spin_unlock(&sk->sk_peer_lock); 1774 1775 if (!peer_pid) 1776 return -ENODATA; 1777 1778 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); 1779 put_pid(peer_pid); 1780 if (pidfd < 0) 1781 return pidfd; 1782 1783 if (copy_to_sockptr(optval, &pidfd, len) || 1784 copy_to_sockptr(optlen, &len, sizeof(int))) { 1785 put_unused_fd(pidfd); 1786 fput(pidfd_file); 1787 1788 return -EFAULT; 1789 } 1790 1791 fd_install(pidfd, pidfd_file); 1792 return 0; 1793 } 1794 1795 case SO_PEERGROUPS: 1796 { 1797 const struct cred *cred; 1798 int ret, n; 1799 1800 cred = sk_get_peer_cred(sk); 1801 if (!cred) 1802 return -ENODATA; 1803 1804 n = cred->group_info->ngroups; 1805 if (len < n * sizeof(gid_t)) { 1806 len = n * sizeof(gid_t); 1807 put_cred(cred); 1808 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1809 } 1810 len = n * sizeof(gid_t); 1811 1812 ret = groups_to_user(optval, cred->group_info); 1813 put_cred(cred); 1814 if (ret) 1815 return ret; 1816 goto lenout; 1817 } 1818 1819 case SO_PEERNAME: 1820 { 1821 struct sockaddr_storage address; 1822 1823 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1824 if (lv < 0) 1825 return -ENOTCONN; 1826 if (lv < len) 1827 return -EINVAL; 1828 if (copy_to_sockptr(optval, &address, len)) 1829 return -EFAULT; 1830 goto lenout; 1831 } 1832 1833 /* Dubious BSD thing... Probably nobody even uses it, but 1834 * the UNIX standard wants it for whatever reason... -DaveM 1835 */ 1836 case SO_ACCEPTCONN: 1837 v.val = sk->sk_state == TCP_LISTEN; 1838 break; 1839 1840 case SO_PASSSEC: 1841 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1842 break; 1843 1844 case SO_PEERSEC: 1845 return security_socket_getpeersec_stream(sock, 1846 optval, optlen, len); 1847 1848 case SO_MARK: 1849 v.val = READ_ONCE(sk->sk_mark); 1850 break; 1851 1852 case SO_RCVMARK: 1853 v.val = sock_flag(sk, SOCK_RCVMARK); 1854 break; 1855 1856 case SO_RXQ_OVFL: 1857 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1858 break; 1859 1860 case SO_WIFI_STATUS: 1861 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1862 break; 1863 1864 case SO_PEEK_OFF: 1865 if (!READ_ONCE(sock->ops)->set_peek_off) 1866 return -EOPNOTSUPP; 1867 1868 v.val = READ_ONCE(sk->sk_peek_off); 1869 break; 1870 case SO_NOFCS: 1871 v.val = sock_flag(sk, SOCK_NOFCS); 1872 break; 1873 1874 case SO_BINDTODEVICE: 1875 return sock_getbindtodevice(sk, optval, optlen, len); 1876 1877 case SO_GET_FILTER: 1878 len = sk_get_filter(sk, optval, len); 1879 if (len < 0) 1880 return len; 1881 1882 goto lenout; 1883 1884 case SO_LOCK_FILTER: 1885 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1886 break; 1887 1888 case SO_BPF_EXTENSIONS: 1889 v.val = bpf_tell_extensions(); 1890 break; 1891 1892 case SO_SELECT_ERR_QUEUE: 1893 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1894 break; 1895 1896 #ifdef CONFIG_NET_RX_BUSY_POLL 1897 case SO_BUSY_POLL: 1898 v.val = READ_ONCE(sk->sk_ll_usec); 1899 break; 1900 case SO_PREFER_BUSY_POLL: 1901 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1902 break; 1903 #endif 1904 1905 case SO_MAX_PACING_RATE: 1906 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 1907 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1908 lv = sizeof(v.ulval); 1909 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 1910 } else { 1911 /* 32bit version */ 1912 v.val = min_t(unsigned long, ~0U, 1913 READ_ONCE(sk->sk_max_pacing_rate)); 1914 } 1915 break; 1916 1917 case SO_INCOMING_CPU: 1918 v.val = READ_ONCE(sk->sk_incoming_cpu); 1919 break; 1920 1921 case SO_MEMINFO: 1922 { 1923 u32 meminfo[SK_MEMINFO_VARS]; 1924 1925 sk_get_meminfo(sk, meminfo); 1926 1927 len = min_t(unsigned int, len, sizeof(meminfo)); 1928 if (copy_to_sockptr(optval, &meminfo, len)) 1929 return -EFAULT; 1930 1931 goto lenout; 1932 } 1933 1934 #ifdef CONFIG_NET_RX_BUSY_POLL 1935 case SO_INCOMING_NAPI_ID: 1936 v.val = READ_ONCE(sk->sk_napi_id); 1937 1938 /* aggregate non-NAPI IDs down to 0 */ 1939 if (v.val < MIN_NAPI_ID) 1940 v.val = 0; 1941 1942 break; 1943 #endif 1944 1945 case SO_COOKIE: 1946 lv = sizeof(u64); 1947 if (len < lv) 1948 return -EINVAL; 1949 v.val64 = sock_gen_cookie(sk); 1950 break; 1951 1952 case SO_ZEROCOPY: 1953 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1954 break; 1955 1956 case SO_TXTIME: 1957 lv = sizeof(v.txtime); 1958 v.txtime.clockid = sk->sk_clockid; 1959 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1960 SOF_TXTIME_DEADLINE_MODE : 0; 1961 v.txtime.flags |= sk->sk_txtime_report_errors ? 1962 SOF_TXTIME_REPORT_ERRORS : 0; 1963 break; 1964 1965 case SO_BINDTOIFINDEX: 1966 v.val = READ_ONCE(sk->sk_bound_dev_if); 1967 break; 1968 1969 case SO_NETNS_COOKIE: 1970 lv = sizeof(u64); 1971 if (len != lv) 1972 return -EINVAL; 1973 v.val64 = sock_net(sk)->net_cookie; 1974 break; 1975 1976 case SO_BUF_LOCK: 1977 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1978 break; 1979 1980 case SO_RESERVE_MEM: 1981 v.val = READ_ONCE(sk->sk_reserved_mem); 1982 break; 1983 1984 case SO_TXREHASH: 1985 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 1986 v.val = READ_ONCE(sk->sk_txrehash); 1987 break; 1988 1989 default: 1990 /* We implement the SO_SNDLOWAT etc to not be settable 1991 * (1003.1g 7). 1992 */ 1993 return -ENOPROTOOPT; 1994 } 1995 1996 if (len > lv) 1997 len = lv; 1998 if (copy_to_sockptr(optval, &v, len)) 1999 return -EFAULT; 2000 lenout: 2001 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2002 return -EFAULT; 2003 return 0; 2004 } 2005 2006 int sock_getsockopt(struct socket *sock, int level, int optname, 2007 char __user *optval, int __user *optlen) 2008 { 2009 return sk_getsockopt(sock->sk, level, optname, 2010 USER_SOCKPTR(optval), 2011 USER_SOCKPTR(optlen)); 2012 } 2013 2014 /* 2015 * Initialize an sk_lock. 2016 * 2017 * (We also register the sk_lock with the lock validator.) 2018 */ 2019 static inline void sock_lock_init(struct sock *sk) 2020 { 2021 if (sk->sk_kern_sock) 2022 sock_lock_init_class_and_name( 2023 sk, 2024 af_family_kern_slock_key_strings[sk->sk_family], 2025 af_family_kern_slock_keys + sk->sk_family, 2026 af_family_kern_key_strings[sk->sk_family], 2027 af_family_kern_keys + sk->sk_family); 2028 else 2029 sock_lock_init_class_and_name( 2030 sk, 2031 af_family_slock_key_strings[sk->sk_family], 2032 af_family_slock_keys + sk->sk_family, 2033 af_family_key_strings[sk->sk_family], 2034 af_family_keys + sk->sk_family); 2035 } 2036 2037 /* 2038 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2039 * even temporarly, because of RCU lookups. sk_node should also be left as is. 2040 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2041 */ 2042 static void sock_copy(struct sock *nsk, const struct sock *osk) 2043 { 2044 const struct proto *prot = READ_ONCE(osk->sk_prot); 2045 #ifdef CONFIG_SECURITY_NETWORK 2046 void *sptr = nsk->sk_security; 2047 #endif 2048 2049 /* If we move sk_tx_queue_mapping out of the private section, 2050 * we must check if sk_tx_queue_clear() is called after 2051 * sock_copy() in sk_clone_lock(). 2052 */ 2053 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2054 offsetof(struct sock, sk_dontcopy_begin) || 2055 offsetof(struct sock, sk_tx_queue_mapping) >= 2056 offsetof(struct sock, sk_dontcopy_end)); 2057 2058 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2059 2060 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2061 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 2062 2063 #ifdef CONFIG_SECURITY_NETWORK 2064 nsk->sk_security = sptr; 2065 security_sk_clone(osk, nsk); 2066 #endif 2067 } 2068 2069 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2070 int family) 2071 { 2072 struct sock *sk; 2073 struct kmem_cache *slab; 2074 2075 slab = prot->slab; 2076 if (slab != NULL) { 2077 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2078 if (!sk) 2079 return sk; 2080 if (want_init_on_alloc(priority)) 2081 sk_prot_clear_nulls(sk, prot->obj_size); 2082 } else 2083 sk = kmalloc(prot->obj_size, priority); 2084 2085 if (sk != NULL) { 2086 if (security_sk_alloc(sk, family, priority)) 2087 goto out_free; 2088 2089 if (!try_module_get(prot->owner)) 2090 goto out_free_sec; 2091 } 2092 2093 return sk; 2094 2095 out_free_sec: 2096 security_sk_free(sk); 2097 out_free: 2098 if (slab != NULL) 2099 kmem_cache_free(slab, sk); 2100 else 2101 kfree(sk); 2102 return NULL; 2103 } 2104 2105 static void sk_prot_free(struct proto *prot, struct sock *sk) 2106 { 2107 struct kmem_cache *slab; 2108 struct module *owner; 2109 2110 owner = prot->owner; 2111 slab = prot->slab; 2112 2113 cgroup_sk_free(&sk->sk_cgrp_data); 2114 mem_cgroup_sk_free(sk); 2115 security_sk_free(sk); 2116 if (slab != NULL) 2117 kmem_cache_free(slab, sk); 2118 else 2119 kfree(sk); 2120 module_put(owner); 2121 } 2122 2123 /** 2124 * sk_alloc - All socket objects are allocated here 2125 * @net: the applicable net namespace 2126 * @family: protocol family 2127 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2128 * @prot: struct proto associated with this new sock instance 2129 * @kern: is this to be a kernel socket? 2130 */ 2131 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2132 struct proto *prot, int kern) 2133 { 2134 struct sock *sk; 2135 2136 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2137 if (sk) { 2138 sk->sk_family = family; 2139 /* 2140 * See comment in struct sock definition to understand 2141 * why we need sk_prot_creator -acme 2142 */ 2143 sk->sk_prot = sk->sk_prot_creator = prot; 2144 sk->sk_kern_sock = kern; 2145 sock_lock_init(sk); 2146 sk->sk_net_refcnt = kern ? 0 : 1; 2147 if (likely(sk->sk_net_refcnt)) { 2148 get_net_track(net, &sk->ns_tracker, priority); 2149 sock_inuse_add(net, 1); 2150 } else { 2151 __netns_tracker_alloc(net, &sk->ns_tracker, 2152 false, priority); 2153 } 2154 2155 sock_net_set(sk, net); 2156 refcount_set(&sk->sk_wmem_alloc, 1); 2157 2158 mem_cgroup_sk_alloc(sk); 2159 cgroup_sk_alloc(&sk->sk_cgrp_data); 2160 sock_update_classid(&sk->sk_cgrp_data); 2161 sock_update_netprioidx(&sk->sk_cgrp_data); 2162 sk_tx_queue_clear(sk); 2163 } 2164 2165 return sk; 2166 } 2167 EXPORT_SYMBOL(sk_alloc); 2168 2169 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2170 * grace period. This is the case for UDP sockets and TCP listeners. 2171 */ 2172 static void __sk_destruct(struct rcu_head *head) 2173 { 2174 struct sock *sk = container_of(head, struct sock, sk_rcu); 2175 struct sk_filter *filter; 2176 2177 if (sk->sk_destruct) 2178 sk->sk_destruct(sk); 2179 2180 filter = rcu_dereference_check(sk->sk_filter, 2181 refcount_read(&sk->sk_wmem_alloc) == 0); 2182 if (filter) { 2183 sk_filter_uncharge(sk, filter); 2184 RCU_INIT_POINTER(sk->sk_filter, NULL); 2185 } 2186 2187 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2188 2189 #ifdef CONFIG_BPF_SYSCALL 2190 bpf_sk_storage_free(sk); 2191 #endif 2192 2193 if (atomic_read(&sk->sk_omem_alloc)) 2194 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2195 __func__, atomic_read(&sk->sk_omem_alloc)); 2196 2197 if (sk->sk_frag.page) { 2198 put_page(sk->sk_frag.page); 2199 sk->sk_frag.page = NULL; 2200 } 2201 2202 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2203 put_cred(sk->sk_peer_cred); 2204 put_pid(sk->sk_peer_pid); 2205 2206 if (likely(sk->sk_net_refcnt)) 2207 put_net_track(sock_net(sk), &sk->ns_tracker); 2208 else 2209 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); 2210 2211 sk_prot_free(sk->sk_prot_creator, sk); 2212 } 2213 2214 void sk_destruct(struct sock *sk) 2215 { 2216 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2217 2218 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2219 reuseport_detach_sock(sk); 2220 use_call_rcu = true; 2221 } 2222 2223 if (use_call_rcu) 2224 call_rcu(&sk->sk_rcu, __sk_destruct); 2225 else 2226 __sk_destruct(&sk->sk_rcu); 2227 } 2228 2229 static void __sk_free(struct sock *sk) 2230 { 2231 if (likely(sk->sk_net_refcnt)) 2232 sock_inuse_add(sock_net(sk), -1); 2233 2234 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2235 sock_diag_broadcast_destroy(sk); 2236 else 2237 sk_destruct(sk); 2238 } 2239 2240 void sk_free(struct sock *sk) 2241 { 2242 /* 2243 * We subtract one from sk_wmem_alloc and can know if 2244 * some packets are still in some tx queue. 2245 * If not null, sock_wfree() will call __sk_free(sk) later 2246 */ 2247 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2248 __sk_free(sk); 2249 } 2250 EXPORT_SYMBOL(sk_free); 2251 2252 static void sk_init_common(struct sock *sk) 2253 { 2254 skb_queue_head_init(&sk->sk_receive_queue); 2255 skb_queue_head_init(&sk->sk_write_queue); 2256 skb_queue_head_init(&sk->sk_error_queue); 2257 2258 rwlock_init(&sk->sk_callback_lock); 2259 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2260 af_rlock_keys + sk->sk_family, 2261 af_family_rlock_key_strings[sk->sk_family]); 2262 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2263 af_wlock_keys + sk->sk_family, 2264 af_family_wlock_key_strings[sk->sk_family]); 2265 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2266 af_elock_keys + sk->sk_family, 2267 af_family_elock_key_strings[sk->sk_family]); 2268 lockdep_set_class_and_name(&sk->sk_callback_lock, 2269 af_callback_keys + sk->sk_family, 2270 af_family_clock_key_strings[sk->sk_family]); 2271 } 2272 2273 /** 2274 * sk_clone_lock - clone a socket, and lock its clone 2275 * @sk: the socket to clone 2276 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2277 * 2278 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2279 */ 2280 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2281 { 2282 struct proto *prot = READ_ONCE(sk->sk_prot); 2283 struct sk_filter *filter; 2284 bool is_charged = true; 2285 struct sock *newsk; 2286 2287 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2288 if (!newsk) 2289 goto out; 2290 2291 sock_copy(newsk, sk); 2292 2293 newsk->sk_prot_creator = prot; 2294 2295 /* SANITY */ 2296 if (likely(newsk->sk_net_refcnt)) { 2297 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2298 sock_inuse_add(sock_net(newsk), 1); 2299 } else { 2300 /* Kernel sockets are not elevating the struct net refcount. 2301 * Instead, use a tracker to more easily detect if a layer 2302 * is not properly dismantling its kernel sockets at netns 2303 * destroy time. 2304 */ 2305 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2306 false, priority); 2307 } 2308 sk_node_init(&newsk->sk_node); 2309 sock_lock_init(newsk); 2310 bh_lock_sock(newsk); 2311 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2312 newsk->sk_backlog.len = 0; 2313 2314 atomic_set(&newsk->sk_rmem_alloc, 0); 2315 2316 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2317 refcount_set(&newsk->sk_wmem_alloc, 1); 2318 2319 atomic_set(&newsk->sk_omem_alloc, 0); 2320 sk_init_common(newsk); 2321 2322 newsk->sk_dst_cache = NULL; 2323 newsk->sk_dst_pending_confirm = 0; 2324 newsk->sk_wmem_queued = 0; 2325 newsk->sk_forward_alloc = 0; 2326 newsk->sk_reserved_mem = 0; 2327 atomic_set(&newsk->sk_drops, 0); 2328 newsk->sk_send_head = NULL; 2329 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2330 atomic_set(&newsk->sk_zckey, 0); 2331 2332 sock_reset_flag(newsk, SOCK_DONE); 2333 2334 /* sk->sk_memcg will be populated at accept() time */ 2335 newsk->sk_memcg = NULL; 2336 2337 cgroup_sk_clone(&newsk->sk_cgrp_data); 2338 2339 rcu_read_lock(); 2340 filter = rcu_dereference(sk->sk_filter); 2341 if (filter != NULL) 2342 /* though it's an empty new sock, the charging may fail 2343 * if sysctl_optmem_max was changed between creation of 2344 * original socket and cloning 2345 */ 2346 is_charged = sk_filter_charge(newsk, filter); 2347 RCU_INIT_POINTER(newsk->sk_filter, filter); 2348 rcu_read_unlock(); 2349 2350 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2351 /* We need to make sure that we don't uncharge the new 2352 * socket if we couldn't charge it in the first place 2353 * as otherwise we uncharge the parent's filter. 2354 */ 2355 if (!is_charged) 2356 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2357 sk_free_unlock_clone(newsk); 2358 newsk = NULL; 2359 goto out; 2360 } 2361 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2362 2363 if (bpf_sk_storage_clone(sk, newsk)) { 2364 sk_free_unlock_clone(newsk); 2365 newsk = NULL; 2366 goto out; 2367 } 2368 2369 /* Clear sk_user_data if parent had the pointer tagged 2370 * as not suitable for copying when cloning. 2371 */ 2372 if (sk_user_data_is_nocopy(newsk)) 2373 newsk->sk_user_data = NULL; 2374 2375 newsk->sk_err = 0; 2376 newsk->sk_err_soft = 0; 2377 newsk->sk_priority = 0; 2378 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2379 2380 /* Before updating sk_refcnt, we must commit prior changes to memory 2381 * (Documentation/RCU/rculist_nulls.rst for details) 2382 */ 2383 smp_wmb(); 2384 refcount_set(&newsk->sk_refcnt, 2); 2385 2386 sk_set_socket(newsk, NULL); 2387 sk_tx_queue_clear(newsk); 2388 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2389 2390 if (newsk->sk_prot->sockets_allocated) 2391 sk_sockets_allocated_inc(newsk); 2392 2393 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2394 net_enable_timestamp(); 2395 out: 2396 return newsk; 2397 } 2398 EXPORT_SYMBOL_GPL(sk_clone_lock); 2399 2400 void sk_free_unlock_clone(struct sock *sk) 2401 { 2402 /* It is still raw copy of parent, so invalidate 2403 * destructor and make plain sk_free() */ 2404 sk->sk_destruct = NULL; 2405 bh_unlock_sock(sk); 2406 sk_free(sk); 2407 } 2408 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2409 2410 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) 2411 { 2412 bool is_ipv6 = false; 2413 u32 max_size; 2414 2415 #if IS_ENABLED(CONFIG_IPV6) 2416 is_ipv6 = (sk->sk_family == AF_INET6 && 2417 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2418 #endif 2419 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2420 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : 2421 READ_ONCE(dst->dev->gso_ipv4_max_size); 2422 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2423 max_size = GSO_LEGACY_MAX_SIZE; 2424 2425 return max_size - (MAX_TCP_HEADER + 1); 2426 } 2427 2428 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2429 { 2430 u32 max_segs = 1; 2431 2432 sk->sk_route_caps = dst->dev->features; 2433 if (sk_is_tcp(sk)) 2434 sk->sk_route_caps |= NETIF_F_GSO; 2435 if (sk->sk_route_caps & NETIF_F_GSO) 2436 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2437 if (unlikely(sk->sk_gso_disabled)) 2438 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2439 if (sk_can_gso(sk)) { 2440 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2441 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2442 } else { 2443 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2444 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); 2445 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2446 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2447 } 2448 } 2449 sk->sk_gso_max_segs = max_segs; 2450 sk_dst_set(sk, dst); 2451 } 2452 EXPORT_SYMBOL_GPL(sk_setup_caps); 2453 2454 /* 2455 * Simple resource managers for sockets. 2456 */ 2457 2458 2459 /* 2460 * Write buffer destructor automatically called from kfree_skb. 2461 */ 2462 void sock_wfree(struct sk_buff *skb) 2463 { 2464 struct sock *sk = skb->sk; 2465 unsigned int len = skb->truesize; 2466 bool free; 2467 2468 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2469 if (sock_flag(sk, SOCK_RCU_FREE) && 2470 sk->sk_write_space == sock_def_write_space) { 2471 rcu_read_lock(); 2472 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); 2473 sock_def_write_space_wfree(sk); 2474 rcu_read_unlock(); 2475 if (unlikely(free)) 2476 __sk_free(sk); 2477 return; 2478 } 2479 2480 /* 2481 * Keep a reference on sk_wmem_alloc, this will be released 2482 * after sk_write_space() call 2483 */ 2484 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2485 sk->sk_write_space(sk); 2486 len = 1; 2487 } 2488 /* 2489 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2490 * could not do because of in-flight packets 2491 */ 2492 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2493 __sk_free(sk); 2494 } 2495 EXPORT_SYMBOL(sock_wfree); 2496 2497 /* This variant of sock_wfree() is used by TCP, 2498 * since it sets SOCK_USE_WRITE_QUEUE. 2499 */ 2500 void __sock_wfree(struct sk_buff *skb) 2501 { 2502 struct sock *sk = skb->sk; 2503 2504 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2505 __sk_free(sk); 2506 } 2507 2508 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2509 { 2510 skb_orphan(skb); 2511 skb->sk = sk; 2512 #ifdef CONFIG_INET 2513 if (unlikely(!sk_fullsock(sk))) { 2514 skb->destructor = sock_edemux; 2515 sock_hold(sk); 2516 return; 2517 } 2518 #endif 2519 skb->destructor = sock_wfree; 2520 skb_set_hash_from_sk(skb, sk); 2521 /* 2522 * We used to take a refcount on sk, but following operation 2523 * is enough to guarantee sk_free() wont free this sock until 2524 * all in-flight packets are completed 2525 */ 2526 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2527 } 2528 EXPORT_SYMBOL(skb_set_owner_w); 2529 2530 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2531 { 2532 #ifdef CONFIG_TLS_DEVICE 2533 /* Drivers depend on in-order delivery for crypto offload, 2534 * partial orphan breaks out-of-order-OK logic. 2535 */ 2536 if (skb->decrypted) 2537 return false; 2538 #endif 2539 return (skb->destructor == sock_wfree || 2540 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2541 } 2542 2543 /* This helper is used by netem, as it can hold packets in its 2544 * delay queue. We want to allow the owner socket to send more 2545 * packets, as if they were already TX completed by a typical driver. 2546 * But we also want to keep skb->sk set because some packet schedulers 2547 * rely on it (sch_fq for example). 2548 */ 2549 void skb_orphan_partial(struct sk_buff *skb) 2550 { 2551 if (skb_is_tcp_pure_ack(skb)) 2552 return; 2553 2554 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2555 return; 2556 2557 skb_orphan(skb); 2558 } 2559 EXPORT_SYMBOL(skb_orphan_partial); 2560 2561 /* 2562 * Read buffer destructor automatically called from kfree_skb. 2563 */ 2564 void sock_rfree(struct sk_buff *skb) 2565 { 2566 struct sock *sk = skb->sk; 2567 unsigned int len = skb->truesize; 2568 2569 atomic_sub(len, &sk->sk_rmem_alloc); 2570 sk_mem_uncharge(sk, len); 2571 } 2572 EXPORT_SYMBOL(sock_rfree); 2573 2574 /* 2575 * Buffer destructor for skbs that are not used directly in read or write 2576 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2577 */ 2578 void sock_efree(struct sk_buff *skb) 2579 { 2580 sock_put(skb->sk); 2581 } 2582 EXPORT_SYMBOL(sock_efree); 2583 2584 /* Buffer destructor for prefetch/receive path where reference count may 2585 * not be held, e.g. for listen sockets. 2586 */ 2587 #ifdef CONFIG_INET 2588 void sock_pfree(struct sk_buff *skb) 2589 { 2590 if (sk_is_refcounted(skb->sk)) 2591 sock_gen_put(skb->sk); 2592 } 2593 EXPORT_SYMBOL(sock_pfree); 2594 #endif /* CONFIG_INET */ 2595 2596 kuid_t sock_i_uid(struct sock *sk) 2597 { 2598 kuid_t uid; 2599 2600 read_lock_bh(&sk->sk_callback_lock); 2601 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2602 read_unlock_bh(&sk->sk_callback_lock); 2603 return uid; 2604 } 2605 EXPORT_SYMBOL(sock_i_uid); 2606 2607 unsigned long __sock_i_ino(struct sock *sk) 2608 { 2609 unsigned long ino; 2610 2611 read_lock(&sk->sk_callback_lock); 2612 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2613 read_unlock(&sk->sk_callback_lock); 2614 return ino; 2615 } 2616 EXPORT_SYMBOL(__sock_i_ino); 2617 2618 unsigned long sock_i_ino(struct sock *sk) 2619 { 2620 unsigned long ino; 2621 2622 local_bh_disable(); 2623 ino = __sock_i_ino(sk); 2624 local_bh_enable(); 2625 return ino; 2626 } 2627 EXPORT_SYMBOL(sock_i_ino); 2628 2629 /* 2630 * Allocate a skb from the socket's send buffer. 2631 */ 2632 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2633 gfp_t priority) 2634 { 2635 if (force || 2636 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2637 struct sk_buff *skb = alloc_skb(size, priority); 2638 2639 if (skb) { 2640 skb_set_owner_w(skb, sk); 2641 return skb; 2642 } 2643 } 2644 return NULL; 2645 } 2646 EXPORT_SYMBOL(sock_wmalloc); 2647 2648 static void sock_ofree(struct sk_buff *skb) 2649 { 2650 struct sock *sk = skb->sk; 2651 2652 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2653 } 2654 2655 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2656 gfp_t priority) 2657 { 2658 struct sk_buff *skb; 2659 2660 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2661 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2662 READ_ONCE(sysctl_optmem_max)) 2663 return NULL; 2664 2665 skb = alloc_skb(size, priority); 2666 if (!skb) 2667 return NULL; 2668 2669 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2670 skb->sk = sk; 2671 skb->destructor = sock_ofree; 2672 return skb; 2673 } 2674 2675 /* 2676 * Allocate a memory block from the socket's option memory buffer. 2677 */ 2678 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2679 { 2680 int optmem_max = READ_ONCE(sysctl_optmem_max); 2681 2682 if ((unsigned int)size <= optmem_max && 2683 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2684 void *mem; 2685 /* First do the add, to avoid the race if kmalloc 2686 * might sleep. 2687 */ 2688 atomic_add(size, &sk->sk_omem_alloc); 2689 mem = kmalloc(size, priority); 2690 if (mem) 2691 return mem; 2692 atomic_sub(size, &sk->sk_omem_alloc); 2693 } 2694 return NULL; 2695 } 2696 EXPORT_SYMBOL(sock_kmalloc); 2697 2698 /* Free an option memory block. Note, we actually want the inline 2699 * here as this allows gcc to detect the nullify and fold away the 2700 * condition entirely. 2701 */ 2702 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2703 const bool nullify) 2704 { 2705 if (WARN_ON_ONCE(!mem)) 2706 return; 2707 if (nullify) 2708 kfree_sensitive(mem); 2709 else 2710 kfree(mem); 2711 atomic_sub(size, &sk->sk_omem_alloc); 2712 } 2713 2714 void sock_kfree_s(struct sock *sk, void *mem, int size) 2715 { 2716 __sock_kfree_s(sk, mem, size, false); 2717 } 2718 EXPORT_SYMBOL(sock_kfree_s); 2719 2720 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2721 { 2722 __sock_kfree_s(sk, mem, size, true); 2723 } 2724 EXPORT_SYMBOL(sock_kzfree_s); 2725 2726 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2727 I think, these locks should be removed for datagram sockets. 2728 */ 2729 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2730 { 2731 DEFINE_WAIT(wait); 2732 2733 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2734 for (;;) { 2735 if (!timeo) 2736 break; 2737 if (signal_pending(current)) 2738 break; 2739 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2740 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2741 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2742 break; 2743 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2744 break; 2745 if (READ_ONCE(sk->sk_err)) 2746 break; 2747 timeo = schedule_timeout(timeo); 2748 } 2749 finish_wait(sk_sleep(sk), &wait); 2750 return timeo; 2751 } 2752 2753 2754 /* 2755 * Generic send/receive buffer handlers 2756 */ 2757 2758 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2759 unsigned long data_len, int noblock, 2760 int *errcode, int max_page_order) 2761 { 2762 struct sk_buff *skb; 2763 long timeo; 2764 int err; 2765 2766 timeo = sock_sndtimeo(sk, noblock); 2767 for (;;) { 2768 err = sock_error(sk); 2769 if (err != 0) 2770 goto failure; 2771 2772 err = -EPIPE; 2773 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2774 goto failure; 2775 2776 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2777 break; 2778 2779 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2780 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2781 err = -EAGAIN; 2782 if (!timeo) 2783 goto failure; 2784 if (signal_pending(current)) 2785 goto interrupted; 2786 timeo = sock_wait_for_wmem(sk, timeo); 2787 } 2788 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2789 errcode, sk->sk_allocation); 2790 if (skb) 2791 skb_set_owner_w(skb, sk); 2792 return skb; 2793 2794 interrupted: 2795 err = sock_intr_errno(timeo); 2796 failure: 2797 *errcode = err; 2798 return NULL; 2799 } 2800 EXPORT_SYMBOL(sock_alloc_send_pskb); 2801 2802 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 2803 struct sockcm_cookie *sockc) 2804 { 2805 u32 tsflags; 2806 2807 switch (cmsg->cmsg_type) { 2808 case SO_MARK: 2809 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 2810 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2811 return -EPERM; 2812 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2813 return -EINVAL; 2814 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2815 break; 2816 case SO_TIMESTAMPING_OLD: 2817 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2818 return -EINVAL; 2819 2820 tsflags = *(u32 *)CMSG_DATA(cmsg); 2821 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2822 return -EINVAL; 2823 2824 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2825 sockc->tsflags |= tsflags; 2826 break; 2827 case SCM_TXTIME: 2828 if (!sock_flag(sk, SOCK_TXTIME)) 2829 return -EINVAL; 2830 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2831 return -EINVAL; 2832 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2833 break; 2834 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2835 case SCM_RIGHTS: 2836 case SCM_CREDENTIALS: 2837 break; 2838 default: 2839 return -EINVAL; 2840 } 2841 return 0; 2842 } 2843 EXPORT_SYMBOL(__sock_cmsg_send); 2844 2845 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2846 struct sockcm_cookie *sockc) 2847 { 2848 struct cmsghdr *cmsg; 2849 int ret; 2850 2851 for_each_cmsghdr(cmsg, msg) { 2852 if (!CMSG_OK(msg, cmsg)) 2853 return -EINVAL; 2854 if (cmsg->cmsg_level != SOL_SOCKET) 2855 continue; 2856 ret = __sock_cmsg_send(sk, cmsg, sockc); 2857 if (ret) 2858 return ret; 2859 } 2860 return 0; 2861 } 2862 EXPORT_SYMBOL(sock_cmsg_send); 2863 2864 static void sk_enter_memory_pressure(struct sock *sk) 2865 { 2866 if (!sk->sk_prot->enter_memory_pressure) 2867 return; 2868 2869 sk->sk_prot->enter_memory_pressure(sk); 2870 } 2871 2872 static void sk_leave_memory_pressure(struct sock *sk) 2873 { 2874 if (sk->sk_prot->leave_memory_pressure) { 2875 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 2876 tcp_leave_memory_pressure, sk); 2877 } else { 2878 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2879 2880 if (memory_pressure && READ_ONCE(*memory_pressure)) 2881 WRITE_ONCE(*memory_pressure, 0); 2882 } 2883 } 2884 2885 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2886 2887 /** 2888 * skb_page_frag_refill - check that a page_frag contains enough room 2889 * @sz: minimum size of the fragment we want to get 2890 * @pfrag: pointer to page_frag 2891 * @gfp: priority for memory allocation 2892 * 2893 * Note: While this allocator tries to use high order pages, there is 2894 * no guarantee that allocations succeed. Therefore, @sz MUST be 2895 * less or equal than PAGE_SIZE. 2896 */ 2897 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2898 { 2899 if (pfrag->page) { 2900 if (page_ref_count(pfrag->page) == 1) { 2901 pfrag->offset = 0; 2902 return true; 2903 } 2904 if (pfrag->offset + sz <= pfrag->size) 2905 return true; 2906 put_page(pfrag->page); 2907 } 2908 2909 pfrag->offset = 0; 2910 if (SKB_FRAG_PAGE_ORDER && 2911 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2912 /* Avoid direct reclaim but allow kswapd to wake */ 2913 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2914 __GFP_COMP | __GFP_NOWARN | 2915 __GFP_NORETRY, 2916 SKB_FRAG_PAGE_ORDER); 2917 if (likely(pfrag->page)) { 2918 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2919 return true; 2920 } 2921 } 2922 pfrag->page = alloc_page(gfp); 2923 if (likely(pfrag->page)) { 2924 pfrag->size = PAGE_SIZE; 2925 return true; 2926 } 2927 return false; 2928 } 2929 EXPORT_SYMBOL(skb_page_frag_refill); 2930 2931 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2932 { 2933 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2934 return true; 2935 2936 sk_enter_memory_pressure(sk); 2937 sk_stream_moderate_sndbuf(sk); 2938 return false; 2939 } 2940 EXPORT_SYMBOL(sk_page_frag_refill); 2941 2942 void __lock_sock(struct sock *sk) 2943 __releases(&sk->sk_lock.slock) 2944 __acquires(&sk->sk_lock.slock) 2945 { 2946 DEFINE_WAIT(wait); 2947 2948 for (;;) { 2949 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2950 TASK_UNINTERRUPTIBLE); 2951 spin_unlock_bh(&sk->sk_lock.slock); 2952 schedule(); 2953 spin_lock_bh(&sk->sk_lock.slock); 2954 if (!sock_owned_by_user(sk)) 2955 break; 2956 } 2957 finish_wait(&sk->sk_lock.wq, &wait); 2958 } 2959 2960 void __release_sock(struct sock *sk) 2961 __releases(&sk->sk_lock.slock) 2962 __acquires(&sk->sk_lock.slock) 2963 { 2964 struct sk_buff *skb, *next; 2965 2966 while ((skb = sk->sk_backlog.head) != NULL) { 2967 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2968 2969 spin_unlock_bh(&sk->sk_lock.slock); 2970 2971 do { 2972 next = skb->next; 2973 prefetch(next); 2974 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 2975 skb_mark_not_on_list(skb); 2976 sk_backlog_rcv(sk, skb); 2977 2978 cond_resched(); 2979 2980 skb = next; 2981 } while (skb != NULL); 2982 2983 spin_lock_bh(&sk->sk_lock.slock); 2984 } 2985 2986 /* 2987 * Doing the zeroing here guarantee we can not loop forever 2988 * while a wild producer attempts to flood us. 2989 */ 2990 sk->sk_backlog.len = 0; 2991 } 2992 2993 void __sk_flush_backlog(struct sock *sk) 2994 { 2995 spin_lock_bh(&sk->sk_lock.slock); 2996 __release_sock(sk); 2997 2998 if (sk->sk_prot->release_cb) 2999 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3000 tcp_release_cb, sk); 3001 3002 spin_unlock_bh(&sk->sk_lock.slock); 3003 } 3004 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3005 3006 /** 3007 * sk_wait_data - wait for data to arrive at sk_receive_queue 3008 * @sk: sock to wait on 3009 * @timeo: for how long 3010 * @skb: last skb seen on sk_receive_queue 3011 * 3012 * Now socket state including sk->sk_err is changed only under lock, 3013 * hence we may omit checks after joining wait queue. 3014 * We check receive queue before schedule() only as optimization; 3015 * it is very likely that release_sock() added new data. 3016 */ 3017 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3018 { 3019 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3020 int rc; 3021 3022 add_wait_queue(sk_sleep(sk), &wait); 3023 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3024 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3025 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3026 remove_wait_queue(sk_sleep(sk), &wait); 3027 return rc; 3028 } 3029 EXPORT_SYMBOL(sk_wait_data); 3030 3031 /** 3032 * __sk_mem_raise_allocated - increase memory_allocated 3033 * @sk: socket 3034 * @size: memory size to allocate 3035 * @amt: pages to allocate 3036 * @kind: allocation type 3037 * 3038 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 3039 */ 3040 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3041 { 3042 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; 3043 struct proto *prot = sk->sk_prot; 3044 bool charged = true; 3045 long allocated; 3046 3047 sk_memory_allocated_add(sk, amt); 3048 allocated = sk_memory_allocated(sk); 3049 if (memcg_charge && 3050 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, 3051 gfp_memcg_charge()))) 3052 goto suppress_allocation; 3053 3054 /* Under limit. */ 3055 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3056 sk_leave_memory_pressure(sk); 3057 return 1; 3058 } 3059 3060 /* Under pressure. */ 3061 if (allocated > sk_prot_mem_limits(sk, 1)) 3062 sk_enter_memory_pressure(sk); 3063 3064 /* Over hard limit. */ 3065 if (allocated > sk_prot_mem_limits(sk, 2)) 3066 goto suppress_allocation; 3067 3068 /* guarantee minimum buffer size under pressure */ 3069 if (kind == SK_MEM_RECV) { 3070 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3071 return 1; 3072 3073 } else { /* SK_MEM_SEND */ 3074 int wmem0 = sk_get_wmem0(sk, prot); 3075 3076 if (sk->sk_type == SOCK_STREAM) { 3077 if (sk->sk_wmem_queued < wmem0) 3078 return 1; 3079 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3080 return 1; 3081 } 3082 } 3083 3084 if (sk_has_memory_pressure(sk)) { 3085 u64 alloc; 3086 3087 if (!sk_under_memory_pressure(sk)) 3088 return 1; 3089 alloc = sk_sockets_allocated_read_positive(sk); 3090 if (sk_prot_mem_limits(sk, 2) > alloc * 3091 sk_mem_pages(sk->sk_wmem_queued + 3092 atomic_read(&sk->sk_rmem_alloc) + 3093 sk->sk_forward_alloc)) 3094 return 1; 3095 } 3096 3097 suppress_allocation: 3098 3099 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3100 sk_stream_moderate_sndbuf(sk); 3101 3102 /* Fail only if socket is _under_ its sndbuf. 3103 * In this case we cannot block, so that we have to fail. 3104 */ 3105 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3106 /* Force charge with __GFP_NOFAIL */ 3107 if (memcg_charge && !charged) { 3108 mem_cgroup_charge_skmem(sk->sk_memcg, amt, 3109 gfp_memcg_charge() | __GFP_NOFAIL); 3110 } 3111 return 1; 3112 } 3113 } 3114 3115 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 3116 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3117 3118 sk_memory_allocated_sub(sk, amt); 3119 3120 if (memcg_charge && charged) 3121 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 3122 3123 return 0; 3124 } 3125 3126 /** 3127 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3128 * @sk: socket 3129 * @size: memory size to allocate 3130 * @kind: allocation type 3131 * 3132 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3133 * rmem allocation. This function assumes that protocols which have 3134 * memory_pressure use sk_wmem_queued as write buffer accounting. 3135 */ 3136 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3137 { 3138 int ret, amt = sk_mem_pages(size); 3139 3140 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3141 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3142 if (!ret) 3143 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3144 return ret; 3145 } 3146 EXPORT_SYMBOL(__sk_mem_schedule); 3147 3148 /** 3149 * __sk_mem_reduce_allocated - reclaim memory_allocated 3150 * @sk: socket 3151 * @amount: number of quanta 3152 * 3153 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3154 */ 3155 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3156 { 3157 sk_memory_allocated_sub(sk, amount); 3158 3159 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 3160 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 3161 3162 if (sk_under_global_memory_pressure(sk) && 3163 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3164 sk_leave_memory_pressure(sk); 3165 } 3166 3167 /** 3168 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3169 * @sk: socket 3170 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3171 */ 3172 void __sk_mem_reclaim(struct sock *sk, int amount) 3173 { 3174 amount >>= PAGE_SHIFT; 3175 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3176 __sk_mem_reduce_allocated(sk, amount); 3177 } 3178 EXPORT_SYMBOL(__sk_mem_reclaim); 3179 3180 int sk_set_peek_off(struct sock *sk, int val) 3181 { 3182 WRITE_ONCE(sk->sk_peek_off, val); 3183 return 0; 3184 } 3185 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3186 3187 /* 3188 * Set of default routines for initialising struct proto_ops when 3189 * the protocol does not support a particular function. In certain 3190 * cases where it makes no sense for a protocol to have a "do nothing" 3191 * function, some default processing is provided. 3192 */ 3193 3194 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3195 { 3196 return -EOPNOTSUPP; 3197 } 3198 EXPORT_SYMBOL(sock_no_bind); 3199 3200 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3201 int len, int flags) 3202 { 3203 return -EOPNOTSUPP; 3204 } 3205 EXPORT_SYMBOL(sock_no_connect); 3206 3207 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3208 { 3209 return -EOPNOTSUPP; 3210 } 3211 EXPORT_SYMBOL(sock_no_socketpair); 3212 3213 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 3214 bool kern) 3215 { 3216 return -EOPNOTSUPP; 3217 } 3218 EXPORT_SYMBOL(sock_no_accept); 3219 3220 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3221 int peer) 3222 { 3223 return -EOPNOTSUPP; 3224 } 3225 EXPORT_SYMBOL(sock_no_getname); 3226 3227 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3228 { 3229 return -EOPNOTSUPP; 3230 } 3231 EXPORT_SYMBOL(sock_no_ioctl); 3232 3233 int sock_no_listen(struct socket *sock, int backlog) 3234 { 3235 return -EOPNOTSUPP; 3236 } 3237 EXPORT_SYMBOL(sock_no_listen); 3238 3239 int sock_no_shutdown(struct socket *sock, int how) 3240 { 3241 return -EOPNOTSUPP; 3242 } 3243 EXPORT_SYMBOL(sock_no_shutdown); 3244 3245 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3246 { 3247 return -EOPNOTSUPP; 3248 } 3249 EXPORT_SYMBOL(sock_no_sendmsg); 3250 3251 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3252 { 3253 return -EOPNOTSUPP; 3254 } 3255 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3256 3257 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3258 int flags) 3259 { 3260 return -EOPNOTSUPP; 3261 } 3262 EXPORT_SYMBOL(sock_no_recvmsg); 3263 3264 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3265 { 3266 /* Mirror missing mmap method error code */ 3267 return -ENODEV; 3268 } 3269 EXPORT_SYMBOL(sock_no_mmap); 3270 3271 /* 3272 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3273 * various sock-based usage counts. 3274 */ 3275 void __receive_sock(struct file *file) 3276 { 3277 struct socket *sock; 3278 3279 sock = sock_from_file(file); 3280 if (sock) { 3281 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3282 sock_update_classid(&sock->sk->sk_cgrp_data); 3283 } 3284 } 3285 3286 /* 3287 * Default Socket Callbacks 3288 */ 3289 3290 static void sock_def_wakeup(struct sock *sk) 3291 { 3292 struct socket_wq *wq; 3293 3294 rcu_read_lock(); 3295 wq = rcu_dereference(sk->sk_wq); 3296 if (skwq_has_sleeper(wq)) 3297 wake_up_interruptible_all(&wq->wait); 3298 rcu_read_unlock(); 3299 } 3300 3301 static void sock_def_error_report(struct sock *sk) 3302 { 3303 struct socket_wq *wq; 3304 3305 rcu_read_lock(); 3306 wq = rcu_dereference(sk->sk_wq); 3307 if (skwq_has_sleeper(wq)) 3308 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3309 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3310 rcu_read_unlock(); 3311 } 3312 3313 void sock_def_readable(struct sock *sk) 3314 { 3315 struct socket_wq *wq; 3316 3317 trace_sk_data_ready(sk); 3318 3319 rcu_read_lock(); 3320 wq = rcu_dereference(sk->sk_wq); 3321 if (skwq_has_sleeper(wq)) 3322 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3323 EPOLLRDNORM | EPOLLRDBAND); 3324 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3325 rcu_read_unlock(); 3326 } 3327 3328 static void sock_def_write_space(struct sock *sk) 3329 { 3330 struct socket_wq *wq; 3331 3332 rcu_read_lock(); 3333 3334 /* Do not wake up a writer until he can make "significant" 3335 * progress. --DaveM 3336 */ 3337 if (sock_writeable(sk)) { 3338 wq = rcu_dereference(sk->sk_wq); 3339 if (skwq_has_sleeper(wq)) 3340 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3341 EPOLLWRNORM | EPOLLWRBAND); 3342 3343 /* Should agree with poll, otherwise some programs break */ 3344 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3345 } 3346 3347 rcu_read_unlock(); 3348 } 3349 3350 /* An optimised version of sock_def_write_space(), should only be called 3351 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3352 * ->sk_wmem_alloc. 3353 */ 3354 static void sock_def_write_space_wfree(struct sock *sk) 3355 { 3356 /* Do not wake up a writer until he can make "significant" 3357 * progress. --DaveM 3358 */ 3359 if (sock_writeable(sk)) { 3360 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3361 3362 /* rely on refcount_sub from sock_wfree() */ 3363 smp_mb__after_atomic(); 3364 if (wq && waitqueue_active(&wq->wait)) 3365 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3366 EPOLLWRNORM | EPOLLWRBAND); 3367 3368 /* Should agree with poll, otherwise some programs break */ 3369 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3370 } 3371 } 3372 3373 static void sock_def_destruct(struct sock *sk) 3374 { 3375 } 3376 3377 void sk_send_sigurg(struct sock *sk) 3378 { 3379 if (sk->sk_socket && sk->sk_socket->file) 3380 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3381 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3382 } 3383 EXPORT_SYMBOL(sk_send_sigurg); 3384 3385 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3386 unsigned long expires) 3387 { 3388 if (!mod_timer(timer, expires)) 3389 sock_hold(sk); 3390 } 3391 EXPORT_SYMBOL(sk_reset_timer); 3392 3393 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3394 { 3395 if (del_timer(timer)) 3396 __sock_put(sk); 3397 } 3398 EXPORT_SYMBOL(sk_stop_timer); 3399 3400 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3401 { 3402 if (del_timer_sync(timer)) 3403 __sock_put(sk); 3404 } 3405 EXPORT_SYMBOL(sk_stop_timer_sync); 3406 3407 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3408 { 3409 sk_init_common(sk); 3410 sk->sk_send_head = NULL; 3411 3412 timer_setup(&sk->sk_timer, NULL, 0); 3413 3414 sk->sk_allocation = GFP_KERNEL; 3415 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3416 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3417 sk->sk_state = TCP_CLOSE; 3418 sk->sk_use_task_frag = true; 3419 sk_set_socket(sk, sock); 3420 3421 sock_set_flag(sk, SOCK_ZAPPED); 3422 3423 if (sock) { 3424 sk->sk_type = sock->type; 3425 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3426 sock->sk = sk; 3427 } else { 3428 RCU_INIT_POINTER(sk->sk_wq, NULL); 3429 } 3430 sk->sk_uid = uid; 3431 3432 rwlock_init(&sk->sk_callback_lock); 3433 if (sk->sk_kern_sock) 3434 lockdep_set_class_and_name( 3435 &sk->sk_callback_lock, 3436 af_kern_callback_keys + sk->sk_family, 3437 af_family_kern_clock_key_strings[sk->sk_family]); 3438 else 3439 lockdep_set_class_and_name( 3440 &sk->sk_callback_lock, 3441 af_callback_keys + sk->sk_family, 3442 af_family_clock_key_strings[sk->sk_family]); 3443 3444 sk->sk_state_change = sock_def_wakeup; 3445 sk->sk_data_ready = sock_def_readable; 3446 sk->sk_write_space = sock_def_write_space; 3447 sk->sk_error_report = sock_def_error_report; 3448 sk->sk_destruct = sock_def_destruct; 3449 3450 sk->sk_frag.page = NULL; 3451 sk->sk_frag.offset = 0; 3452 sk->sk_peek_off = -1; 3453 3454 sk->sk_peer_pid = NULL; 3455 sk->sk_peer_cred = NULL; 3456 spin_lock_init(&sk->sk_peer_lock); 3457 3458 sk->sk_write_pending = 0; 3459 sk->sk_rcvlowat = 1; 3460 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3461 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3462 3463 sk->sk_stamp = SK_DEFAULT_STAMP; 3464 #if BITS_PER_LONG==32 3465 seqlock_init(&sk->sk_stamp_seq); 3466 #endif 3467 atomic_set(&sk->sk_zckey, 0); 3468 3469 #ifdef CONFIG_NET_RX_BUSY_POLL 3470 sk->sk_napi_id = 0; 3471 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3472 #endif 3473 3474 sk->sk_max_pacing_rate = ~0UL; 3475 sk->sk_pacing_rate = ~0UL; 3476 WRITE_ONCE(sk->sk_pacing_shift, 10); 3477 sk->sk_incoming_cpu = -1; 3478 3479 sk_rx_queue_clear(sk); 3480 /* 3481 * Before updating sk_refcnt, we must commit prior changes to memory 3482 * (Documentation/RCU/rculist_nulls.rst for details) 3483 */ 3484 smp_wmb(); 3485 refcount_set(&sk->sk_refcnt, 1); 3486 atomic_set(&sk->sk_drops, 0); 3487 } 3488 EXPORT_SYMBOL(sock_init_data_uid); 3489 3490 void sock_init_data(struct socket *sock, struct sock *sk) 3491 { 3492 kuid_t uid = sock ? 3493 SOCK_INODE(sock)->i_uid : 3494 make_kuid(sock_net(sk)->user_ns, 0); 3495 3496 sock_init_data_uid(sock, sk, uid); 3497 } 3498 EXPORT_SYMBOL(sock_init_data); 3499 3500 void lock_sock_nested(struct sock *sk, int subclass) 3501 { 3502 /* The sk_lock has mutex_lock() semantics here. */ 3503 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3504 3505 might_sleep(); 3506 spin_lock_bh(&sk->sk_lock.slock); 3507 if (sock_owned_by_user_nocheck(sk)) 3508 __lock_sock(sk); 3509 sk->sk_lock.owned = 1; 3510 spin_unlock_bh(&sk->sk_lock.slock); 3511 } 3512 EXPORT_SYMBOL(lock_sock_nested); 3513 3514 void release_sock(struct sock *sk) 3515 { 3516 spin_lock_bh(&sk->sk_lock.slock); 3517 if (sk->sk_backlog.tail) 3518 __release_sock(sk); 3519 3520 if (sk->sk_prot->release_cb) 3521 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3522 tcp_release_cb, sk); 3523 3524 sock_release_ownership(sk); 3525 if (waitqueue_active(&sk->sk_lock.wq)) 3526 wake_up(&sk->sk_lock.wq); 3527 spin_unlock_bh(&sk->sk_lock.slock); 3528 } 3529 EXPORT_SYMBOL(release_sock); 3530 3531 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3532 { 3533 might_sleep(); 3534 spin_lock_bh(&sk->sk_lock.slock); 3535 3536 if (!sock_owned_by_user_nocheck(sk)) { 3537 /* 3538 * Fast path return with bottom halves disabled and 3539 * sock::sk_lock.slock held. 3540 * 3541 * The 'mutex' is not contended and holding 3542 * sock::sk_lock.slock prevents all other lockers to 3543 * proceed so the corresponding unlock_sock_fast() can 3544 * avoid the slow path of release_sock() completely and 3545 * just release slock. 3546 * 3547 * From a semantical POV this is equivalent to 'acquiring' 3548 * the 'mutex', hence the corresponding lockdep 3549 * mutex_release() has to happen in the fast path of 3550 * unlock_sock_fast(). 3551 */ 3552 return false; 3553 } 3554 3555 __lock_sock(sk); 3556 sk->sk_lock.owned = 1; 3557 __acquire(&sk->sk_lock.slock); 3558 spin_unlock_bh(&sk->sk_lock.slock); 3559 return true; 3560 } 3561 EXPORT_SYMBOL(__lock_sock_fast); 3562 3563 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3564 bool timeval, bool time32) 3565 { 3566 struct sock *sk = sock->sk; 3567 struct timespec64 ts; 3568 3569 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3570 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3571 if (ts.tv_sec == -1) 3572 return -ENOENT; 3573 if (ts.tv_sec == 0) { 3574 ktime_t kt = ktime_get_real(); 3575 sock_write_timestamp(sk, kt); 3576 ts = ktime_to_timespec64(kt); 3577 } 3578 3579 if (timeval) 3580 ts.tv_nsec /= 1000; 3581 3582 #ifdef CONFIG_COMPAT_32BIT_TIME 3583 if (time32) 3584 return put_old_timespec32(&ts, userstamp); 3585 #endif 3586 #ifdef CONFIG_SPARC64 3587 /* beware of padding in sparc64 timeval */ 3588 if (timeval && !in_compat_syscall()) { 3589 struct __kernel_old_timeval __user tv = { 3590 .tv_sec = ts.tv_sec, 3591 .tv_usec = ts.tv_nsec, 3592 }; 3593 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3594 return -EFAULT; 3595 return 0; 3596 } 3597 #endif 3598 return put_timespec64(&ts, userstamp); 3599 } 3600 EXPORT_SYMBOL(sock_gettstamp); 3601 3602 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3603 { 3604 if (!sock_flag(sk, flag)) { 3605 unsigned long previous_flags = sk->sk_flags; 3606 3607 sock_set_flag(sk, flag); 3608 /* 3609 * we just set one of the two flags which require net 3610 * time stamping, but time stamping might have been on 3611 * already because of the other one 3612 */ 3613 if (sock_needs_netstamp(sk) && 3614 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3615 net_enable_timestamp(); 3616 } 3617 } 3618 3619 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3620 int level, int type) 3621 { 3622 struct sock_exterr_skb *serr; 3623 struct sk_buff *skb; 3624 int copied, err; 3625 3626 err = -EAGAIN; 3627 skb = sock_dequeue_err_skb(sk); 3628 if (skb == NULL) 3629 goto out; 3630 3631 copied = skb->len; 3632 if (copied > len) { 3633 msg->msg_flags |= MSG_TRUNC; 3634 copied = len; 3635 } 3636 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3637 if (err) 3638 goto out_free_skb; 3639 3640 sock_recv_timestamp(msg, sk, skb); 3641 3642 serr = SKB_EXT_ERR(skb); 3643 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3644 3645 msg->msg_flags |= MSG_ERRQUEUE; 3646 err = copied; 3647 3648 out_free_skb: 3649 kfree_skb(skb); 3650 out: 3651 return err; 3652 } 3653 EXPORT_SYMBOL(sock_recv_errqueue); 3654 3655 /* 3656 * Get a socket option on an socket. 3657 * 3658 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3659 * asynchronous errors should be reported by getsockopt. We assume 3660 * this means if you specify SO_ERROR (otherwise whats the point of it). 3661 */ 3662 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3663 char __user *optval, int __user *optlen) 3664 { 3665 struct sock *sk = sock->sk; 3666 3667 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3668 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3669 } 3670 EXPORT_SYMBOL(sock_common_getsockopt); 3671 3672 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3673 int flags) 3674 { 3675 struct sock *sk = sock->sk; 3676 int addr_len = 0; 3677 int err; 3678 3679 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3680 if (err >= 0) 3681 msg->msg_namelen = addr_len; 3682 return err; 3683 } 3684 EXPORT_SYMBOL(sock_common_recvmsg); 3685 3686 /* 3687 * Set socket options on an inet socket. 3688 */ 3689 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3690 sockptr_t optval, unsigned int optlen) 3691 { 3692 struct sock *sk = sock->sk; 3693 3694 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3695 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3696 } 3697 EXPORT_SYMBOL(sock_common_setsockopt); 3698 3699 void sk_common_release(struct sock *sk) 3700 { 3701 if (sk->sk_prot->destroy) 3702 sk->sk_prot->destroy(sk); 3703 3704 /* 3705 * Observation: when sk_common_release is called, processes have 3706 * no access to socket. But net still has. 3707 * Step one, detach it from networking: 3708 * 3709 * A. Remove from hash tables. 3710 */ 3711 3712 sk->sk_prot->unhash(sk); 3713 3714 /* 3715 * In this point socket cannot receive new packets, but it is possible 3716 * that some packets are in flight because some CPU runs receiver and 3717 * did hash table lookup before we unhashed socket. They will achieve 3718 * receive queue and will be purged by socket destructor. 3719 * 3720 * Also we still have packets pending on receive queue and probably, 3721 * our own packets waiting in device queues. sock_destroy will drain 3722 * receive queue, but transmitted packets will delay socket destruction 3723 * until the last reference will be released. 3724 */ 3725 3726 sock_orphan(sk); 3727 3728 xfrm_sk_free_policy(sk); 3729 3730 sock_put(sk); 3731 } 3732 EXPORT_SYMBOL(sk_common_release); 3733 3734 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3735 { 3736 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3737 3738 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3739 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3740 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3741 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3742 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); 3743 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3744 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3745 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3746 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3747 } 3748 3749 #ifdef CONFIG_PROC_FS 3750 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3751 3752 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3753 { 3754 int cpu, idx = prot->inuse_idx; 3755 int res = 0; 3756 3757 for_each_possible_cpu(cpu) 3758 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3759 3760 return res >= 0 ? res : 0; 3761 } 3762 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3763 3764 int sock_inuse_get(struct net *net) 3765 { 3766 int cpu, res = 0; 3767 3768 for_each_possible_cpu(cpu) 3769 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 3770 3771 return res; 3772 } 3773 3774 EXPORT_SYMBOL_GPL(sock_inuse_get); 3775 3776 static int __net_init sock_inuse_init_net(struct net *net) 3777 { 3778 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3779 if (net->core.prot_inuse == NULL) 3780 return -ENOMEM; 3781 return 0; 3782 } 3783 3784 static void __net_exit sock_inuse_exit_net(struct net *net) 3785 { 3786 free_percpu(net->core.prot_inuse); 3787 } 3788 3789 static struct pernet_operations net_inuse_ops = { 3790 .init = sock_inuse_init_net, 3791 .exit = sock_inuse_exit_net, 3792 }; 3793 3794 static __init int net_inuse_init(void) 3795 { 3796 if (register_pernet_subsys(&net_inuse_ops)) 3797 panic("Cannot initialize net inuse counters"); 3798 3799 return 0; 3800 } 3801 3802 core_initcall(net_inuse_init); 3803 3804 static int assign_proto_idx(struct proto *prot) 3805 { 3806 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3807 3808 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3809 pr_err("PROTO_INUSE_NR exhausted\n"); 3810 return -ENOSPC; 3811 } 3812 3813 set_bit(prot->inuse_idx, proto_inuse_idx); 3814 return 0; 3815 } 3816 3817 static void release_proto_idx(struct proto *prot) 3818 { 3819 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3820 clear_bit(prot->inuse_idx, proto_inuse_idx); 3821 } 3822 #else 3823 static inline int assign_proto_idx(struct proto *prot) 3824 { 3825 return 0; 3826 } 3827 3828 static inline void release_proto_idx(struct proto *prot) 3829 { 3830 } 3831 3832 #endif 3833 3834 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3835 { 3836 if (!twsk_prot) 3837 return; 3838 kfree(twsk_prot->twsk_slab_name); 3839 twsk_prot->twsk_slab_name = NULL; 3840 kmem_cache_destroy(twsk_prot->twsk_slab); 3841 twsk_prot->twsk_slab = NULL; 3842 } 3843 3844 static int tw_prot_init(const struct proto *prot) 3845 { 3846 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3847 3848 if (!twsk_prot) 3849 return 0; 3850 3851 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3852 prot->name); 3853 if (!twsk_prot->twsk_slab_name) 3854 return -ENOMEM; 3855 3856 twsk_prot->twsk_slab = 3857 kmem_cache_create(twsk_prot->twsk_slab_name, 3858 twsk_prot->twsk_obj_size, 0, 3859 SLAB_ACCOUNT | prot->slab_flags, 3860 NULL); 3861 if (!twsk_prot->twsk_slab) { 3862 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3863 prot->name); 3864 return -ENOMEM; 3865 } 3866 3867 return 0; 3868 } 3869 3870 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3871 { 3872 if (!rsk_prot) 3873 return; 3874 kfree(rsk_prot->slab_name); 3875 rsk_prot->slab_name = NULL; 3876 kmem_cache_destroy(rsk_prot->slab); 3877 rsk_prot->slab = NULL; 3878 } 3879 3880 static int req_prot_init(const struct proto *prot) 3881 { 3882 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3883 3884 if (!rsk_prot) 3885 return 0; 3886 3887 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3888 prot->name); 3889 if (!rsk_prot->slab_name) 3890 return -ENOMEM; 3891 3892 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3893 rsk_prot->obj_size, 0, 3894 SLAB_ACCOUNT | prot->slab_flags, 3895 NULL); 3896 3897 if (!rsk_prot->slab) { 3898 pr_crit("%s: Can't create request sock SLAB cache!\n", 3899 prot->name); 3900 return -ENOMEM; 3901 } 3902 return 0; 3903 } 3904 3905 int proto_register(struct proto *prot, int alloc_slab) 3906 { 3907 int ret = -ENOBUFS; 3908 3909 if (prot->memory_allocated && !prot->sysctl_mem) { 3910 pr_err("%s: missing sysctl_mem\n", prot->name); 3911 return -EINVAL; 3912 } 3913 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 3914 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 3915 return -EINVAL; 3916 } 3917 if (alloc_slab) { 3918 prot->slab = kmem_cache_create_usercopy(prot->name, 3919 prot->obj_size, 0, 3920 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3921 prot->slab_flags, 3922 prot->useroffset, prot->usersize, 3923 NULL); 3924 3925 if (prot->slab == NULL) { 3926 pr_crit("%s: Can't create sock SLAB cache!\n", 3927 prot->name); 3928 goto out; 3929 } 3930 3931 if (req_prot_init(prot)) 3932 goto out_free_request_sock_slab; 3933 3934 if (tw_prot_init(prot)) 3935 goto out_free_timewait_sock_slab; 3936 } 3937 3938 mutex_lock(&proto_list_mutex); 3939 ret = assign_proto_idx(prot); 3940 if (ret) { 3941 mutex_unlock(&proto_list_mutex); 3942 goto out_free_timewait_sock_slab; 3943 } 3944 list_add(&prot->node, &proto_list); 3945 mutex_unlock(&proto_list_mutex); 3946 return ret; 3947 3948 out_free_timewait_sock_slab: 3949 if (alloc_slab) 3950 tw_prot_cleanup(prot->twsk_prot); 3951 out_free_request_sock_slab: 3952 if (alloc_slab) { 3953 req_prot_cleanup(prot->rsk_prot); 3954 3955 kmem_cache_destroy(prot->slab); 3956 prot->slab = NULL; 3957 } 3958 out: 3959 return ret; 3960 } 3961 EXPORT_SYMBOL(proto_register); 3962 3963 void proto_unregister(struct proto *prot) 3964 { 3965 mutex_lock(&proto_list_mutex); 3966 release_proto_idx(prot); 3967 list_del(&prot->node); 3968 mutex_unlock(&proto_list_mutex); 3969 3970 kmem_cache_destroy(prot->slab); 3971 prot->slab = NULL; 3972 3973 req_prot_cleanup(prot->rsk_prot); 3974 tw_prot_cleanup(prot->twsk_prot); 3975 } 3976 EXPORT_SYMBOL(proto_unregister); 3977 3978 int sock_load_diag_module(int family, int protocol) 3979 { 3980 if (!protocol) { 3981 if (!sock_is_registered(family)) 3982 return -ENOENT; 3983 3984 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3985 NETLINK_SOCK_DIAG, family); 3986 } 3987 3988 #ifdef CONFIG_INET 3989 if (family == AF_INET && 3990 protocol != IPPROTO_RAW && 3991 protocol < MAX_INET_PROTOS && 3992 !rcu_access_pointer(inet_protos[protocol])) 3993 return -ENOENT; 3994 #endif 3995 3996 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3997 NETLINK_SOCK_DIAG, family, protocol); 3998 } 3999 EXPORT_SYMBOL(sock_load_diag_module); 4000 4001 #ifdef CONFIG_PROC_FS 4002 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4003 __acquires(proto_list_mutex) 4004 { 4005 mutex_lock(&proto_list_mutex); 4006 return seq_list_start_head(&proto_list, *pos); 4007 } 4008 4009 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4010 { 4011 return seq_list_next(v, &proto_list, pos); 4012 } 4013 4014 static void proto_seq_stop(struct seq_file *seq, void *v) 4015 __releases(proto_list_mutex) 4016 { 4017 mutex_unlock(&proto_list_mutex); 4018 } 4019 4020 static char proto_method_implemented(const void *method) 4021 { 4022 return method == NULL ? 'n' : 'y'; 4023 } 4024 static long sock_prot_memory_allocated(struct proto *proto) 4025 { 4026 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4027 } 4028 4029 static const char *sock_prot_memory_pressure(struct proto *proto) 4030 { 4031 return proto->memory_pressure != NULL ? 4032 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4033 } 4034 4035 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4036 { 4037 4038 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4039 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4040 proto->name, 4041 proto->obj_size, 4042 sock_prot_inuse_get(seq_file_net(seq), proto), 4043 sock_prot_memory_allocated(proto), 4044 sock_prot_memory_pressure(proto), 4045 proto->max_header, 4046 proto->slab == NULL ? "no" : "yes", 4047 module_name(proto->owner), 4048 proto_method_implemented(proto->close), 4049 proto_method_implemented(proto->connect), 4050 proto_method_implemented(proto->disconnect), 4051 proto_method_implemented(proto->accept), 4052 proto_method_implemented(proto->ioctl), 4053 proto_method_implemented(proto->init), 4054 proto_method_implemented(proto->destroy), 4055 proto_method_implemented(proto->shutdown), 4056 proto_method_implemented(proto->setsockopt), 4057 proto_method_implemented(proto->getsockopt), 4058 proto_method_implemented(proto->sendmsg), 4059 proto_method_implemented(proto->recvmsg), 4060 proto_method_implemented(proto->bind), 4061 proto_method_implemented(proto->backlog_rcv), 4062 proto_method_implemented(proto->hash), 4063 proto_method_implemented(proto->unhash), 4064 proto_method_implemented(proto->get_port), 4065 proto_method_implemented(proto->enter_memory_pressure)); 4066 } 4067 4068 static int proto_seq_show(struct seq_file *seq, void *v) 4069 { 4070 if (v == &proto_list) 4071 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4072 "protocol", 4073 "size", 4074 "sockets", 4075 "memory", 4076 "press", 4077 "maxhdr", 4078 "slab", 4079 "module", 4080 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4081 else 4082 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4083 return 0; 4084 } 4085 4086 static const struct seq_operations proto_seq_ops = { 4087 .start = proto_seq_start, 4088 .next = proto_seq_next, 4089 .stop = proto_seq_stop, 4090 .show = proto_seq_show, 4091 }; 4092 4093 static __net_init int proto_init_net(struct net *net) 4094 { 4095 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4096 sizeof(struct seq_net_private))) 4097 return -ENOMEM; 4098 4099 return 0; 4100 } 4101 4102 static __net_exit void proto_exit_net(struct net *net) 4103 { 4104 remove_proc_entry("protocols", net->proc_net); 4105 } 4106 4107 4108 static __net_initdata struct pernet_operations proto_net_ops = { 4109 .init = proto_init_net, 4110 .exit = proto_exit_net, 4111 }; 4112 4113 static int __init proto_init(void) 4114 { 4115 return register_pernet_subsys(&proto_net_ops); 4116 } 4117 4118 subsys_initcall(proto_init); 4119 4120 #endif /* PROC_FS */ 4121 4122 #ifdef CONFIG_NET_RX_BUSY_POLL 4123 bool sk_busy_loop_end(void *p, unsigned long start_time) 4124 { 4125 struct sock *sk = p; 4126 4127 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 4128 sk_busy_loop_timeout(sk, start_time); 4129 } 4130 EXPORT_SYMBOL(sk_busy_loop_end); 4131 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4132 4133 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4134 { 4135 if (!sk->sk_prot->bind_add) 4136 return -EOPNOTSUPP; 4137 return sk->sk_prot->bind_add(sk, addr, addr_len); 4138 } 4139 EXPORT_SYMBOL(sock_bind_add); 4140 4141 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4142 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4143 void __user *arg, void *karg, size_t size) 4144 { 4145 int ret; 4146 4147 if (copy_from_user(karg, arg, size)) 4148 return -EFAULT; 4149 4150 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4151 if (ret) 4152 return ret; 4153 4154 if (copy_to_user(arg, karg, size)) 4155 return -EFAULT; 4156 4157 return 0; 4158 } 4159 EXPORT_SYMBOL(sock_ioctl_inout); 4160 4161 /* This is the most common ioctl prep function, where the result (4 bytes) is 4162 * copied back to userspace if the ioctl() returns successfully. No input is 4163 * copied from userspace as input argument. 4164 */ 4165 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4166 { 4167 int ret, karg = 0; 4168 4169 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4170 if (ret) 4171 return ret; 4172 4173 return put_user(karg, (int __user *)arg); 4174 } 4175 4176 /* A wrapper around sock ioctls, which copies the data from userspace 4177 * (depending on the protocol/ioctl), and copies back the result to userspace. 4178 * The main motivation for this function is to pass kernel memory to the 4179 * protocol ioctl callbacks, instead of userspace memory. 4180 */ 4181 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4182 { 4183 int rc = 1; 4184 4185 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4186 rc = ipmr_sk_ioctl(sk, cmd, arg); 4187 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4188 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4189 else if (sk_is_phonet(sk)) 4190 rc = phonet_sk_ioctl(sk, cmd, arg); 4191 4192 /* If ioctl was processed, returns its value */ 4193 if (rc <= 0) 4194 return rc; 4195 4196 /* Otherwise call the default handler */ 4197 return sock_ioctl_out(sk, cmd, arg); 4198 } 4199 EXPORT_SYMBOL(sk_ioctl); 4200