1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <asm/unaligned.h> 95 #include <linux/capability.h> 96 #include <linux/errno.h> 97 #include <linux/errqueue.h> 98 #include <linux/types.h> 99 #include <linux/socket.h> 100 #include <linux/in.h> 101 #include <linux/kernel.h> 102 #include <linux/module.h> 103 #include <linux/proc_fs.h> 104 #include <linux/seq_file.h> 105 #include <linux/sched.h> 106 #include <linux/sched/mm.h> 107 #include <linux/timer.h> 108 #include <linux/string.h> 109 #include <linux/sockios.h> 110 #include <linux/net.h> 111 #include <linux/mm.h> 112 #include <linux/slab.h> 113 #include <linux/interrupt.h> 114 #include <linux/poll.h> 115 #include <linux/tcp.h> 116 #include <linux/init.h> 117 #include <linux/highmem.h> 118 #include <linux/user_namespace.h> 119 #include <linux/static_key.h> 120 #include <linux/memcontrol.h> 121 #include <linux/prefetch.h> 122 123 #include <linux/uaccess.h> 124 125 #include <linux/netdevice.h> 126 #include <net/protocol.h> 127 #include <linux/skbuff.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <linux/net_tstamp.h> 132 #include <net/xfrm.h> 133 #include <linux/ipsec.h> 134 #include <net/cls_cgroup.h> 135 #include <net/netprio_cgroup.h> 136 #include <linux/sock_diag.h> 137 138 #include <linux/filter.h> 139 #include <net/sock_reuseport.h> 140 141 #include <trace/events/sock.h> 142 143 #include <net/tcp.h> 144 #include <net/busy_poll.h> 145 146 static DEFINE_MUTEX(proto_list_mutex); 147 static LIST_HEAD(proto_list); 148 149 static void sock_inuse_add(struct net *net, int val); 150 151 /** 152 * sk_ns_capable - General socket capability test 153 * @sk: Socket to use a capability on or through 154 * @user_ns: The user namespace of the capability to use 155 * @cap: The capability to use 156 * 157 * Test to see if the opener of the socket had when the socket was 158 * created and the current process has the capability @cap in the user 159 * namespace @user_ns. 160 */ 161 bool sk_ns_capable(const struct sock *sk, 162 struct user_namespace *user_ns, int cap) 163 { 164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 165 ns_capable(user_ns, cap); 166 } 167 EXPORT_SYMBOL(sk_ns_capable); 168 169 /** 170 * sk_capable - Socket global capability test 171 * @sk: Socket to use a capability on or through 172 * @cap: The global capability to use 173 * 174 * Test to see if the opener of the socket had when the socket was 175 * created and the current process has the capability @cap in all user 176 * namespaces. 177 */ 178 bool sk_capable(const struct sock *sk, int cap) 179 { 180 return sk_ns_capable(sk, &init_user_ns, cap); 181 } 182 EXPORT_SYMBOL(sk_capable); 183 184 /** 185 * sk_net_capable - Network namespace socket capability test 186 * @sk: Socket to use a capability on or through 187 * @cap: The capability to use 188 * 189 * Test to see if the opener of the socket had when the socket was created 190 * and the current process has the capability @cap over the network namespace 191 * the socket is a member of. 192 */ 193 bool sk_net_capable(const struct sock *sk, int cap) 194 { 195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 196 } 197 EXPORT_SYMBOL(sk_net_capable); 198 199 /* 200 * Each address family might have different locking rules, so we have 201 * one slock key per address family and separate keys for internal and 202 * userspace sockets. 203 */ 204 static struct lock_class_key af_family_keys[AF_MAX]; 205 static struct lock_class_key af_family_kern_keys[AF_MAX]; 206 static struct lock_class_key af_family_slock_keys[AF_MAX]; 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 208 209 /* 210 * Make lock validator output more readable. (we pre-construct these 211 * strings build-time, so that runtime initialization of socket 212 * locks is fast): 213 */ 214 215 #define _sock_locks(x) \ 216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 225 x "27" , x "28" , x "AF_CAN" , \ 226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 231 x "AF_MAX" 232 233 static const char *const af_family_key_strings[AF_MAX+1] = { 234 _sock_locks("sk_lock-") 235 }; 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 237 _sock_locks("slock-") 238 }; 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 240 _sock_locks("clock-") 241 }; 242 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 244 _sock_locks("k-sk_lock-") 245 }; 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 247 _sock_locks("k-slock-") 248 }; 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 250 _sock_locks("k-clock-") 251 }; 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 253 _sock_locks("rlock-") 254 }; 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 256 _sock_locks("wlock-") 257 }; 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 259 _sock_locks("elock-") 260 }; 261 262 /* 263 * sk_callback_lock and sk queues locking rules are per-address-family, 264 * so split the lock classes by using a per-AF key: 265 */ 266 static struct lock_class_key af_callback_keys[AF_MAX]; 267 static struct lock_class_key af_rlock_keys[AF_MAX]; 268 static struct lock_class_key af_wlock_keys[AF_MAX]; 269 static struct lock_class_key af_elock_keys[AF_MAX]; 270 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 271 272 /* Run time adjustable parameters. */ 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 274 EXPORT_SYMBOL(sysctl_wmem_max); 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 276 EXPORT_SYMBOL(sysctl_rmem_max); 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 279 280 /* Maximal space eaten by iovec or ancillary data plus some space */ 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 282 EXPORT_SYMBOL(sysctl_optmem_max); 283 284 int sysctl_tstamp_allow_data __read_mostly = 1; 285 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 287 EXPORT_SYMBOL_GPL(memalloc_socks_key); 288 289 /** 290 * sk_set_memalloc - sets %SOCK_MEMALLOC 291 * @sk: socket to set it on 292 * 293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 294 * It's the responsibility of the admin to adjust min_free_kbytes 295 * to meet the requirements 296 */ 297 void sk_set_memalloc(struct sock *sk) 298 { 299 sock_set_flag(sk, SOCK_MEMALLOC); 300 sk->sk_allocation |= __GFP_MEMALLOC; 301 static_branch_inc(&memalloc_socks_key); 302 } 303 EXPORT_SYMBOL_GPL(sk_set_memalloc); 304 305 void sk_clear_memalloc(struct sock *sk) 306 { 307 sock_reset_flag(sk, SOCK_MEMALLOC); 308 sk->sk_allocation &= ~__GFP_MEMALLOC; 309 static_branch_dec(&memalloc_socks_key); 310 311 /* 312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 313 * progress of swapping. SOCK_MEMALLOC may be cleared while 314 * it has rmem allocations due to the last swapfile being deactivated 315 * but there is a risk that the socket is unusable due to exceeding 316 * the rmem limits. Reclaim the reserves and obey rmem limits again. 317 */ 318 sk_mem_reclaim(sk); 319 } 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 321 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 323 { 324 int ret; 325 unsigned int noreclaim_flag; 326 327 /* these should have been dropped before queueing */ 328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 329 330 noreclaim_flag = memalloc_noreclaim_save(); 331 ret = sk->sk_backlog_rcv(sk, skb); 332 memalloc_noreclaim_restore(noreclaim_flag); 333 334 return ret; 335 } 336 EXPORT_SYMBOL(__sk_backlog_rcv); 337 338 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 339 { 340 struct __kernel_sock_timeval tv; 341 int size; 342 343 if (timeo == MAX_SCHEDULE_TIMEOUT) { 344 tv.tv_sec = 0; 345 tv.tv_usec = 0; 346 } else { 347 tv.tv_sec = timeo / HZ; 348 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 349 } 350 351 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 352 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 353 *(struct old_timeval32 *)optval = tv32; 354 return sizeof(tv32); 355 } 356 357 if (old_timeval) { 358 struct __kernel_old_timeval old_tv; 359 old_tv.tv_sec = tv.tv_sec; 360 old_tv.tv_usec = tv.tv_usec; 361 *(struct __kernel_old_timeval *)optval = old_tv; 362 size = sizeof(old_tv); 363 } else { 364 *(struct __kernel_sock_timeval *)optval = tv; 365 size = sizeof(tv); 366 } 367 368 return size; 369 } 370 371 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) 372 { 373 struct __kernel_sock_timeval tv; 374 375 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 376 struct old_timeval32 tv32; 377 378 if (optlen < sizeof(tv32)) 379 return -EINVAL; 380 381 if (copy_from_user(&tv32, optval, sizeof(tv32))) 382 return -EFAULT; 383 tv.tv_sec = tv32.tv_sec; 384 tv.tv_usec = tv32.tv_usec; 385 } else if (old_timeval) { 386 struct __kernel_old_timeval old_tv; 387 388 if (optlen < sizeof(old_tv)) 389 return -EINVAL; 390 if (copy_from_user(&old_tv, optval, sizeof(old_tv))) 391 return -EFAULT; 392 tv.tv_sec = old_tv.tv_sec; 393 tv.tv_usec = old_tv.tv_usec; 394 } else { 395 if (optlen < sizeof(tv)) 396 return -EINVAL; 397 if (copy_from_user(&tv, optval, sizeof(tv))) 398 return -EFAULT; 399 } 400 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 401 return -EDOM; 402 403 if (tv.tv_sec < 0) { 404 static int warned __read_mostly; 405 406 *timeo_p = 0; 407 if (warned < 10 && net_ratelimit()) { 408 warned++; 409 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 410 __func__, current->comm, task_pid_nr(current)); 411 } 412 return 0; 413 } 414 *timeo_p = MAX_SCHEDULE_TIMEOUT; 415 if (tv.tv_sec == 0 && tv.tv_usec == 0) 416 return 0; 417 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 418 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 419 return 0; 420 } 421 422 static void sock_warn_obsolete_bsdism(const char *name) 423 { 424 static int warned; 425 static char warncomm[TASK_COMM_LEN]; 426 if (strcmp(warncomm, current->comm) && warned < 5) { 427 strcpy(warncomm, current->comm); 428 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 429 warncomm, name); 430 warned++; 431 } 432 } 433 434 static bool sock_needs_netstamp(const struct sock *sk) 435 { 436 switch (sk->sk_family) { 437 case AF_UNSPEC: 438 case AF_UNIX: 439 return false; 440 default: 441 return true; 442 } 443 } 444 445 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 446 { 447 if (sk->sk_flags & flags) { 448 sk->sk_flags &= ~flags; 449 if (sock_needs_netstamp(sk) && 450 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 451 net_disable_timestamp(); 452 } 453 } 454 455 456 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 457 { 458 unsigned long flags; 459 struct sk_buff_head *list = &sk->sk_receive_queue; 460 461 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 462 atomic_inc(&sk->sk_drops); 463 trace_sock_rcvqueue_full(sk, skb); 464 return -ENOMEM; 465 } 466 467 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 468 atomic_inc(&sk->sk_drops); 469 return -ENOBUFS; 470 } 471 472 skb->dev = NULL; 473 skb_set_owner_r(skb, sk); 474 475 /* we escape from rcu protected region, make sure we dont leak 476 * a norefcounted dst 477 */ 478 skb_dst_force(skb); 479 480 spin_lock_irqsave(&list->lock, flags); 481 sock_skb_set_dropcount(sk, skb); 482 __skb_queue_tail(list, skb); 483 spin_unlock_irqrestore(&list->lock, flags); 484 485 if (!sock_flag(sk, SOCK_DEAD)) 486 sk->sk_data_ready(sk); 487 return 0; 488 } 489 EXPORT_SYMBOL(__sock_queue_rcv_skb); 490 491 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 492 { 493 int err; 494 495 err = sk_filter(sk, skb); 496 if (err) 497 return err; 498 499 return __sock_queue_rcv_skb(sk, skb); 500 } 501 EXPORT_SYMBOL(sock_queue_rcv_skb); 502 503 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 504 const int nested, unsigned int trim_cap, bool refcounted) 505 { 506 int rc = NET_RX_SUCCESS; 507 508 if (sk_filter_trim_cap(sk, skb, trim_cap)) 509 goto discard_and_relse; 510 511 skb->dev = NULL; 512 513 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 if (nested) 518 bh_lock_sock_nested(sk); 519 else 520 bh_lock_sock(sk); 521 if (!sock_owned_by_user(sk)) { 522 /* 523 * trylock + unlock semantics: 524 */ 525 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 526 527 rc = sk_backlog_rcv(sk, skb); 528 529 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 530 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 531 bh_unlock_sock(sk); 532 atomic_inc(&sk->sk_drops); 533 goto discard_and_relse; 534 } 535 536 bh_unlock_sock(sk); 537 out: 538 if (refcounted) 539 sock_put(sk); 540 return rc; 541 discard_and_relse: 542 kfree_skb(skb); 543 goto out; 544 } 545 EXPORT_SYMBOL(__sk_receive_skb); 546 547 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 548 { 549 struct dst_entry *dst = __sk_dst_get(sk); 550 551 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 552 sk_tx_queue_clear(sk); 553 sk->sk_dst_pending_confirm = 0; 554 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 555 dst_release(dst); 556 return NULL; 557 } 558 559 return dst; 560 } 561 EXPORT_SYMBOL(__sk_dst_check); 562 563 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 564 { 565 struct dst_entry *dst = sk_dst_get(sk); 566 567 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 568 sk_dst_reset(sk); 569 dst_release(dst); 570 return NULL; 571 } 572 573 return dst; 574 } 575 EXPORT_SYMBOL(sk_dst_check); 576 577 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) 578 { 579 int ret = -ENOPROTOOPT; 580 #ifdef CONFIG_NETDEVICES 581 struct net *net = sock_net(sk); 582 583 /* Sorry... */ 584 ret = -EPERM; 585 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 586 goto out; 587 588 ret = -EINVAL; 589 if (ifindex < 0) 590 goto out; 591 592 sk->sk_bound_dev_if = ifindex; 593 if (sk->sk_prot->rehash) 594 sk->sk_prot->rehash(sk); 595 sk_dst_reset(sk); 596 597 ret = 0; 598 599 out: 600 #endif 601 602 return ret; 603 } 604 605 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 606 int optlen) 607 { 608 int ret = -ENOPROTOOPT; 609 #ifdef CONFIG_NETDEVICES 610 struct net *net = sock_net(sk); 611 char devname[IFNAMSIZ]; 612 int index; 613 614 ret = -EINVAL; 615 if (optlen < 0) 616 goto out; 617 618 /* Bind this socket to a particular device like "eth0", 619 * as specified in the passed interface name. If the 620 * name is "" or the option length is zero the socket 621 * is not bound. 622 */ 623 if (optlen > IFNAMSIZ - 1) 624 optlen = IFNAMSIZ - 1; 625 memset(devname, 0, sizeof(devname)); 626 627 ret = -EFAULT; 628 if (copy_from_user(devname, optval, optlen)) 629 goto out; 630 631 index = 0; 632 if (devname[0] != '\0') { 633 struct net_device *dev; 634 635 rcu_read_lock(); 636 dev = dev_get_by_name_rcu(net, devname); 637 if (dev) 638 index = dev->ifindex; 639 rcu_read_unlock(); 640 ret = -ENODEV; 641 if (!dev) 642 goto out; 643 } 644 645 lock_sock(sk); 646 ret = sock_setbindtodevice_locked(sk, index); 647 release_sock(sk); 648 649 out: 650 #endif 651 652 return ret; 653 } 654 655 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 656 int __user *optlen, int len) 657 { 658 int ret = -ENOPROTOOPT; 659 #ifdef CONFIG_NETDEVICES 660 struct net *net = sock_net(sk); 661 char devname[IFNAMSIZ]; 662 663 if (sk->sk_bound_dev_if == 0) { 664 len = 0; 665 goto zero; 666 } 667 668 ret = -EINVAL; 669 if (len < IFNAMSIZ) 670 goto out; 671 672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 673 if (ret) 674 goto out; 675 676 len = strlen(devname) + 1; 677 678 ret = -EFAULT; 679 if (copy_to_user(optval, devname, len)) 680 goto out; 681 682 zero: 683 ret = -EFAULT; 684 if (put_user(len, optlen)) 685 goto out; 686 687 ret = 0; 688 689 out: 690 #endif 691 692 return ret; 693 } 694 695 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 696 { 697 if (valbool) 698 sock_set_flag(sk, bit); 699 else 700 sock_reset_flag(sk, bit); 701 } 702 703 bool sk_mc_loop(struct sock *sk) 704 { 705 if (dev_recursion_level()) 706 return false; 707 if (!sk) 708 return true; 709 switch (sk->sk_family) { 710 case AF_INET: 711 return inet_sk(sk)->mc_loop; 712 #if IS_ENABLED(CONFIG_IPV6) 713 case AF_INET6: 714 return inet6_sk(sk)->mc_loop; 715 #endif 716 } 717 WARN_ON(1); 718 return true; 719 } 720 EXPORT_SYMBOL(sk_mc_loop); 721 722 /* 723 * This is meant for all protocols to use and covers goings on 724 * at the socket level. Everything here is generic. 725 */ 726 727 int sock_setsockopt(struct socket *sock, int level, int optname, 728 char __user *optval, unsigned int optlen) 729 { 730 struct sock_txtime sk_txtime; 731 struct sock *sk = sock->sk; 732 int val; 733 int valbool; 734 struct linger ling; 735 int ret = 0; 736 737 /* 738 * Options without arguments 739 */ 740 741 if (optname == SO_BINDTODEVICE) 742 return sock_setbindtodevice(sk, optval, optlen); 743 744 if (optlen < sizeof(int)) 745 return -EINVAL; 746 747 if (get_user(val, (int __user *)optval)) 748 return -EFAULT; 749 750 valbool = val ? 1 : 0; 751 752 lock_sock(sk); 753 754 switch (optname) { 755 case SO_DEBUG: 756 if (val && !capable(CAP_NET_ADMIN)) 757 ret = -EACCES; 758 else 759 sock_valbool_flag(sk, SOCK_DBG, valbool); 760 break; 761 case SO_REUSEADDR: 762 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 763 break; 764 case SO_REUSEPORT: 765 sk->sk_reuseport = valbool; 766 break; 767 case SO_TYPE: 768 case SO_PROTOCOL: 769 case SO_DOMAIN: 770 case SO_ERROR: 771 ret = -ENOPROTOOPT; 772 break; 773 case SO_DONTROUTE: 774 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 775 sk_dst_reset(sk); 776 break; 777 case SO_BROADCAST: 778 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 779 break; 780 case SO_SNDBUF: 781 /* Don't error on this BSD doesn't and if you think 782 * about it this is right. Otherwise apps have to 783 * play 'guess the biggest size' games. RCVBUF/SNDBUF 784 * are treated in BSD as hints 785 */ 786 val = min_t(u32, val, sysctl_wmem_max); 787 set_sndbuf: 788 /* Ensure val * 2 fits into an int, to prevent max_t() 789 * from treating it as a negative value. 790 */ 791 val = min_t(int, val, INT_MAX / 2); 792 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 793 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 794 /* Wake up sending tasks if we upped the value. */ 795 sk->sk_write_space(sk); 796 break; 797 798 case SO_SNDBUFFORCE: 799 if (!capable(CAP_NET_ADMIN)) { 800 ret = -EPERM; 801 break; 802 } 803 804 /* No negative values (to prevent underflow, as val will be 805 * multiplied by 2). 806 */ 807 if (val < 0) 808 val = 0; 809 goto set_sndbuf; 810 811 case SO_RCVBUF: 812 /* Don't error on this BSD doesn't and if you think 813 * about it this is right. Otherwise apps have to 814 * play 'guess the biggest size' games. RCVBUF/SNDBUF 815 * are treated in BSD as hints 816 */ 817 val = min_t(u32, val, sysctl_rmem_max); 818 set_rcvbuf: 819 /* Ensure val * 2 fits into an int, to prevent max_t() 820 * from treating it as a negative value. 821 */ 822 val = min_t(int, val, INT_MAX / 2); 823 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 824 /* 825 * We double it on the way in to account for 826 * "struct sk_buff" etc. overhead. Applications 827 * assume that the SO_RCVBUF setting they make will 828 * allow that much actual data to be received on that 829 * socket. 830 * 831 * Applications are unaware that "struct sk_buff" and 832 * other overheads allocate from the receive buffer 833 * during socket buffer allocation. 834 * 835 * And after considering the possible alternatives, 836 * returning the value we actually used in getsockopt 837 * is the most desirable behavior. 838 */ 839 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 840 break; 841 842 case SO_RCVBUFFORCE: 843 if (!capable(CAP_NET_ADMIN)) { 844 ret = -EPERM; 845 break; 846 } 847 848 /* No negative values (to prevent underflow, as val will be 849 * multiplied by 2). 850 */ 851 if (val < 0) 852 val = 0; 853 goto set_rcvbuf; 854 855 case SO_KEEPALIVE: 856 if (sk->sk_prot->keepalive) 857 sk->sk_prot->keepalive(sk, valbool); 858 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 859 break; 860 861 case SO_OOBINLINE: 862 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 863 break; 864 865 case SO_NO_CHECK: 866 sk->sk_no_check_tx = valbool; 867 break; 868 869 case SO_PRIORITY: 870 if ((val >= 0 && val <= 6) || 871 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 872 sk->sk_priority = val; 873 else 874 ret = -EPERM; 875 break; 876 877 case SO_LINGER: 878 if (optlen < sizeof(ling)) { 879 ret = -EINVAL; /* 1003.1g */ 880 break; 881 } 882 if (copy_from_user(&ling, optval, sizeof(ling))) { 883 ret = -EFAULT; 884 break; 885 } 886 if (!ling.l_onoff) 887 sock_reset_flag(sk, SOCK_LINGER); 888 else { 889 #if (BITS_PER_LONG == 32) 890 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 891 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 892 else 893 #endif 894 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 895 sock_set_flag(sk, SOCK_LINGER); 896 } 897 break; 898 899 case SO_BSDCOMPAT: 900 sock_warn_obsolete_bsdism("setsockopt"); 901 break; 902 903 case SO_PASSCRED: 904 if (valbool) 905 set_bit(SOCK_PASSCRED, &sock->flags); 906 else 907 clear_bit(SOCK_PASSCRED, &sock->flags); 908 break; 909 910 case SO_TIMESTAMP_OLD: 911 case SO_TIMESTAMP_NEW: 912 case SO_TIMESTAMPNS_OLD: 913 case SO_TIMESTAMPNS_NEW: 914 if (valbool) { 915 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) 916 sock_set_flag(sk, SOCK_TSTAMP_NEW); 917 else 918 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 919 920 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) 921 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 922 else 923 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 924 sock_set_flag(sk, SOCK_RCVTSTAMP); 925 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 926 } else { 927 sock_reset_flag(sk, SOCK_RCVTSTAMP); 928 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 929 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 930 } 931 break; 932 933 case SO_TIMESTAMPING_NEW: 934 sock_set_flag(sk, SOCK_TSTAMP_NEW); 935 /* fall through */ 936 case SO_TIMESTAMPING_OLD: 937 if (val & ~SOF_TIMESTAMPING_MASK) { 938 ret = -EINVAL; 939 break; 940 } 941 942 if (val & SOF_TIMESTAMPING_OPT_ID && 943 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 944 if (sk->sk_protocol == IPPROTO_TCP && 945 sk->sk_type == SOCK_STREAM) { 946 if ((1 << sk->sk_state) & 947 (TCPF_CLOSE | TCPF_LISTEN)) { 948 ret = -EINVAL; 949 break; 950 } 951 sk->sk_tskey = tcp_sk(sk)->snd_una; 952 } else { 953 sk->sk_tskey = 0; 954 } 955 } 956 957 if (val & SOF_TIMESTAMPING_OPT_STATS && 958 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 959 ret = -EINVAL; 960 break; 961 } 962 963 sk->sk_tsflags = val; 964 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 965 sock_enable_timestamp(sk, 966 SOCK_TIMESTAMPING_RX_SOFTWARE); 967 else { 968 if (optname == SO_TIMESTAMPING_NEW) 969 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 970 971 sock_disable_timestamp(sk, 972 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 973 } 974 break; 975 976 case SO_RCVLOWAT: 977 if (val < 0) 978 val = INT_MAX; 979 if (sock->ops->set_rcvlowat) 980 ret = sock->ops->set_rcvlowat(sk, val); 981 else 982 sk->sk_rcvlowat = val ? : 1; 983 break; 984 985 case SO_RCVTIMEO_OLD: 986 case SO_RCVTIMEO_NEW: 987 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); 988 break; 989 990 case SO_SNDTIMEO_OLD: 991 case SO_SNDTIMEO_NEW: 992 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); 993 break; 994 995 case SO_ATTACH_FILTER: 996 ret = -EINVAL; 997 if (optlen == sizeof(struct sock_fprog)) { 998 struct sock_fprog fprog; 999 1000 ret = -EFAULT; 1001 if (copy_from_user(&fprog, optval, sizeof(fprog))) 1002 break; 1003 1004 ret = sk_attach_filter(&fprog, sk); 1005 } 1006 break; 1007 1008 case SO_ATTACH_BPF: 1009 ret = -EINVAL; 1010 if (optlen == sizeof(u32)) { 1011 u32 ufd; 1012 1013 ret = -EFAULT; 1014 if (copy_from_user(&ufd, optval, sizeof(ufd))) 1015 break; 1016 1017 ret = sk_attach_bpf(ufd, sk); 1018 } 1019 break; 1020 1021 case SO_ATTACH_REUSEPORT_CBPF: 1022 ret = -EINVAL; 1023 if (optlen == sizeof(struct sock_fprog)) { 1024 struct sock_fprog fprog; 1025 1026 ret = -EFAULT; 1027 if (copy_from_user(&fprog, optval, sizeof(fprog))) 1028 break; 1029 1030 ret = sk_reuseport_attach_filter(&fprog, sk); 1031 } 1032 break; 1033 1034 case SO_ATTACH_REUSEPORT_EBPF: 1035 ret = -EINVAL; 1036 if (optlen == sizeof(u32)) { 1037 u32 ufd; 1038 1039 ret = -EFAULT; 1040 if (copy_from_user(&ufd, optval, sizeof(ufd))) 1041 break; 1042 1043 ret = sk_reuseport_attach_bpf(ufd, sk); 1044 } 1045 break; 1046 1047 case SO_DETACH_FILTER: 1048 ret = sk_detach_filter(sk); 1049 break; 1050 1051 case SO_LOCK_FILTER: 1052 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1053 ret = -EPERM; 1054 else 1055 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1056 break; 1057 1058 case SO_PASSSEC: 1059 if (valbool) 1060 set_bit(SOCK_PASSSEC, &sock->flags); 1061 else 1062 clear_bit(SOCK_PASSSEC, &sock->flags); 1063 break; 1064 case SO_MARK: 1065 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1066 ret = -EPERM; 1067 } else if (val != sk->sk_mark) { 1068 sk->sk_mark = val; 1069 sk_dst_reset(sk); 1070 } 1071 break; 1072 1073 case SO_RXQ_OVFL: 1074 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1075 break; 1076 1077 case SO_WIFI_STATUS: 1078 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1079 break; 1080 1081 case SO_PEEK_OFF: 1082 if (sock->ops->set_peek_off) 1083 ret = sock->ops->set_peek_off(sk, val); 1084 else 1085 ret = -EOPNOTSUPP; 1086 break; 1087 1088 case SO_NOFCS: 1089 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1090 break; 1091 1092 case SO_SELECT_ERR_QUEUE: 1093 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1094 break; 1095 1096 #ifdef CONFIG_NET_RX_BUSY_POLL 1097 case SO_BUSY_POLL: 1098 /* allow unprivileged users to decrease the value */ 1099 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1100 ret = -EPERM; 1101 else { 1102 if (val < 0) 1103 ret = -EINVAL; 1104 else 1105 sk->sk_ll_usec = val; 1106 } 1107 break; 1108 #endif 1109 1110 case SO_MAX_PACING_RATE: 1111 if (val != ~0U) 1112 cmpxchg(&sk->sk_pacing_status, 1113 SK_PACING_NONE, 1114 SK_PACING_NEEDED); 1115 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; 1116 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1117 sk->sk_max_pacing_rate); 1118 break; 1119 1120 case SO_INCOMING_CPU: 1121 sk->sk_incoming_cpu = val; 1122 break; 1123 1124 case SO_CNX_ADVICE: 1125 if (val == 1) 1126 dst_negative_advice(sk); 1127 break; 1128 1129 case SO_ZEROCOPY: 1130 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1131 if (!((sk->sk_type == SOCK_STREAM && 1132 sk->sk_protocol == IPPROTO_TCP) || 1133 (sk->sk_type == SOCK_DGRAM && 1134 sk->sk_protocol == IPPROTO_UDP))) 1135 ret = -ENOTSUPP; 1136 } else if (sk->sk_family != PF_RDS) { 1137 ret = -ENOTSUPP; 1138 } 1139 if (!ret) { 1140 if (val < 0 || val > 1) 1141 ret = -EINVAL; 1142 else 1143 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1144 } 1145 break; 1146 1147 case SO_TXTIME: 1148 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1149 ret = -EPERM; 1150 } else if (optlen != sizeof(struct sock_txtime)) { 1151 ret = -EINVAL; 1152 } else if (copy_from_user(&sk_txtime, optval, 1153 sizeof(struct sock_txtime))) { 1154 ret = -EFAULT; 1155 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1156 ret = -EINVAL; 1157 } else { 1158 sock_valbool_flag(sk, SOCK_TXTIME, true); 1159 sk->sk_clockid = sk_txtime.clockid; 1160 sk->sk_txtime_deadline_mode = 1161 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1162 sk->sk_txtime_report_errors = 1163 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1164 } 1165 break; 1166 1167 case SO_BINDTOIFINDEX: 1168 ret = sock_setbindtodevice_locked(sk, val); 1169 break; 1170 1171 default: 1172 ret = -ENOPROTOOPT; 1173 break; 1174 } 1175 release_sock(sk); 1176 return ret; 1177 } 1178 EXPORT_SYMBOL(sock_setsockopt); 1179 1180 1181 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1182 struct ucred *ucred) 1183 { 1184 ucred->pid = pid_vnr(pid); 1185 ucred->uid = ucred->gid = -1; 1186 if (cred) { 1187 struct user_namespace *current_ns = current_user_ns(); 1188 1189 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1190 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1191 } 1192 } 1193 1194 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1195 { 1196 struct user_namespace *user_ns = current_user_ns(); 1197 int i; 1198 1199 for (i = 0; i < src->ngroups; i++) 1200 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1201 return -EFAULT; 1202 1203 return 0; 1204 } 1205 1206 int sock_getsockopt(struct socket *sock, int level, int optname, 1207 char __user *optval, int __user *optlen) 1208 { 1209 struct sock *sk = sock->sk; 1210 1211 union { 1212 int val; 1213 u64 val64; 1214 struct linger ling; 1215 struct old_timeval32 tm32; 1216 struct __kernel_old_timeval tm; 1217 struct __kernel_sock_timeval stm; 1218 struct sock_txtime txtime; 1219 } v; 1220 1221 int lv = sizeof(int); 1222 int len; 1223 1224 if (get_user(len, optlen)) 1225 return -EFAULT; 1226 if (len < 0) 1227 return -EINVAL; 1228 1229 memset(&v, 0, sizeof(v)); 1230 1231 switch (optname) { 1232 case SO_DEBUG: 1233 v.val = sock_flag(sk, SOCK_DBG); 1234 break; 1235 1236 case SO_DONTROUTE: 1237 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1238 break; 1239 1240 case SO_BROADCAST: 1241 v.val = sock_flag(sk, SOCK_BROADCAST); 1242 break; 1243 1244 case SO_SNDBUF: 1245 v.val = sk->sk_sndbuf; 1246 break; 1247 1248 case SO_RCVBUF: 1249 v.val = sk->sk_rcvbuf; 1250 break; 1251 1252 case SO_REUSEADDR: 1253 v.val = sk->sk_reuse; 1254 break; 1255 1256 case SO_REUSEPORT: 1257 v.val = sk->sk_reuseport; 1258 break; 1259 1260 case SO_KEEPALIVE: 1261 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1262 break; 1263 1264 case SO_TYPE: 1265 v.val = sk->sk_type; 1266 break; 1267 1268 case SO_PROTOCOL: 1269 v.val = sk->sk_protocol; 1270 break; 1271 1272 case SO_DOMAIN: 1273 v.val = sk->sk_family; 1274 break; 1275 1276 case SO_ERROR: 1277 v.val = -sock_error(sk); 1278 if (v.val == 0) 1279 v.val = xchg(&sk->sk_err_soft, 0); 1280 break; 1281 1282 case SO_OOBINLINE: 1283 v.val = sock_flag(sk, SOCK_URGINLINE); 1284 break; 1285 1286 case SO_NO_CHECK: 1287 v.val = sk->sk_no_check_tx; 1288 break; 1289 1290 case SO_PRIORITY: 1291 v.val = sk->sk_priority; 1292 break; 1293 1294 case SO_LINGER: 1295 lv = sizeof(v.ling); 1296 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1297 v.ling.l_linger = sk->sk_lingertime / HZ; 1298 break; 1299 1300 case SO_BSDCOMPAT: 1301 sock_warn_obsolete_bsdism("getsockopt"); 1302 break; 1303 1304 case SO_TIMESTAMP_OLD: 1305 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1306 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1307 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1308 break; 1309 1310 case SO_TIMESTAMPNS_OLD: 1311 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1312 break; 1313 1314 case SO_TIMESTAMP_NEW: 1315 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1316 break; 1317 1318 case SO_TIMESTAMPNS_NEW: 1319 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1320 break; 1321 1322 case SO_TIMESTAMPING_OLD: 1323 v.val = sk->sk_tsflags; 1324 break; 1325 1326 case SO_RCVTIMEO_OLD: 1327 case SO_RCVTIMEO_NEW: 1328 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1329 break; 1330 1331 case SO_SNDTIMEO_OLD: 1332 case SO_SNDTIMEO_NEW: 1333 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1334 break; 1335 1336 case SO_RCVLOWAT: 1337 v.val = sk->sk_rcvlowat; 1338 break; 1339 1340 case SO_SNDLOWAT: 1341 v.val = 1; 1342 break; 1343 1344 case SO_PASSCRED: 1345 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1346 break; 1347 1348 case SO_PEERCRED: 1349 { 1350 struct ucred peercred; 1351 if (len > sizeof(peercred)) 1352 len = sizeof(peercred); 1353 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1354 if (copy_to_user(optval, &peercred, len)) 1355 return -EFAULT; 1356 goto lenout; 1357 } 1358 1359 case SO_PEERGROUPS: 1360 { 1361 int ret, n; 1362 1363 if (!sk->sk_peer_cred) 1364 return -ENODATA; 1365 1366 n = sk->sk_peer_cred->group_info->ngroups; 1367 if (len < n * sizeof(gid_t)) { 1368 len = n * sizeof(gid_t); 1369 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1370 } 1371 len = n * sizeof(gid_t); 1372 1373 ret = groups_to_user((gid_t __user *)optval, 1374 sk->sk_peer_cred->group_info); 1375 if (ret) 1376 return ret; 1377 goto lenout; 1378 } 1379 1380 case SO_PEERNAME: 1381 { 1382 char address[128]; 1383 1384 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1385 if (lv < 0) 1386 return -ENOTCONN; 1387 if (lv < len) 1388 return -EINVAL; 1389 if (copy_to_user(optval, address, len)) 1390 return -EFAULT; 1391 goto lenout; 1392 } 1393 1394 /* Dubious BSD thing... Probably nobody even uses it, but 1395 * the UNIX standard wants it for whatever reason... -DaveM 1396 */ 1397 case SO_ACCEPTCONN: 1398 v.val = sk->sk_state == TCP_LISTEN; 1399 break; 1400 1401 case SO_PASSSEC: 1402 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1403 break; 1404 1405 case SO_PEERSEC: 1406 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1407 1408 case SO_MARK: 1409 v.val = sk->sk_mark; 1410 break; 1411 1412 case SO_RXQ_OVFL: 1413 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1414 break; 1415 1416 case SO_WIFI_STATUS: 1417 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1418 break; 1419 1420 case SO_PEEK_OFF: 1421 if (!sock->ops->set_peek_off) 1422 return -EOPNOTSUPP; 1423 1424 v.val = sk->sk_peek_off; 1425 break; 1426 case SO_NOFCS: 1427 v.val = sock_flag(sk, SOCK_NOFCS); 1428 break; 1429 1430 case SO_BINDTODEVICE: 1431 return sock_getbindtodevice(sk, optval, optlen, len); 1432 1433 case SO_GET_FILTER: 1434 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1435 if (len < 0) 1436 return len; 1437 1438 goto lenout; 1439 1440 case SO_LOCK_FILTER: 1441 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1442 break; 1443 1444 case SO_BPF_EXTENSIONS: 1445 v.val = bpf_tell_extensions(); 1446 break; 1447 1448 case SO_SELECT_ERR_QUEUE: 1449 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1450 break; 1451 1452 #ifdef CONFIG_NET_RX_BUSY_POLL 1453 case SO_BUSY_POLL: 1454 v.val = sk->sk_ll_usec; 1455 break; 1456 #endif 1457 1458 case SO_MAX_PACING_RATE: 1459 /* 32bit version */ 1460 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1461 break; 1462 1463 case SO_INCOMING_CPU: 1464 v.val = sk->sk_incoming_cpu; 1465 break; 1466 1467 case SO_MEMINFO: 1468 { 1469 u32 meminfo[SK_MEMINFO_VARS]; 1470 1471 if (get_user(len, optlen)) 1472 return -EFAULT; 1473 1474 sk_get_meminfo(sk, meminfo); 1475 1476 len = min_t(unsigned int, len, sizeof(meminfo)); 1477 if (copy_to_user(optval, &meminfo, len)) 1478 return -EFAULT; 1479 1480 goto lenout; 1481 } 1482 1483 #ifdef CONFIG_NET_RX_BUSY_POLL 1484 case SO_INCOMING_NAPI_ID: 1485 v.val = READ_ONCE(sk->sk_napi_id); 1486 1487 /* aggregate non-NAPI IDs down to 0 */ 1488 if (v.val < MIN_NAPI_ID) 1489 v.val = 0; 1490 1491 break; 1492 #endif 1493 1494 case SO_COOKIE: 1495 lv = sizeof(u64); 1496 if (len < lv) 1497 return -EINVAL; 1498 v.val64 = sock_gen_cookie(sk); 1499 break; 1500 1501 case SO_ZEROCOPY: 1502 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1503 break; 1504 1505 case SO_TXTIME: 1506 lv = sizeof(v.txtime); 1507 v.txtime.clockid = sk->sk_clockid; 1508 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1509 SOF_TXTIME_DEADLINE_MODE : 0; 1510 v.txtime.flags |= sk->sk_txtime_report_errors ? 1511 SOF_TXTIME_REPORT_ERRORS : 0; 1512 break; 1513 1514 case SO_BINDTOIFINDEX: 1515 v.val = sk->sk_bound_dev_if; 1516 break; 1517 1518 default: 1519 /* We implement the SO_SNDLOWAT etc to not be settable 1520 * (1003.1g 7). 1521 */ 1522 return -ENOPROTOOPT; 1523 } 1524 1525 if (len > lv) 1526 len = lv; 1527 if (copy_to_user(optval, &v, len)) 1528 return -EFAULT; 1529 lenout: 1530 if (put_user(len, optlen)) 1531 return -EFAULT; 1532 return 0; 1533 } 1534 1535 /* 1536 * Initialize an sk_lock. 1537 * 1538 * (We also register the sk_lock with the lock validator.) 1539 */ 1540 static inline void sock_lock_init(struct sock *sk) 1541 { 1542 if (sk->sk_kern_sock) 1543 sock_lock_init_class_and_name( 1544 sk, 1545 af_family_kern_slock_key_strings[sk->sk_family], 1546 af_family_kern_slock_keys + sk->sk_family, 1547 af_family_kern_key_strings[sk->sk_family], 1548 af_family_kern_keys + sk->sk_family); 1549 else 1550 sock_lock_init_class_and_name( 1551 sk, 1552 af_family_slock_key_strings[sk->sk_family], 1553 af_family_slock_keys + sk->sk_family, 1554 af_family_key_strings[sk->sk_family], 1555 af_family_keys + sk->sk_family); 1556 } 1557 1558 /* 1559 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1560 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1561 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1562 */ 1563 static void sock_copy(struct sock *nsk, const struct sock *osk) 1564 { 1565 #ifdef CONFIG_SECURITY_NETWORK 1566 void *sptr = nsk->sk_security; 1567 #endif 1568 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1569 1570 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1571 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1572 1573 #ifdef CONFIG_SECURITY_NETWORK 1574 nsk->sk_security = sptr; 1575 security_sk_clone(osk, nsk); 1576 #endif 1577 } 1578 1579 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1580 int family) 1581 { 1582 struct sock *sk; 1583 struct kmem_cache *slab; 1584 1585 slab = prot->slab; 1586 if (slab != NULL) { 1587 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1588 if (!sk) 1589 return sk; 1590 if (priority & __GFP_ZERO) 1591 sk_prot_clear_nulls(sk, prot->obj_size); 1592 } else 1593 sk = kmalloc(prot->obj_size, priority); 1594 1595 if (sk != NULL) { 1596 if (security_sk_alloc(sk, family, priority)) 1597 goto out_free; 1598 1599 if (!try_module_get(prot->owner)) 1600 goto out_free_sec; 1601 sk_tx_queue_clear(sk); 1602 } 1603 1604 return sk; 1605 1606 out_free_sec: 1607 security_sk_free(sk); 1608 out_free: 1609 if (slab != NULL) 1610 kmem_cache_free(slab, sk); 1611 else 1612 kfree(sk); 1613 return NULL; 1614 } 1615 1616 static void sk_prot_free(struct proto *prot, struct sock *sk) 1617 { 1618 struct kmem_cache *slab; 1619 struct module *owner; 1620 1621 owner = prot->owner; 1622 slab = prot->slab; 1623 1624 cgroup_sk_free(&sk->sk_cgrp_data); 1625 mem_cgroup_sk_free(sk); 1626 security_sk_free(sk); 1627 if (slab != NULL) 1628 kmem_cache_free(slab, sk); 1629 else 1630 kfree(sk); 1631 module_put(owner); 1632 } 1633 1634 /** 1635 * sk_alloc - All socket objects are allocated here 1636 * @net: the applicable net namespace 1637 * @family: protocol family 1638 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1639 * @prot: struct proto associated with this new sock instance 1640 * @kern: is this to be a kernel socket? 1641 */ 1642 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1643 struct proto *prot, int kern) 1644 { 1645 struct sock *sk; 1646 1647 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1648 if (sk) { 1649 sk->sk_family = family; 1650 /* 1651 * See comment in struct sock definition to understand 1652 * why we need sk_prot_creator -acme 1653 */ 1654 sk->sk_prot = sk->sk_prot_creator = prot; 1655 sk->sk_kern_sock = kern; 1656 sock_lock_init(sk); 1657 sk->sk_net_refcnt = kern ? 0 : 1; 1658 if (likely(sk->sk_net_refcnt)) { 1659 get_net(net); 1660 sock_inuse_add(net, 1); 1661 } 1662 1663 sock_net_set(sk, net); 1664 refcount_set(&sk->sk_wmem_alloc, 1); 1665 1666 mem_cgroup_sk_alloc(sk); 1667 cgroup_sk_alloc(&sk->sk_cgrp_data); 1668 sock_update_classid(&sk->sk_cgrp_data); 1669 sock_update_netprioidx(&sk->sk_cgrp_data); 1670 } 1671 1672 return sk; 1673 } 1674 EXPORT_SYMBOL(sk_alloc); 1675 1676 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1677 * grace period. This is the case for UDP sockets and TCP listeners. 1678 */ 1679 static void __sk_destruct(struct rcu_head *head) 1680 { 1681 struct sock *sk = container_of(head, struct sock, sk_rcu); 1682 struct sk_filter *filter; 1683 1684 if (sk->sk_destruct) 1685 sk->sk_destruct(sk); 1686 1687 filter = rcu_dereference_check(sk->sk_filter, 1688 refcount_read(&sk->sk_wmem_alloc) == 0); 1689 if (filter) { 1690 sk_filter_uncharge(sk, filter); 1691 RCU_INIT_POINTER(sk->sk_filter, NULL); 1692 } 1693 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1694 reuseport_detach_sock(sk); 1695 1696 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1697 1698 if (atomic_read(&sk->sk_omem_alloc)) 1699 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1700 __func__, atomic_read(&sk->sk_omem_alloc)); 1701 1702 if (sk->sk_frag.page) { 1703 put_page(sk->sk_frag.page); 1704 sk->sk_frag.page = NULL; 1705 } 1706 1707 if (sk->sk_peer_cred) 1708 put_cred(sk->sk_peer_cred); 1709 put_pid(sk->sk_peer_pid); 1710 if (likely(sk->sk_net_refcnt)) 1711 put_net(sock_net(sk)); 1712 sk_prot_free(sk->sk_prot_creator, sk); 1713 } 1714 1715 void sk_destruct(struct sock *sk) 1716 { 1717 if (sock_flag(sk, SOCK_RCU_FREE)) 1718 call_rcu(&sk->sk_rcu, __sk_destruct); 1719 else 1720 __sk_destruct(&sk->sk_rcu); 1721 } 1722 1723 static void __sk_free(struct sock *sk) 1724 { 1725 if (likely(sk->sk_net_refcnt)) 1726 sock_inuse_add(sock_net(sk), -1); 1727 1728 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1729 sock_diag_broadcast_destroy(sk); 1730 else 1731 sk_destruct(sk); 1732 } 1733 1734 void sk_free(struct sock *sk) 1735 { 1736 /* 1737 * We subtract one from sk_wmem_alloc and can know if 1738 * some packets are still in some tx queue. 1739 * If not null, sock_wfree() will call __sk_free(sk) later 1740 */ 1741 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1742 __sk_free(sk); 1743 } 1744 EXPORT_SYMBOL(sk_free); 1745 1746 static void sk_init_common(struct sock *sk) 1747 { 1748 skb_queue_head_init(&sk->sk_receive_queue); 1749 skb_queue_head_init(&sk->sk_write_queue); 1750 skb_queue_head_init(&sk->sk_error_queue); 1751 1752 rwlock_init(&sk->sk_callback_lock); 1753 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1754 af_rlock_keys + sk->sk_family, 1755 af_family_rlock_key_strings[sk->sk_family]); 1756 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1757 af_wlock_keys + sk->sk_family, 1758 af_family_wlock_key_strings[sk->sk_family]); 1759 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1760 af_elock_keys + sk->sk_family, 1761 af_family_elock_key_strings[sk->sk_family]); 1762 lockdep_set_class_and_name(&sk->sk_callback_lock, 1763 af_callback_keys + sk->sk_family, 1764 af_family_clock_key_strings[sk->sk_family]); 1765 } 1766 1767 /** 1768 * sk_clone_lock - clone a socket, and lock its clone 1769 * @sk: the socket to clone 1770 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1771 * 1772 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1773 */ 1774 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1775 { 1776 struct sock *newsk; 1777 bool is_charged = true; 1778 1779 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1780 if (newsk != NULL) { 1781 struct sk_filter *filter; 1782 1783 sock_copy(newsk, sk); 1784 1785 newsk->sk_prot_creator = sk->sk_prot; 1786 1787 /* SANITY */ 1788 if (likely(newsk->sk_net_refcnt)) 1789 get_net(sock_net(newsk)); 1790 sk_node_init(&newsk->sk_node); 1791 sock_lock_init(newsk); 1792 bh_lock_sock(newsk); 1793 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1794 newsk->sk_backlog.len = 0; 1795 1796 atomic_set(&newsk->sk_rmem_alloc, 0); 1797 /* 1798 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1799 */ 1800 refcount_set(&newsk->sk_wmem_alloc, 1); 1801 atomic_set(&newsk->sk_omem_alloc, 0); 1802 sk_init_common(newsk); 1803 1804 newsk->sk_dst_cache = NULL; 1805 newsk->sk_dst_pending_confirm = 0; 1806 newsk->sk_wmem_queued = 0; 1807 newsk->sk_forward_alloc = 0; 1808 atomic_set(&newsk->sk_drops, 0); 1809 newsk->sk_send_head = NULL; 1810 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1811 atomic_set(&newsk->sk_zckey, 0); 1812 1813 sock_reset_flag(newsk, SOCK_DONE); 1814 mem_cgroup_sk_alloc(newsk); 1815 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1816 1817 rcu_read_lock(); 1818 filter = rcu_dereference(sk->sk_filter); 1819 if (filter != NULL) 1820 /* though it's an empty new sock, the charging may fail 1821 * if sysctl_optmem_max was changed between creation of 1822 * original socket and cloning 1823 */ 1824 is_charged = sk_filter_charge(newsk, filter); 1825 RCU_INIT_POINTER(newsk->sk_filter, filter); 1826 rcu_read_unlock(); 1827 1828 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1829 /* We need to make sure that we don't uncharge the new 1830 * socket if we couldn't charge it in the first place 1831 * as otherwise we uncharge the parent's filter. 1832 */ 1833 if (!is_charged) 1834 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1835 sk_free_unlock_clone(newsk); 1836 newsk = NULL; 1837 goto out; 1838 } 1839 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1840 1841 newsk->sk_err = 0; 1842 newsk->sk_err_soft = 0; 1843 newsk->sk_priority = 0; 1844 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1845 if (likely(newsk->sk_net_refcnt)) 1846 sock_inuse_add(sock_net(newsk), 1); 1847 1848 /* 1849 * Before updating sk_refcnt, we must commit prior changes to memory 1850 * (Documentation/RCU/rculist_nulls.txt for details) 1851 */ 1852 smp_wmb(); 1853 refcount_set(&newsk->sk_refcnt, 2); 1854 1855 /* 1856 * Increment the counter in the same struct proto as the master 1857 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1858 * is the same as sk->sk_prot->socks, as this field was copied 1859 * with memcpy). 1860 * 1861 * This _changes_ the previous behaviour, where 1862 * tcp_create_openreq_child always was incrementing the 1863 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1864 * to be taken into account in all callers. -acme 1865 */ 1866 sk_refcnt_debug_inc(newsk); 1867 sk_set_socket(newsk, NULL); 1868 newsk->sk_wq = NULL; 1869 1870 if (newsk->sk_prot->sockets_allocated) 1871 sk_sockets_allocated_inc(newsk); 1872 1873 if (sock_needs_netstamp(sk) && 1874 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1875 net_enable_timestamp(); 1876 } 1877 out: 1878 return newsk; 1879 } 1880 EXPORT_SYMBOL_GPL(sk_clone_lock); 1881 1882 void sk_free_unlock_clone(struct sock *sk) 1883 { 1884 /* It is still raw copy of parent, so invalidate 1885 * destructor and make plain sk_free() */ 1886 sk->sk_destruct = NULL; 1887 bh_unlock_sock(sk); 1888 sk_free(sk); 1889 } 1890 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1891 1892 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1893 { 1894 u32 max_segs = 1; 1895 1896 sk_dst_set(sk, dst); 1897 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 1898 if (sk->sk_route_caps & NETIF_F_GSO) 1899 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1900 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1901 if (sk_can_gso(sk)) { 1902 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1903 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1904 } else { 1905 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1906 sk->sk_gso_max_size = dst->dev->gso_max_size; 1907 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1908 } 1909 } 1910 sk->sk_gso_max_segs = max_segs; 1911 } 1912 EXPORT_SYMBOL_GPL(sk_setup_caps); 1913 1914 /* 1915 * Simple resource managers for sockets. 1916 */ 1917 1918 1919 /* 1920 * Write buffer destructor automatically called from kfree_skb. 1921 */ 1922 void sock_wfree(struct sk_buff *skb) 1923 { 1924 struct sock *sk = skb->sk; 1925 unsigned int len = skb->truesize; 1926 1927 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1928 /* 1929 * Keep a reference on sk_wmem_alloc, this will be released 1930 * after sk_write_space() call 1931 */ 1932 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1933 sk->sk_write_space(sk); 1934 len = 1; 1935 } 1936 /* 1937 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1938 * could not do because of in-flight packets 1939 */ 1940 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1941 __sk_free(sk); 1942 } 1943 EXPORT_SYMBOL(sock_wfree); 1944 1945 /* This variant of sock_wfree() is used by TCP, 1946 * since it sets SOCK_USE_WRITE_QUEUE. 1947 */ 1948 void __sock_wfree(struct sk_buff *skb) 1949 { 1950 struct sock *sk = skb->sk; 1951 1952 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1953 __sk_free(sk); 1954 } 1955 1956 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1957 { 1958 skb_orphan(skb); 1959 skb->sk = sk; 1960 #ifdef CONFIG_INET 1961 if (unlikely(!sk_fullsock(sk))) { 1962 skb->destructor = sock_edemux; 1963 sock_hold(sk); 1964 return; 1965 } 1966 #endif 1967 skb->destructor = sock_wfree; 1968 skb_set_hash_from_sk(skb, sk); 1969 /* 1970 * We used to take a refcount on sk, but following operation 1971 * is enough to guarantee sk_free() wont free this sock until 1972 * all in-flight packets are completed 1973 */ 1974 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1975 } 1976 EXPORT_SYMBOL(skb_set_owner_w); 1977 1978 /* This helper is used by netem, as it can hold packets in its 1979 * delay queue. We want to allow the owner socket to send more 1980 * packets, as if they were already TX completed by a typical driver. 1981 * But we also want to keep skb->sk set because some packet schedulers 1982 * rely on it (sch_fq for example). 1983 */ 1984 void skb_orphan_partial(struct sk_buff *skb) 1985 { 1986 if (skb_is_tcp_pure_ack(skb)) 1987 return; 1988 1989 if (skb->destructor == sock_wfree 1990 #ifdef CONFIG_INET 1991 || skb->destructor == tcp_wfree 1992 #endif 1993 ) { 1994 struct sock *sk = skb->sk; 1995 1996 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1997 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1998 skb->destructor = sock_efree; 1999 } 2000 } else { 2001 skb_orphan(skb); 2002 } 2003 } 2004 EXPORT_SYMBOL(skb_orphan_partial); 2005 2006 /* 2007 * Read buffer destructor automatically called from kfree_skb. 2008 */ 2009 void sock_rfree(struct sk_buff *skb) 2010 { 2011 struct sock *sk = skb->sk; 2012 unsigned int len = skb->truesize; 2013 2014 atomic_sub(len, &sk->sk_rmem_alloc); 2015 sk_mem_uncharge(sk, len); 2016 } 2017 EXPORT_SYMBOL(sock_rfree); 2018 2019 /* 2020 * Buffer destructor for skbs that are not used directly in read or write 2021 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2022 */ 2023 void sock_efree(struct sk_buff *skb) 2024 { 2025 sock_put(skb->sk); 2026 } 2027 EXPORT_SYMBOL(sock_efree); 2028 2029 kuid_t sock_i_uid(struct sock *sk) 2030 { 2031 kuid_t uid; 2032 2033 read_lock_bh(&sk->sk_callback_lock); 2034 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2035 read_unlock_bh(&sk->sk_callback_lock); 2036 return uid; 2037 } 2038 EXPORT_SYMBOL(sock_i_uid); 2039 2040 unsigned long sock_i_ino(struct sock *sk) 2041 { 2042 unsigned long ino; 2043 2044 read_lock_bh(&sk->sk_callback_lock); 2045 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2046 read_unlock_bh(&sk->sk_callback_lock); 2047 return ino; 2048 } 2049 EXPORT_SYMBOL(sock_i_ino); 2050 2051 /* 2052 * Allocate a skb from the socket's send buffer. 2053 */ 2054 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2055 gfp_t priority) 2056 { 2057 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 2058 struct sk_buff *skb = alloc_skb(size, priority); 2059 if (skb) { 2060 skb_set_owner_w(skb, sk); 2061 return skb; 2062 } 2063 } 2064 return NULL; 2065 } 2066 EXPORT_SYMBOL(sock_wmalloc); 2067 2068 static void sock_ofree(struct sk_buff *skb) 2069 { 2070 struct sock *sk = skb->sk; 2071 2072 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2073 } 2074 2075 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2076 gfp_t priority) 2077 { 2078 struct sk_buff *skb; 2079 2080 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2081 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2082 sysctl_optmem_max) 2083 return NULL; 2084 2085 skb = alloc_skb(size, priority); 2086 if (!skb) 2087 return NULL; 2088 2089 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2090 skb->sk = sk; 2091 skb->destructor = sock_ofree; 2092 return skb; 2093 } 2094 2095 /* 2096 * Allocate a memory block from the socket's option memory buffer. 2097 */ 2098 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2099 { 2100 if ((unsigned int)size <= sysctl_optmem_max && 2101 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2102 void *mem; 2103 /* First do the add, to avoid the race if kmalloc 2104 * might sleep. 2105 */ 2106 atomic_add(size, &sk->sk_omem_alloc); 2107 mem = kmalloc(size, priority); 2108 if (mem) 2109 return mem; 2110 atomic_sub(size, &sk->sk_omem_alloc); 2111 } 2112 return NULL; 2113 } 2114 EXPORT_SYMBOL(sock_kmalloc); 2115 2116 /* Free an option memory block. Note, we actually want the inline 2117 * here as this allows gcc to detect the nullify and fold away the 2118 * condition entirely. 2119 */ 2120 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2121 const bool nullify) 2122 { 2123 if (WARN_ON_ONCE(!mem)) 2124 return; 2125 if (nullify) 2126 kzfree(mem); 2127 else 2128 kfree(mem); 2129 atomic_sub(size, &sk->sk_omem_alloc); 2130 } 2131 2132 void sock_kfree_s(struct sock *sk, void *mem, int size) 2133 { 2134 __sock_kfree_s(sk, mem, size, false); 2135 } 2136 EXPORT_SYMBOL(sock_kfree_s); 2137 2138 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2139 { 2140 __sock_kfree_s(sk, mem, size, true); 2141 } 2142 EXPORT_SYMBOL(sock_kzfree_s); 2143 2144 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2145 I think, these locks should be removed for datagram sockets. 2146 */ 2147 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2148 { 2149 DEFINE_WAIT(wait); 2150 2151 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2152 for (;;) { 2153 if (!timeo) 2154 break; 2155 if (signal_pending(current)) 2156 break; 2157 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2158 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2159 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2160 break; 2161 if (sk->sk_shutdown & SEND_SHUTDOWN) 2162 break; 2163 if (sk->sk_err) 2164 break; 2165 timeo = schedule_timeout(timeo); 2166 } 2167 finish_wait(sk_sleep(sk), &wait); 2168 return timeo; 2169 } 2170 2171 2172 /* 2173 * Generic send/receive buffer handlers 2174 */ 2175 2176 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2177 unsigned long data_len, int noblock, 2178 int *errcode, int max_page_order) 2179 { 2180 struct sk_buff *skb; 2181 long timeo; 2182 int err; 2183 2184 timeo = sock_sndtimeo(sk, noblock); 2185 for (;;) { 2186 err = sock_error(sk); 2187 if (err != 0) 2188 goto failure; 2189 2190 err = -EPIPE; 2191 if (sk->sk_shutdown & SEND_SHUTDOWN) 2192 goto failure; 2193 2194 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2195 break; 2196 2197 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2198 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2199 err = -EAGAIN; 2200 if (!timeo) 2201 goto failure; 2202 if (signal_pending(current)) 2203 goto interrupted; 2204 timeo = sock_wait_for_wmem(sk, timeo); 2205 } 2206 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2207 errcode, sk->sk_allocation); 2208 if (skb) 2209 skb_set_owner_w(skb, sk); 2210 return skb; 2211 2212 interrupted: 2213 err = sock_intr_errno(timeo); 2214 failure: 2215 *errcode = err; 2216 return NULL; 2217 } 2218 EXPORT_SYMBOL(sock_alloc_send_pskb); 2219 2220 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2221 int noblock, int *errcode) 2222 { 2223 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2224 } 2225 EXPORT_SYMBOL(sock_alloc_send_skb); 2226 2227 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2228 struct sockcm_cookie *sockc) 2229 { 2230 u32 tsflags; 2231 2232 switch (cmsg->cmsg_type) { 2233 case SO_MARK: 2234 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2235 return -EPERM; 2236 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2237 return -EINVAL; 2238 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2239 break; 2240 case SO_TIMESTAMPING_OLD: 2241 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2242 return -EINVAL; 2243 2244 tsflags = *(u32 *)CMSG_DATA(cmsg); 2245 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2246 return -EINVAL; 2247 2248 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2249 sockc->tsflags |= tsflags; 2250 break; 2251 case SCM_TXTIME: 2252 if (!sock_flag(sk, SOCK_TXTIME)) 2253 return -EINVAL; 2254 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2255 return -EINVAL; 2256 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2257 break; 2258 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2259 case SCM_RIGHTS: 2260 case SCM_CREDENTIALS: 2261 break; 2262 default: 2263 return -EINVAL; 2264 } 2265 return 0; 2266 } 2267 EXPORT_SYMBOL(__sock_cmsg_send); 2268 2269 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2270 struct sockcm_cookie *sockc) 2271 { 2272 struct cmsghdr *cmsg; 2273 int ret; 2274 2275 for_each_cmsghdr(cmsg, msg) { 2276 if (!CMSG_OK(msg, cmsg)) 2277 return -EINVAL; 2278 if (cmsg->cmsg_level != SOL_SOCKET) 2279 continue; 2280 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2281 if (ret) 2282 return ret; 2283 } 2284 return 0; 2285 } 2286 EXPORT_SYMBOL(sock_cmsg_send); 2287 2288 static void sk_enter_memory_pressure(struct sock *sk) 2289 { 2290 if (!sk->sk_prot->enter_memory_pressure) 2291 return; 2292 2293 sk->sk_prot->enter_memory_pressure(sk); 2294 } 2295 2296 static void sk_leave_memory_pressure(struct sock *sk) 2297 { 2298 if (sk->sk_prot->leave_memory_pressure) { 2299 sk->sk_prot->leave_memory_pressure(sk); 2300 } else { 2301 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2302 2303 if (memory_pressure && *memory_pressure) 2304 *memory_pressure = 0; 2305 } 2306 } 2307 2308 /* On 32bit arches, an skb frag is limited to 2^15 */ 2309 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2310 2311 /** 2312 * skb_page_frag_refill - check that a page_frag contains enough room 2313 * @sz: minimum size of the fragment we want to get 2314 * @pfrag: pointer to page_frag 2315 * @gfp: priority for memory allocation 2316 * 2317 * Note: While this allocator tries to use high order pages, there is 2318 * no guarantee that allocations succeed. Therefore, @sz MUST be 2319 * less or equal than PAGE_SIZE. 2320 */ 2321 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2322 { 2323 if (pfrag->page) { 2324 if (page_ref_count(pfrag->page) == 1) { 2325 pfrag->offset = 0; 2326 return true; 2327 } 2328 if (pfrag->offset + sz <= pfrag->size) 2329 return true; 2330 put_page(pfrag->page); 2331 } 2332 2333 pfrag->offset = 0; 2334 if (SKB_FRAG_PAGE_ORDER) { 2335 /* Avoid direct reclaim but allow kswapd to wake */ 2336 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2337 __GFP_COMP | __GFP_NOWARN | 2338 __GFP_NORETRY, 2339 SKB_FRAG_PAGE_ORDER); 2340 if (likely(pfrag->page)) { 2341 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2342 return true; 2343 } 2344 } 2345 pfrag->page = alloc_page(gfp); 2346 if (likely(pfrag->page)) { 2347 pfrag->size = PAGE_SIZE; 2348 return true; 2349 } 2350 return false; 2351 } 2352 EXPORT_SYMBOL(skb_page_frag_refill); 2353 2354 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2355 { 2356 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2357 return true; 2358 2359 sk_enter_memory_pressure(sk); 2360 sk_stream_moderate_sndbuf(sk); 2361 return false; 2362 } 2363 EXPORT_SYMBOL(sk_page_frag_refill); 2364 2365 static void __lock_sock(struct sock *sk) 2366 __releases(&sk->sk_lock.slock) 2367 __acquires(&sk->sk_lock.slock) 2368 { 2369 DEFINE_WAIT(wait); 2370 2371 for (;;) { 2372 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2373 TASK_UNINTERRUPTIBLE); 2374 spin_unlock_bh(&sk->sk_lock.slock); 2375 schedule(); 2376 spin_lock_bh(&sk->sk_lock.slock); 2377 if (!sock_owned_by_user(sk)) 2378 break; 2379 } 2380 finish_wait(&sk->sk_lock.wq, &wait); 2381 } 2382 2383 void __release_sock(struct sock *sk) 2384 __releases(&sk->sk_lock.slock) 2385 __acquires(&sk->sk_lock.slock) 2386 { 2387 struct sk_buff *skb, *next; 2388 2389 while ((skb = sk->sk_backlog.head) != NULL) { 2390 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2391 2392 spin_unlock_bh(&sk->sk_lock.slock); 2393 2394 do { 2395 next = skb->next; 2396 prefetch(next); 2397 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2398 skb_mark_not_on_list(skb); 2399 sk_backlog_rcv(sk, skb); 2400 2401 cond_resched(); 2402 2403 skb = next; 2404 } while (skb != NULL); 2405 2406 spin_lock_bh(&sk->sk_lock.slock); 2407 } 2408 2409 /* 2410 * Doing the zeroing here guarantee we can not loop forever 2411 * while a wild producer attempts to flood us. 2412 */ 2413 sk->sk_backlog.len = 0; 2414 } 2415 2416 void __sk_flush_backlog(struct sock *sk) 2417 { 2418 spin_lock_bh(&sk->sk_lock.slock); 2419 __release_sock(sk); 2420 spin_unlock_bh(&sk->sk_lock.slock); 2421 } 2422 2423 /** 2424 * sk_wait_data - wait for data to arrive at sk_receive_queue 2425 * @sk: sock to wait on 2426 * @timeo: for how long 2427 * @skb: last skb seen on sk_receive_queue 2428 * 2429 * Now socket state including sk->sk_err is changed only under lock, 2430 * hence we may omit checks after joining wait queue. 2431 * We check receive queue before schedule() only as optimization; 2432 * it is very likely that release_sock() added new data. 2433 */ 2434 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2435 { 2436 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2437 int rc; 2438 2439 add_wait_queue(sk_sleep(sk), &wait); 2440 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2441 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2442 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2443 remove_wait_queue(sk_sleep(sk), &wait); 2444 return rc; 2445 } 2446 EXPORT_SYMBOL(sk_wait_data); 2447 2448 /** 2449 * __sk_mem_raise_allocated - increase memory_allocated 2450 * @sk: socket 2451 * @size: memory size to allocate 2452 * @amt: pages to allocate 2453 * @kind: allocation type 2454 * 2455 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2456 */ 2457 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2458 { 2459 struct proto *prot = sk->sk_prot; 2460 long allocated = sk_memory_allocated_add(sk, amt); 2461 bool charged = true; 2462 2463 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2464 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2465 goto suppress_allocation; 2466 2467 /* Under limit. */ 2468 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2469 sk_leave_memory_pressure(sk); 2470 return 1; 2471 } 2472 2473 /* Under pressure. */ 2474 if (allocated > sk_prot_mem_limits(sk, 1)) 2475 sk_enter_memory_pressure(sk); 2476 2477 /* Over hard limit. */ 2478 if (allocated > sk_prot_mem_limits(sk, 2)) 2479 goto suppress_allocation; 2480 2481 /* guarantee minimum buffer size under pressure */ 2482 if (kind == SK_MEM_RECV) { 2483 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2484 return 1; 2485 2486 } else { /* SK_MEM_SEND */ 2487 int wmem0 = sk_get_wmem0(sk, prot); 2488 2489 if (sk->sk_type == SOCK_STREAM) { 2490 if (sk->sk_wmem_queued < wmem0) 2491 return 1; 2492 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2493 return 1; 2494 } 2495 } 2496 2497 if (sk_has_memory_pressure(sk)) { 2498 u64 alloc; 2499 2500 if (!sk_under_memory_pressure(sk)) 2501 return 1; 2502 alloc = sk_sockets_allocated_read_positive(sk); 2503 if (sk_prot_mem_limits(sk, 2) > alloc * 2504 sk_mem_pages(sk->sk_wmem_queued + 2505 atomic_read(&sk->sk_rmem_alloc) + 2506 sk->sk_forward_alloc)) 2507 return 1; 2508 } 2509 2510 suppress_allocation: 2511 2512 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2513 sk_stream_moderate_sndbuf(sk); 2514 2515 /* Fail only if socket is _under_ its sndbuf. 2516 * In this case we cannot block, so that we have to fail. 2517 */ 2518 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2519 return 1; 2520 } 2521 2522 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2523 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2524 2525 sk_memory_allocated_sub(sk, amt); 2526 2527 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2528 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2529 2530 return 0; 2531 } 2532 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2533 2534 /** 2535 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2536 * @sk: socket 2537 * @size: memory size to allocate 2538 * @kind: allocation type 2539 * 2540 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2541 * rmem allocation. This function assumes that protocols which have 2542 * memory_pressure use sk_wmem_queued as write buffer accounting. 2543 */ 2544 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2545 { 2546 int ret, amt = sk_mem_pages(size); 2547 2548 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2549 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2550 if (!ret) 2551 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2552 return ret; 2553 } 2554 EXPORT_SYMBOL(__sk_mem_schedule); 2555 2556 /** 2557 * __sk_mem_reduce_allocated - reclaim memory_allocated 2558 * @sk: socket 2559 * @amount: number of quanta 2560 * 2561 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2562 */ 2563 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2564 { 2565 sk_memory_allocated_sub(sk, amount); 2566 2567 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2568 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2569 2570 if (sk_under_memory_pressure(sk) && 2571 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2572 sk_leave_memory_pressure(sk); 2573 } 2574 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2575 2576 /** 2577 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2578 * @sk: socket 2579 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2580 */ 2581 void __sk_mem_reclaim(struct sock *sk, int amount) 2582 { 2583 amount >>= SK_MEM_QUANTUM_SHIFT; 2584 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2585 __sk_mem_reduce_allocated(sk, amount); 2586 } 2587 EXPORT_SYMBOL(__sk_mem_reclaim); 2588 2589 int sk_set_peek_off(struct sock *sk, int val) 2590 { 2591 sk->sk_peek_off = val; 2592 return 0; 2593 } 2594 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2595 2596 /* 2597 * Set of default routines for initialising struct proto_ops when 2598 * the protocol does not support a particular function. In certain 2599 * cases where it makes no sense for a protocol to have a "do nothing" 2600 * function, some default processing is provided. 2601 */ 2602 2603 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2604 { 2605 return -EOPNOTSUPP; 2606 } 2607 EXPORT_SYMBOL(sock_no_bind); 2608 2609 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2610 int len, int flags) 2611 { 2612 return -EOPNOTSUPP; 2613 } 2614 EXPORT_SYMBOL(sock_no_connect); 2615 2616 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2617 { 2618 return -EOPNOTSUPP; 2619 } 2620 EXPORT_SYMBOL(sock_no_socketpair); 2621 2622 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2623 bool kern) 2624 { 2625 return -EOPNOTSUPP; 2626 } 2627 EXPORT_SYMBOL(sock_no_accept); 2628 2629 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2630 int peer) 2631 { 2632 return -EOPNOTSUPP; 2633 } 2634 EXPORT_SYMBOL(sock_no_getname); 2635 2636 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2637 { 2638 return -EOPNOTSUPP; 2639 } 2640 EXPORT_SYMBOL(sock_no_ioctl); 2641 2642 int sock_no_listen(struct socket *sock, int backlog) 2643 { 2644 return -EOPNOTSUPP; 2645 } 2646 EXPORT_SYMBOL(sock_no_listen); 2647 2648 int sock_no_shutdown(struct socket *sock, int how) 2649 { 2650 return -EOPNOTSUPP; 2651 } 2652 EXPORT_SYMBOL(sock_no_shutdown); 2653 2654 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2655 char __user *optval, unsigned int optlen) 2656 { 2657 return -EOPNOTSUPP; 2658 } 2659 EXPORT_SYMBOL(sock_no_setsockopt); 2660 2661 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2662 char __user *optval, int __user *optlen) 2663 { 2664 return -EOPNOTSUPP; 2665 } 2666 EXPORT_SYMBOL(sock_no_getsockopt); 2667 2668 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2669 { 2670 return -EOPNOTSUPP; 2671 } 2672 EXPORT_SYMBOL(sock_no_sendmsg); 2673 2674 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2675 { 2676 return -EOPNOTSUPP; 2677 } 2678 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2679 2680 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2681 int flags) 2682 { 2683 return -EOPNOTSUPP; 2684 } 2685 EXPORT_SYMBOL(sock_no_recvmsg); 2686 2687 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2688 { 2689 /* Mirror missing mmap method error code */ 2690 return -ENODEV; 2691 } 2692 EXPORT_SYMBOL(sock_no_mmap); 2693 2694 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2695 { 2696 ssize_t res; 2697 struct msghdr msg = {.msg_flags = flags}; 2698 struct kvec iov; 2699 char *kaddr = kmap(page); 2700 iov.iov_base = kaddr + offset; 2701 iov.iov_len = size; 2702 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2703 kunmap(page); 2704 return res; 2705 } 2706 EXPORT_SYMBOL(sock_no_sendpage); 2707 2708 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2709 int offset, size_t size, int flags) 2710 { 2711 ssize_t res; 2712 struct msghdr msg = {.msg_flags = flags}; 2713 struct kvec iov; 2714 char *kaddr = kmap(page); 2715 2716 iov.iov_base = kaddr + offset; 2717 iov.iov_len = size; 2718 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2719 kunmap(page); 2720 return res; 2721 } 2722 EXPORT_SYMBOL(sock_no_sendpage_locked); 2723 2724 /* 2725 * Default Socket Callbacks 2726 */ 2727 2728 static void sock_def_wakeup(struct sock *sk) 2729 { 2730 struct socket_wq *wq; 2731 2732 rcu_read_lock(); 2733 wq = rcu_dereference(sk->sk_wq); 2734 if (skwq_has_sleeper(wq)) 2735 wake_up_interruptible_all(&wq->wait); 2736 rcu_read_unlock(); 2737 } 2738 2739 static void sock_def_error_report(struct sock *sk) 2740 { 2741 struct socket_wq *wq; 2742 2743 rcu_read_lock(); 2744 wq = rcu_dereference(sk->sk_wq); 2745 if (skwq_has_sleeper(wq)) 2746 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2747 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2748 rcu_read_unlock(); 2749 } 2750 2751 static void sock_def_readable(struct sock *sk) 2752 { 2753 struct socket_wq *wq; 2754 2755 rcu_read_lock(); 2756 wq = rcu_dereference(sk->sk_wq); 2757 if (skwq_has_sleeper(wq)) 2758 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2759 EPOLLRDNORM | EPOLLRDBAND); 2760 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2761 rcu_read_unlock(); 2762 } 2763 2764 static void sock_def_write_space(struct sock *sk) 2765 { 2766 struct socket_wq *wq; 2767 2768 rcu_read_lock(); 2769 2770 /* Do not wake up a writer until he can make "significant" 2771 * progress. --DaveM 2772 */ 2773 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2774 wq = rcu_dereference(sk->sk_wq); 2775 if (skwq_has_sleeper(wq)) 2776 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2777 EPOLLWRNORM | EPOLLWRBAND); 2778 2779 /* Should agree with poll, otherwise some programs break */ 2780 if (sock_writeable(sk)) 2781 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2782 } 2783 2784 rcu_read_unlock(); 2785 } 2786 2787 static void sock_def_destruct(struct sock *sk) 2788 { 2789 } 2790 2791 void sk_send_sigurg(struct sock *sk) 2792 { 2793 if (sk->sk_socket && sk->sk_socket->file) 2794 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2795 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2796 } 2797 EXPORT_SYMBOL(sk_send_sigurg); 2798 2799 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2800 unsigned long expires) 2801 { 2802 if (!mod_timer(timer, expires)) 2803 sock_hold(sk); 2804 } 2805 EXPORT_SYMBOL(sk_reset_timer); 2806 2807 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2808 { 2809 if (del_timer(timer)) 2810 __sock_put(sk); 2811 } 2812 EXPORT_SYMBOL(sk_stop_timer); 2813 2814 void sock_init_data(struct socket *sock, struct sock *sk) 2815 { 2816 sk_init_common(sk); 2817 sk->sk_send_head = NULL; 2818 2819 timer_setup(&sk->sk_timer, NULL, 0); 2820 2821 sk->sk_allocation = GFP_KERNEL; 2822 sk->sk_rcvbuf = sysctl_rmem_default; 2823 sk->sk_sndbuf = sysctl_wmem_default; 2824 sk->sk_state = TCP_CLOSE; 2825 sk_set_socket(sk, sock); 2826 2827 sock_set_flag(sk, SOCK_ZAPPED); 2828 2829 if (sock) { 2830 sk->sk_type = sock->type; 2831 sk->sk_wq = sock->wq; 2832 sock->sk = sk; 2833 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2834 } else { 2835 sk->sk_wq = NULL; 2836 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2837 } 2838 2839 rwlock_init(&sk->sk_callback_lock); 2840 if (sk->sk_kern_sock) 2841 lockdep_set_class_and_name( 2842 &sk->sk_callback_lock, 2843 af_kern_callback_keys + sk->sk_family, 2844 af_family_kern_clock_key_strings[sk->sk_family]); 2845 else 2846 lockdep_set_class_and_name( 2847 &sk->sk_callback_lock, 2848 af_callback_keys + sk->sk_family, 2849 af_family_clock_key_strings[sk->sk_family]); 2850 2851 sk->sk_state_change = sock_def_wakeup; 2852 sk->sk_data_ready = sock_def_readable; 2853 sk->sk_write_space = sock_def_write_space; 2854 sk->sk_error_report = sock_def_error_report; 2855 sk->sk_destruct = sock_def_destruct; 2856 2857 sk->sk_frag.page = NULL; 2858 sk->sk_frag.offset = 0; 2859 sk->sk_peek_off = -1; 2860 2861 sk->sk_peer_pid = NULL; 2862 sk->sk_peer_cred = NULL; 2863 sk->sk_write_pending = 0; 2864 sk->sk_rcvlowat = 1; 2865 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2866 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2867 2868 sk->sk_stamp = SK_DEFAULT_STAMP; 2869 #if BITS_PER_LONG==32 2870 seqlock_init(&sk->sk_stamp_seq); 2871 #endif 2872 atomic_set(&sk->sk_zckey, 0); 2873 2874 #ifdef CONFIG_NET_RX_BUSY_POLL 2875 sk->sk_napi_id = 0; 2876 sk->sk_ll_usec = sysctl_net_busy_read; 2877 #endif 2878 2879 sk->sk_max_pacing_rate = ~0UL; 2880 sk->sk_pacing_rate = ~0UL; 2881 sk->sk_pacing_shift = 10; 2882 sk->sk_incoming_cpu = -1; 2883 2884 sk_rx_queue_clear(sk); 2885 /* 2886 * Before updating sk_refcnt, we must commit prior changes to memory 2887 * (Documentation/RCU/rculist_nulls.txt for details) 2888 */ 2889 smp_wmb(); 2890 refcount_set(&sk->sk_refcnt, 1); 2891 atomic_set(&sk->sk_drops, 0); 2892 } 2893 EXPORT_SYMBOL(sock_init_data); 2894 2895 void lock_sock_nested(struct sock *sk, int subclass) 2896 { 2897 might_sleep(); 2898 spin_lock_bh(&sk->sk_lock.slock); 2899 if (sk->sk_lock.owned) 2900 __lock_sock(sk); 2901 sk->sk_lock.owned = 1; 2902 spin_unlock(&sk->sk_lock.slock); 2903 /* 2904 * The sk_lock has mutex_lock() semantics here: 2905 */ 2906 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2907 local_bh_enable(); 2908 } 2909 EXPORT_SYMBOL(lock_sock_nested); 2910 2911 void release_sock(struct sock *sk) 2912 { 2913 spin_lock_bh(&sk->sk_lock.slock); 2914 if (sk->sk_backlog.tail) 2915 __release_sock(sk); 2916 2917 /* Warning : release_cb() might need to release sk ownership, 2918 * ie call sock_release_ownership(sk) before us. 2919 */ 2920 if (sk->sk_prot->release_cb) 2921 sk->sk_prot->release_cb(sk); 2922 2923 sock_release_ownership(sk); 2924 if (waitqueue_active(&sk->sk_lock.wq)) 2925 wake_up(&sk->sk_lock.wq); 2926 spin_unlock_bh(&sk->sk_lock.slock); 2927 } 2928 EXPORT_SYMBOL(release_sock); 2929 2930 /** 2931 * lock_sock_fast - fast version of lock_sock 2932 * @sk: socket 2933 * 2934 * This version should be used for very small section, where process wont block 2935 * return false if fast path is taken: 2936 * 2937 * sk_lock.slock locked, owned = 0, BH disabled 2938 * 2939 * return true if slow path is taken: 2940 * 2941 * sk_lock.slock unlocked, owned = 1, BH enabled 2942 */ 2943 bool lock_sock_fast(struct sock *sk) 2944 { 2945 might_sleep(); 2946 spin_lock_bh(&sk->sk_lock.slock); 2947 2948 if (!sk->sk_lock.owned) 2949 /* 2950 * Note : We must disable BH 2951 */ 2952 return false; 2953 2954 __lock_sock(sk); 2955 sk->sk_lock.owned = 1; 2956 spin_unlock(&sk->sk_lock.slock); 2957 /* 2958 * The sk_lock has mutex_lock() semantics here: 2959 */ 2960 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2961 local_bh_enable(); 2962 return true; 2963 } 2964 EXPORT_SYMBOL(lock_sock_fast); 2965 2966 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2967 { 2968 struct timeval tv; 2969 2970 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2971 tv = ktime_to_timeval(sock_read_timestamp(sk)); 2972 if (tv.tv_sec == -1) 2973 return -ENOENT; 2974 if (tv.tv_sec == 0) { 2975 ktime_t kt = ktime_get_real(); 2976 sock_write_timestamp(sk, kt); 2977 tv = ktime_to_timeval(kt); 2978 } 2979 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2980 } 2981 EXPORT_SYMBOL(sock_get_timestamp); 2982 2983 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2984 { 2985 struct timespec ts; 2986 2987 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2988 ts = ktime_to_timespec(sock_read_timestamp(sk)); 2989 if (ts.tv_sec == -1) 2990 return -ENOENT; 2991 if (ts.tv_sec == 0) { 2992 ktime_t kt = ktime_get_real(); 2993 sock_write_timestamp(sk, kt); 2994 ts = ktime_to_timespec(sk->sk_stamp); 2995 } 2996 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2997 } 2998 EXPORT_SYMBOL(sock_get_timestampns); 2999 3000 void sock_enable_timestamp(struct sock *sk, int flag) 3001 { 3002 if (!sock_flag(sk, flag)) { 3003 unsigned long previous_flags = sk->sk_flags; 3004 3005 sock_set_flag(sk, flag); 3006 /* 3007 * we just set one of the two flags which require net 3008 * time stamping, but time stamping might have been on 3009 * already because of the other one 3010 */ 3011 if (sock_needs_netstamp(sk) && 3012 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3013 net_enable_timestamp(); 3014 } 3015 } 3016 3017 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3018 int level, int type) 3019 { 3020 struct sock_exterr_skb *serr; 3021 struct sk_buff *skb; 3022 int copied, err; 3023 3024 err = -EAGAIN; 3025 skb = sock_dequeue_err_skb(sk); 3026 if (skb == NULL) 3027 goto out; 3028 3029 copied = skb->len; 3030 if (copied > len) { 3031 msg->msg_flags |= MSG_TRUNC; 3032 copied = len; 3033 } 3034 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3035 if (err) 3036 goto out_free_skb; 3037 3038 sock_recv_timestamp(msg, sk, skb); 3039 3040 serr = SKB_EXT_ERR(skb); 3041 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3042 3043 msg->msg_flags |= MSG_ERRQUEUE; 3044 err = copied; 3045 3046 out_free_skb: 3047 kfree_skb(skb); 3048 out: 3049 return err; 3050 } 3051 EXPORT_SYMBOL(sock_recv_errqueue); 3052 3053 /* 3054 * Get a socket option on an socket. 3055 * 3056 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3057 * asynchronous errors should be reported by getsockopt. We assume 3058 * this means if you specify SO_ERROR (otherwise whats the point of it). 3059 */ 3060 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3061 char __user *optval, int __user *optlen) 3062 { 3063 struct sock *sk = sock->sk; 3064 3065 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3066 } 3067 EXPORT_SYMBOL(sock_common_getsockopt); 3068 3069 #ifdef CONFIG_COMPAT 3070 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 3071 char __user *optval, int __user *optlen) 3072 { 3073 struct sock *sk = sock->sk; 3074 3075 if (sk->sk_prot->compat_getsockopt != NULL) 3076 return sk->sk_prot->compat_getsockopt(sk, level, optname, 3077 optval, optlen); 3078 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3079 } 3080 EXPORT_SYMBOL(compat_sock_common_getsockopt); 3081 #endif 3082 3083 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3084 int flags) 3085 { 3086 struct sock *sk = sock->sk; 3087 int addr_len = 0; 3088 int err; 3089 3090 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3091 flags & ~MSG_DONTWAIT, &addr_len); 3092 if (err >= 0) 3093 msg->msg_namelen = addr_len; 3094 return err; 3095 } 3096 EXPORT_SYMBOL(sock_common_recvmsg); 3097 3098 /* 3099 * Set socket options on an inet socket. 3100 */ 3101 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3102 char __user *optval, unsigned int optlen) 3103 { 3104 struct sock *sk = sock->sk; 3105 3106 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3107 } 3108 EXPORT_SYMBOL(sock_common_setsockopt); 3109 3110 #ifdef CONFIG_COMPAT 3111 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3112 char __user *optval, unsigned int optlen) 3113 { 3114 struct sock *sk = sock->sk; 3115 3116 if (sk->sk_prot->compat_setsockopt != NULL) 3117 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3118 optval, optlen); 3119 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3120 } 3121 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3122 #endif 3123 3124 void sk_common_release(struct sock *sk) 3125 { 3126 if (sk->sk_prot->destroy) 3127 sk->sk_prot->destroy(sk); 3128 3129 /* 3130 * Observation: when sock_common_release is called, processes have 3131 * no access to socket. But net still has. 3132 * Step one, detach it from networking: 3133 * 3134 * A. Remove from hash tables. 3135 */ 3136 3137 sk->sk_prot->unhash(sk); 3138 3139 /* 3140 * In this point socket cannot receive new packets, but it is possible 3141 * that some packets are in flight because some CPU runs receiver and 3142 * did hash table lookup before we unhashed socket. They will achieve 3143 * receive queue and will be purged by socket destructor. 3144 * 3145 * Also we still have packets pending on receive queue and probably, 3146 * our own packets waiting in device queues. sock_destroy will drain 3147 * receive queue, but transmitted packets will delay socket destruction 3148 * until the last reference will be released. 3149 */ 3150 3151 sock_orphan(sk); 3152 3153 xfrm_sk_free_policy(sk); 3154 3155 sk_refcnt_debug_release(sk); 3156 3157 sock_put(sk); 3158 } 3159 EXPORT_SYMBOL(sk_common_release); 3160 3161 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3162 { 3163 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3164 3165 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3166 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3167 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3168 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3169 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3170 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3171 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3172 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3173 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3174 } 3175 3176 #ifdef CONFIG_PROC_FS 3177 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3178 struct prot_inuse { 3179 int val[PROTO_INUSE_NR]; 3180 }; 3181 3182 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3183 3184 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3185 { 3186 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3187 } 3188 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3189 3190 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3191 { 3192 int cpu, idx = prot->inuse_idx; 3193 int res = 0; 3194 3195 for_each_possible_cpu(cpu) 3196 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3197 3198 return res >= 0 ? res : 0; 3199 } 3200 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3201 3202 static void sock_inuse_add(struct net *net, int val) 3203 { 3204 this_cpu_add(*net->core.sock_inuse, val); 3205 } 3206 3207 int sock_inuse_get(struct net *net) 3208 { 3209 int cpu, res = 0; 3210 3211 for_each_possible_cpu(cpu) 3212 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3213 3214 return res; 3215 } 3216 3217 EXPORT_SYMBOL_GPL(sock_inuse_get); 3218 3219 static int __net_init sock_inuse_init_net(struct net *net) 3220 { 3221 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3222 if (net->core.prot_inuse == NULL) 3223 return -ENOMEM; 3224 3225 net->core.sock_inuse = alloc_percpu(int); 3226 if (net->core.sock_inuse == NULL) 3227 goto out; 3228 3229 return 0; 3230 3231 out: 3232 free_percpu(net->core.prot_inuse); 3233 return -ENOMEM; 3234 } 3235 3236 static void __net_exit sock_inuse_exit_net(struct net *net) 3237 { 3238 free_percpu(net->core.prot_inuse); 3239 free_percpu(net->core.sock_inuse); 3240 } 3241 3242 static struct pernet_operations net_inuse_ops = { 3243 .init = sock_inuse_init_net, 3244 .exit = sock_inuse_exit_net, 3245 }; 3246 3247 static __init int net_inuse_init(void) 3248 { 3249 if (register_pernet_subsys(&net_inuse_ops)) 3250 panic("Cannot initialize net inuse counters"); 3251 3252 return 0; 3253 } 3254 3255 core_initcall(net_inuse_init); 3256 3257 static void assign_proto_idx(struct proto *prot) 3258 { 3259 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3260 3261 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3262 pr_err("PROTO_INUSE_NR exhausted\n"); 3263 return; 3264 } 3265 3266 set_bit(prot->inuse_idx, proto_inuse_idx); 3267 } 3268 3269 static void release_proto_idx(struct proto *prot) 3270 { 3271 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3272 clear_bit(prot->inuse_idx, proto_inuse_idx); 3273 } 3274 #else 3275 static inline void assign_proto_idx(struct proto *prot) 3276 { 3277 } 3278 3279 static inline void release_proto_idx(struct proto *prot) 3280 { 3281 } 3282 3283 static void sock_inuse_add(struct net *net, int val) 3284 { 3285 } 3286 #endif 3287 3288 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3289 { 3290 if (!rsk_prot) 3291 return; 3292 kfree(rsk_prot->slab_name); 3293 rsk_prot->slab_name = NULL; 3294 kmem_cache_destroy(rsk_prot->slab); 3295 rsk_prot->slab = NULL; 3296 } 3297 3298 static int req_prot_init(const struct proto *prot) 3299 { 3300 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3301 3302 if (!rsk_prot) 3303 return 0; 3304 3305 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3306 prot->name); 3307 if (!rsk_prot->slab_name) 3308 return -ENOMEM; 3309 3310 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3311 rsk_prot->obj_size, 0, 3312 SLAB_ACCOUNT | prot->slab_flags, 3313 NULL); 3314 3315 if (!rsk_prot->slab) { 3316 pr_crit("%s: Can't create request sock SLAB cache!\n", 3317 prot->name); 3318 return -ENOMEM; 3319 } 3320 return 0; 3321 } 3322 3323 int proto_register(struct proto *prot, int alloc_slab) 3324 { 3325 if (alloc_slab) { 3326 prot->slab = kmem_cache_create_usercopy(prot->name, 3327 prot->obj_size, 0, 3328 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3329 prot->slab_flags, 3330 prot->useroffset, prot->usersize, 3331 NULL); 3332 3333 if (prot->slab == NULL) { 3334 pr_crit("%s: Can't create sock SLAB cache!\n", 3335 prot->name); 3336 goto out; 3337 } 3338 3339 if (req_prot_init(prot)) 3340 goto out_free_request_sock_slab; 3341 3342 if (prot->twsk_prot != NULL) { 3343 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3344 3345 if (prot->twsk_prot->twsk_slab_name == NULL) 3346 goto out_free_request_sock_slab; 3347 3348 prot->twsk_prot->twsk_slab = 3349 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3350 prot->twsk_prot->twsk_obj_size, 3351 0, 3352 SLAB_ACCOUNT | 3353 prot->slab_flags, 3354 NULL); 3355 if (prot->twsk_prot->twsk_slab == NULL) 3356 goto out_free_timewait_sock_slab_name; 3357 } 3358 } 3359 3360 mutex_lock(&proto_list_mutex); 3361 list_add(&prot->node, &proto_list); 3362 assign_proto_idx(prot); 3363 mutex_unlock(&proto_list_mutex); 3364 return 0; 3365 3366 out_free_timewait_sock_slab_name: 3367 kfree(prot->twsk_prot->twsk_slab_name); 3368 out_free_request_sock_slab: 3369 req_prot_cleanup(prot->rsk_prot); 3370 3371 kmem_cache_destroy(prot->slab); 3372 prot->slab = NULL; 3373 out: 3374 return -ENOBUFS; 3375 } 3376 EXPORT_SYMBOL(proto_register); 3377 3378 void proto_unregister(struct proto *prot) 3379 { 3380 mutex_lock(&proto_list_mutex); 3381 release_proto_idx(prot); 3382 list_del(&prot->node); 3383 mutex_unlock(&proto_list_mutex); 3384 3385 kmem_cache_destroy(prot->slab); 3386 prot->slab = NULL; 3387 3388 req_prot_cleanup(prot->rsk_prot); 3389 3390 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3391 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3392 kfree(prot->twsk_prot->twsk_slab_name); 3393 prot->twsk_prot->twsk_slab = NULL; 3394 } 3395 } 3396 EXPORT_SYMBOL(proto_unregister); 3397 3398 int sock_load_diag_module(int family, int protocol) 3399 { 3400 if (!protocol) { 3401 if (!sock_is_registered(family)) 3402 return -ENOENT; 3403 3404 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3405 NETLINK_SOCK_DIAG, family); 3406 } 3407 3408 #ifdef CONFIG_INET 3409 if (family == AF_INET && 3410 protocol != IPPROTO_RAW && 3411 !rcu_access_pointer(inet_protos[protocol])) 3412 return -ENOENT; 3413 #endif 3414 3415 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3416 NETLINK_SOCK_DIAG, family, protocol); 3417 } 3418 EXPORT_SYMBOL(sock_load_diag_module); 3419 3420 #ifdef CONFIG_PROC_FS 3421 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3422 __acquires(proto_list_mutex) 3423 { 3424 mutex_lock(&proto_list_mutex); 3425 return seq_list_start_head(&proto_list, *pos); 3426 } 3427 3428 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3429 { 3430 return seq_list_next(v, &proto_list, pos); 3431 } 3432 3433 static void proto_seq_stop(struct seq_file *seq, void *v) 3434 __releases(proto_list_mutex) 3435 { 3436 mutex_unlock(&proto_list_mutex); 3437 } 3438 3439 static char proto_method_implemented(const void *method) 3440 { 3441 return method == NULL ? 'n' : 'y'; 3442 } 3443 static long sock_prot_memory_allocated(struct proto *proto) 3444 { 3445 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3446 } 3447 3448 static char *sock_prot_memory_pressure(struct proto *proto) 3449 { 3450 return proto->memory_pressure != NULL ? 3451 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3452 } 3453 3454 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3455 { 3456 3457 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3458 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3459 proto->name, 3460 proto->obj_size, 3461 sock_prot_inuse_get(seq_file_net(seq), proto), 3462 sock_prot_memory_allocated(proto), 3463 sock_prot_memory_pressure(proto), 3464 proto->max_header, 3465 proto->slab == NULL ? "no" : "yes", 3466 module_name(proto->owner), 3467 proto_method_implemented(proto->close), 3468 proto_method_implemented(proto->connect), 3469 proto_method_implemented(proto->disconnect), 3470 proto_method_implemented(proto->accept), 3471 proto_method_implemented(proto->ioctl), 3472 proto_method_implemented(proto->init), 3473 proto_method_implemented(proto->destroy), 3474 proto_method_implemented(proto->shutdown), 3475 proto_method_implemented(proto->setsockopt), 3476 proto_method_implemented(proto->getsockopt), 3477 proto_method_implemented(proto->sendmsg), 3478 proto_method_implemented(proto->recvmsg), 3479 proto_method_implemented(proto->sendpage), 3480 proto_method_implemented(proto->bind), 3481 proto_method_implemented(proto->backlog_rcv), 3482 proto_method_implemented(proto->hash), 3483 proto_method_implemented(proto->unhash), 3484 proto_method_implemented(proto->get_port), 3485 proto_method_implemented(proto->enter_memory_pressure)); 3486 } 3487 3488 static int proto_seq_show(struct seq_file *seq, void *v) 3489 { 3490 if (v == &proto_list) 3491 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3492 "protocol", 3493 "size", 3494 "sockets", 3495 "memory", 3496 "press", 3497 "maxhdr", 3498 "slab", 3499 "module", 3500 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3501 else 3502 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3503 return 0; 3504 } 3505 3506 static const struct seq_operations proto_seq_ops = { 3507 .start = proto_seq_start, 3508 .next = proto_seq_next, 3509 .stop = proto_seq_stop, 3510 .show = proto_seq_show, 3511 }; 3512 3513 static __net_init int proto_init_net(struct net *net) 3514 { 3515 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3516 sizeof(struct seq_net_private))) 3517 return -ENOMEM; 3518 3519 return 0; 3520 } 3521 3522 static __net_exit void proto_exit_net(struct net *net) 3523 { 3524 remove_proc_entry("protocols", net->proc_net); 3525 } 3526 3527 3528 static __net_initdata struct pernet_operations proto_net_ops = { 3529 .init = proto_init_net, 3530 .exit = proto_exit_net, 3531 }; 3532 3533 static int __init proto_init(void) 3534 { 3535 return register_pernet_subsys(&proto_net_ops); 3536 } 3537 3538 subsys_initcall(proto_init); 3539 3540 #endif /* PROC_FS */ 3541 3542 #ifdef CONFIG_NET_RX_BUSY_POLL 3543 bool sk_busy_loop_end(void *p, unsigned long start_time) 3544 { 3545 struct sock *sk = p; 3546 3547 return !skb_queue_empty(&sk->sk_receive_queue) || 3548 sk_busy_loop_timeout(sk, start_time); 3549 } 3550 EXPORT_SYMBOL(sk_busy_loop_end); 3551 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3552