1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
111#include <linux/highmem.h>
112#include <linux/user_namespace.h>
113#include <linux/static_key.h>
114#include <linux/memcontrol.h>
115#include <linux/prefetch.h>
116#include <linux/compat.h>
117#include <linux/mroute.h>
118#include <linux/mroute6.h>
119#include <linux/icmpv6.h>
120
121#include <linux/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134#include <linux/sock_diag.h>
135
136#include <linux/filter.h>
137#include <net/sock_reuseport.h>
138#include <net/bpf_sk_storage.h>
139
140#include <trace/events/sock.h>
141
142#include <net/tcp.h>
143#include <net/busy_poll.h>
144#include <net/phonet/phonet.h>
145
146#include <linux/ethtool.h>
147
148#include "dev.h"
149
150static DEFINE_MUTEX(proto_list_mutex);
151static LIST_HEAD(proto_list);
152
153static void sock_def_write_space_wfree(struct sock *sk);
154static void sock_def_write_space(struct sock *sk);
155
156
157
158
159
160
161
162
163
164
165
166bool sk_ns_capable(const struct sock *sk,
167 struct user_namespace *user_ns, int cap)
168{
169 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 ns_capable(user_ns, cap);
171}
172EXPORT_SYMBOL(sk_ns_capable);
173
174
175
176
177
178
179
180
181
182
183bool sk_capable(const struct sock *sk, int cap)
184{
185 return sk_ns_capable(sk, &init_user_ns, cap);
186}
187EXPORT_SYMBOL(sk_capable);
188
189
190
191
192
193
194
195
196
197
198bool sk_net_capable(const struct sock *sk, int cap)
199{
200 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201}
202EXPORT_SYMBOL(sk_net_capable);
203
204
205
206
207
208
209static struct lock_class_key af_family_keys[AF_MAX];
210static struct lock_class_key af_family_kern_keys[AF_MAX];
211static struct lock_class_key af_family_slock_keys[AF_MAX];
212static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213
214
215
216
217
218
219
220#define _sock_locks(x) \
221 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
222 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
223 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
224 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
225 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
226 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
227 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
228 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
229 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
230 x "27" , x "28" , x "AF_CAN" , \
231 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
232 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
233 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
234 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
235 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
236 x "AF_MCTP" , \
237 x "AF_MAX"
238
239static const char *const af_family_key_strings[AF_MAX+1] = {
240 _sock_locks("sk_lock-")
241};
242static const char *const af_family_slock_key_strings[AF_MAX+1] = {
243 _sock_locks("slock-")
244};
245static const char *const af_family_clock_key_strings[AF_MAX+1] = {
246 _sock_locks("clock-")
247};
248
249static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 _sock_locks("k-sk_lock-")
251};
252static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 _sock_locks("k-slock-")
254};
255static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 _sock_locks("k-clock-")
257};
258static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
259 _sock_locks("rlock-")
260};
261static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
262 _sock_locks("wlock-")
263};
264static const char *const af_family_elock_key_strings[AF_MAX+1] = {
265 _sock_locks("elock-")
266};
267
268
269
270
271
272static struct lock_class_key af_callback_keys[AF_MAX];
273static struct lock_class_key af_rlock_keys[AF_MAX];
274static struct lock_class_key af_wlock_keys[AF_MAX];
275static struct lock_class_key af_elock_keys[AF_MAX];
276static struct lock_class_key af_kern_callback_keys[AF_MAX];
277
278
279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280EXPORT_SYMBOL(sysctl_wmem_max);
281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282EXPORT_SYMBOL(sysctl_rmem_max);
283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285
286
287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288EXPORT_SYMBOL(sysctl_optmem_max);
289
290int sysctl_tstamp_allow_data __read_mostly = 1;
291
292DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293EXPORT_SYMBOL_GPL(memalloc_socks_key);
294
295
296
297
298
299
300
301
302
303void sk_set_memalloc(struct sock *sk)
304{
305 sock_set_flag(sk, SOCK_MEMALLOC);
306 sk->sk_allocation |= __GFP_MEMALLOC;
307 static_branch_inc(&memalloc_socks_key);
308}
309EXPORT_SYMBOL_GPL(sk_set_memalloc);
310
311void sk_clear_memalloc(struct sock *sk)
312{
313 sock_reset_flag(sk, SOCK_MEMALLOC);
314 sk->sk_allocation &= ~__GFP_MEMALLOC;
315 static_branch_dec(&memalloc_socks_key);
316
317
318
319
320
321
322
323
324 sk_mem_reclaim(sk);
325}
326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327
328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329{
330 int ret;
331 unsigned int noreclaim_flag;
332
333
334 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335
336 noreclaim_flag = memalloc_noreclaim_save();
337 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338 tcp_v6_do_rcv,
339 tcp_v4_do_rcv,
340 sk, skb);
341 memalloc_noreclaim_restore(noreclaim_flag);
342
343 return ret;
344}
345EXPORT_SYMBOL(__sk_backlog_rcv);
346
347void sk_error_report(struct sock *sk)
348{
349 sk->sk_error_report(sk);
350
351 switch (sk->sk_family) {
352 case AF_INET:
353 fallthrough;
354 case AF_INET6:
355 trace_inet_sk_error_report(sk);
356 break;
357 default:
358 break;
359 }
360}
361EXPORT_SYMBOL(sk_error_report);
362
363int sock_get_timeout(long timeo, void *optval, bool old_timeval)
364{
365 struct __kernel_sock_timeval tv;
366
367 if (timeo == MAX_SCHEDULE_TIMEOUT) {
368 tv.tv_sec = 0;
369 tv.tv_usec = 0;
370 } else {
371 tv.tv_sec = timeo / HZ;
372 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373 }
374
375 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377 *(struct old_timeval32 *)optval = tv32;
378 return sizeof(tv32);
379 }
380
381 if (old_timeval) {
382 struct __kernel_old_timeval old_tv;
383 old_tv.tv_sec = tv.tv_sec;
384 old_tv.tv_usec = tv.tv_usec;
385 *(struct __kernel_old_timeval *)optval = old_tv;
386 return sizeof(old_tv);
387 }
388
389 *(struct __kernel_sock_timeval *)optval = tv;
390 return sizeof(tv);
391}
392EXPORT_SYMBOL(sock_get_timeout);
393
394int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395 sockptr_t optval, int optlen, bool old_timeval)
396{
397 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
398 struct old_timeval32 tv32;
399
400 if (optlen < sizeof(tv32))
401 return -EINVAL;
402
403 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
404 return -EFAULT;
405 tv->tv_sec = tv32.tv_sec;
406 tv->tv_usec = tv32.tv_usec;
407 } else if (old_timeval) {
408 struct __kernel_old_timeval old_tv;
409
410 if (optlen < sizeof(old_tv))
411 return -EINVAL;
412 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
413 return -EFAULT;
414 tv->tv_sec = old_tv.tv_sec;
415 tv->tv_usec = old_tv.tv_usec;
416 } else {
417 if (optlen < sizeof(*tv))
418 return -EINVAL;
419 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
420 return -EFAULT;
421 }
422
423 return 0;
424}
425EXPORT_SYMBOL(sock_copy_user_timeval);
426
427static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
428 bool old_timeval)
429{
430 struct __kernel_sock_timeval tv;
431 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
432 long val;
433
434 if (err)
435 return err;
436
437 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
438 return -EDOM;
439
440 if (tv.tv_sec < 0) {
441 static int warned __read_mostly;
442
443 WRITE_ONCE(*timeo_p, 0);
444 if (warned < 10 && net_ratelimit()) {
445 warned++;
446 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
447 __func__, current->comm, task_pid_nr(current));
448 }
449 return 0;
450 }
451 val = MAX_SCHEDULE_TIMEOUT;
452 if ((tv.tv_sec || tv.tv_usec) &&
453 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
454 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
455 USEC_PER_SEC / HZ);
456 WRITE_ONCE(*timeo_p, val);
457 return 0;
458}
459
460static bool sock_needs_netstamp(const struct sock *sk)
461{
462 switch (sk->sk_family) {
463 case AF_UNSPEC:
464 case AF_UNIX:
465 return false;
466 default:
467 return true;
468 }
469}
470
471static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
472{
473 if (sk->sk_flags & flags) {
474 sk->sk_flags &= ~flags;
475 if (sock_needs_netstamp(sk) &&
476 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
477 net_disable_timestamp();
478 }
479}
480
481
482int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
483{
484 unsigned long flags;
485 struct sk_buff_head *list = &sk->sk_receive_queue;
486
487 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
488 atomic_inc(&sk->sk_drops);
489 trace_sock_rcvqueue_full(sk, skb);
490 return -ENOMEM;
491 }
492
493 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
494 atomic_inc(&sk->sk_drops);
495 return -ENOBUFS;
496 }
497
498 skb->dev = NULL;
499 skb_set_owner_r(skb, sk);
500
501
502
503
504 skb_dst_force(skb);
505
506 spin_lock_irqsave(&list->lock, flags);
507 sock_skb_set_dropcount(sk, skb);
508 __skb_queue_tail(list, skb);
509 spin_unlock_irqrestore(&list->lock, flags);
510
511 if (!sock_flag(sk, SOCK_DEAD))
512 sk->sk_data_ready(sk);
513 return 0;
514}
515EXPORT_SYMBOL(__sock_queue_rcv_skb);
516
517int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
518 enum skb_drop_reason *reason)
519{
520 enum skb_drop_reason drop_reason;
521 int err;
522
523 err = sk_filter(sk, skb);
524 if (err) {
525 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
526 goto out;
527 }
528 err = __sock_queue_rcv_skb(sk, skb);
529 switch (err) {
530 case -ENOMEM:
531 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
532 break;
533 case -ENOBUFS:
534 drop_reason = SKB_DROP_REASON_PROTO_MEM;
535 break;
536 default:
537 drop_reason = SKB_NOT_DROPPED_YET;
538 break;
539 }
540out:
541 if (reason)
542 *reason = drop_reason;
543 return err;
544}
545EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
546
547int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
548 const int nested, unsigned int trim_cap, bool refcounted)
549{
550 int rc = NET_RX_SUCCESS;
551
552 if (sk_filter_trim_cap(sk, skb, trim_cap))
553 goto discard_and_relse;
554
555 skb->dev = NULL;
556
557 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
558 atomic_inc(&sk->sk_drops);
559 goto discard_and_relse;
560 }
561 if (nested)
562 bh_lock_sock_nested(sk);
563 else
564 bh_lock_sock(sk);
565 if (!sock_owned_by_user(sk)) {
566
567
568
569 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
570
571 rc = sk_backlog_rcv(sk, skb);
572
573 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
574 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
575 bh_unlock_sock(sk);
576 atomic_inc(&sk->sk_drops);
577 goto discard_and_relse;
578 }
579
580 bh_unlock_sock(sk);
581out:
582 if (refcounted)
583 sock_put(sk);
584 return rc;
585discard_and_relse:
586 kfree_skb(skb);
587 goto out;
588}
589EXPORT_SYMBOL(__sk_receive_skb);
590
591INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
592 u32));
593INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
594 u32));
595struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
596{
597 struct dst_entry *dst = __sk_dst_get(sk);
598
599 if (dst && dst->obsolete &&
600 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
601 dst, cookie) == NULL) {
602 sk_tx_queue_clear(sk);
603 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
604 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
605 dst_release(dst);
606 return NULL;
607 }
608
609 return dst;
610}
611EXPORT_SYMBOL(__sk_dst_check);
612
613struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
614{
615 struct dst_entry *dst = sk_dst_get(sk);
616
617 if (dst && dst->obsolete &&
618 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
619 dst, cookie) == NULL) {
620 sk_dst_reset(sk);
621 dst_release(dst);
622 return NULL;
623 }
624
625 return dst;
626}
627EXPORT_SYMBOL(sk_dst_check);
628
629static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
630{
631 int ret = -ENOPROTOOPT;
632#ifdef CONFIG_NETDEVICES
633 struct net *net = sock_net(sk);
634
635
636 ret = -EPERM;
637 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
638 goto out;
639
640 ret = -EINVAL;
641 if (ifindex < 0)
642 goto out;
643
644
645 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
646
647 if (sk->sk_prot->rehash)
648 sk->sk_prot->rehash(sk);
649 sk_dst_reset(sk);
650
651 ret = 0;
652
653out:
654#endif
655
656 return ret;
657}
658
659int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
660{
661 int ret;
662
663 if (lock_sk)
664 lock_sock(sk);
665 ret = sock_bindtoindex_locked(sk, ifindex);
666 if (lock_sk)
667 release_sock(sk);
668
669 return ret;
670}
671EXPORT_SYMBOL(sock_bindtoindex);
672
673static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
674{
675 int ret = -ENOPROTOOPT;
676#ifdef CONFIG_NETDEVICES
677 struct net *net = sock_net(sk);
678 char devname[IFNAMSIZ];
679 int index;
680
681 ret = -EINVAL;
682 if (optlen < 0)
683 goto out;
684
685
686
687
688
689
690 if (optlen > IFNAMSIZ - 1)
691 optlen = IFNAMSIZ - 1;
692 memset(devname, 0, sizeof(devname));
693
694 ret = -EFAULT;
695 if (copy_from_sockptr(devname, optval, optlen))
696 goto out;
697
698 index = 0;
699 if (devname[0] != '\0') {
700 struct net_device *dev;
701
702 rcu_read_lock();
703 dev = dev_get_by_name_rcu(net, devname);
704 if (dev)
705 index = dev->ifindex;
706 rcu_read_unlock();
707 ret = -ENODEV;
708 if (!dev)
709 goto out;
710 }
711
712 sockopt_lock_sock(sk);
713 ret = sock_bindtoindex_locked(sk, index);
714 sockopt_release_sock(sk);
715out:
716#endif
717
718 return ret;
719}
720
721static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
722 sockptr_t optlen, int len)
723{
724 int ret = -ENOPROTOOPT;
725#ifdef CONFIG_NETDEVICES
726 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
727 struct net *net = sock_net(sk);
728 char devname[IFNAMSIZ];
729
730 if (bound_dev_if == 0) {
731 len = 0;
732 goto zero;
733 }
734
735 ret = -EINVAL;
736 if (len < IFNAMSIZ)
737 goto out;
738
739 ret = netdev_get_name(net, devname, bound_dev_if);
740 if (ret)
741 goto out;
742
743 len = strlen(devname) + 1;
744
745 ret = -EFAULT;
746 if (copy_to_sockptr(optval, devname, len))
747 goto out;
748
749zero:
750 ret = -EFAULT;
751 if (copy_to_sockptr(optlen, &len, sizeof(int)))
752 goto out;
753
754 ret = 0;
755
756out:
757#endif
758
759 return ret;
760}
761
762bool sk_mc_loop(const struct sock *sk)
763{
764 if (dev_recursion_level())
765 return false;
766 if (!sk)
767 return true;
768
769 switch (READ_ONCE(sk->sk_family)) {
770 case AF_INET:
771 return inet_test_bit(MC_LOOP, sk);
772#if IS_ENABLED(CONFIG_IPV6)
773 case AF_INET6:
774 return inet6_test_bit(MC6_LOOP, sk);
775#endif
776 }
777 WARN_ON_ONCE(1);
778 return true;
779}
780EXPORT_SYMBOL(sk_mc_loop);
781
782void sock_set_reuseaddr(struct sock *sk)
783{
784 lock_sock(sk);
785 sk->sk_reuse = SK_CAN_REUSE;
786 release_sock(sk);
787}
788EXPORT_SYMBOL(sock_set_reuseaddr);
789
790void sock_set_reuseport(struct sock *sk)
791{
792 lock_sock(sk);
793 sk->sk_reuseport = true;
794 release_sock(sk);
795}
796EXPORT_SYMBOL(sock_set_reuseport);
797
798void sock_no_linger(struct sock *sk)
799{
800 lock_sock(sk);
801 WRITE_ONCE(sk->sk_lingertime, 0);
802 sock_set_flag(sk, SOCK_LINGER);
803 release_sock(sk);
804}
805EXPORT_SYMBOL(sock_no_linger);
806
807void sock_set_priority(struct sock *sk, u32 priority)
808{
809 WRITE_ONCE(sk->sk_priority, priority);
810}
811EXPORT_SYMBOL(sock_set_priority);
812
813void sock_set_sndtimeo(struct sock *sk, s64 secs)
814{
815 lock_sock(sk);
816 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
817 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
818 else
819 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
820 release_sock(sk);
821}
822EXPORT_SYMBOL(sock_set_sndtimeo);
823
824static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
825{
826 if (val) {
827 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
828 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
829 sock_set_flag(sk, SOCK_RCVTSTAMP);
830 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
831 } else {
832 sock_reset_flag(sk, SOCK_RCVTSTAMP);
833 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
834 }
835}
836
837void sock_enable_timestamps(struct sock *sk)
838{
839 lock_sock(sk);
840 __sock_set_timestamps(sk, true, false, true);
841 release_sock(sk);
842}
843EXPORT_SYMBOL(sock_enable_timestamps);
844
845void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
846{
847 switch (optname) {
848 case SO_TIMESTAMP_OLD:
849 __sock_set_timestamps(sk, valbool, false, false);
850 break;
851 case SO_TIMESTAMP_NEW:
852 __sock_set_timestamps(sk, valbool, true, false);
853 break;
854 case SO_TIMESTAMPNS_OLD:
855 __sock_set_timestamps(sk, valbool, false, true);
856 break;
857 case SO_TIMESTAMPNS_NEW:
858 __sock_set_timestamps(sk, valbool, true, true);
859 break;
860 }
861}
862
863static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
864{
865 struct net *net = sock_net(sk);
866 struct net_device *dev = NULL;
867 bool match = false;
868 int *vclock_index;
869 int i, num;
870
871 if (sk->sk_bound_dev_if)
872 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
873
874 if (!dev) {
875 pr_err("%s: sock not bind to device\n", __func__);
876 return -EOPNOTSUPP;
877 }
878
879 num = ethtool_get_phc_vclocks(dev, &vclock_index);
880 dev_put(dev);
881
882 for (i = 0; i < num; i++) {
883 if (*(vclock_index + i) == phc_index) {
884 match = true;
885 break;
886 }
887 }
888
889 if (num > 0)
890 kfree(vclock_index);
891
892 if (!match)
893 return -EINVAL;
894
895 WRITE_ONCE(sk->sk_bind_phc, phc_index);
896
897 return 0;
898}
899
900int sock_set_timestamping(struct sock *sk, int optname,
901 struct so_timestamping timestamping)
902{
903 int val = timestamping.flags;
904 int ret;
905
906 if (val & ~SOF_TIMESTAMPING_MASK)
907 return -EINVAL;
908
909 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
910 !(val & SOF_TIMESTAMPING_OPT_ID))
911 return -EINVAL;
912
913 if (val & SOF_TIMESTAMPING_OPT_ID &&
914 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
915 if (sk_is_tcp(sk)) {
916 if ((1 << sk->sk_state) &
917 (TCPF_CLOSE | TCPF_LISTEN))
918 return -EINVAL;
919 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
920 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
921 else
922 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
923 } else {
924 atomic_set(&sk->sk_tskey, 0);
925 }
926 }
927
928 if (val & SOF_TIMESTAMPING_OPT_STATS &&
929 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
930 return -EINVAL;
931
932 if (val & SOF_TIMESTAMPING_BIND_PHC) {
933 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
934 if (ret)
935 return ret;
936 }
937
938 WRITE_ONCE(sk->sk_tsflags, val);
939 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
940
941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 sock_enable_timestamp(sk,
943 SOCK_TIMESTAMPING_RX_SOFTWARE);
944 else
945 sock_disable_timestamp(sk,
946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 return 0;
948}
949
950void sock_set_keepalive(struct sock *sk)
951{
952 lock_sock(sk);
953 if (sk->sk_prot->keepalive)
954 sk->sk_prot->keepalive(sk, true);
955 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
956 release_sock(sk);
957}
958EXPORT_SYMBOL(sock_set_keepalive);
959
960static void __sock_set_rcvbuf(struct sock *sk, int val)
961{
962
963
964
965 val = min_t(int, val, INT_MAX / 2);
966 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
967
968
969
970
971
972
973
974
975
976
977
978 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
979}
980
981void sock_set_rcvbuf(struct sock *sk, int val)
982{
983 lock_sock(sk);
984 __sock_set_rcvbuf(sk, val);
985 release_sock(sk);
986}
987EXPORT_SYMBOL(sock_set_rcvbuf);
988
989static void __sock_set_mark(struct sock *sk, u32 val)
990{
991 if (val != sk->sk_mark) {
992 WRITE_ONCE(sk->sk_mark, val);
993 sk_dst_reset(sk);
994 }
995}
996
997void sock_set_mark(struct sock *sk, u32 val)
998{
999 lock_sock(sk);
1000 __sock_set_mark(sk, val);
1001 release_sock(sk);
1002}
1003EXPORT_SYMBOL(sock_set_mark);
1004
1005static void sock_release_reserved_memory(struct sock *sk, int bytes)
1006{
1007
1008 bytes = round_down(bytes, PAGE_SIZE);
1009
1010 WARN_ON(bytes > sk->sk_reserved_mem);
1011 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1012 sk_mem_reclaim(sk);
1013}
1014
1015static int sock_reserve_memory(struct sock *sk, int bytes)
1016{
1017 long allocated;
1018 bool charged;
1019 int pages;
1020
1021 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1022 return -EOPNOTSUPP;
1023
1024 if (!bytes)
1025 return 0;
1026
1027 pages = sk_mem_pages(bytes);
1028
1029
1030 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1031 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1032 if (!charged)
1033 return -ENOMEM;
1034
1035
1036 sk_memory_allocated_add(sk, pages);
1037 allocated = sk_memory_allocated(sk);
1038
1039
1040
1041 if (allocated > sk_prot_mem_limits(sk, 1)) {
1042 sk_memory_allocated_sub(sk, pages);
1043 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1044 return -ENOMEM;
1045 }
1046 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1047
1048 WRITE_ONCE(sk->sk_reserved_mem,
1049 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1050
1051 return 0;
1052}
1053
1054void sockopt_lock_sock(struct sock *sk)
1055{
1056
1057
1058
1059
1060 if (has_current_bpf_ctx())
1061 return;
1062
1063 lock_sock(sk);
1064}
1065EXPORT_SYMBOL(sockopt_lock_sock);
1066
1067void sockopt_release_sock(struct sock *sk)
1068{
1069 if (has_current_bpf_ctx())
1070 return;
1071
1072 release_sock(sk);
1073}
1074EXPORT_SYMBOL(sockopt_release_sock);
1075
1076bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1077{
1078 return has_current_bpf_ctx() || ns_capable(ns, cap);
1079}
1080EXPORT_SYMBOL(sockopt_ns_capable);
1081
1082bool sockopt_capable(int cap)
1083{
1084 return has_current_bpf_ctx() || capable(cap);
1085}
1086EXPORT_SYMBOL(sockopt_capable);
1087
1088
1089
1090
1091
1092
1093int sk_setsockopt(struct sock *sk, int level, int optname,
1094 sockptr_t optval, unsigned int optlen)
1095{
1096 struct so_timestamping timestamping;
1097 struct socket *sock = sk->sk_socket;
1098 struct sock_txtime sk_txtime;
1099 int val;
1100 int valbool;
1101 struct linger ling;
1102 int ret = 0;
1103
1104
1105
1106
1107
1108 if (optname == SO_BINDTODEVICE)
1109 return sock_setbindtodevice(sk, optval, optlen);
1110
1111 if (optlen < sizeof(int))
1112 return -EINVAL;
1113
1114 if (copy_from_sockptr(&val, optval, sizeof(val)))
1115 return -EFAULT;
1116
1117 valbool = val ? 1 : 0;
1118
1119
1120 switch (optname) {
1121 case SO_PRIORITY:
1122 if ((val >= 0 && val <= 6) ||
1123 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1124 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125 sock_set_priority(sk, val);
1126 return 0;
1127 }
1128 return -EPERM;
1129 case SO_PASSSEC:
1130 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1131 return 0;
1132 case SO_PASSCRED:
1133 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1134 return 0;
1135 case SO_PASSPIDFD:
1136 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1137 return 0;
1138 case SO_TYPE:
1139 case SO_PROTOCOL:
1140 case SO_DOMAIN:
1141 case SO_ERROR:
1142 return -ENOPROTOOPT;
1143#ifdef CONFIG_NET_RX_BUSY_POLL
1144 case SO_BUSY_POLL:
1145 if (val < 0)
1146 return -EINVAL;
1147 WRITE_ONCE(sk->sk_ll_usec, val);
1148 return 0;
1149 case SO_PREFER_BUSY_POLL:
1150 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1151 return -EPERM;
1152 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1153 return 0;
1154 case SO_BUSY_POLL_BUDGET:
1155 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1156 !sockopt_capable(CAP_NET_ADMIN))
1157 return -EPERM;
1158 if (val < 0 || val > U16_MAX)
1159 return -EINVAL;
1160 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1161 return 0;
1162#endif
1163 case SO_MAX_PACING_RATE:
1164 {
1165 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1166 unsigned long pacing_rate;
1167
1168 if (sizeof(ulval) != sizeof(val) &&
1169 optlen >= sizeof(ulval) &&
1170 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171 return -EFAULT;
1172 }
1173 if (ulval != ~0UL)
1174 cmpxchg(&sk->sk_pacing_status,
1175 SK_PACING_NONE,
1176 SK_PACING_NEEDED);
1177
1178 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1179 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1180 if (ulval < pacing_rate)
1181 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1182 return 0;
1183 }
1184 case SO_TXREHASH:
1185 if (val < -1 || val > 1)
1186 return -EINVAL;
1187 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1188 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1189
1190
1191
1192 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1193 return 0;
1194 }
1195
1196 sockopt_lock_sock(sk);
1197
1198 switch (optname) {
1199 case SO_DEBUG:
1200 if (val && !sockopt_capable(CAP_NET_ADMIN))
1201 ret = -EACCES;
1202 else
1203 sock_valbool_flag(sk, SOCK_DBG, valbool);
1204 break;
1205 case SO_REUSEADDR:
1206 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1207 break;
1208 case SO_REUSEPORT:
1209 sk->sk_reuseport = valbool;
1210 break;
1211 case SO_DONTROUTE:
1212 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1213 sk_dst_reset(sk);
1214 break;
1215 case SO_BROADCAST:
1216 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1217 break;
1218 case SO_SNDBUF:
1219
1220
1221
1222
1223
1224 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1225set_sndbuf:
1226
1227
1228
1229 val = min_t(int, val, INT_MAX / 2);
1230 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1231 WRITE_ONCE(sk->sk_sndbuf,
1232 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1233
1234 sk->sk_write_space(sk);
1235 break;
1236
1237 case SO_SNDBUFFORCE:
1238 if (!sockopt_capable(CAP_NET_ADMIN)) {
1239 ret = -EPERM;
1240 break;
1241 }
1242
1243
1244
1245
1246 if (val < 0)
1247 val = 0;
1248 goto set_sndbuf;
1249
1250 case SO_RCVBUF:
1251
1252
1253
1254
1255
1256 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1257 break;
1258
1259 case SO_RCVBUFFORCE:
1260 if (!sockopt_capable(CAP_NET_ADMIN)) {
1261 ret = -EPERM;
1262 break;
1263 }
1264
1265
1266
1267
1268 __sock_set_rcvbuf(sk, max(val, 0));
1269 break;
1270
1271 case SO_KEEPALIVE:
1272 if (sk->sk_prot->keepalive)
1273 sk->sk_prot->keepalive(sk, valbool);
1274 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1275 break;
1276
1277 case SO_OOBINLINE:
1278 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1279 break;
1280
1281 case SO_NO_CHECK:
1282 sk->sk_no_check_tx = valbool;
1283 break;
1284
1285 case SO_LINGER:
1286 if (optlen < sizeof(ling)) {
1287 ret = -EINVAL;
1288 break;
1289 }
1290 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1291 ret = -EFAULT;
1292 break;
1293 }
1294 if (!ling.l_onoff) {
1295 sock_reset_flag(sk, SOCK_LINGER);
1296 } else {
1297 unsigned long t_sec = ling.l_linger;
1298
1299 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1300 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1301 else
1302 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1303 sock_set_flag(sk, SOCK_LINGER);
1304 }
1305 break;
1306
1307 case SO_BSDCOMPAT:
1308 break;
1309
1310 case SO_TIMESTAMP_OLD:
1311 case SO_TIMESTAMP_NEW:
1312 case SO_TIMESTAMPNS_OLD:
1313 case SO_TIMESTAMPNS_NEW:
1314 sock_set_timestamp(sk, optname, valbool);
1315 break;
1316
1317 case SO_TIMESTAMPING_NEW:
1318 case SO_TIMESTAMPING_OLD:
1319 if (optlen == sizeof(timestamping)) {
1320 if (copy_from_sockptr(×tamping, optval,
1321 sizeof(timestamping))) {
1322 ret = -EFAULT;
1323 break;
1324 }
1325 } else {
1326 memset(×tamping, 0, sizeof(timestamping));
1327 timestamping.flags = val;
1328 }
1329 ret = sock_set_timestamping(sk, optname, timestamping);
1330 break;
1331
1332 case SO_RCVLOWAT:
1333 {
1334 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1335
1336 if (val < 0)
1337 val = INT_MAX;
1338 if (sock)
1339 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1340 if (set_rcvlowat)
1341 ret = set_rcvlowat(sk, val);
1342 else
1343 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1344 break;
1345 }
1346 case SO_RCVTIMEO_OLD:
1347 case SO_RCVTIMEO_NEW:
1348 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1349 optlen, optname == SO_RCVTIMEO_OLD);
1350 break;
1351
1352 case SO_SNDTIMEO_OLD:
1353 case SO_SNDTIMEO_NEW:
1354 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1355 optlen, optname == SO_SNDTIMEO_OLD);
1356 break;
1357
1358 case SO_ATTACH_FILTER: {
1359 struct sock_fprog fprog;
1360
1361 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1362 if (!ret)
1363 ret = sk_attach_filter(&fprog, sk);
1364 break;
1365 }
1366 case SO_ATTACH_BPF:
1367 ret = -EINVAL;
1368 if (optlen == sizeof(u32)) {
1369 u32 ufd;
1370
1371 ret = -EFAULT;
1372 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1373 break;
1374
1375 ret = sk_attach_bpf(ufd, sk);
1376 }
1377 break;
1378
1379 case SO_ATTACH_REUSEPORT_CBPF: {
1380 struct sock_fprog fprog;
1381
1382 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1383 if (!ret)
1384 ret = sk_reuseport_attach_filter(&fprog, sk);
1385 break;
1386 }
1387 case SO_ATTACH_REUSEPORT_EBPF:
1388 ret = -EINVAL;
1389 if (optlen == sizeof(u32)) {
1390 u32 ufd;
1391
1392 ret = -EFAULT;
1393 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1394 break;
1395
1396 ret = sk_reuseport_attach_bpf(ufd, sk);
1397 }
1398 break;
1399
1400 case SO_DETACH_REUSEPORT_BPF:
1401 ret = reuseport_detach_prog(sk);
1402 break;
1403
1404 case SO_DETACH_FILTER:
1405 ret = sk_detach_filter(sk);
1406 break;
1407
1408 case SO_LOCK_FILTER:
1409 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1410 ret = -EPERM;
1411 else
1412 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1413 break;
1414
1415 case SO_MARK:
1416 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1417 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1418 ret = -EPERM;
1419 break;
1420 }
1421
1422 __sock_set_mark(sk, val);
1423 break;
1424 case SO_RCVMARK:
1425 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1426 break;
1427
1428 case SO_RXQ_OVFL:
1429 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1430 break;
1431
1432 case SO_WIFI_STATUS:
1433 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1434 break;
1435
1436 case SO_PEEK_OFF:
1437 {
1438 int (*set_peek_off)(struct sock *sk, int val);
1439
1440 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1441 if (set_peek_off)
1442 ret = set_peek_off(sk, val);
1443 else
1444 ret = -EOPNOTSUPP;
1445 break;
1446 }
1447
1448 case SO_NOFCS:
1449 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1450 break;
1451
1452 case SO_SELECT_ERR_QUEUE:
1453 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1454 break;
1455
1456
1457 case SO_INCOMING_CPU:
1458 reuseport_update_incoming_cpu(sk, val);
1459 break;
1460
1461 case SO_CNX_ADVICE:
1462 if (val == 1)
1463 dst_negative_advice(sk);
1464 break;
1465
1466 case SO_ZEROCOPY:
1467 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1468 if (!(sk_is_tcp(sk) ||
1469 (sk->sk_type == SOCK_DGRAM &&
1470 sk->sk_protocol == IPPROTO_UDP)))
1471 ret = -EOPNOTSUPP;
1472 } else if (sk->sk_family != PF_RDS) {
1473 ret = -EOPNOTSUPP;
1474 }
1475 if (!ret) {
1476 if (val < 0 || val > 1)
1477 ret = -EINVAL;
1478 else
1479 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1480 }
1481 break;
1482
1483 case SO_TXTIME:
1484 if (optlen != sizeof(struct sock_txtime)) {
1485 ret = -EINVAL;
1486 break;
1487 } else if (copy_from_sockptr(&sk_txtime, optval,
1488 sizeof(struct sock_txtime))) {
1489 ret = -EFAULT;
1490 break;
1491 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1492 ret = -EINVAL;
1493 break;
1494 }
1495
1496
1497
1498 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1499 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1500 ret = -EPERM;
1501 break;
1502 }
1503 sock_valbool_flag(sk, SOCK_TXTIME, true);
1504 sk->sk_clockid = sk_txtime.clockid;
1505 sk->sk_txtime_deadline_mode =
1506 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1507 sk->sk_txtime_report_errors =
1508 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1509 break;
1510
1511 case SO_BINDTOIFINDEX:
1512 ret = sock_bindtoindex_locked(sk, val);
1513 break;
1514
1515 case SO_BUF_LOCK:
1516 if (val & ~SOCK_BUF_LOCK_MASK) {
1517 ret = -EINVAL;
1518 break;
1519 }
1520 sk->sk_userlocks = val | (sk->sk_userlocks &
1521 ~SOCK_BUF_LOCK_MASK);
1522 break;
1523
1524 case SO_RESERVE_MEM:
1525 {
1526 int delta;
1527
1528 if (val < 0) {
1529 ret = -EINVAL;
1530 break;
1531 }
1532
1533 delta = val - sk->sk_reserved_mem;
1534 if (delta < 0)
1535 sock_release_reserved_memory(sk, -delta);
1536 else
1537 ret = sock_reserve_memory(sk, delta);
1538 break;
1539 }
1540
1541 default:
1542 ret = -ENOPROTOOPT;
1543 break;
1544 }
1545 sockopt_release_sock(sk);
1546 return ret;
1547}
1548
1549int sock_setsockopt(struct socket *sock, int level, int optname,
1550 sockptr_t optval, unsigned int optlen)
1551{
1552 return sk_setsockopt(sock->sk, level, optname,
1553 optval, optlen);
1554}
1555EXPORT_SYMBOL(sock_setsockopt);
1556
1557static const struct cred *sk_get_peer_cred(struct sock *sk)
1558{
1559 const struct cred *cred;
1560
1561 spin_lock(&sk->sk_peer_lock);
1562 cred = get_cred(sk->sk_peer_cred);
1563 spin_unlock(&sk->sk_peer_lock);
1564
1565 return cred;
1566}
1567
1568static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1569 struct ucred *ucred)
1570{
1571 ucred->pid = pid_vnr(pid);
1572 ucred->uid = ucred->gid = -1;
1573 if (cred) {
1574 struct user_namespace *current_ns = current_user_ns();
1575
1576 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1577 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1578 }
1579}
1580
1581static int groups_to_user(sockptr_t dst, const struct group_info *src)
1582{
1583 struct user_namespace *user_ns = current_user_ns();
1584 int i;
1585
1586 for (i = 0; i < src->ngroups; i++) {
1587 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1588
1589 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1590 return -EFAULT;
1591 }
1592
1593 return 0;
1594}
1595
1596int sk_getsockopt(struct sock *sk, int level, int optname,
1597 sockptr_t optval, sockptr_t optlen)
1598{
1599 struct socket *sock = sk->sk_socket;
1600
1601 union {
1602 int val;
1603 u64 val64;
1604 unsigned long ulval;
1605 struct linger ling;
1606 struct old_timeval32 tm32;
1607 struct __kernel_old_timeval tm;
1608 struct __kernel_sock_timeval stm;
1609 struct sock_txtime txtime;
1610 struct so_timestamping timestamping;
1611 } v;
1612
1613 int lv = sizeof(int);
1614 int len;
1615
1616 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1617 return -EFAULT;
1618 if (len < 0)
1619 return -EINVAL;
1620
1621 memset(&v, 0, sizeof(v));
1622
1623 switch (optname) {
1624 case SO_DEBUG:
1625 v.val = sock_flag(sk, SOCK_DBG);
1626 break;
1627
1628 case SO_DONTROUTE:
1629 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1630 break;
1631
1632 case SO_BROADCAST:
1633 v.val = sock_flag(sk, SOCK_BROADCAST);
1634 break;
1635
1636 case SO_SNDBUF:
1637 v.val = READ_ONCE(sk->sk_sndbuf);
1638 break;
1639
1640 case SO_RCVBUF:
1641 v.val = READ_ONCE(sk->sk_rcvbuf);
1642 break;
1643
1644 case SO_REUSEADDR:
1645 v.val = sk->sk_reuse;
1646 break;
1647
1648 case SO_REUSEPORT:
1649 v.val = sk->sk_reuseport;
1650 break;
1651
1652 case SO_KEEPALIVE:
1653 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1654 break;
1655
1656 case SO_TYPE:
1657 v.val = sk->sk_type;
1658 break;
1659
1660 case SO_PROTOCOL:
1661 v.val = sk->sk_protocol;
1662 break;
1663
1664 case SO_DOMAIN:
1665 v.val = sk->sk_family;
1666 break;
1667
1668 case SO_ERROR:
1669 v.val = -sock_error(sk);
1670 if (v.val == 0)
1671 v.val = xchg(&sk->sk_err_soft, 0);
1672 break;
1673
1674 case SO_OOBINLINE:
1675 v.val = sock_flag(sk, SOCK_URGINLINE);
1676 break;
1677
1678 case SO_NO_CHECK:
1679 v.val = sk->sk_no_check_tx;
1680 break;
1681
1682 case SO_PRIORITY:
1683 v.val = READ_ONCE(sk->sk_priority);
1684 break;
1685
1686 case SO_LINGER:
1687 lv = sizeof(v.ling);
1688 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1689 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1690 break;
1691
1692 case SO_BSDCOMPAT:
1693 break;
1694
1695 case SO_TIMESTAMP_OLD:
1696 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1697 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1698 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1699 break;
1700
1701 case SO_TIMESTAMPNS_OLD:
1702 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1703 break;
1704
1705 case SO_TIMESTAMP_NEW:
1706 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1707 break;
1708
1709 case SO_TIMESTAMPNS_NEW:
1710 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1711 break;
1712
1713 case SO_TIMESTAMPING_OLD:
1714 case SO_TIMESTAMPING_NEW:
1715 lv = sizeof(v.timestamping);
1716
1717
1718
1719
1720 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1721 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1722 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1723 }
1724 break;
1725
1726 case SO_RCVTIMEO_OLD:
1727 case SO_RCVTIMEO_NEW:
1728 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1729 SO_RCVTIMEO_OLD == optname);
1730 break;
1731
1732 case SO_SNDTIMEO_OLD:
1733 case SO_SNDTIMEO_NEW:
1734 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1735 SO_SNDTIMEO_OLD == optname);
1736 break;
1737
1738 case SO_RCVLOWAT:
1739 v.val = READ_ONCE(sk->sk_rcvlowat);
1740 break;
1741
1742 case SO_SNDLOWAT:
1743 v.val = 1;
1744 break;
1745
1746 case SO_PASSCRED:
1747 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1748 break;
1749
1750 case SO_PASSPIDFD:
1751 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1752 break;
1753
1754 case SO_PEERCRED:
1755 {
1756 struct ucred peercred;
1757 if (len > sizeof(peercred))
1758 len = sizeof(peercred);
1759
1760 spin_lock(&sk->sk_peer_lock);
1761 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1762 spin_unlock(&sk->sk_peer_lock);
1763
1764 if (copy_to_sockptr(optval, &peercred, len))
1765 return -EFAULT;
1766 goto lenout;
1767 }
1768
1769 case SO_PEERPIDFD:
1770 {
1771 struct pid *peer_pid;
1772 struct file *pidfd_file = NULL;
1773 int pidfd;
1774
1775 if (len > sizeof(pidfd))
1776 len = sizeof(pidfd);
1777
1778 spin_lock(&sk->sk_peer_lock);
1779 peer_pid = get_pid(sk->sk_peer_pid);
1780 spin_unlock(&sk->sk_peer_lock);
1781
1782 if (!peer_pid)
1783 return -ENODATA;
1784
1785 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1786 put_pid(peer_pid);
1787 if (pidfd < 0)
1788 return pidfd;
1789
1790 if (copy_to_sockptr(optval, &pidfd, len) ||
1791 copy_to_sockptr(optlen, &len, sizeof(int))) {
1792 put_unused_fd(pidfd);
1793 fput(pidfd_file);
1794
1795 return -EFAULT;
1796 }
1797
1798 fd_install(pidfd, pidfd_file);
1799 return 0;
1800 }
1801
1802 case SO_PEERGROUPS:
1803 {
1804 const struct cred *cred;
1805 int ret, n;
1806
1807 cred = sk_get_peer_cred(sk);
1808 if (!cred)
1809 return -ENODATA;
1810
1811 n = cred->group_info->ngroups;
1812 if (len < n * sizeof(gid_t)) {
1813 len = n * sizeof(gid_t);
1814 put_cred(cred);
1815 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1816 }
1817 len = n * sizeof(gid_t);
1818
1819 ret = groups_to_user(optval, cred->group_info);
1820 put_cred(cred);
1821 if (ret)
1822 return ret;
1823 goto lenout;
1824 }
1825
1826 case SO_PEERNAME:
1827 {
1828 struct sockaddr_storage address;
1829
1830 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1831 if (lv < 0)
1832 return -ENOTCONN;
1833 if (lv < len)
1834 return -EINVAL;
1835 if (copy_to_sockptr(optval, &address, len))
1836 return -EFAULT;
1837 goto lenout;
1838 }
1839
1840
1841
1842
1843 case SO_ACCEPTCONN:
1844 v.val = sk->sk_state == TCP_LISTEN;
1845 break;
1846
1847 case SO_PASSSEC:
1848 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1849 break;
1850
1851 case SO_PEERSEC:
1852 return security_socket_getpeersec_stream(sock,
1853 optval, optlen, len);
1854
1855 case SO_MARK:
1856 v.val = READ_ONCE(sk->sk_mark);
1857 break;
1858
1859 case SO_RCVMARK:
1860 v.val = sock_flag(sk, SOCK_RCVMARK);
1861 break;
1862
1863 case SO_RXQ_OVFL:
1864 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1865 break;
1866
1867 case SO_WIFI_STATUS:
1868 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1869 break;
1870
1871 case SO_PEEK_OFF:
1872 if (!READ_ONCE(sock->ops)->set_peek_off)
1873 return -EOPNOTSUPP;
1874
1875 v.val = READ_ONCE(sk->sk_peek_off);
1876 break;
1877 case SO_NOFCS:
1878 v.val = sock_flag(sk, SOCK_NOFCS);
1879 break;
1880
1881 case SO_BINDTODEVICE:
1882 return sock_getbindtodevice(sk, optval, optlen, len);
1883
1884 case SO_GET_FILTER:
1885 len = sk_get_filter(sk, optval, len);
1886 if (len < 0)
1887 return len;
1888
1889 goto lenout;
1890
1891 case SO_LOCK_FILTER:
1892 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1893 break;
1894
1895 case SO_BPF_EXTENSIONS:
1896 v.val = bpf_tell_extensions();
1897 break;
1898
1899 case SO_SELECT_ERR_QUEUE:
1900 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1901 break;
1902
1903#ifdef CONFIG_NET_RX_BUSY_POLL
1904 case SO_BUSY_POLL:
1905 v.val = READ_ONCE(sk->sk_ll_usec);
1906 break;
1907 case SO_PREFER_BUSY_POLL:
1908 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1909 break;
1910#endif
1911
1912 case SO_MAX_PACING_RATE:
1913
1914 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1915 lv = sizeof(v.ulval);
1916 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1917 } else {
1918
1919 v.val = min_t(unsigned long, ~0U,
1920 READ_ONCE(sk->sk_max_pacing_rate));
1921 }
1922 break;
1923
1924 case SO_INCOMING_CPU:
1925 v.val = READ_ONCE(sk->sk_incoming_cpu);
1926 break;
1927
1928 case SO_MEMINFO:
1929 {
1930 u32 meminfo[SK_MEMINFO_VARS];
1931
1932 sk_get_meminfo(sk, meminfo);
1933
1934 len = min_t(unsigned int, len, sizeof(meminfo));
1935 if (copy_to_sockptr(optval, &meminfo, len))
1936 return -EFAULT;
1937
1938 goto lenout;
1939 }
1940
1941#ifdef CONFIG_NET_RX_BUSY_POLL
1942 case SO_INCOMING_NAPI_ID:
1943 v.val = READ_ONCE(sk->sk_napi_id);
1944
1945
1946 if (v.val < MIN_NAPI_ID)
1947 v.val = 0;
1948
1949 break;
1950#endif
1951
1952 case SO_COOKIE:
1953 lv = sizeof(u64);
1954 if (len < lv)
1955 return -EINVAL;
1956 v.val64 = sock_gen_cookie(sk);
1957 break;
1958
1959 case SO_ZEROCOPY:
1960 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1961 break;
1962
1963 case SO_TXTIME:
1964 lv = sizeof(v.txtime);
1965 v.txtime.clockid = sk->sk_clockid;
1966 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1967 SOF_TXTIME_DEADLINE_MODE : 0;
1968 v.txtime.flags |= sk->sk_txtime_report_errors ?
1969 SOF_TXTIME_REPORT_ERRORS : 0;
1970 break;
1971
1972 case SO_BINDTOIFINDEX:
1973 v.val = READ_ONCE(sk->sk_bound_dev_if);
1974 break;
1975
1976 case SO_NETNS_COOKIE:
1977 lv = sizeof(u64);
1978 if (len != lv)
1979 return -EINVAL;
1980 v.val64 = sock_net(sk)->net_cookie;
1981 break;
1982
1983 case SO_BUF_LOCK:
1984 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1985 break;
1986
1987 case SO_RESERVE_MEM:
1988 v.val = READ_ONCE(sk->sk_reserved_mem);
1989 break;
1990
1991 case SO_TXREHASH:
1992
1993 v.val = READ_ONCE(sk->sk_txrehash);
1994 break;
1995
1996 default:
1997
1998
1999
2000 return -ENOPROTOOPT;
2001 }
2002
2003 if (len > lv)
2004 len = lv;
2005 if (copy_to_sockptr(optval, &v, len))
2006 return -EFAULT;
2007lenout:
2008 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2009 return -EFAULT;
2010 return 0;
2011}
2012
2013
2014
2015
2016
2017
2018static inline void sock_lock_init(struct sock *sk)
2019{
2020 if (sk->sk_kern_sock)
2021 sock_lock_init_class_and_name(
2022 sk,
2023 af_family_kern_slock_key_strings[sk->sk_family],
2024 af_family_kern_slock_keys + sk->sk_family,
2025 af_family_kern_key_strings[sk->sk_family],
2026 af_family_kern_keys + sk->sk_family);
2027 else
2028 sock_lock_init_class_and_name(
2029 sk,
2030 af_family_slock_key_strings[sk->sk_family],
2031 af_family_slock_keys + sk->sk_family,
2032 af_family_key_strings[sk->sk_family],
2033 af_family_keys + sk->sk_family);
2034}
2035
2036
2037
2038
2039
2040
2041static void sock_copy(struct sock *nsk, const struct sock *osk)
2042{
2043 const struct proto *prot = READ_ONCE(osk->sk_prot);
2044#ifdef CONFIG_SECURITY_NETWORK
2045 void *sptr = nsk->sk_security;
2046#endif
2047
2048
2049
2050
2051
2052 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2053 offsetof(struct sock, sk_dontcopy_begin) ||
2054 offsetof(struct sock, sk_tx_queue_mapping) >=
2055 offsetof(struct sock, sk_dontcopy_end));
2056
2057 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2058
2059 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2060 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2061
2062#ifdef CONFIG_SECURITY_NETWORK
2063 nsk->sk_security = sptr;
2064 security_sk_clone(osk, nsk);
2065#endif
2066}
2067
2068static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2069 int family)
2070{
2071 struct sock *sk;
2072 struct kmem_cache *slab;
2073
2074 slab = prot->slab;
2075 if (slab != NULL) {
2076 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2077 if (!sk)
2078 return sk;
2079 if (want_init_on_alloc(priority))
2080 sk_prot_clear_nulls(sk, prot->obj_size);
2081 } else
2082 sk = kmalloc(prot->obj_size, priority);
2083
2084 if (sk != NULL) {
2085 if (security_sk_alloc(sk, family, priority))
2086 goto out_free;
2087
2088 if (!try_module_get(prot->owner))
2089 goto out_free_sec;
2090 }
2091
2092 return sk;
2093
2094out_free_sec:
2095 security_sk_free(sk);
2096out_free:
2097 if (slab != NULL)
2098 kmem_cache_free(slab, sk);
2099 else
2100 kfree(sk);
2101 return NULL;
2102}
2103
2104static void sk_prot_free(struct proto *prot, struct sock *sk)
2105{
2106 struct kmem_cache *slab;
2107 struct module *owner;
2108
2109 owner = prot->owner;
2110 slab = prot->slab;
2111
2112 cgroup_sk_free(&sk->sk_cgrp_data);
2113 mem_cgroup_sk_free(sk);
2114 security_sk_free(sk);
2115 if (slab != NULL)
2116 kmem_cache_free(slab, sk);
2117 else
2118 kfree(sk);
2119 module_put(owner);
2120}
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2131 struct proto *prot, int kern)
2132{
2133 struct sock *sk;
2134
2135 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2136 if (sk) {
2137 sk->sk_family = family;
2138
2139
2140
2141
2142 sk->sk_prot = sk->sk_prot_creator = prot;
2143 sk->sk_kern_sock = kern;
2144 sock_lock_init(sk);
2145 sk->sk_net_refcnt = kern ? 0 : 1;
2146 if (likely(sk->sk_net_refcnt)) {
2147 get_net_track(net, &sk->ns_tracker, priority);
2148 sock_inuse_add(net, 1);
2149 } else {
2150 __netns_tracker_alloc(net, &sk->ns_tracker,
2151 false, priority);
2152 }
2153
2154 sock_net_set(sk, net);
2155 refcount_set(&sk->sk_wmem_alloc, 1);
2156
2157 mem_cgroup_sk_alloc(sk);
2158 cgroup_sk_alloc(&sk->sk_cgrp_data);
2159 sock_update_classid(&sk->sk_cgrp_data);
2160 sock_update_netprioidx(&sk->sk_cgrp_data);
2161 sk_tx_queue_clear(sk);
2162 }
2163
2164 return sk;
2165}
2166EXPORT_SYMBOL(sk_alloc);
2167
2168
2169
2170
2171static void __sk_destruct(struct rcu_head *head)
2172{
2173 struct sock *sk = container_of(head, struct sock, sk_rcu);
2174 struct sk_filter *filter;
2175
2176 if (sk->sk_destruct)
2177 sk->sk_destruct(sk);
2178
2179 filter = rcu_dereference_check(sk->sk_filter,
2180 refcount_read(&sk->sk_wmem_alloc) == 0);
2181 if (filter) {
2182 sk_filter_uncharge(sk, filter);
2183 RCU_INIT_POINTER(sk->sk_filter, NULL);
2184 }
2185
2186 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2187
2188#ifdef CONFIG_BPF_SYSCALL
2189 bpf_sk_storage_free(sk);
2190#endif
2191
2192 if (atomic_read(&sk->sk_omem_alloc))
2193 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2194 __func__, atomic_read(&sk->sk_omem_alloc));
2195
2196 if (sk->sk_frag.page) {
2197 put_page(sk->sk_frag.page);
2198 sk->sk_frag.page = NULL;
2199 }
2200
2201
2202 put_cred(sk->sk_peer_cred);
2203 put_pid(sk->sk_peer_pid);
2204
2205 if (likely(sk->sk_net_refcnt))
2206 put_net_track(sock_net(sk), &sk->ns_tracker);
2207 else
2208 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2209
2210 sk_prot_free(sk->sk_prot_creator, sk);
2211}
2212
2213void sk_destruct(struct sock *sk)
2214{
2215 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2216
2217 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2218 reuseport_detach_sock(sk);
2219 use_call_rcu = true;
2220 }
2221
2222 if (use_call_rcu)
2223 call_rcu(&sk->sk_rcu, __sk_destruct);
2224 else
2225 __sk_destruct(&sk->sk_rcu);
2226}
2227
2228static void __sk_free(struct sock *sk)
2229{
2230 if (likely(sk->sk_net_refcnt))
2231 sock_inuse_add(sock_net(sk), -1);
2232
2233 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2234 sock_diag_broadcast_destroy(sk);
2235 else
2236 sk_destruct(sk);
2237}
2238
2239void sk_free(struct sock *sk)
2240{
2241
2242
2243
2244
2245
2246 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2247 __sk_free(sk);
2248}
2249EXPORT_SYMBOL(sk_free);
2250
2251static void sk_init_common(struct sock *sk)
2252{
2253 skb_queue_head_init(&sk->sk_receive_queue);
2254 skb_queue_head_init(&sk->sk_write_queue);
2255 skb_queue_head_init(&sk->sk_error_queue);
2256
2257 rwlock_init(&sk->sk_callback_lock);
2258 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2259 af_rlock_keys + sk->sk_family,
2260 af_family_rlock_key_strings[sk->sk_family]);
2261 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2262 af_wlock_keys + sk->sk_family,
2263 af_family_wlock_key_strings[sk->sk_family]);
2264 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2265 af_elock_keys + sk->sk_family,
2266 af_family_elock_key_strings[sk->sk_family]);
2267 lockdep_set_class_and_name(&sk->sk_callback_lock,
2268 af_callback_keys + sk->sk_family,
2269 af_family_clock_key_strings[sk->sk_family]);
2270}
2271
2272
2273
2274
2275
2276
2277
2278
2279struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2280{
2281 struct proto *prot = READ_ONCE(sk->sk_prot);
2282 struct sk_filter *filter;
2283 bool is_charged = true;
2284 struct sock *newsk;
2285
2286 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2287 if (!newsk)
2288 goto out;
2289
2290 sock_copy(newsk, sk);
2291
2292 newsk->sk_prot_creator = prot;
2293
2294
2295 if (likely(newsk->sk_net_refcnt)) {
2296 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2297 sock_inuse_add(sock_net(newsk), 1);
2298 } else {
2299
2300
2301
2302
2303
2304 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2305 false, priority);
2306 }
2307 sk_node_init(&newsk->sk_node);
2308 sock_lock_init(newsk);
2309 bh_lock_sock(newsk);
2310 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2311 newsk->sk_backlog.len = 0;
2312
2313 atomic_set(&newsk->sk_rmem_alloc, 0);
2314
2315
2316 refcount_set(&newsk->sk_wmem_alloc, 1);
2317
2318 atomic_set(&newsk->sk_omem_alloc, 0);
2319 sk_init_common(newsk);
2320
2321 newsk->sk_dst_cache = NULL;
2322 newsk->sk_dst_pending_confirm = 0;
2323 newsk->sk_wmem_queued = 0;
2324 newsk->sk_forward_alloc = 0;
2325 newsk->sk_reserved_mem = 0;
2326 atomic_set(&newsk->sk_drops, 0);
2327 newsk->sk_send_head = NULL;
2328 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2329 atomic_set(&newsk->sk_zckey, 0);
2330
2331 sock_reset_flag(newsk, SOCK_DONE);
2332
2333
2334 newsk->sk_memcg = NULL;
2335
2336 cgroup_sk_clone(&newsk->sk_cgrp_data);
2337
2338 rcu_read_lock();
2339 filter = rcu_dereference(sk->sk_filter);
2340 if (filter != NULL)
2341
2342
2343
2344
2345 is_charged = sk_filter_charge(newsk, filter);
2346 RCU_INIT_POINTER(newsk->sk_filter, filter);
2347 rcu_read_unlock();
2348
2349 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2350
2351
2352
2353
2354 if (!is_charged)
2355 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2356 sk_free_unlock_clone(newsk);
2357 newsk = NULL;
2358 goto out;
2359 }
2360 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2361
2362 if (bpf_sk_storage_clone(sk, newsk)) {
2363 sk_free_unlock_clone(newsk);
2364 newsk = NULL;
2365 goto out;
2366 }
2367
2368
2369
2370
2371 if (sk_user_data_is_nocopy(newsk))
2372 newsk->sk_user_data = NULL;
2373
2374 newsk->sk_err = 0;
2375 newsk->sk_err_soft = 0;
2376 newsk->sk_priority = 0;
2377 newsk->sk_incoming_cpu = raw_smp_processor_id();
2378
2379
2380
2381
2382 smp_wmb();
2383 refcount_set(&newsk->sk_refcnt, 2);
2384
2385 sk_set_socket(newsk, NULL);
2386 sk_tx_queue_clear(newsk);
2387 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2388
2389 if (newsk->sk_prot->sockets_allocated)
2390 sk_sockets_allocated_inc(newsk);
2391
2392 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2393 net_enable_timestamp();
2394out:
2395 return newsk;
2396}
2397EXPORT_SYMBOL_GPL(sk_clone_lock);
2398
2399void sk_free_unlock_clone(struct sock *sk)
2400{
2401
2402
2403 sk->sk_destruct = NULL;
2404 bh_unlock_sock(sk);
2405 sk_free(sk);
2406}
2407EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2408
2409static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2410{
2411 bool is_ipv6 = false;
2412 u32 max_size;
2413
2414#if IS_ENABLED(CONFIG_IPV6)
2415 is_ipv6 = (sk->sk_family == AF_INET6 &&
2416 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2417#endif
2418
2419 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2420 READ_ONCE(dst->dev->gso_ipv4_max_size);
2421 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2422 max_size = GSO_LEGACY_MAX_SIZE;
2423
2424 return max_size - (MAX_TCP_HEADER + 1);
2425}
2426
2427void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2428{
2429 u32 max_segs = 1;
2430
2431 sk->sk_route_caps = dst->dev->features;
2432 if (sk_is_tcp(sk))
2433 sk->sk_route_caps |= NETIF_F_GSO;
2434 if (sk->sk_route_caps & NETIF_F_GSO)
2435 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2436 if (unlikely(sk->sk_gso_disabled))
2437 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2438 if (sk_can_gso(sk)) {
2439 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2440 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2441 } else {
2442 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2443 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2444
2445 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2446 }
2447 }
2448 sk->sk_gso_max_segs = max_segs;
2449 sk_dst_set(sk, dst);
2450}
2451EXPORT_SYMBOL_GPL(sk_setup_caps);
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461void sock_wfree(struct sk_buff *skb)
2462{
2463 struct sock *sk = skb->sk;
2464 unsigned int len = skb->truesize;
2465 bool free;
2466
2467 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2468 if (sock_flag(sk, SOCK_RCU_FREE) &&
2469 sk->sk_write_space == sock_def_write_space) {
2470 rcu_read_lock();
2471 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2472 sock_def_write_space_wfree(sk);
2473 rcu_read_unlock();
2474 if (unlikely(free))
2475 __sk_free(sk);
2476 return;
2477 }
2478
2479
2480
2481
2482
2483 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2484 sk->sk_write_space(sk);
2485 len = 1;
2486 }
2487
2488
2489
2490
2491 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2492 __sk_free(sk);
2493}
2494EXPORT_SYMBOL(sock_wfree);
2495
2496
2497
2498
2499void __sock_wfree(struct sk_buff *skb)
2500{
2501 struct sock *sk = skb->sk;
2502
2503 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2504 __sk_free(sk);
2505}
2506
2507void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2508{
2509 skb_orphan(skb);
2510 skb->sk = sk;
2511#ifdef CONFIG_INET
2512 if (unlikely(!sk_fullsock(sk))) {
2513 skb->destructor = sock_edemux;
2514 sock_hold(sk);
2515 return;
2516 }
2517#endif
2518 skb->destructor = sock_wfree;
2519 skb_set_hash_from_sk(skb, sk);
2520
2521
2522
2523
2524
2525 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2526}
2527EXPORT_SYMBOL(skb_set_owner_w);
2528
2529static bool can_skb_orphan_partial(const struct sk_buff *skb)
2530{
2531#ifdef CONFIG_TLS_DEVICE
2532
2533
2534
2535 if (skb->decrypted)
2536 return false;
2537#endif
2538 return (skb->destructor == sock_wfree ||
2539 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2540}
2541
2542
2543
2544
2545
2546
2547
2548void skb_orphan_partial(struct sk_buff *skb)
2549{
2550 if (skb_is_tcp_pure_ack(skb))
2551 return;
2552
2553 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2554 return;
2555
2556 skb_orphan(skb);
2557}
2558EXPORT_SYMBOL(skb_orphan_partial);
2559
2560
2561
2562
2563void sock_rfree(struct sk_buff *skb)
2564{
2565 struct sock *sk = skb->sk;
2566 unsigned int len = skb->truesize;
2567
2568 atomic_sub(len, &sk->sk_rmem_alloc);
2569 sk_mem_uncharge(sk, len);
2570}
2571EXPORT_SYMBOL(sock_rfree);
2572
2573
2574
2575
2576
2577void sock_efree(struct sk_buff *skb)
2578{
2579 sock_put(skb->sk);
2580}
2581EXPORT_SYMBOL(sock_efree);
2582
2583
2584
2585
2586#ifdef CONFIG_INET
2587void sock_pfree(struct sk_buff *skb)
2588{
2589 if (sk_is_refcounted(skb->sk))
2590 sock_gen_put(skb->sk);
2591}
2592EXPORT_SYMBOL(sock_pfree);
2593#endif
2594
2595kuid_t sock_i_uid(struct sock *sk)
2596{
2597 kuid_t uid;
2598
2599 read_lock_bh(&sk->sk_callback_lock);
2600 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2601 read_unlock_bh(&sk->sk_callback_lock);
2602 return uid;
2603}
2604EXPORT_SYMBOL(sock_i_uid);
2605
2606unsigned long __sock_i_ino(struct sock *sk)
2607{
2608 unsigned long ino;
2609
2610 read_lock(&sk->sk_callback_lock);
2611 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2612 read_unlock(&sk->sk_callback_lock);
2613 return ino;
2614}
2615EXPORT_SYMBOL(__sock_i_ino);
2616
2617unsigned long sock_i_ino(struct sock *sk)
2618{
2619 unsigned long ino;
2620
2621 local_bh_disable();
2622 ino = __sock_i_ino(sk);
2623 local_bh_enable();
2624 return ino;
2625}
2626EXPORT_SYMBOL(sock_i_ino);
2627
2628
2629
2630
2631struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2632 gfp_t priority)
2633{
2634 if (force ||
2635 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2636 struct sk_buff *skb = alloc_skb(size, priority);
2637
2638 if (skb) {
2639 skb_set_owner_w(skb, sk);
2640 return skb;
2641 }
2642 }
2643 return NULL;
2644}
2645EXPORT_SYMBOL(sock_wmalloc);
2646
2647static void sock_ofree(struct sk_buff *skb)
2648{
2649 struct sock *sk = skb->sk;
2650
2651 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2652}
2653
2654struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2655 gfp_t priority)
2656{
2657 struct sk_buff *skb;
2658
2659
2660 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2661 READ_ONCE(sysctl_optmem_max))
2662 return NULL;
2663
2664 skb = alloc_skb(size, priority);
2665 if (!skb)
2666 return NULL;
2667
2668 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2669 skb->sk = sk;
2670 skb->destructor = sock_ofree;
2671 return skb;
2672}
2673
2674
2675
2676
2677void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2678{
2679 int optmem_max = READ_ONCE(sysctl_optmem_max);
2680
2681 if ((unsigned int)size <= optmem_max &&
2682 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2683 void *mem;
2684
2685
2686
2687 atomic_add(size, &sk->sk_omem_alloc);
2688 mem = kmalloc(size, priority);
2689 if (mem)
2690 return mem;
2691 atomic_sub(size, &sk->sk_omem_alloc);
2692 }
2693 return NULL;
2694}
2695EXPORT_SYMBOL(sock_kmalloc);
2696
2697
2698
2699
2700
2701static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2702 const bool nullify)
2703{
2704 if (WARN_ON_ONCE(!mem))
2705 return;
2706 if (nullify)
2707 kfree_sensitive(mem);
2708 else
2709 kfree(mem);
2710 atomic_sub(size, &sk->sk_omem_alloc);
2711}
2712
2713void sock_kfree_s(struct sock *sk, void *mem, int size)
2714{
2715 __sock_kfree_s(sk, mem, size, false);
2716}
2717EXPORT_SYMBOL(sock_kfree_s);
2718
2719void sock_kzfree_s(struct sock *sk, void *mem, int size)
2720{
2721 __sock_kfree_s(sk, mem, size, true);
2722}
2723EXPORT_SYMBOL(sock_kzfree_s);
2724
2725
2726
2727
2728static long sock_wait_for_wmem(struct sock *sk, long timeo)
2729{
2730 DEFINE_WAIT(wait);
2731
2732 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2733 for (;;) {
2734 if (!timeo)
2735 break;
2736 if (signal_pending(current))
2737 break;
2738 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2739 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2740 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2741 break;
2742 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2743 break;
2744 if (READ_ONCE(sk->sk_err))
2745 break;
2746 timeo = schedule_timeout(timeo);
2747 }
2748 finish_wait(sk_sleep(sk), &wait);
2749 return timeo;
2750}
2751
2752
2753
2754
2755
2756
2757struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2758 unsigned long data_len, int noblock,
2759 int *errcode, int max_page_order)
2760{
2761 struct sk_buff *skb;
2762 long timeo;
2763 int err;
2764
2765 timeo = sock_sndtimeo(sk, noblock);
2766 for (;;) {
2767 err = sock_error(sk);
2768 if (err != 0)
2769 goto failure;
2770
2771 err = -EPIPE;
2772 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2773 goto failure;
2774
2775 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2776 break;
2777
2778 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2779 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2780 err = -EAGAIN;
2781 if (!timeo)
2782 goto failure;
2783 if (signal_pending(current))
2784 goto interrupted;
2785 timeo = sock_wait_for_wmem(sk, timeo);
2786 }
2787 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2788 errcode, sk->sk_allocation);
2789 if (skb)
2790 skb_set_owner_w(skb, sk);
2791 return skb;
2792
2793interrupted:
2794 err = sock_intr_errno(timeo);
2795failure:
2796 *errcode = err;
2797 return NULL;
2798}
2799EXPORT_SYMBOL(sock_alloc_send_pskb);
2800
2801int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2802 struct sockcm_cookie *sockc)
2803{
2804 u32 tsflags;
2805
2806 switch (cmsg->cmsg_type) {
2807 case SO_MARK:
2808 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2809 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2810 return -EPERM;
2811 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2812 return -EINVAL;
2813 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2814 break;
2815 case SO_TIMESTAMPING_OLD:
2816 case SO_TIMESTAMPING_NEW:
2817 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2818 return -EINVAL;
2819
2820 tsflags = *(u32 *)CMSG_DATA(cmsg);
2821 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2822 return -EINVAL;
2823
2824 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2825 sockc->tsflags |= tsflags;
2826 break;
2827 case SCM_TXTIME:
2828 if (!sock_flag(sk, SOCK_TXTIME))
2829 return -EINVAL;
2830 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2831 return -EINVAL;
2832 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2833 break;
2834
2835 case SCM_RIGHTS:
2836 case SCM_CREDENTIALS:
2837 break;
2838 default:
2839 return -EINVAL;
2840 }
2841 return 0;
2842}
2843EXPORT_SYMBOL(__sock_cmsg_send);
2844
2845int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2846 struct sockcm_cookie *sockc)
2847{
2848 struct cmsghdr *cmsg;
2849 int ret;
2850
2851 for_each_cmsghdr(cmsg, msg) {
2852 if (!CMSG_OK(msg, cmsg))
2853 return -EINVAL;
2854 if (cmsg->cmsg_level != SOL_SOCKET)
2855 continue;
2856 ret = __sock_cmsg_send(sk, cmsg, sockc);
2857 if (ret)
2858 return ret;
2859 }
2860 return 0;
2861}
2862EXPORT_SYMBOL(sock_cmsg_send);
2863
2864static void sk_enter_memory_pressure(struct sock *sk)
2865{
2866 if (!sk->sk_prot->enter_memory_pressure)
2867 return;
2868
2869 sk->sk_prot->enter_memory_pressure(sk);
2870}
2871
2872static void sk_leave_memory_pressure(struct sock *sk)
2873{
2874 if (sk->sk_prot->leave_memory_pressure) {
2875 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2876 tcp_leave_memory_pressure, sk);
2877 } else {
2878 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2879
2880 if (memory_pressure && READ_ONCE(*memory_pressure))
2881 WRITE_ONCE(*memory_pressure, 0);
2882 }
2883}
2884
2885DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2898{
2899 if (pfrag->page) {
2900 if (page_ref_count(pfrag->page) == 1) {
2901 pfrag->offset = 0;
2902 return true;
2903 }
2904 if (pfrag->offset + sz <= pfrag->size)
2905 return true;
2906 put_page(pfrag->page);
2907 }
2908
2909 pfrag->offset = 0;
2910 if (SKB_FRAG_PAGE_ORDER &&
2911 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2912
2913 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2914 __GFP_COMP | __GFP_NOWARN |
2915 __GFP_NORETRY,
2916 SKB_FRAG_PAGE_ORDER);
2917 if (likely(pfrag->page)) {
2918 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2919 return true;
2920 }
2921 }
2922 pfrag->page = alloc_page(gfp);
2923 if (likely(pfrag->page)) {
2924 pfrag->size = PAGE_SIZE;
2925 return true;
2926 }
2927 return false;
2928}
2929EXPORT_SYMBOL(skb_page_frag_refill);
2930
2931bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2932{
2933 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2934 return true;
2935
2936 sk_enter_memory_pressure(sk);
2937 sk_stream_moderate_sndbuf(sk);
2938 return false;
2939}
2940EXPORT_SYMBOL(sk_page_frag_refill);
2941
2942void __lock_sock(struct sock *sk)
2943 __releases(&sk->sk_lock.slock)
2944 __acquires(&sk->sk_lock.slock)
2945{
2946 DEFINE_WAIT(wait);
2947
2948 for (;;) {
2949 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2950 TASK_UNINTERRUPTIBLE);
2951 spin_unlock_bh(&sk->sk_lock.slock);
2952 schedule();
2953 spin_lock_bh(&sk->sk_lock.slock);
2954 if (!sock_owned_by_user(sk))
2955 break;
2956 }
2957 finish_wait(&sk->sk_lock.wq, &wait);
2958}
2959
2960void __release_sock(struct sock *sk)
2961 __releases(&sk->sk_lock.slock)
2962 __acquires(&sk->sk_lock.slock)
2963{
2964 struct sk_buff *skb, *next;
2965
2966 while ((skb = sk->sk_backlog.head) != NULL) {
2967 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2968
2969 spin_unlock_bh(&sk->sk_lock.slock);
2970
2971 do {
2972 next = skb->next;
2973 prefetch(next);
2974 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2975 skb_mark_not_on_list(skb);
2976 sk_backlog_rcv(sk, skb);
2977
2978 cond_resched();
2979
2980 skb = next;
2981 } while (skb != NULL);
2982
2983 spin_lock_bh(&sk->sk_lock.slock);
2984 }
2985
2986
2987
2988
2989
2990 sk->sk_backlog.len = 0;
2991}
2992
2993void __sk_flush_backlog(struct sock *sk)
2994{
2995 spin_lock_bh(&sk->sk_lock.slock);
2996 __release_sock(sk);
2997
2998 if (sk->sk_prot->release_cb)
2999 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3000 tcp_release_cb, sk);
3001
3002 spin_unlock_bh(&sk->sk_lock.slock);
3003}
3004EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3018{
3019 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3020 int rc;
3021
3022 add_wait_queue(sk_sleep(sk), &wait);
3023 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3024 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3025 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3026 remove_wait_queue(sk_sleep(sk), &wait);
3027 return rc;
3028}
3029EXPORT_SYMBOL(sk_wait_data);
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3047{
3048 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3049 struct proto *prot = sk->sk_prot;
3050 bool charged = false;
3051 long allocated;
3052
3053 sk_memory_allocated_add(sk, amt);
3054 allocated = sk_memory_allocated(sk);
3055
3056 if (memcg) {
3057 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3058 goto suppress_allocation;
3059 charged = true;
3060 }
3061
3062
3063 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3064 sk_leave_memory_pressure(sk);
3065 return 1;
3066 }
3067
3068
3069 if (allocated > sk_prot_mem_limits(sk, 1))
3070 sk_enter_memory_pressure(sk);
3071
3072
3073 if (allocated > sk_prot_mem_limits(sk, 2))
3074 goto suppress_allocation;
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084 if (kind == SK_MEM_RECV) {
3085 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3086 return 1;
3087
3088 } else {
3089 int wmem0 = sk_get_wmem0(sk, prot);
3090
3091 if (sk->sk_type == SOCK_STREAM) {
3092 if (sk->sk_wmem_queued < wmem0)
3093 return 1;
3094 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3095 return 1;
3096 }
3097 }
3098
3099 if (sk_has_memory_pressure(sk)) {
3100 u64 alloc;
3101
3102
3103
3104
3105
3106 if (!sk_under_global_memory_pressure(sk))
3107 return 1;
3108
3109
3110
3111
3112
3113 alloc = sk_sockets_allocated_read_positive(sk);
3114 if (sk_prot_mem_limits(sk, 2) > alloc *
3115 sk_mem_pages(sk->sk_wmem_queued +
3116 atomic_read(&sk->sk_rmem_alloc) +
3117 sk->sk_forward_alloc))
3118 return 1;
3119 }
3120
3121suppress_allocation:
3122
3123 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3124 sk_stream_moderate_sndbuf(sk);
3125
3126
3127
3128
3129 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3130
3131 if (memcg && !charged) {
3132 mem_cgroup_charge_skmem(memcg, amt,
3133 gfp_memcg_charge() | __GFP_NOFAIL);
3134 }
3135 return 1;
3136 }
3137 }
3138
3139 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3140 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3141
3142 sk_memory_allocated_sub(sk, amt);
3143
3144 if (charged)
3145 mem_cgroup_uncharge_skmem(memcg, amt);
3146
3147 return 0;
3148}
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160int __sk_mem_schedule(struct sock *sk, int size, int kind)
3161{
3162 int ret, amt = sk_mem_pages(size);
3163
3164 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3165 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3166 if (!ret)
3167 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3168 return ret;
3169}
3170EXPORT_SYMBOL(__sk_mem_schedule);
3171
3172
3173
3174
3175
3176
3177
3178
3179void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3180{
3181 sk_memory_allocated_sub(sk, amount);
3182
3183 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3184 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3185
3186 if (sk_under_global_memory_pressure(sk) &&
3187 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3188 sk_leave_memory_pressure(sk);
3189}
3190
3191
3192
3193
3194
3195
3196void __sk_mem_reclaim(struct sock *sk, int amount)
3197{
3198 amount >>= PAGE_SHIFT;
3199 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3200 __sk_mem_reduce_allocated(sk, amount);
3201}
3202EXPORT_SYMBOL(__sk_mem_reclaim);
3203
3204int sk_set_peek_off(struct sock *sk, int val)
3205{
3206 WRITE_ONCE(sk->sk_peek_off, val);
3207 return 0;
3208}
3209EXPORT_SYMBOL_GPL(sk_set_peek_off);
3210
3211
3212
3213
3214
3215
3216
3217
3218int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3219{
3220 return -EOPNOTSUPP;
3221}
3222EXPORT_SYMBOL(sock_no_bind);
3223
3224int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3225 int len, int flags)
3226{
3227 return -EOPNOTSUPP;
3228}
3229EXPORT_SYMBOL(sock_no_connect);
3230
3231int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3232{
3233 return -EOPNOTSUPP;
3234}
3235EXPORT_SYMBOL(sock_no_socketpair);
3236
3237int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3238 bool kern)
3239{
3240 return -EOPNOTSUPP;
3241}
3242EXPORT_SYMBOL(sock_no_accept);
3243
3244int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3245 int peer)
3246{
3247 return -EOPNOTSUPP;
3248}
3249EXPORT_SYMBOL(sock_no_getname);
3250
3251int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3252{
3253 return -EOPNOTSUPP;
3254}
3255EXPORT_SYMBOL(sock_no_ioctl);
3256
3257int sock_no_listen(struct socket *sock, int backlog)
3258{
3259 return -EOPNOTSUPP;
3260}
3261EXPORT_SYMBOL(sock_no_listen);
3262
3263int sock_no_shutdown(struct socket *sock, int how)
3264{
3265 return -EOPNOTSUPP;
3266}
3267EXPORT_SYMBOL(sock_no_shutdown);
3268
3269int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3270{
3271 return -EOPNOTSUPP;
3272}
3273EXPORT_SYMBOL(sock_no_sendmsg);
3274
3275int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3276{
3277 return -EOPNOTSUPP;
3278}
3279EXPORT_SYMBOL(sock_no_sendmsg_locked);
3280
3281int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3282 int flags)
3283{
3284 return -EOPNOTSUPP;
3285}
3286EXPORT_SYMBOL(sock_no_recvmsg);
3287
3288int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3289{
3290
3291 return -ENODEV;
3292}
3293EXPORT_SYMBOL(sock_no_mmap);
3294
3295
3296
3297
3298
3299void __receive_sock(struct file *file)
3300{
3301 struct socket *sock;
3302
3303 sock = sock_from_file(file);
3304 if (sock) {
3305 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3306 sock_update_classid(&sock->sk->sk_cgrp_data);
3307 }
3308}
3309
3310
3311
3312
3313
3314static void sock_def_wakeup(struct sock *sk)
3315{
3316 struct socket_wq *wq;
3317
3318 rcu_read_lock();
3319 wq = rcu_dereference(sk->sk_wq);
3320 if (skwq_has_sleeper(wq))
3321 wake_up_interruptible_all(&wq->wait);
3322 rcu_read_unlock();
3323}
3324
3325static void sock_def_error_report(struct sock *sk)
3326{
3327 struct socket_wq *wq;
3328
3329 rcu_read_lock();
3330 wq = rcu_dereference(sk->sk_wq);
3331 if (skwq_has_sleeper(wq))
3332 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3333 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3334 rcu_read_unlock();
3335}
3336
3337void sock_def_readable(struct sock *sk)
3338{
3339 struct socket_wq *wq;
3340
3341 trace_sk_data_ready(sk);
3342
3343 rcu_read_lock();
3344 wq = rcu_dereference(sk->sk_wq);
3345 if (skwq_has_sleeper(wq))
3346 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3347 EPOLLRDNORM | EPOLLRDBAND);
3348 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3349 rcu_read_unlock();
3350}
3351
3352static void sock_def_write_space(struct sock *sk)
3353{
3354 struct socket_wq *wq;
3355
3356 rcu_read_lock();
3357
3358
3359
3360
3361 if (sock_writeable(sk)) {
3362 wq = rcu_dereference(sk->sk_wq);
3363 if (skwq_has_sleeper(wq))
3364 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3365 EPOLLWRNORM | EPOLLWRBAND);
3366
3367
3368 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3369 }
3370
3371 rcu_read_unlock();
3372}
3373
3374
3375
3376
3377
3378static void sock_def_write_space_wfree(struct sock *sk)
3379{
3380
3381
3382
3383 if (sock_writeable(sk)) {
3384 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3385
3386
3387 smp_mb__after_atomic();
3388 if (wq && waitqueue_active(&wq->wait))
3389 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3390 EPOLLWRNORM | EPOLLWRBAND);
3391
3392
3393 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3394 }
3395}
3396
3397static void sock_def_destruct(struct sock *sk)
3398{
3399}
3400
3401void sk_send_sigurg(struct sock *sk)
3402{
3403 if (sk->sk_socket && sk->sk_socket->file)
3404 if (send_sigurg(&sk->sk_socket->file->f_owner))
3405 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3406}
3407EXPORT_SYMBOL(sk_send_sigurg);
3408
3409void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3410 unsigned long expires)
3411{
3412 if (!mod_timer(timer, expires))
3413 sock_hold(sk);
3414}
3415EXPORT_SYMBOL(sk_reset_timer);
3416
3417void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3418{
3419 if (del_timer(timer))
3420 __sock_put(sk);
3421}
3422EXPORT_SYMBOL(sk_stop_timer);
3423
3424void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3425{
3426 if (del_timer_sync(timer))
3427 __sock_put(sk);
3428}
3429EXPORT_SYMBOL(sk_stop_timer_sync);
3430
3431void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3432{
3433 sk_init_common(sk);
3434 sk->sk_send_head = NULL;
3435
3436 timer_setup(&sk->sk_timer, NULL, 0);
3437
3438 sk->sk_allocation = GFP_KERNEL;
3439 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3440 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3441 sk->sk_state = TCP_CLOSE;
3442 sk->sk_use_task_frag = true;
3443 sk_set_socket(sk, sock);
3444
3445 sock_set_flag(sk, SOCK_ZAPPED);
3446
3447 if (sock) {
3448 sk->sk_type = sock->type;
3449 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3450 sock->sk = sk;
3451 } else {
3452 RCU_INIT_POINTER(sk->sk_wq, NULL);
3453 }
3454 sk->sk_uid = uid;
3455
3456 rwlock_init(&sk->sk_callback_lock);
3457 if (sk->sk_kern_sock)
3458 lockdep_set_class_and_name(
3459 &sk->sk_callback_lock,
3460 af_kern_callback_keys + sk->sk_family,
3461 af_family_kern_clock_key_strings[sk->sk_family]);
3462 else
3463 lockdep_set_class_and_name(
3464 &sk->sk_callback_lock,
3465 af_callback_keys + sk->sk_family,
3466 af_family_clock_key_strings[sk->sk_family]);
3467
3468 sk->sk_state_change = sock_def_wakeup;
3469 sk->sk_data_ready = sock_def_readable;
3470 sk->sk_write_space = sock_def_write_space;
3471 sk->sk_error_report = sock_def_error_report;
3472 sk->sk_destruct = sock_def_destruct;
3473
3474 sk->sk_frag.page = NULL;
3475 sk->sk_frag.offset = 0;
3476 sk->sk_peek_off = -1;
3477
3478 sk->sk_peer_pid = NULL;
3479 sk->sk_peer_cred = NULL;
3480 spin_lock_init(&sk->sk_peer_lock);
3481
3482 sk->sk_write_pending = 0;
3483 sk->sk_rcvlowat = 1;
3484 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3485 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3486
3487 sk->sk_stamp = SK_DEFAULT_STAMP;
3488#if BITS_PER_LONG==32
3489 seqlock_init(&sk->sk_stamp_seq);
3490#endif
3491 atomic_set(&sk->sk_zckey, 0);
3492
3493#ifdef CONFIG_NET_RX_BUSY_POLL
3494 sk->sk_napi_id = 0;
3495 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3496#endif
3497
3498 sk->sk_max_pacing_rate = ~0UL;
3499 sk->sk_pacing_rate = ~0UL;
3500 WRITE_ONCE(sk->sk_pacing_shift, 10);
3501 sk->sk_incoming_cpu = -1;
3502
3503 sk_rx_queue_clear(sk);
3504
3505
3506
3507
3508 smp_wmb();
3509 refcount_set(&sk->sk_refcnt, 1);
3510 atomic_set(&sk->sk_drops, 0);
3511}
3512EXPORT_SYMBOL(sock_init_data_uid);
3513
3514void sock_init_data(struct socket *sock, struct sock *sk)
3515{
3516 kuid_t uid = sock ?
3517 SOCK_INODE(sock)->i_uid :
3518 make_kuid(sock_net(sk)->user_ns, 0);
3519
3520 sock_init_data_uid(sock, sk, uid);
3521}
3522EXPORT_SYMBOL(sock_init_data);
3523
3524void lock_sock_nested(struct sock *sk, int subclass)
3525{
3526
3527 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3528
3529 might_sleep();
3530 spin_lock_bh(&sk->sk_lock.slock);
3531 if (sock_owned_by_user_nocheck(sk))
3532 __lock_sock(sk);
3533 sk->sk_lock.owned = 1;
3534 spin_unlock_bh(&sk->sk_lock.slock);
3535}
3536EXPORT_SYMBOL(lock_sock_nested);
3537
3538void release_sock(struct sock *sk)
3539{
3540 spin_lock_bh(&sk->sk_lock.slock);
3541 if (sk->sk_backlog.tail)
3542 __release_sock(sk);
3543
3544 if (sk->sk_prot->release_cb)
3545 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3546 tcp_release_cb, sk);
3547
3548 sock_release_ownership(sk);
3549 if (waitqueue_active(&sk->sk_lock.wq))
3550 wake_up(&sk->sk_lock.wq);
3551 spin_unlock_bh(&sk->sk_lock.slock);
3552}
3553EXPORT_SYMBOL(release_sock);
3554
3555bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3556{
3557 might_sleep();
3558 spin_lock_bh(&sk->sk_lock.slock);
3559
3560 if (!sock_owned_by_user_nocheck(sk)) {
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576 return false;
3577 }
3578
3579 __lock_sock(sk);
3580 sk->sk_lock.owned = 1;
3581 __acquire(&sk->sk_lock.slock);
3582 spin_unlock_bh(&sk->sk_lock.slock);
3583 return true;
3584}
3585EXPORT_SYMBOL(__lock_sock_fast);
3586
3587int sock_gettstamp(struct socket *sock, void __user *userstamp,
3588 bool timeval, bool time32)
3589{
3590 struct sock *sk = sock->sk;
3591 struct timespec64 ts;
3592
3593 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3594 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3595 if (ts.tv_sec == -1)
3596 return -ENOENT;
3597 if (ts.tv_sec == 0) {
3598 ktime_t kt = ktime_get_real();
3599 sock_write_timestamp(sk, kt);
3600 ts = ktime_to_timespec64(kt);
3601 }
3602
3603 if (timeval)
3604 ts.tv_nsec /= 1000;
3605
3606#ifdef CONFIG_COMPAT_32BIT_TIME
3607 if (time32)
3608 return put_old_timespec32(&ts, userstamp);
3609#endif
3610#ifdef CONFIG_SPARC64
3611
3612 if (timeval && !in_compat_syscall()) {
3613 struct __kernel_old_timeval __user tv = {
3614 .tv_sec = ts.tv_sec,
3615 .tv_usec = ts.tv_nsec,
3616 };
3617 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3618 return -EFAULT;
3619 return 0;
3620 }
3621#endif
3622 return put_timespec64(&ts, userstamp);
3623}
3624EXPORT_SYMBOL(sock_gettstamp);
3625
3626void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3627{
3628 if (!sock_flag(sk, flag)) {
3629 unsigned long previous_flags = sk->sk_flags;
3630
3631 sock_set_flag(sk, flag);
3632
3633
3634
3635
3636
3637 if (sock_needs_netstamp(sk) &&
3638 !(previous_flags & SK_FLAGS_TIMESTAMP))
3639 net_enable_timestamp();
3640 }
3641}
3642
3643int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3644 int level, int type)
3645{
3646 struct sock_exterr_skb *serr;
3647 struct sk_buff *skb;
3648 int copied, err;
3649
3650 err = -EAGAIN;
3651 skb = sock_dequeue_err_skb(sk);
3652 if (skb == NULL)
3653 goto out;
3654
3655 copied = skb->len;
3656 if (copied > len) {
3657 msg->msg_flags |= MSG_TRUNC;
3658 copied = len;
3659 }
3660 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3661 if (err)
3662 goto out_free_skb;
3663
3664 sock_recv_timestamp(msg, sk, skb);
3665
3666 serr = SKB_EXT_ERR(skb);
3667 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3668
3669 msg->msg_flags |= MSG_ERRQUEUE;
3670 err = copied;
3671
3672out_free_skb:
3673 kfree_skb(skb);
3674out:
3675 return err;
3676}
3677EXPORT_SYMBOL(sock_recv_errqueue);
3678
3679
3680
3681
3682
3683
3684
3685
3686int sock_common_getsockopt(struct socket *sock, int level, int optname,
3687 char __user *optval, int __user *optlen)
3688{
3689 struct sock *sk = sock->sk;
3690
3691
3692 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3693}
3694EXPORT_SYMBOL(sock_common_getsockopt);
3695
3696int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3697 int flags)
3698{
3699 struct sock *sk = sock->sk;
3700 int addr_len = 0;
3701 int err;
3702
3703 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3704 if (err >= 0)
3705 msg->msg_namelen = addr_len;
3706 return err;
3707}
3708EXPORT_SYMBOL(sock_common_recvmsg);
3709
3710
3711
3712
3713int sock_common_setsockopt(struct socket *sock, int level, int optname,
3714 sockptr_t optval, unsigned int optlen)
3715{
3716 struct sock *sk = sock->sk;
3717
3718
3719 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3720}
3721EXPORT_SYMBOL(sock_common_setsockopt);
3722
3723void sk_common_release(struct sock *sk)
3724{
3725 if (sk->sk_prot->destroy)
3726 sk->sk_prot->destroy(sk);
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736 sk->sk_prot->unhash(sk);
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750 sock_orphan(sk);
3751
3752 xfrm_sk_free_policy(sk);
3753
3754 sock_put(sk);
3755}
3756EXPORT_SYMBOL(sk_common_release);
3757
3758void sk_get_meminfo(const struct sock *sk, u32 *mem)
3759{
3760 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3761
3762 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3763 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3764 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3765 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3766 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3767 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3768 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3769 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3770 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3771}
3772
3773#ifdef CONFIG_PROC_FS
3774static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3775
3776int sock_prot_inuse_get(struct net *net, struct proto *prot)
3777{
3778 int cpu, idx = prot->inuse_idx;
3779 int res = 0;
3780
3781 for_each_possible_cpu(cpu)
3782 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3783
3784 return res >= 0 ? res : 0;
3785}
3786EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3787
3788int sock_inuse_get(struct net *net)
3789{
3790 int cpu, res = 0;
3791
3792 for_each_possible_cpu(cpu)
3793 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3794
3795 return res;
3796}
3797
3798EXPORT_SYMBOL_GPL(sock_inuse_get);
3799
3800static int __net_init sock_inuse_init_net(struct net *net)
3801{
3802 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3803 if (net->core.prot_inuse == NULL)
3804 return -ENOMEM;
3805 return 0;
3806}
3807
3808static void __net_exit sock_inuse_exit_net(struct net *net)
3809{
3810 free_percpu(net->core.prot_inuse);
3811}
3812
3813static struct pernet_operations net_inuse_ops = {
3814 .init = sock_inuse_init_net,
3815 .exit = sock_inuse_exit_net,
3816};
3817
3818static __init int net_inuse_init(void)
3819{
3820 if (register_pernet_subsys(&net_inuse_ops))
3821 panic("Cannot initialize net inuse counters");
3822
3823 return 0;
3824}
3825
3826core_initcall(net_inuse_init);
3827
3828static int assign_proto_idx(struct proto *prot)
3829{
3830 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3831
3832 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3833 pr_err("PROTO_INUSE_NR exhausted\n");
3834 return -ENOSPC;
3835 }
3836
3837 set_bit(prot->inuse_idx, proto_inuse_idx);
3838 return 0;
3839}
3840
3841static void release_proto_idx(struct proto *prot)
3842{
3843 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3844 clear_bit(prot->inuse_idx, proto_inuse_idx);
3845}
3846#else
3847static inline int assign_proto_idx(struct proto *prot)
3848{
3849 return 0;
3850}
3851
3852static inline void release_proto_idx(struct proto *prot)
3853{
3854}
3855
3856#endif
3857
3858static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3859{
3860 if (!twsk_prot)
3861 return;
3862 kfree(twsk_prot->twsk_slab_name);
3863 twsk_prot->twsk_slab_name = NULL;
3864 kmem_cache_destroy(twsk_prot->twsk_slab);
3865 twsk_prot->twsk_slab = NULL;
3866}
3867
3868static int tw_prot_init(const struct proto *prot)
3869{
3870 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3871
3872 if (!twsk_prot)
3873 return 0;
3874
3875 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3876 prot->name);
3877 if (!twsk_prot->twsk_slab_name)
3878 return -ENOMEM;
3879
3880 twsk_prot->twsk_slab =
3881 kmem_cache_create(twsk_prot->twsk_slab_name,
3882 twsk_prot->twsk_obj_size, 0,
3883 SLAB_ACCOUNT | prot->slab_flags,
3884 NULL);
3885 if (!twsk_prot->twsk_slab) {
3886 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3887 prot->name);
3888 return -ENOMEM;
3889 }
3890
3891 return 0;
3892}
3893
3894static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3895{
3896 if (!rsk_prot)
3897 return;
3898 kfree(rsk_prot->slab_name);
3899 rsk_prot->slab_name = NULL;
3900 kmem_cache_destroy(rsk_prot->slab);
3901 rsk_prot->slab = NULL;
3902}
3903
3904static int req_prot_init(const struct proto *prot)
3905{
3906 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3907
3908 if (!rsk_prot)
3909 return 0;
3910
3911 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3912 prot->name);
3913 if (!rsk_prot->slab_name)
3914 return -ENOMEM;
3915
3916 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3917 rsk_prot->obj_size, 0,
3918 SLAB_ACCOUNT | prot->slab_flags,
3919 NULL);
3920
3921 if (!rsk_prot->slab) {
3922 pr_crit("%s: Can't create request sock SLAB cache!\n",
3923 prot->name);
3924 return -ENOMEM;
3925 }
3926 return 0;
3927}
3928
3929int proto_register(struct proto *prot, int alloc_slab)
3930{
3931 int ret = -ENOBUFS;
3932
3933 if (prot->memory_allocated && !prot->sysctl_mem) {
3934 pr_err("%s: missing sysctl_mem\n", prot->name);
3935 return -EINVAL;
3936 }
3937 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3938 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3939 return -EINVAL;
3940 }
3941 if (alloc_slab) {
3942 prot->slab = kmem_cache_create_usercopy(prot->name,
3943 prot->obj_size, 0,
3944 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3945 prot->slab_flags,
3946 prot->useroffset, prot->usersize,
3947 NULL);
3948
3949 if (prot->slab == NULL) {
3950 pr_crit("%s: Can't create sock SLAB cache!\n",
3951 prot->name);
3952 goto out;
3953 }
3954
3955 if (req_prot_init(prot))
3956 goto out_free_request_sock_slab;
3957
3958 if (tw_prot_init(prot))
3959 goto out_free_timewait_sock_slab;
3960 }
3961
3962 mutex_lock(&proto_list_mutex);
3963 ret = assign_proto_idx(prot);
3964 if (ret) {
3965 mutex_unlock(&proto_list_mutex);
3966 goto out_free_timewait_sock_slab;
3967 }
3968 list_add(&prot->node, &proto_list);
3969 mutex_unlock(&proto_list_mutex);
3970 return ret;
3971
3972out_free_timewait_sock_slab:
3973 if (alloc_slab)
3974 tw_prot_cleanup(prot->twsk_prot);
3975out_free_request_sock_slab:
3976 if (alloc_slab) {
3977 req_prot_cleanup(prot->rsk_prot);
3978
3979 kmem_cache_destroy(prot->slab);
3980 prot->slab = NULL;
3981 }
3982out:
3983 return ret;
3984}
3985EXPORT_SYMBOL(proto_register);
3986
3987void proto_unregister(struct proto *prot)
3988{
3989 mutex_lock(&proto_list_mutex);
3990 release_proto_idx(prot);
3991 list_del(&prot->node);
3992 mutex_unlock(&proto_list_mutex);
3993
3994 kmem_cache_destroy(prot->slab);
3995 prot->slab = NULL;
3996
3997 req_prot_cleanup(prot->rsk_prot);
3998 tw_prot_cleanup(prot->twsk_prot);
3999}
4000EXPORT_SYMBOL(proto_unregister);
4001
4002int sock_load_diag_module(int family, int protocol)
4003{
4004 if (!protocol) {
4005 if (!sock_is_registered(family))
4006 return -ENOENT;
4007
4008 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4009 NETLINK_SOCK_DIAG, family);
4010 }
4011
4012#ifdef CONFIG_INET
4013 if (family == AF_INET &&
4014 protocol != IPPROTO_RAW &&
4015 protocol < MAX_INET_PROTOS &&
4016 !rcu_access_pointer(inet_protos[protocol]))
4017 return -ENOENT;
4018#endif
4019
4020 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4021 NETLINK_SOCK_DIAG, family, protocol);
4022}
4023EXPORT_SYMBOL(sock_load_diag_module);
4024
4025#ifdef CONFIG_PROC_FS
4026static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4027 __acquires(proto_list_mutex)
4028{
4029 mutex_lock(&proto_list_mutex);
4030 return seq_list_start_head(&proto_list, *pos);
4031}
4032
4033static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4034{
4035 return seq_list_next(v, &proto_list, pos);
4036}
4037
4038static void proto_seq_stop(struct seq_file *seq, void *v)
4039 __releases(proto_list_mutex)
4040{
4041 mutex_unlock(&proto_list_mutex);
4042}
4043
4044static char proto_method_implemented(const void *method)
4045{
4046 return method == NULL ? 'n' : 'y';
4047}
4048static long sock_prot_memory_allocated(struct proto *proto)
4049{
4050 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4051}
4052
4053static const char *sock_prot_memory_pressure(struct proto *proto)
4054{
4055 return proto->memory_pressure != NULL ?
4056 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4057}
4058
4059static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4060{
4061
4062 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4063 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4064 proto->name,
4065 proto->obj_size,
4066 sock_prot_inuse_get(seq_file_net(seq), proto),
4067 sock_prot_memory_allocated(proto),
4068 sock_prot_memory_pressure(proto),
4069 proto->max_header,
4070 proto->slab == NULL ? "no" : "yes",
4071 module_name(proto->owner),
4072 proto_method_implemented(proto->close),
4073 proto_method_implemented(proto->connect),
4074 proto_method_implemented(proto->disconnect),
4075 proto_method_implemented(proto->accept),
4076 proto_method_implemented(proto->ioctl),
4077 proto_method_implemented(proto->init),
4078 proto_method_implemented(proto->destroy),
4079 proto_method_implemented(proto->shutdown),
4080 proto_method_implemented(proto->setsockopt),
4081 proto_method_implemented(proto->getsockopt),
4082 proto_method_implemented(proto->sendmsg),
4083 proto_method_implemented(proto->recvmsg),
4084 proto_method_implemented(proto->bind),
4085 proto_method_implemented(proto->backlog_rcv),
4086 proto_method_implemented(proto->hash),
4087 proto_method_implemented(proto->unhash),
4088 proto_method_implemented(proto->get_port),
4089 proto_method_implemented(proto->enter_memory_pressure));
4090}
4091
4092static int proto_seq_show(struct seq_file *seq, void *v)
4093{
4094 if (v == &proto_list)
4095 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4096 "protocol",
4097 "size",
4098 "sockets",
4099 "memory",
4100 "press",
4101 "maxhdr",
4102 "slab",
4103 "module",
4104 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4105 else
4106 proto_seq_printf(seq, list_entry(v, struct proto, node));
4107 return 0;
4108}
4109
4110static const struct seq_operations proto_seq_ops = {
4111 .start = proto_seq_start,
4112 .next = proto_seq_next,
4113 .stop = proto_seq_stop,
4114 .show = proto_seq_show,
4115};
4116
4117static __net_init int proto_init_net(struct net *net)
4118{
4119 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4120 sizeof(struct seq_net_private)))
4121 return -ENOMEM;
4122
4123 return 0;
4124}
4125
4126static __net_exit void proto_exit_net(struct net *net)
4127{
4128 remove_proc_entry("protocols", net->proc_net);
4129}
4130
4131
4132static __net_initdata struct pernet_operations proto_net_ops = {
4133 .init = proto_init_net,
4134 .exit = proto_exit_net,
4135};
4136
4137static int __init proto_init(void)
4138{
4139 return register_pernet_subsys(&proto_net_ops);
4140}
4141
4142subsys_initcall(proto_init);
4143
4144#endif
4145
4146#ifdef CONFIG_NET_RX_BUSY_POLL
4147bool sk_busy_loop_end(void *p, unsigned long start_time)
4148{
4149 struct sock *sk = p;
4150
4151 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4152 sk_busy_loop_timeout(sk, start_time);
4153}
4154EXPORT_SYMBOL(sk_busy_loop_end);
4155#endif
4156
4157int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4158{
4159 if (!sk->sk_prot->bind_add)
4160 return -EOPNOTSUPP;
4161 return sk->sk_prot->bind_add(sk, addr, addr_len);
4162}
4163EXPORT_SYMBOL(sock_bind_add);
4164
4165
4166int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4167 void __user *arg, void *karg, size_t size)
4168{
4169 int ret;
4170
4171 if (copy_from_user(karg, arg, size))
4172 return -EFAULT;
4173
4174 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4175 if (ret)
4176 return ret;
4177
4178 if (copy_to_user(arg, karg, size))
4179 return -EFAULT;
4180
4181 return 0;
4182}
4183EXPORT_SYMBOL(sock_ioctl_inout);
4184
4185
4186
4187
4188
4189static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4190{
4191 int ret, karg = 0;
4192
4193 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4194 if (ret)
4195 return ret;
4196
4197 return put_user(karg, (int __user *)arg);
4198}
4199
4200
4201
4202
4203
4204
4205int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4206{
4207 int rc = 1;
4208
4209 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4210 rc = ipmr_sk_ioctl(sk, cmd, arg);
4211 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4212 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4213 else if (sk_is_phonet(sk))
4214 rc = phonet_sk_ioctl(sk, cmd, arg);
4215
4216
4217 if (rc <= 0)
4218 return rc;
4219
4220
4221 return sock_ioctl_out(sk, cmd, arg);
4222}
4223EXPORT_SYMBOL(sk_ioctl);
4224