1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49#include <linux/ethtool.h>
50#include <linux/types.h>
51#include <linux/mm.h>
52#include <linux/capability.h>
53#include <linux/fcntl.h>
54#include <linux/socket.h>
55#include <linux/in.h>
56#include <linux/inet.h>
57#include <linux/netdevice.h>
58#include <linux/if_packet.h>
59#include <linux/wireless.h>
60#include <linux/kernel.h>
61#include <linux/kmod.h>
62#include <linux/slab.h>
63#include <linux/vmalloc.h>
64#include <net/net_namespace.h>
65#include <net/ip.h>
66#include <net/protocol.h>
67#include <linux/skbuff.h>
68#include <net/sock.h>
69#include <linux/errno.h>
70#include <linux/timer.h>
71#include <linux/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81#include <linux/mutex.h>
82#include <linux/if_vlan.h>
83#include <linux/virtio_net.h>
84#include <linux/errqueue.h>
85#include <linux/net_tstamp.h>
86#include <linux/percpu.h>
87#ifdef CONFIG_INET
88#include <net/inet_common.h>
89#endif
90#include <linux/bpf.h>
91#include <net/compat.h>
92
93#include "internal.h"
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct packet_mreq_max {
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
160};
161
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
170 int closing, int tx_ring);
171
172#define V3_ALIGNMENT (8)
173
174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
179#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
180#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
181#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
182#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
183#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
184#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
185
186struct packet_sock;
187static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
188 struct packet_type *pt, struct net_device *orig_dev);
189
190static void *packet_previous_frame(struct packet_sock *po,
191 struct packet_ring_buffer *rb,
192 int status);
193static void packet_increment_head(struct packet_ring_buffer *buff);
194static int prb_curr_blk_in_use(struct tpacket_block_desc *);
195static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
196 struct packet_sock *);
197static void prb_retire_current_block(struct tpacket_kbdq_core *,
198 struct packet_sock *, unsigned int status);
199static int prb_queue_frozen(struct tpacket_kbdq_core *);
200static void prb_open_block(struct tpacket_kbdq_core *,
201 struct tpacket_block_desc *);
202static void prb_retire_rx_blk_timer_expired(struct timer_list *);
203static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
204static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
205static void prb_clear_rxhash(struct tpacket_kbdq_core *,
206 struct tpacket3_hdr *);
207static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void packet_flush_mclist(struct sock *sk);
210static u16 packet_pick_tx_queue(struct sk_buff *skb);
211
212struct packet_skb_cb {
213 union {
214 struct sockaddr_pkt pkt;
215 union {
216
217
218
219
220 unsigned int origlen;
221 struct sockaddr_ll ll;
222 };
223 } sa;
224};
225
226#define vio_le() virtio_legacy_is_little_endian()
227
228#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
229
230#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
231#define GET_PBLOCK_DESC(x, bid) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
233#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
235#define GET_NEXT_PRB_BLK_NUM(x) \
236 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
237 ((x)->kactive_blk_num+1) : 0)
238
239static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
240static void __fanout_link(struct sock *sk, struct packet_sock *po);
241
242static int packet_direct_xmit(struct sk_buff *skb)
243{
244 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
245}
246
247static struct net_device *packet_cached_dev_get(struct packet_sock *po)
248{
249 struct net_device *dev;
250
251 rcu_read_lock();
252 dev = rcu_dereference(po->cached_dev);
253 if (likely(dev))
254 dev_hold(dev);
255 rcu_read_unlock();
256
257 return dev;
258}
259
260static void packet_cached_dev_assign(struct packet_sock *po,
261 struct net_device *dev)
262{
263 rcu_assign_pointer(po->cached_dev, dev);
264}
265
266static void packet_cached_dev_reset(struct packet_sock *po)
267{
268 RCU_INIT_POINTER(po->cached_dev, NULL);
269}
270
271static bool packet_use_direct_xmit(const struct packet_sock *po)
272{
273 return po->xmit == packet_direct_xmit;
274}
275
276static u16 packet_pick_tx_queue(struct sk_buff *skb)
277{
278 struct net_device *dev = skb->dev;
279 const struct net_device_ops *ops = dev->netdev_ops;
280 int cpu = raw_smp_processor_id();
281 u16 queue_index;
282
283#ifdef CONFIG_XPS
284 skb->sender_cpu = cpu + 1;
285#endif
286 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
287 if (ops->ndo_select_queue) {
288 queue_index = ops->ndo_select_queue(dev, skb, NULL);
289 queue_index = netdev_cap_txqueue(dev, queue_index);
290 } else {
291 queue_index = netdev_pick_tx(dev, skb, NULL);
292 }
293
294 return queue_index;
295}
296
297
298
299
300
301static void __register_prot_hook(struct sock *sk)
302{
303 struct packet_sock *po = pkt_sk(sk);
304
305 if (!po->running) {
306 if (po->fanout)
307 __fanout_link(sk, po);
308 else
309 dev_add_pack(&po->prot_hook);
310
311 sock_hold(sk);
312 po->running = 1;
313 }
314}
315
316static void register_prot_hook(struct sock *sk)
317{
318 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
319 __register_prot_hook(sk);
320}
321
322
323
324
325
326
327
328static void __unregister_prot_hook(struct sock *sk, bool sync)
329{
330 struct packet_sock *po = pkt_sk(sk);
331
332 lockdep_assert_held_once(&po->bind_lock);
333
334 po->running = 0;
335
336 if (po->fanout)
337 __fanout_unlink(sk, po);
338 else
339 __dev_remove_pack(&po->prot_hook);
340
341 __sock_put(sk);
342
343 if (sync) {
344 spin_unlock(&po->bind_lock);
345 synchronize_net();
346 spin_lock(&po->bind_lock);
347 }
348}
349
350static void unregister_prot_hook(struct sock *sk, bool sync)
351{
352 struct packet_sock *po = pkt_sk(sk);
353
354 if (po->running)
355 __unregister_prot_hook(sk, sync);
356}
357
358static inline struct page * __pure pgv_to_page(void *addr)
359{
360 if (is_vmalloc_addr(addr))
361 return vmalloc_to_page(addr);
362 return virt_to_page(addr);
363}
364
365static void __packet_set_status(struct packet_sock *po, void *frame, int status)
366{
367 union tpacket_uhdr h;
368
369 h.raw = frame;
370 switch (po->tp_version) {
371 case TPACKET_V1:
372 h.h1->tp_status = status;
373 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
374 break;
375 case TPACKET_V2:
376 h.h2->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
378 break;
379 case TPACKET_V3:
380 h.h3->tp_status = status;
381 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
382 break;
383 default:
384 WARN(1, "TPACKET version not supported.\n");
385 BUG();
386 }
387
388 smp_wmb();
389}
390
391static int __packet_get_status(const struct packet_sock *po, void *frame)
392{
393 union tpacket_uhdr h;
394
395 smp_rmb();
396
397 h.raw = frame;
398 switch (po->tp_version) {
399 case TPACKET_V1:
400 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
401 return h.h1->tp_status;
402 case TPACKET_V2:
403 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
404 return h.h2->tp_status;
405 case TPACKET_V3:
406 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
407 return h.h3->tp_status;
408 default:
409 WARN(1, "TPACKET version not supported.\n");
410 BUG();
411 return 0;
412 }
413}
414
415static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
416 unsigned int flags)
417{
418 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
419
420 if (shhwtstamps &&
421 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
422 ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
423 return TP_STATUS_TS_RAW_HARDWARE;
424
425 if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
426 ktime_to_timespec64_cond(skb->tstamp, ts))
427 return TP_STATUS_TS_SOFTWARE;
428
429 return 0;
430}
431
432static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
433 struct sk_buff *skb)
434{
435 union tpacket_uhdr h;
436 struct timespec64 ts;
437 __u32 ts_status;
438
439 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
440 return 0;
441
442 h.raw = frame;
443
444
445
446
447
448
449
450 switch (po->tp_version) {
451 case TPACKET_V1:
452 h.h1->tp_sec = ts.tv_sec;
453 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
454 break;
455 case TPACKET_V2:
456 h.h2->tp_sec = ts.tv_sec;
457 h.h2->tp_nsec = ts.tv_nsec;
458 break;
459 case TPACKET_V3:
460 h.h3->tp_sec = ts.tv_sec;
461 h.h3->tp_nsec = ts.tv_nsec;
462 break;
463 default:
464 WARN(1, "TPACKET version not supported.\n");
465 BUG();
466 }
467
468
469 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
470 smp_wmb();
471
472 return ts_status;
473}
474
475static void *packet_lookup_frame(const struct packet_sock *po,
476 const struct packet_ring_buffer *rb,
477 unsigned int position,
478 int status)
479{
480 unsigned int pg_vec_pos, frame_offset;
481 union tpacket_uhdr h;
482
483 pg_vec_pos = position / rb->frames_per_block;
484 frame_offset = position % rb->frames_per_block;
485
486 h.raw = rb->pg_vec[pg_vec_pos].buffer +
487 (frame_offset * rb->frame_size);
488
489 if (status != __packet_get_status(po, h.raw))
490 return NULL;
491
492 return h.raw;
493}
494
495static void *packet_current_frame(struct packet_sock *po,
496 struct packet_ring_buffer *rb,
497 int status)
498{
499 return packet_lookup_frame(po, rb, rb->head, status);
500}
501
502static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
503{
504 del_timer_sync(&pkc->retire_blk_timer);
505}
506
507static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
508 struct sk_buff_head *rb_queue)
509{
510 struct tpacket_kbdq_core *pkc;
511
512 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
513
514 spin_lock_bh(&rb_queue->lock);
515 pkc->delete_blk_timer = 1;
516 spin_unlock_bh(&rb_queue->lock);
517
518 prb_del_retire_blk_timer(pkc);
519}
520
521static void prb_setup_retire_blk_timer(struct packet_sock *po)
522{
523 struct tpacket_kbdq_core *pkc;
524
525 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
526 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
527 0);
528 pkc->retire_blk_timer.expires = jiffies;
529}
530
531static int prb_calc_retire_blk_tmo(struct packet_sock *po,
532 int blk_size_in_bytes)
533{
534 struct net_device *dev;
535 unsigned int mbits, div;
536 struct ethtool_link_ksettings ecmd;
537 int err;
538
539 rtnl_lock();
540 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
541 if (unlikely(!dev)) {
542 rtnl_unlock();
543 return DEFAULT_PRB_RETIRE_TOV;
544 }
545 err = __ethtool_get_link_ksettings(dev, &ecmd);
546 rtnl_unlock();
547 if (err)
548 return DEFAULT_PRB_RETIRE_TOV;
549
550
551
552
553 if (ecmd.base.speed < SPEED_1000 ||
554 ecmd.base.speed == SPEED_UNKNOWN)
555 return DEFAULT_PRB_RETIRE_TOV;
556
557 div = ecmd.base.speed / 1000;
558 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
559
560 if (div)
561 mbits /= div;
562
563 if (div)
564 return mbits + 1;
565 return mbits;
566}
567
568static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
569 union tpacket_req_u *req_u)
570{
571 p1->feature_req_word = req_u->req3.tp_feature_req_word;
572}
573
574static void init_prb_bdqc(struct packet_sock *po,
575 struct packet_ring_buffer *rb,
576 struct pgv *pg_vec,
577 union tpacket_req_u *req_u)
578{
579 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
580 struct tpacket_block_desc *pbd;
581
582 memset(p1, 0x0, sizeof(*p1));
583
584 p1->knxt_seq_num = 1;
585 p1->pkbdq = pg_vec;
586 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
587 p1->pkblk_start = pg_vec[0].buffer;
588 p1->kblk_size = req_u->req3.tp_block_size;
589 p1->knum_blocks = req_u->req3.tp_block_nr;
590 p1->hdrlen = po->tp_hdrlen;
591 p1->version = po->tp_version;
592 p1->last_kactive_blk_num = 0;
593 po->stats.stats3.tp_freeze_q_cnt = 0;
594 if (req_u->req3.tp_retire_blk_tov)
595 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
596 else
597 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
598 req_u->req3.tp_block_size);
599 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
600 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
601 rwlock_init(&p1->blk_fill_in_prog_lock);
602
603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
604 prb_init_ft_ops(p1, req_u);
605 prb_setup_retire_blk_timer(po);
606 prb_open_block(p1, pbd);
607}
608
609
610
611
612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
643{
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
647 unsigned int frozen;
648 struct tpacket_block_desc *pbd;
649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658
659
660
661
662
663
664
665
666
667 if (BLOCK_NUM_PKTS(pbd)) {
668
669 write_lock(&pkc->blk_fill_in_prog_lock);
670 write_unlock(&pkc->blk_fill_in_prog_lock);
671 }
672
673 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!frozen) {
675 if (!BLOCK_NUM_PKTS(pbd)) {
676
677 goto refresh_timer;
678 }
679 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
680 if (!prb_dispatch_next_block(pkc, po))
681 goto refresh_timer;
682 else
683 goto out;
684 } else {
685
686
687
688 if (prb_curr_blk_in_use(pbd)) {
689
690
691
692
693 goto refresh_timer;
694 } else {
695
696
697
698
699
700
701
702 prb_open_block(pkc, pbd);
703 goto out;
704 }
705 }
706 }
707
708refresh_timer:
709 _prb_refresh_rx_retire_blk_timer(pkc);
710
711out:
712 spin_unlock(&po->sk.sk_receive_queue.lock);
713}
714
715static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
716 struct tpacket_block_desc *pbd1, __u32 status)
717{
718
719
720#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
721 u8 *start, *end;
722
723 start = (u8 *)pbd1;
724
725
726 start += PAGE_SIZE;
727
728 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
729 for (; start < end; start += PAGE_SIZE)
730 flush_dcache_page(pgv_to_page(start));
731
732 smp_wmb();
733#endif
734
735
736
737 BLOCK_STATUS(pbd1) = status;
738
739
740
741#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 start = (u8 *)pbd1;
743 flush_dcache_page(pgv_to_page(start));
744
745 smp_wmb();
746#endif
747}
748
749
750
751
752
753
754
755
756
757
758static void prb_close_block(struct tpacket_kbdq_core *pkc1,
759 struct tpacket_block_desc *pbd1,
760 struct packet_sock *po, unsigned int stat)
761{
762 __u32 status = TP_STATUS_USER | stat;
763
764 struct tpacket3_hdr *last_pkt;
765 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
766 struct sock *sk = &po->sk;
767
768 if (atomic_read(&po->tp_drops))
769 status |= TP_STATUS_LOSING;
770
771 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
772 last_pkt->tp_next_offset = 0;
773
774
775 if (BLOCK_NUM_PKTS(pbd1)) {
776 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
777 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 } else {
779
780
781
782
783
784 struct timespec64 ts;
785 ktime_get_real_ts64(&ts);
786 h1->ts_last_pkt.ts_sec = ts.tv_sec;
787 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
788 }
789
790 smp_wmb();
791
792
793 prb_flush_block(pkc1, pbd1, status);
794
795 sk->sk_data_ready(sk);
796
797 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
798}
799
800static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
801{
802 pkc->reset_pending_on_curr_blk = 0;
803}
804
805
806
807
808
809
810
811
812static void prb_open_block(struct tpacket_kbdq_core *pkc1,
813 struct tpacket_block_desc *pbd1)
814{
815 struct timespec64 ts;
816 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
817
818 smp_rmb();
819
820
821
822
823
824 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
825 BLOCK_NUM_PKTS(pbd1) = 0;
826 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
827
828 ktime_get_real_ts64(&ts);
829
830 h1->ts_first_pkt.ts_sec = ts.tv_sec;
831 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
832
833 pkc1->pkblk_start = (char *)pbd1;
834 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
835
836 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
837 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
838
839 pbd1->version = pkc1->version;
840 pkc1->prev = pkc1->nxt_offset;
841 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
842
843 prb_thaw_queue(pkc1);
844 _prb_refresh_rx_retire_blk_timer(pkc1);
845
846 smp_wmb();
847}
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
873 struct packet_sock *po)
874{
875 pkc->reset_pending_on_curr_blk = 1;
876 po->stats.stats3.tp_freeze_q_cnt++;
877}
878
879#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
880
881
882
883
884
885
886
887static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
888 struct packet_sock *po)
889{
890 struct tpacket_block_desc *pbd;
891
892 smp_rmb();
893
894
895 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896
897
898 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
899 prb_freeze_queue(pkc, po);
900 return NULL;
901 }
902
903
904
905
906
907
908 prb_open_block(pkc, pbd);
909 return (void *)pkc->nxt_offset;
910}
911
912static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
913 struct packet_sock *po, unsigned int status)
914{
915 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
916
917
918 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919
920
921
922
923
924
925
926
927
928 if (!(status & TP_STATUS_BLK_TMO)) {
929
930 write_lock(&pkc->blk_fill_in_prog_lock);
931 write_unlock(&pkc->blk_fill_in_prog_lock);
932 }
933 prb_close_block(pkc, pbd, po, status);
934 return;
935 }
936}
937
938static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
939{
940 return TP_STATUS_USER & BLOCK_STATUS(pbd);
941}
942
943static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
944{
945 return pkc->reset_pending_on_curr_blk;
946}
947
948static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
949 __releases(&pkc->blk_fill_in_prog_lock)
950{
951 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
952
953 read_unlock(&pkc->blk_fill_in_prog_lock);
954}
955
956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
957 struct tpacket3_hdr *ppd)
958{
959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
960}
961
962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
969 struct tpacket3_hdr *ppd)
970{
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
975 } else {
976 ppd->hv1.tp_vlan_tci = 0;
977 ppd->hv1.tp_vlan_tpid = 0;
978 ppd->tp_status = TP_STATUS_AVAILABLE;
979 }
980}
981
982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
983 struct tpacket3_hdr *ppd)
984{
985 ppd->hv1.tp_padding = 0;
986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
994static void prb_fill_curr_block(char *curr,
995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
997 unsigned int len)
998 __acquires(&pkc->blk_fill_in_prog_lock)
999{
1000 struct tpacket3_hdr *ppd;
1001
1002 ppd = (struct tpacket3_hdr *)curr;
1003 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 pkc->prev = curr;
1005 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1007 BLOCK_NUM_PKTS(pbd) += 1;
1008 read_lock(&pkc->blk_fill_in_prog_lock);
1009 prb_run_all_ft_ops(pkc, ppd);
1010}
1011
1012
1013static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1014 struct sk_buff *skb,
1015 unsigned int len
1016 )
1017{
1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
1020 char *curr, *end;
1021
1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025
1026 if (prb_queue_frozen(pkc)) {
1027
1028
1029
1030
1031 if (prb_curr_blk_in_use(pbd)) {
1032
1033 return NULL;
1034 } else {
1035
1036
1037
1038
1039
1040
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
1048 end = (char *)pbd + pkc->kblk_size;
1049
1050
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056
1057 prb_retire_current_block(pkc, po, 0);
1058
1059
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067
1068
1069
1070
1071 return NULL;
1072}
1073
1074static void *packet_current_rx_frame(struct packet_sock *po,
1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
1090 return NULL;
1091 }
1092}
1093
1094static void *prb_lookup_block(const struct packet_sock *po,
1095 const struct packet_ring_buffer *rb,
1096 unsigned int idx,
1097 int status)
1098{
1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117
1118static void *__prb_previous_block(struct packet_sock *po,
1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
1126static void *packet_previous_rx_frame(struct packet_sock *po,
1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
1136static void packet_increment_rx_head(struct packet_sock *po,
1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
1151static void *packet_previous_frame(struct packet_sock *po,
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
1159static void packet_increment_head(struct packet_ring_buffer *buff)
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1211{
1212 int idx, len;
1213
1214 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1215 idx = READ_ONCE(po->rx_ring.head);
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1228 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
1235
1236static int __packet_rcv_has_room(const struct packet_sock *po,
1237 const struct sk_buff *skb)
1238{
1239 const struct sock *sk = &po->sk;
1240 int ret = ROOM_NONE;
1241
1242 if (po->prot_hook.func != tpacket_rcv) {
1243 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1244 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1245 - (skb ? skb->truesize : 0);
1246
1247 if (avail > (rcvbuf >> ROOM_POW_OFF))
1248 return ROOM_NORMAL;
1249 else if (avail > 0)
1250 return ROOM_LOW;
1251 else
1252 return ROOM_NONE;
1253 }
1254
1255 if (po->tp_version == TPACKET_V3) {
1256 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1257 ret = ROOM_NORMAL;
1258 else if (__tpacket_v3_has_room(po, 0))
1259 ret = ROOM_LOW;
1260 } else {
1261 if (__tpacket_has_room(po, ROOM_POW_OFF))
1262 ret = ROOM_NORMAL;
1263 else if (__tpacket_has_room(po, 0))
1264 ret = ROOM_LOW;
1265 }
1266
1267 return ret;
1268}
1269
1270static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1271{
1272 int pressure, ret;
1273
1274 ret = __packet_rcv_has_room(po, skb);
1275 pressure = ret != ROOM_NORMAL;
1276
1277 if (READ_ONCE(po->pressure) != pressure)
1278 WRITE_ONCE(po->pressure, pressure);
1279
1280 return ret;
1281}
1282
1283static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1284{
1285 if (READ_ONCE(po->pressure) &&
1286 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1287 WRITE_ONCE(po->pressure, 0);
1288}
1289
1290static void packet_sock_destruct(struct sock *sk)
1291{
1292 skb_queue_purge(&sk->sk_error_queue);
1293
1294 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1295 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1296
1297 if (!sock_flag(sk, SOCK_DEAD)) {
1298 pr_err("Attempt to release alive packet socket: %p\n", sk);
1299 return;
1300 }
1301
1302 sk_refcnt_debug_dec(sk);
1303}
1304
1305static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1306{
1307 u32 *history = po->rollover->history;
1308 u32 victim, rxhash;
1309 int i, count = 0;
1310
1311 rxhash = skb_get_hash(skb);
1312 for (i = 0; i < ROLLOVER_HLEN; i++)
1313 if (READ_ONCE(history[i]) == rxhash)
1314 count++;
1315
1316 victim = prandom_u32() % ROLLOVER_HLEN;
1317
1318
1319 if (READ_ONCE(history[victim]) != rxhash)
1320 WRITE_ONCE(history[victim], rxhash);
1321
1322 return count > (ROLLOVER_HLEN >> 1);
1323}
1324
1325static unsigned int fanout_demux_hash(struct packet_fanout *f,
1326 struct sk_buff *skb,
1327 unsigned int num)
1328{
1329 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1330}
1331
1332static unsigned int fanout_demux_lb(struct packet_fanout *f,
1333 struct sk_buff *skb,
1334 unsigned int num)
1335{
1336 unsigned int val = atomic_inc_return(&f->rr_cur);
1337
1338 return val % num;
1339}
1340
1341static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1342 struct sk_buff *skb,
1343 unsigned int num)
1344{
1345 return smp_processor_id() % num;
1346}
1347
1348static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1349 struct sk_buff *skb,
1350 unsigned int num)
1351{
1352 return prandom_u32_max(num);
1353}
1354
1355static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int idx, bool try_self,
1358 unsigned int num)
1359{
1360 struct packet_sock *po, *po_next, *po_skip = NULL;
1361 unsigned int i, j, room = ROOM_NONE;
1362
1363 po = pkt_sk(rcu_dereference(f->arr[idx]));
1364
1365 if (try_self) {
1366 room = packet_rcv_has_room(po, skb);
1367 if (room == ROOM_NORMAL ||
1368 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1369 return idx;
1370 po_skip = po;
1371 }
1372
1373 i = j = min_t(int, po->rollover->sock, num - 1);
1374 do {
1375 po_next = pkt_sk(rcu_dereference(f->arr[i]));
1376 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
1377 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1378 if (i != j)
1379 po->rollover->sock = i;
1380 atomic_long_inc(&po->rollover->num);
1381 if (room == ROOM_LOW)
1382 atomic_long_inc(&po->rollover->num_huge);
1383 return i;
1384 }
1385
1386 if (++i == num)
1387 i = 0;
1388 } while (i != j);
1389
1390 atomic_long_inc(&po->rollover->num_failed);
1391 return idx;
1392}
1393
1394static unsigned int fanout_demux_qm(struct packet_fanout *f,
1395 struct sk_buff *skb,
1396 unsigned int num)
1397{
1398 return skb_get_queue_mapping(skb) % num;
1399}
1400
1401static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1402 struct sk_buff *skb,
1403 unsigned int num)
1404{
1405 struct bpf_prog *prog;
1406 unsigned int ret = 0;
1407
1408 rcu_read_lock();
1409 prog = rcu_dereference(f->bpf_prog);
1410 if (prog)
1411 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1412 rcu_read_unlock();
1413
1414 return ret;
1415}
1416
1417static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1418{
1419 return f->flags & (flag >> 8);
1420}
1421
1422static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1423 struct packet_type *pt, struct net_device *orig_dev)
1424{
1425 struct packet_fanout *f = pt->af_packet_priv;
1426 unsigned int num = READ_ONCE(f->num_members);
1427 struct net *net = read_pnet(&f->net);
1428 struct packet_sock *po;
1429 unsigned int idx;
1430
1431 if (!net_eq(dev_net(dev), net) || !num) {
1432 kfree_skb(skb);
1433 return 0;
1434 }
1435
1436 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1437 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1438 if (!skb)
1439 return 0;
1440 }
1441 switch (f->type) {
1442 case PACKET_FANOUT_HASH:
1443 default:
1444 idx = fanout_demux_hash(f, skb, num);
1445 break;
1446 case PACKET_FANOUT_LB:
1447 idx = fanout_demux_lb(f, skb, num);
1448 break;
1449 case PACKET_FANOUT_CPU:
1450 idx = fanout_demux_cpu(f, skb, num);
1451 break;
1452 case PACKET_FANOUT_RND:
1453 idx = fanout_demux_rnd(f, skb, num);
1454 break;
1455 case PACKET_FANOUT_QM:
1456 idx = fanout_demux_qm(f, skb, num);
1457 break;
1458 case PACKET_FANOUT_ROLLOVER:
1459 idx = fanout_demux_rollover(f, skb, 0, false, num);
1460 break;
1461 case PACKET_FANOUT_CBPF:
1462 case PACKET_FANOUT_EBPF:
1463 idx = fanout_demux_bpf(f, skb, num);
1464 break;
1465 }
1466
1467 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1468 idx = fanout_demux_rollover(f, skb, idx, true, num);
1469
1470 po = pkt_sk(rcu_dereference(f->arr[idx]));
1471 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1472}
1473
1474DEFINE_MUTEX(fanout_mutex);
1475EXPORT_SYMBOL_GPL(fanout_mutex);
1476static LIST_HEAD(fanout_list);
1477static u16 fanout_next_id;
1478
1479static void __fanout_link(struct sock *sk, struct packet_sock *po)
1480{
1481 struct packet_fanout *f = po->fanout;
1482
1483 spin_lock(&f->lock);
1484 rcu_assign_pointer(f->arr[f->num_members], sk);
1485 smp_wmb();
1486 f->num_members++;
1487 if (f->num_members == 1)
1488 dev_add_pack(&f->prot_hook);
1489 spin_unlock(&f->lock);
1490}
1491
1492static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495 int i;
1496
1497 spin_lock(&f->lock);
1498 for (i = 0; i < f->num_members; i++) {
1499 if (rcu_dereference_protected(f->arr[i],
1500 lockdep_is_held(&f->lock)) == sk)
1501 break;
1502 }
1503 BUG_ON(i >= f->num_members);
1504 rcu_assign_pointer(f->arr[i],
1505 rcu_dereference_protected(f->arr[f->num_members - 1],
1506 lockdep_is_held(&f->lock)));
1507 f->num_members--;
1508 if (f->num_members == 0)
1509 __dev_remove_pack(&f->prot_hook);
1510 spin_unlock(&f->lock);
1511}
1512
1513static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1514{
1515 if (sk->sk_family != PF_PACKET)
1516 return false;
1517
1518 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1519}
1520
1521static void fanout_init_data(struct packet_fanout *f)
1522{
1523 switch (f->type) {
1524 case PACKET_FANOUT_LB:
1525 atomic_set(&f->rr_cur, 0);
1526 break;
1527 case PACKET_FANOUT_CBPF:
1528 case PACKET_FANOUT_EBPF:
1529 RCU_INIT_POINTER(f->bpf_prog, NULL);
1530 break;
1531 }
1532}
1533
1534static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1535{
1536 struct bpf_prog *old;
1537
1538 spin_lock(&f->lock);
1539 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1540 rcu_assign_pointer(f->bpf_prog, new);
1541 spin_unlock(&f->lock);
1542
1543 if (old) {
1544 synchronize_net();
1545 bpf_prog_destroy(old);
1546 }
1547}
1548
1549static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
1550 unsigned int len)
1551{
1552 struct bpf_prog *new;
1553 struct sock_fprog fprog;
1554 int ret;
1555
1556 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1557 return -EPERM;
1558
1559 ret = copy_bpf_fprog_from_user(&fprog, data, len);
1560 if (ret)
1561 return ret;
1562
1563 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1564 if (ret)
1565 return ret;
1566
1567 __fanout_set_data_bpf(po->fanout, new);
1568 return 0;
1569}
1570
1571static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1572 unsigned int len)
1573{
1574 struct bpf_prog *new;
1575 u32 fd;
1576
1577 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1578 return -EPERM;
1579 if (len != sizeof(fd))
1580 return -EINVAL;
1581 if (copy_from_sockptr(&fd, data, len))
1582 return -EFAULT;
1583
1584 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1585 if (IS_ERR(new))
1586 return PTR_ERR(new);
1587
1588 __fanout_set_data_bpf(po->fanout, new);
1589 return 0;
1590}
1591
1592static int fanout_set_data(struct packet_sock *po, sockptr_t data,
1593 unsigned int len)
1594{
1595 switch (po->fanout->type) {
1596 case PACKET_FANOUT_CBPF:
1597 return fanout_set_data_cbpf(po, data, len);
1598 case PACKET_FANOUT_EBPF:
1599 return fanout_set_data_ebpf(po, data, len);
1600 default:
1601 return -EINVAL;
1602 }
1603}
1604
1605static void fanout_release_data(struct packet_fanout *f)
1606{
1607 switch (f->type) {
1608 case PACKET_FANOUT_CBPF:
1609 case PACKET_FANOUT_EBPF:
1610 __fanout_set_data_bpf(f, NULL);
1611 }
1612}
1613
1614static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1615{
1616 struct packet_fanout *f;
1617
1618 list_for_each_entry(f, &fanout_list, list) {
1619 if (f->id == candidate_id &&
1620 read_pnet(&f->net) == sock_net(sk)) {
1621 return false;
1622 }
1623 }
1624 return true;
1625}
1626
1627static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1628{
1629 u16 id = fanout_next_id;
1630
1631 do {
1632 if (__fanout_id_is_free(sk, id)) {
1633 *new_id = id;
1634 fanout_next_id = id + 1;
1635 return true;
1636 }
1637
1638 id++;
1639 } while (id != fanout_next_id);
1640
1641 return false;
1642}
1643
1644static int fanout_add(struct sock *sk, struct fanout_args *args)
1645{
1646 struct packet_rollover *rollover = NULL;
1647 struct packet_sock *po = pkt_sk(sk);
1648 u16 type_flags = args->type_flags;
1649 struct packet_fanout *f, *match;
1650 u8 type = type_flags & 0xff;
1651 u8 flags = type_flags >> 8;
1652 u16 id = args->id;
1653 int err;
1654
1655 switch (type) {
1656 case PACKET_FANOUT_ROLLOVER:
1657 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1658 return -EINVAL;
1659 break;
1660 case PACKET_FANOUT_HASH:
1661 case PACKET_FANOUT_LB:
1662 case PACKET_FANOUT_CPU:
1663 case PACKET_FANOUT_RND:
1664 case PACKET_FANOUT_QM:
1665 case PACKET_FANOUT_CBPF:
1666 case PACKET_FANOUT_EBPF:
1667 break;
1668 default:
1669 return -EINVAL;
1670 }
1671
1672 mutex_lock(&fanout_mutex);
1673
1674 err = -EALREADY;
1675 if (po->fanout)
1676 goto out;
1677
1678 if (type == PACKET_FANOUT_ROLLOVER ||
1679 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1680 err = -ENOMEM;
1681 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1682 if (!rollover)
1683 goto out;
1684 atomic_long_set(&rollover->num, 0);
1685 atomic_long_set(&rollover->num_huge, 0);
1686 atomic_long_set(&rollover->num_failed, 0);
1687 }
1688
1689 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1690 if (id != 0) {
1691 err = -EINVAL;
1692 goto out;
1693 }
1694 if (!fanout_find_new_id(sk, &id)) {
1695 err = -ENOMEM;
1696 goto out;
1697 }
1698
1699 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1700 }
1701
1702 match = NULL;
1703 list_for_each_entry(f, &fanout_list, list) {
1704 if (f->id == id &&
1705 read_pnet(&f->net) == sock_net(sk)) {
1706 match = f;
1707 break;
1708 }
1709 }
1710 err = -EINVAL;
1711 if (match) {
1712 if (match->flags != flags)
1713 goto out;
1714 if (args->max_num_members &&
1715 args->max_num_members != match->max_num_members)
1716 goto out;
1717 } else {
1718 if (args->max_num_members > PACKET_FANOUT_MAX)
1719 goto out;
1720 if (!args->max_num_members)
1721
1722 args->max_num_members = 256;
1723 err = -ENOMEM;
1724 match = kvzalloc(struct_size(match, arr, args->max_num_members),
1725 GFP_KERNEL);
1726 if (!match)
1727 goto out;
1728 write_pnet(&match->net, sock_net(sk));
1729 match->id = id;
1730 match->type = type;
1731 match->flags = flags;
1732 INIT_LIST_HEAD(&match->list);
1733 spin_lock_init(&match->lock);
1734 refcount_set(&match->sk_ref, 0);
1735 fanout_init_data(match);
1736 match->prot_hook.type = po->prot_hook.type;
1737 match->prot_hook.dev = po->prot_hook.dev;
1738 match->prot_hook.func = packet_rcv_fanout;
1739 match->prot_hook.af_packet_priv = match;
1740 match->prot_hook.id_match = match_fanout_group;
1741 match->max_num_members = args->max_num_members;
1742 list_add(&match->list, &fanout_list);
1743 }
1744 err = -EINVAL;
1745
1746 spin_lock(&po->bind_lock);
1747 if (po->running &&
1748 match->type == type &&
1749 match->prot_hook.type == po->prot_hook.type &&
1750 match->prot_hook.dev == po->prot_hook.dev) {
1751 err = -ENOSPC;
1752 if (refcount_read(&match->sk_ref) < match->max_num_members) {
1753 __dev_remove_pack(&po->prot_hook);
1754 po->fanout = match;
1755 po->rollover = rollover;
1756 rollover = NULL;
1757 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1758 __fanout_link(sk, po);
1759 err = 0;
1760 }
1761 }
1762 spin_unlock(&po->bind_lock);
1763
1764 if (err && !refcount_read(&match->sk_ref)) {
1765 list_del(&match->list);
1766 kvfree(match);
1767 }
1768
1769out:
1770 kfree(rollover);
1771 mutex_unlock(&fanout_mutex);
1772 return err;
1773}
1774
1775
1776
1777
1778
1779
1780static struct packet_fanout *fanout_release(struct sock *sk)
1781{
1782 struct packet_sock *po = pkt_sk(sk);
1783 struct packet_fanout *f;
1784
1785 mutex_lock(&fanout_mutex);
1786 f = po->fanout;
1787 if (f) {
1788 po->fanout = NULL;
1789
1790 if (refcount_dec_and_test(&f->sk_ref))
1791 list_del(&f->list);
1792 else
1793 f = NULL;
1794 }
1795 mutex_unlock(&fanout_mutex);
1796
1797 return f;
1798}
1799
1800static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1801 struct sk_buff *skb)
1802{
1803
1804
1805
1806
1807 if (unlikely(dev->type != ARPHRD_ETHER))
1808 return false;
1809
1810 skb_reset_mac_header(skb);
1811 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1812}
1813
1814static const struct proto_ops packet_ops;
1815
1816static const struct proto_ops packet_ops_spkt;
1817
1818static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1819 struct packet_type *pt, struct net_device *orig_dev)
1820{
1821 struct sock *sk;
1822 struct sockaddr_pkt *spkt;
1823
1824
1825
1826
1827
1828
1829 sk = pt->af_packet_priv;
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842 if (skb->pkt_type == PACKET_LOOPBACK)
1843 goto out;
1844
1845 if (!net_eq(dev_net(dev), sock_net(sk)))
1846 goto out;
1847
1848 skb = skb_share_check(skb, GFP_ATOMIC);
1849 if (skb == NULL)
1850 goto oom;
1851
1852
1853 skb_dst_drop(skb);
1854
1855
1856 nf_reset_ct(skb);
1857
1858 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1859
1860 skb_push(skb, skb->data - skb_mac_header(skb));
1861
1862
1863
1864
1865
1866 spkt->spkt_family = dev->type;
1867 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1868 spkt->spkt_protocol = skb->protocol;
1869
1870
1871
1872
1873
1874
1875 if (sock_queue_rcv_skb(sk, skb) == 0)
1876 return 0;
1877
1878out:
1879 kfree_skb(skb);
1880oom:
1881 return 0;
1882}
1883
1884static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1885{
1886 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1887 sock->type == SOCK_RAW) {
1888 skb_reset_mac_header(skb);
1889 skb->protocol = dev_parse_header_protocol(skb);
1890 }
1891
1892 skb_probe_transport_header(skb);
1893}
1894
1895
1896
1897
1898
1899
1900static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1901 size_t len)
1902{
1903 struct sock *sk = sock->sk;
1904 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1905 struct sk_buff *skb = NULL;
1906 struct net_device *dev;
1907 struct sockcm_cookie sockc;
1908 __be16 proto = 0;
1909 int err;
1910 int extra_len = 0;
1911
1912
1913
1914
1915
1916 if (saddr) {
1917 if (msg->msg_namelen < sizeof(struct sockaddr))
1918 return -EINVAL;
1919 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1920 proto = saddr->spkt_protocol;
1921 } else
1922 return -ENOTCONN;
1923
1924
1925
1926
1927
1928 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1929retry:
1930 rcu_read_lock();
1931 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1932 err = -ENODEV;
1933 if (dev == NULL)
1934 goto out_unlock;
1935
1936 err = -ENETDOWN;
1937 if (!(dev->flags & IFF_UP))
1938 goto out_unlock;
1939
1940
1941
1942
1943
1944
1945 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1946 if (!netif_supports_nofcs(dev)) {
1947 err = -EPROTONOSUPPORT;
1948 goto out_unlock;
1949 }
1950 extra_len = 4;
1951 }
1952
1953 err = -EMSGSIZE;
1954 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1955 goto out_unlock;
1956
1957 if (!skb) {
1958 size_t reserved = LL_RESERVED_SPACE(dev);
1959 int tlen = dev->needed_tailroom;
1960 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1961
1962 rcu_read_unlock();
1963 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1964 if (skb == NULL)
1965 return -ENOBUFS;
1966
1967
1968
1969
1970 skb_reserve(skb, reserved);
1971 skb_reset_network_header(skb);
1972
1973
1974 if (hhlen) {
1975 skb->data -= hhlen;
1976 skb->tail -= hhlen;
1977 if (len < hhlen)
1978 skb_reset_network_header(skb);
1979 }
1980 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1981 if (err)
1982 goto out_free;
1983 goto retry;
1984 }
1985
1986 if (!dev_validate_header(dev, skb->data, len)) {
1987 err = -EINVAL;
1988 goto out_unlock;
1989 }
1990 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1991 !packet_extra_vlan_len_allowed(dev, skb)) {
1992 err = -EMSGSIZE;
1993 goto out_unlock;
1994 }
1995
1996 sockcm_init(&sockc, sk);
1997 if (msg->msg_controllen) {
1998 err = sock_cmsg_send(sk, msg, &sockc);
1999 if (unlikely(err))
2000 goto out_unlock;
2001 }
2002
2003 skb->protocol = proto;
2004 skb->dev = dev;
2005 skb->priority = sk->sk_priority;
2006 skb->mark = sk->sk_mark;
2007 skb->tstamp = sockc.transmit_time;
2008
2009 skb_setup_tx_timestamp(skb, sockc.tsflags);
2010
2011 if (unlikely(extra_len == 4))
2012 skb->no_fcs = 1;
2013
2014 packet_parse_headers(skb, sock);
2015
2016 dev_queue_xmit(skb);
2017 rcu_read_unlock();
2018 return len;
2019
2020out_unlock:
2021 rcu_read_unlock();
2022out_free:
2023 kfree_skb(skb);
2024 return err;
2025}
2026
2027static unsigned int run_filter(struct sk_buff *skb,
2028 const struct sock *sk,
2029 unsigned int res)
2030{
2031 struct sk_filter *filter;
2032
2033 rcu_read_lock();
2034 filter = rcu_dereference(sk->sk_filter);
2035 if (filter != NULL)
2036 res = bpf_prog_run_clear_cb(filter->prog, skb);
2037 rcu_read_unlock();
2038
2039 return res;
2040}
2041
2042static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2043 size_t *len)
2044{
2045 struct virtio_net_hdr vnet_hdr;
2046
2047 if (*len < sizeof(vnet_hdr))
2048 return -EINVAL;
2049 *len -= sizeof(vnet_hdr);
2050
2051 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2052 return -EINVAL;
2053
2054 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2055}
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2070 struct packet_type *pt, struct net_device *orig_dev)
2071{
2072 struct sock *sk;
2073 struct sockaddr_ll *sll;
2074 struct packet_sock *po;
2075 u8 *skb_head = skb->data;
2076 int skb_len = skb->len;
2077 unsigned int snaplen, res;
2078 bool is_drop_n_account = false;
2079
2080 if (skb->pkt_type == PACKET_LOOPBACK)
2081 goto drop;
2082
2083 sk = pt->af_packet_priv;
2084 po = pkt_sk(sk);
2085
2086 if (!net_eq(dev_net(dev), sock_net(sk)))
2087 goto drop;
2088
2089 skb->dev = dev;
2090
2091 if (dev_has_header(dev)) {
2092
2093
2094
2095
2096
2097
2098
2099 if (sk->sk_type != SOCK_DGRAM)
2100 skb_push(skb, skb->data - skb_mac_header(skb));
2101 else if (skb->pkt_type == PACKET_OUTGOING) {
2102
2103 skb_pull(skb, skb_network_offset(skb));
2104 }
2105 }
2106
2107 snaplen = skb->len;
2108
2109 res = run_filter(skb, sk, snaplen);
2110 if (!res)
2111 goto drop_n_restore;
2112 if (snaplen > res)
2113 snaplen = res;
2114
2115 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2116 goto drop_n_acct;
2117
2118 if (skb_shared(skb)) {
2119 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2120 if (nskb == NULL)
2121 goto drop_n_acct;
2122
2123 if (skb_head != skb->data) {
2124 skb->data = skb_head;
2125 skb->len = skb_len;
2126 }
2127 consume_skb(skb);
2128 skb = nskb;
2129 }
2130
2131 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2132
2133 sll = &PACKET_SKB_CB(skb)->sa.ll;
2134 sll->sll_hatype = dev->type;
2135 sll->sll_pkttype = skb->pkt_type;
2136 if (unlikely(po->origdev))
2137 sll->sll_ifindex = orig_dev->ifindex;
2138 else
2139 sll->sll_ifindex = dev->ifindex;
2140
2141 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2142
2143
2144
2145
2146 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2147
2148 if (pskb_trim(skb, snaplen))
2149 goto drop_n_acct;
2150
2151 skb_set_owner_r(skb, sk);
2152 skb->dev = NULL;
2153 skb_dst_drop(skb);
2154
2155
2156 nf_reset_ct(skb);
2157
2158 spin_lock(&sk->sk_receive_queue.lock);
2159 po->stats.stats1.tp_packets++;
2160 sock_skb_set_dropcount(sk, skb);
2161 __skb_queue_tail(&sk->sk_receive_queue, skb);
2162 spin_unlock(&sk->sk_receive_queue.lock);
2163 sk->sk_data_ready(sk);
2164 return 0;
2165
2166drop_n_acct:
2167 is_drop_n_account = true;
2168 atomic_inc(&po->tp_drops);
2169 atomic_inc(&sk->sk_drops);
2170
2171drop_n_restore:
2172 if (skb_head != skb->data && skb_shared(skb)) {
2173 skb->data = skb_head;
2174 skb->len = skb_len;
2175 }
2176drop:
2177 if (!is_drop_n_account)
2178 consume_skb(skb);
2179 else
2180 kfree_skb(skb);
2181 return 0;
2182}
2183
2184static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2185 struct packet_type *pt, struct net_device *orig_dev)
2186{
2187 struct sock *sk;
2188 struct packet_sock *po;
2189 struct sockaddr_ll *sll;
2190 union tpacket_uhdr h;
2191 u8 *skb_head = skb->data;
2192 int skb_len = skb->len;
2193 unsigned int snaplen, res;
2194 unsigned long status = TP_STATUS_USER;
2195 unsigned short macoff, hdrlen;
2196 unsigned int netoff;
2197 struct sk_buff *copy_skb = NULL;
2198 struct timespec64 ts;
2199 __u32 ts_status;
2200 bool is_drop_n_account = false;
2201 unsigned int slot_id = 0;
2202 bool do_vnet = false;
2203
2204
2205
2206
2207
2208 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2209 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2210
2211 if (skb->pkt_type == PACKET_LOOPBACK)
2212 goto drop;
2213
2214 sk = pt->af_packet_priv;
2215 po = pkt_sk(sk);
2216
2217 if (!net_eq(dev_net(dev), sock_net(sk)))
2218 goto drop;
2219
2220 if (dev_has_header(dev)) {
2221 if (sk->sk_type != SOCK_DGRAM)
2222 skb_push(skb, skb->data - skb_mac_header(skb));
2223 else if (skb->pkt_type == PACKET_OUTGOING) {
2224
2225 skb_pull(skb, skb_network_offset(skb));
2226 }
2227 }
2228
2229 snaplen = skb->len;
2230
2231 res = run_filter(skb, sk, snaplen);
2232 if (!res)
2233 goto drop_n_restore;
2234
2235
2236 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2237 atomic_inc(&po->tp_drops);
2238 goto drop_n_restore;
2239 }
2240
2241 if (skb->ip_summed == CHECKSUM_PARTIAL)
2242 status |= TP_STATUS_CSUMNOTREADY;
2243 else if (skb->pkt_type != PACKET_OUTGOING &&
2244 (skb->ip_summed == CHECKSUM_COMPLETE ||
2245 skb_csum_unnecessary(skb)))
2246 status |= TP_STATUS_CSUM_VALID;
2247
2248 if (snaplen > res)
2249 snaplen = res;
2250
2251 if (sk->sk_type == SOCK_DGRAM) {
2252 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2253 po->tp_reserve;
2254 } else {
2255 unsigned int maclen = skb_network_offset(skb);
2256 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2257 (maclen < 16 ? 16 : maclen)) +
2258 po->tp_reserve;
2259 if (po->has_vnet_hdr) {
2260 netoff += sizeof(struct virtio_net_hdr);
2261 do_vnet = true;
2262 }
2263 macoff = netoff - maclen;
2264 }
2265 if (netoff > USHRT_MAX) {
2266 atomic_inc(&po->tp_drops);
2267 goto drop_n_restore;
2268 }
2269 if (po->tp_version <= TPACKET_V2) {
2270 if (macoff + snaplen > po->rx_ring.frame_size) {
2271 if (po->copy_thresh &&
2272 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2273 if (skb_shared(skb)) {
2274 copy_skb = skb_clone(skb, GFP_ATOMIC);
2275 } else {
2276 copy_skb = skb_get(skb);
2277 skb_head = skb->data;
2278 }
2279 if (copy_skb)
2280 skb_set_owner_r(copy_skb, sk);
2281 }
2282 snaplen = po->rx_ring.frame_size - macoff;
2283 if ((int)snaplen < 0) {
2284 snaplen = 0;
2285 do_vnet = false;
2286 }
2287 }
2288 } else if (unlikely(macoff + snaplen >
2289 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2290 u32 nval;
2291
2292 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2293 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2294 snaplen, nval, macoff);
2295 snaplen = nval;
2296 if (unlikely((int)snaplen < 0)) {
2297 snaplen = 0;
2298 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2299 do_vnet = false;
2300 }
2301 }
2302 spin_lock(&sk->sk_receive_queue.lock);
2303 h.raw = packet_current_rx_frame(po, skb,
2304 TP_STATUS_KERNEL, (macoff+snaplen));
2305 if (!h.raw)
2306 goto drop_n_account;
2307
2308 if (po->tp_version <= TPACKET_V2) {
2309 slot_id = po->rx_ring.head;
2310 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2311 goto drop_n_account;
2312 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2313 }
2314
2315 if (do_vnet &&
2316 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2317 sizeof(struct virtio_net_hdr),
2318 vio_le(), true, 0)) {
2319 if (po->tp_version == TPACKET_V3)
2320 prb_clear_blk_fill_status(&po->rx_ring);
2321 goto drop_n_account;
2322 }
2323
2324 if (po->tp_version <= TPACKET_V2) {
2325 packet_increment_rx_head(po, &po->rx_ring);
2326
2327
2328
2329
2330
2331
2332 if (atomic_read(&po->tp_drops))
2333 status |= TP_STATUS_LOSING;
2334 }
2335
2336 po->stats.stats1.tp_packets++;
2337 if (copy_skb) {
2338 status |= TP_STATUS_COPY;
2339 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2340 }
2341 spin_unlock(&sk->sk_receive_queue.lock);
2342
2343 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2344
2345
2346
2347
2348 ts_status = tpacket_get_timestamp(skb, &ts,
2349 po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
2350 if (!ts_status)
2351 ktime_get_real_ts64(&ts);
2352
2353 status |= ts_status;
2354
2355 switch (po->tp_version) {
2356 case TPACKET_V1:
2357 h.h1->tp_len = skb->len;
2358 h.h1->tp_snaplen = snaplen;
2359 h.h1->tp_mac = macoff;
2360 h.h1->tp_net = netoff;
2361 h.h1->tp_sec = ts.tv_sec;
2362 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2363 hdrlen = sizeof(*h.h1);
2364 break;
2365 case TPACKET_V2:
2366 h.h2->tp_len = skb->len;
2367 h.h2->tp_snaplen = snaplen;
2368 h.h2->tp_mac = macoff;
2369 h.h2->tp_net = netoff;
2370 h.h2->tp_sec = ts.tv_sec;
2371 h.h2->tp_nsec = ts.tv_nsec;
2372 if (skb_vlan_tag_present(skb)) {
2373 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2374 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2375 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2376 } else {
2377 h.h2->tp_vlan_tci = 0;
2378 h.h2->tp_vlan_tpid = 0;
2379 }
2380 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2381 hdrlen = sizeof(*h.h2);
2382 break;
2383 case TPACKET_V3:
2384
2385
2386
2387 h.h3->tp_status |= status;
2388 h.h3->tp_len = skb->len;
2389 h.h3->tp_snaplen = snaplen;
2390 h.h3->tp_mac = macoff;
2391 h.h3->tp_net = netoff;
2392 h.h3->tp_sec = ts.tv_sec;
2393 h.h3->tp_nsec = ts.tv_nsec;
2394 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2395 hdrlen = sizeof(*h.h3);
2396 break;
2397 default:
2398 BUG();
2399 }
2400
2401 sll = h.raw + TPACKET_ALIGN(hdrlen);
2402 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2403 sll->sll_family = AF_PACKET;
2404 sll->sll_hatype = dev->type;
2405 sll->sll_protocol = skb->protocol;
2406 sll->sll_pkttype = skb->pkt_type;
2407 if (unlikely(po->origdev))
2408 sll->sll_ifindex = orig_dev->ifindex;
2409 else
2410 sll->sll_ifindex = dev->ifindex;
2411
2412 smp_mb();
2413
2414#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2415 if (po->tp_version <= TPACKET_V2) {
2416 u8 *start, *end;
2417
2418 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2419 macoff + snaplen);
2420
2421 for (start = h.raw; start < end; start += PAGE_SIZE)
2422 flush_dcache_page(pgv_to_page(start));
2423 }
2424 smp_wmb();
2425#endif
2426
2427 if (po->tp_version <= TPACKET_V2) {
2428 spin_lock(&sk->sk_receive_queue.lock);
2429 __packet_set_status(po, h.raw, status);
2430 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2431 spin_unlock(&sk->sk_receive_queue.lock);
2432 sk->sk_data_ready(sk);
2433 } else if (po->tp_version == TPACKET_V3) {
2434 prb_clear_blk_fill_status(&po->rx_ring);
2435 }
2436
2437drop_n_restore:
2438 if (skb_head != skb->data && skb_shared(skb)) {
2439 skb->data = skb_head;
2440 skb->len = skb_len;
2441 }
2442drop:
2443 if (!is_drop_n_account)
2444 consume_skb(skb);
2445 else
2446 kfree_skb(skb);
2447 return 0;
2448
2449drop_n_account:
2450 spin_unlock(&sk->sk_receive_queue.lock);
2451 atomic_inc(&po->tp_drops);
2452 is_drop_n_account = true;
2453
2454 sk->sk_data_ready(sk);
2455 kfree_skb(copy_skb);
2456 goto drop_n_restore;
2457}
2458
2459static void tpacket_destruct_skb(struct sk_buff *skb)
2460{
2461 struct packet_sock *po = pkt_sk(skb->sk);
2462
2463 if (likely(po->tx_ring.pg_vec)) {
2464 void *ph;
2465 __u32 ts;
2466
2467 ph = skb_zcopy_get_nouarg(skb);
2468 packet_dec_pending(&po->tx_ring);
2469
2470 ts = __packet_set_timestamp(po, ph, skb);
2471 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2472
2473 if (!packet_read_pending(&po->tx_ring))
2474 complete(&po->skb_completion);
2475 }
2476
2477 sock_wfree(skb);
2478}
2479
2480static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2481{
2482 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2483 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2484 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2485 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2486 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2487 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2488 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2489
2490 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2491 return -EINVAL;
2492
2493 return 0;
2494}
2495
2496static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2497 struct virtio_net_hdr *vnet_hdr)
2498{
2499 if (*len < sizeof(*vnet_hdr))
2500 return -EINVAL;
2501 *len -= sizeof(*vnet_hdr);
2502
2503 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2504 return -EFAULT;
2505
2506 return __packet_snd_vnet_parse(vnet_hdr, *len);
2507}
2508
2509static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2510 void *frame, struct net_device *dev, void *data, int tp_len,
2511 __be16 proto, unsigned char *addr, int hlen, int copylen,
2512 const struct sockcm_cookie *sockc)
2513{
2514 union tpacket_uhdr ph;
2515 int to_write, offset, len, nr_frags, len_max;
2516 struct socket *sock = po->sk.sk_socket;
2517 struct page *page;
2518 int err;
2519
2520 ph.raw = frame;
2521
2522 skb->protocol = proto;
2523 skb->dev = dev;
2524 skb->priority = po->sk.sk_priority;
2525 skb->mark = po->sk.sk_mark;
2526 skb->tstamp = sockc->transmit_time;
2527 skb_setup_tx_timestamp(skb, sockc->tsflags);
2528 skb_zcopy_set_nouarg(skb, ph.raw);
2529
2530 skb_reserve(skb, hlen);
2531 skb_reset_network_header(skb);
2532
2533 to_write = tp_len;
2534
2535 if (sock->type == SOCK_DGRAM) {
2536 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2537 NULL, tp_len);
2538 if (unlikely(err < 0))
2539 return -EINVAL;
2540 } else if (copylen) {
2541 int hdrlen = min_t(int, copylen, tp_len);
2542
2543 skb_push(skb, dev->hard_header_len);
2544 skb_put(skb, copylen - dev->hard_header_len);
2545 err = skb_store_bits(skb, 0, data, hdrlen);
2546 if (unlikely(err))
2547 return err;
2548 if (!dev_validate_header(dev, skb->data, hdrlen))
2549 return -EINVAL;
2550
2551 data += hdrlen;
2552 to_write -= hdrlen;
2553 }
2554
2555 offset = offset_in_page(data);
2556 len_max = PAGE_SIZE - offset;
2557 len = ((to_write > len_max) ? len_max : to_write);
2558
2559 skb->data_len = to_write;
2560 skb->len += to_write;
2561 skb->truesize += to_write;
2562 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2563
2564 while (likely(to_write)) {
2565 nr_frags = skb_shinfo(skb)->nr_frags;
2566
2567 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2568 pr_err("Packet exceed the number of skb frags(%lu)\n",
2569 MAX_SKB_FRAGS);
2570 return -EFAULT;
2571 }
2572
2573 page = pgv_to_page(data);
2574 data += len;
2575 flush_dcache_page(page);
2576 get_page(page);
2577 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2578 to_write -= len;
2579 offset = 0;
2580 len_max = PAGE_SIZE;
2581 len = ((to_write > len_max) ? len_max : to_write);
2582 }
2583
2584 packet_parse_headers(skb, sock);
2585
2586 return tp_len;
2587}
2588
2589static int tpacket_parse_header(struct packet_sock *po, void *frame,
2590 int size_max, void **data)
2591{
2592 union tpacket_uhdr ph;
2593 int tp_len, off;
2594
2595 ph.raw = frame;
2596
2597 switch (po->tp_version) {
2598 case TPACKET_V3:
2599 if (ph.h3->tp_next_offset != 0) {
2600 pr_warn_once("variable sized slot not supported");
2601 return -EINVAL;
2602 }
2603 tp_len = ph.h3->tp_len;
2604 break;
2605 case TPACKET_V2:
2606 tp_len = ph.h2->tp_len;
2607 break;
2608 default:
2609 tp_len = ph.h1->tp_len;
2610 break;
2611 }
2612 if (unlikely(tp_len > size_max)) {
2613 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2614 return -EMSGSIZE;
2615 }
2616
2617 if (unlikely(po->tp_tx_has_off)) {
2618 int off_min, off_max;
2619
2620 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2621 off_max = po->tx_ring.frame_size - tp_len;
2622 if (po->sk.sk_type == SOCK_DGRAM) {
2623 switch (po->tp_version) {
2624 case TPACKET_V3:
2625 off = ph.h3->tp_net;
2626 break;
2627 case TPACKET_V2:
2628 off = ph.h2->tp_net;
2629 break;
2630 default:
2631 off = ph.h1->tp_net;
2632 break;
2633 }
2634 } else {
2635 switch (po->tp_version) {
2636 case TPACKET_V3:
2637 off = ph.h3->tp_mac;
2638 break;
2639 case TPACKET_V2:
2640 off = ph.h2->tp_mac;
2641 break;
2642 default:
2643 off = ph.h1->tp_mac;
2644 break;
2645 }
2646 }
2647 if (unlikely((off < off_min) || (off_max < off)))
2648 return -EINVAL;
2649 } else {
2650 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2651 }
2652
2653 *data = frame + off;
2654 return tp_len;
2655}
2656
2657static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2658{
2659 struct sk_buff *skb = NULL;
2660 struct net_device *dev;
2661 struct virtio_net_hdr *vnet_hdr = NULL;
2662 struct sockcm_cookie sockc;
2663 __be16 proto;
2664 int err, reserve = 0;
2665 void *ph;
2666 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2667 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2668 unsigned char *addr = NULL;
2669 int tp_len, size_max;
2670 void *data;
2671 int len_sum = 0;
2672 int status = TP_STATUS_AVAILABLE;
2673 int hlen, tlen, copylen = 0;
2674 long timeo = 0;
2675
2676 mutex_lock(&po->pg_vec_lock);
2677
2678
2679
2680
2681 if (unlikely(!po->tx_ring.pg_vec)) {
2682 err = -EBUSY;
2683 goto out;
2684 }
2685 if (likely(saddr == NULL)) {
2686 dev = packet_cached_dev_get(po);
2687 proto = READ_ONCE(po->num);
2688 } else {
2689 err = -EINVAL;
2690 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2691 goto out;
2692 if (msg->msg_namelen < (saddr->sll_halen
2693 + offsetof(struct sockaddr_ll,
2694 sll_addr)))
2695 goto out;
2696 proto = saddr->sll_protocol;
2697 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2698 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2699 if (dev && msg->msg_namelen < dev->addr_len +
2700 offsetof(struct sockaddr_ll, sll_addr))
2701 goto out_put;
2702 addr = saddr->sll_addr;
2703 }
2704 }
2705
2706 err = -ENXIO;
2707 if (unlikely(dev == NULL))
2708 goto out;
2709 err = -ENETDOWN;
2710 if (unlikely(!(dev->flags & IFF_UP)))
2711 goto out_put;
2712
2713 sockcm_init(&sockc, &po->sk);
2714 if (msg->msg_controllen) {
2715 err = sock_cmsg_send(&po->sk, msg, &sockc);
2716 if (unlikely(err))
2717 goto out_put;
2718 }
2719
2720 if (po->sk.sk_socket->type == SOCK_RAW)
2721 reserve = dev->hard_header_len;
2722 size_max = po->tx_ring.frame_size
2723 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2724
2725 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2726 size_max = dev->mtu + reserve + VLAN_HLEN;
2727
2728 reinit_completion(&po->skb_completion);
2729
2730 do {
2731 ph = packet_current_frame(po, &po->tx_ring,
2732 TP_STATUS_SEND_REQUEST);
2733 if (unlikely(ph == NULL)) {
2734 if (need_wait && skb) {
2735 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2736 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2737 if (timeo <= 0) {
2738 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2739 goto out_put;
2740 }
2741 }
2742
2743 continue;
2744 }
2745
2746 skb = NULL;
2747 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2748 if (tp_len < 0)
2749 goto tpacket_error;
2750
2751 status = TP_STATUS_SEND_REQUEST;
2752 hlen = LL_RESERVED_SPACE(dev);
2753 tlen = dev->needed_tailroom;
2754 if (po->has_vnet_hdr) {
2755 vnet_hdr = data;
2756 data += sizeof(*vnet_hdr);
2757 tp_len -= sizeof(*vnet_hdr);
2758 if (tp_len < 0 ||
2759 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2760 tp_len = -EINVAL;
2761 goto tpacket_error;
2762 }
2763 copylen = __virtio16_to_cpu(vio_le(),
2764 vnet_hdr->hdr_len);
2765 }
2766 copylen = max_t(int, copylen, dev->hard_header_len);
2767 skb = sock_alloc_send_skb(&po->sk,
2768 hlen + tlen + sizeof(struct sockaddr_ll) +
2769 (copylen - dev->hard_header_len),
2770 !need_wait, &err);
2771
2772 if (unlikely(skb == NULL)) {
2773
2774 if (likely(len_sum > 0))
2775 err = len_sum;
2776 goto out_status;
2777 }
2778 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2779 addr, hlen, copylen, &sockc);
2780 if (likely(tp_len >= 0) &&
2781 tp_len > dev->mtu + reserve &&
2782 !po->has_vnet_hdr &&
2783 !packet_extra_vlan_len_allowed(dev, skb))
2784 tp_len = -EMSGSIZE;
2785
2786 if (unlikely(tp_len < 0)) {
2787tpacket_error:
2788 if (po->tp_loss) {
2789 __packet_set_status(po, ph,
2790 TP_STATUS_AVAILABLE);
2791 packet_increment_head(&po->tx_ring);
2792 kfree_skb(skb);
2793 continue;
2794 } else {
2795 status = TP_STATUS_WRONG_FORMAT;
2796 err = tp_len;
2797 goto out_status;
2798 }
2799 }
2800
2801 if (po->has_vnet_hdr) {
2802 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2803 tp_len = -EINVAL;
2804 goto tpacket_error;
2805 }
2806 virtio_net_hdr_set_proto(skb, vnet_hdr);
2807 }
2808
2809 skb->destructor = tpacket_destruct_skb;
2810 __packet_set_status(po, ph, TP_STATUS_SENDING);
2811 packet_inc_pending(&po->tx_ring);
2812
2813 status = TP_STATUS_SEND_REQUEST;
2814 err = po->xmit(skb);
2815 if (unlikely(err > 0)) {
2816 err = net_xmit_errno(err);
2817 if (err && __packet_get_status(po, ph) ==
2818 TP_STATUS_AVAILABLE) {
2819
2820 skb = NULL;
2821 goto out_status;
2822 }
2823
2824
2825
2826
2827 err = 0;
2828 }
2829 packet_increment_head(&po->tx_ring);
2830 len_sum += tp_len;
2831 } while (likely((ph != NULL) ||
2832
2833
2834
2835
2836
2837
2838 (need_wait && packet_read_pending(&po->tx_ring))));
2839
2840 err = len_sum;
2841 goto out_put;
2842
2843out_status:
2844 __packet_set_status(po, ph, status);
2845 kfree_skb(skb);
2846out_put:
2847 dev_put(dev);
2848out:
2849 mutex_unlock(&po->pg_vec_lock);
2850 return err;
2851}
2852
2853static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2854 size_t reserve, size_t len,
2855 size_t linear, int noblock,
2856 int *err)
2857{
2858 struct sk_buff *skb;
2859
2860
2861 if (prepad + len < PAGE_SIZE || !linear)
2862 linear = len;
2863
2864 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2865 err, 0);
2866 if (!skb)
2867 return NULL;
2868
2869 skb_reserve(skb, reserve);
2870 skb_put(skb, linear);
2871 skb->data_len = len - linear;
2872 skb->len += len - linear;
2873
2874 return skb;
2875}
2876
2877static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2878{
2879 struct sock *sk = sock->sk;
2880 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2881 struct sk_buff *skb;
2882 struct net_device *dev;
2883 __be16 proto;
2884 unsigned char *addr = NULL;
2885 int err, reserve = 0;
2886 struct sockcm_cookie sockc;
2887 struct virtio_net_hdr vnet_hdr = { 0 };
2888 int offset = 0;
2889 struct packet_sock *po = pkt_sk(sk);
2890 bool has_vnet_hdr = false;
2891 int hlen, tlen, linear;
2892 int extra_len = 0;
2893
2894
2895
2896
2897
2898 if (likely(saddr == NULL)) {
2899 dev = packet_cached_dev_get(po);
2900 proto = READ_ONCE(po->num);
2901 } else {
2902 err = -EINVAL;
2903 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2904 goto out;
2905 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2906 goto out;
2907 proto = saddr->sll_protocol;
2908 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2909 if (sock->type == SOCK_DGRAM) {
2910 if (dev && msg->msg_namelen < dev->addr_len +
2911 offsetof(struct sockaddr_ll, sll_addr))
2912 goto out_unlock;
2913 addr = saddr->sll_addr;
2914 }
2915 }
2916
2917 err = -ENXIO;
2918 if (unlikely(dev == NULL))
2919 goto out_unlock;
2920 err = -ENETDOWN;
2921 if (unlikely(!(dev->flags & IFF_UP)))
2922 goto out_unlock;
2923
2924 sockcm_init(&sockc, sk);
2925 sockc.mark = sk->sk_mark;
2926 if (msg->msg_controllen) {
2927 err = sock_cmsg_send(sk, msg, &sockc);
2928 if (unlikely(err))
2929 goto out_unlock;
2930 }
2931
2932 if (sock->type == SOCK_RAW)
2933 reserve = dev->hard_header_len;
2934 if (po->has_vnet_hdr) {
2935 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2936 if (err)
2937 goto out_unlock;
2938 has_vnet_hdr = true;
2939 }
2940
2941 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2942 if (!netif_supports_nofcs(dev)) {
2943 err = -EPROTONOSUPPORT;
2944 goto out_unlock;
2945 }
2946 extra_len = 4;
2947 }
2948
2949 err = -EMSGSIZE;
2950 if (!vnet_hdr.gso_type &&
2951 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2952 goto out_unlock;
2953
2954 err = -ENOBUFS;
2955 hlen = LL_RESERVED_SPACE(dev);
2956 tlen = dev->needed_tailroom;
2957 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2958 linear = max(linear, min_t(int, len, dev->hard_header_len));
2959 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2960 msg->msg_flags & MSG_DONTWAIT, &err);
2961 if (skb == NULL)
2962 goto out_unlock;
2963
2964 skb_reset_network_header(skb);
2965
2966 err = -EINVAL;
2967 if (sock->type == SOCK_DGRAM) {
2968 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2969 if (unlikely(offset < 0))
2970 goto out_free;
2971 } else if (reserve) {
2972 skb_reserve(skb, -reserve);
2973 if (len < reserve + sizeof(struct ipv6hdr) &&
2974 dev->min_header_len != dev->hard_header_len)
2975 skb_reset_network_header(skb);
2976 }
2977
2978
2979 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2980 if (err)
2981 goto out_free;
2982
2983 if (sock->type == SOCK_RAW &&
2984 !dev_validate_header(dev, skb->data, len)) {
2985 err = -EINVAL;
2986 goto out_free;
2987 }
2988
2989 skb_setup_tx_timestamp(skb, sockc.tsflags);
2990
2991 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2992 !packet_extra_vlan_len_allowed(dev, skb)) {
2993 err = -EMSGSIZE;
2994 goto out_free;
2995 }
2996
2997 skb->protocol = proto;
2998 skb->dev = dev;
2999 skb->priority = sk->sk_priority;
3000 skb->mark = sockc.mark;
3001 skb->tstamp = sockc.transmit_time;
3002
3003 if (has_vnet_hdr) {
3004 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3005 if (err)
3006 goto out_free;
3007 len += sizeof(vnet_hdr);
3008 virtio_net_hdr_set_proto(skb, &vnet_hdr);
3009 }
3010
3011 packet_parse_headers(skb, sock);
3012
3013 if (unlikely(extra_len == 4))
3014 skb->no_fcs = 1;
3015
3016 err = po->xmit(skb);
3017 if (err > 0 && (err = net_xmit_errno(err)) != 0)
3018 goto out_unlock;
3019
3020 dev_put(dev);
3021
3022 return len;
3023
3024out_free:
3025 kfree_skb(skb);
3026out_unlock:
3027 if (dev)
3028 dev_put(dev);
3029out:
3030 return err;
3031}
3032
3033static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3034{
3035 struct sock *sk = sock->sk;
3036 struct packet_sock *po = pkt_sk(sk);
3037
3038
3039
3040
3041 if (data_race(po->tx_ring.pg_vec))
3042 return tpacket_snd(po, msg);
3043
3044 return packet_snd(sock, msg, len);
3045}
3046
3047
3048
3049
3050
3051
3052static int packet_release(struct socket *sock)
3053{
3054 struct sock *sk = sock->sk;
3055 struct packet_sock *po;
3056 struct packet_fanout *f;
3057 struct net *net;
3058 union tpacket_req_u req_u;
3059
3060 if (!sk)
3061 return 0;
3062
3063 net = sock_net(sk);
3064 po = pkt_sk(sk);
3065
3066 mutex_lock(&net->packet.sklist_lock);
3067 sk_del_node_init_rcu(sk);
3068 mutex_unlock(&net->packet.sklist_lock);
3069
3070 preempt_disable();
3071 sock_prot_inuse_add(net, sk->sk_prot, -1);
3072 preempt_enable();
3073
3074 spin_lock(&po->bind_lock);
3075 unregister_prot_hook(sk, false);
3076 packet_cached_dev_reset(po);
3077
3078 if (po->prot_hook.dev) {
3079 dev_put(po->prot_hook.dev);
3080 po->prot_hook.dev = NULL;
3081 }
3082 spin_unlock(&po->bind_lock);
3083
3084 packet_flush_mclist(sk);
3085
3086 lock_sock(sk);
3087 if (po->rx_ring.pg_vec) {
3088 memset(&req_u, 0, sizeof(req_u));
3089 packet_set_ring(sk, &req_u, 1, 0);
3090 }
3091
3092 if (po->tx_ring.pg_vec) {
3093 memset(&req_u, 0, sizeof(req_u));
3094 packet_set_ring(sk, &req_u, 1, 1);
3095 }
3096 release_sock(sk);
3097
3098 f = fanout_release(sk);
3099
3100 synchronize_net();
3101
3102 kfree(po->rollover);
3103 if (f) {
3104 fanout_release_data(f);
3105 kvfree(f);
3106 }
3107
3108
3109
3110 sock_orphan(sk);
3111 sock->sk = NULL;
3112
3113
3114
3115 skb_queue_purge(&sk->sk_receive_queue);
3116 packet_free_pending(po);
3117 sk_refcnt_debug_release(sk);
3118
3119 sock_put(sk);
3120 return 0;
3121}
3122
3123
3124
3125
3126
3127static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3128 __be16 proto)
3129{
3130 struct packet_sock *po = pkt_sk(sk);
3131 struct net_device *dev_curr;
3132 __be16 proto_curr;
3133 bool need_rehook;
3134 struct net_device *dev = NULL;
3135 int ret = 0;
3136 bool unlisted = false;
3137
3138 lock_sock(sk);
3139 spin_lock(&po->bind_lock);
3140 rcu_read_lock();
3141
3142 if (po->fanout) {
3143 ret = -EINVAL;
3144 goto out_unlock;
3145 }
3146
3147 if (name) {
3148 dev = dev_get_by_name_rcu(sock_net(sk), name);
3149 if (!dev) {
3150 ret = -ENODEV;
3151 goto out_unlock;
3152 }
3153 } else if (ifindex) {
3154 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3155 if (!dev) {
3156 ret = -ENODEV;
3157 goto out_unlock;
3158 }
3159 }
3160
3161 if (dev)
3162 dev_hold(dev);
3163
3164 proto_curr = po->prot_hook.type;
3165 dev_curr = po->prot_hook.dev;
3166
3167 need_rehook = proto_curr != proto || dev_curr != dev;
3168
3169 if (need_rehook) {
3170 if (po->running) {
3171 rcu_read_unlock();
3172
3173
3174
3175 WRITE_ONCE(po->num, 0);
3176 __unregister_prot_hook(sk, true);
3177 rcu_read_lock();
3178 dev_curr = po->prot_hook.dev;
3179 if (dev)
3180 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3181 dev->ifindex);
3182 }
3183
3184 BUG_ON(po->running);
3185 WRITE_ONCE(po->num, proto);
3186 po->prot_hook.type = proto;
3187
3188 if (unlikely(unlisted)) {
3189 dev_put(dev);
3190 po->prot_hook.dev = NULL;
3191 WRITE_ONCE(po->ifindex, -1);
3192 packet_cached_dev_reset(po);
3193 } else {
3194 po->prot_hook.dev = dev;
3195 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3196 packet_cached_dev_assign(po, dev);
3197 }
3198 }
3199 if (dev_curr)
3200 dev_put(dev_curr);
3201
3202 if (proto == 0 || !need_rehook)
3203 goto out_unlock;
3204
3205 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3206 register_prot_hook(sk);
3207 } else {
3208 sk->sk_err = ENETDOWN;
3209 if (!sock_flag(sk, SOCK_DEAD))
3210 sk_error_report(sk);
3211 }
3212
3213out_unlock:
3214 rcu_read_unlock();
3215 spin_unlock(&po->bind_lock);
3216 release_sock(sk);
3217 return ret;
3218}
3219
3220
3221
3222
3223
3224static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3225 int addr_len)
3226{
3227 struct sock *sk = sock->sk;
3228 char name[sizeof(uaddr->sa_data) + 1];
3229
3230
3231
3232
3233
3234 if (addr_len != sizeof(struct sockaddr))
3235 return -EINVAL;
3236
3237
3238
3239 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3240 name[sizeof(uaddr->sa_data)] = 0;
3241
3242 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3243}
3244
3245static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3246{
3247 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3248 struct sock *sk = sock->sk;
3249
3250
3251
3252
3253
3254 if (addr_len < sizeof(struct sockaddr_ll))
3255 return -EINVAL;
3256 if (sll->sll_family != AF_PACKET)
3257 return -EINVAL;
3258
3259 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3260 sll->sll_protocol ? : pkt_sk(sk)->num);
3261}
3262
3263static struct proto packet_proto = {
3264 .name = "PACKET",
3265 .owner = THIS_MODULE,
3266 .obj_size = sizeof(struct packet_sock),
3267};
3268
3269
3270
3271
3272
3273static int packet_create(struct net *net, struct socket *sock, int protocol,
3274 int kern)
3275{
3276 struct sock *sk;
3277 struct packet_sock *po;
3278 __be16 proto = (__force __be16)protocol;
3279 int err;
3280
3281 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3282 return -EPERM;
3283 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3284 sock->type != SOCK_PACKET)
3285 return -ESOCKTNOSUPPORT;
3286
3287 sock->state = SS_UNCONNECTED;
3288
3289 err = -ENOBUFS;
3290 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3291 if (sk == NULL)
3292 goto out;
3293
3294 sock->ops = &packet_ops;
3295 if (sock->type == SOCK_PACKET)
3296 sock->ops = &packet_ops_spkt;
3297
3298 sock_init_data(sock, sk);
3299
3300 po = pkt_sk(sk);
3301 init_completion(&po->skb_completion);
3302 sk->sk_family = PF_PACKET;
3303 po->num = proto;
3304 po->xmit = dev_queue_xmit;
3305
3306 err = packet_alloc_pending(po);
3307 if (err)
3308 goto out2;
3309
3310 packet_cached_dev_reset(po);
3311
3312 sk->sk_destruct = packet_sock_destruct;
3313 sk_refcnt_debug_inc(sk);
3314
3315
3316
3317
3318
3319 spin_lock_init(&po->bind_lock);
3320 mutex_init(&po->pg_vec_lock);
3321 po->rollover = NULL;
3322 po->prot_hook.func = packet_rcv;
3323
3324 if (sock->type == SOCK_PACKET)
3325 po->prot_hook.func = packet_rcv_spkt;
3326
3327 po->prot_hook.af_packet_priv = sk;
3328
3329 if (proto) {
3330 po->prot_hook.type = proto;
3331 __register_prot_hook(sk);
3332 }
3333
3334 mutex_lock(&net->packet.sklist_lock);
3335 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3336 mutex_unlock(&net->packet.sklist_lock);
3337
3338 preempt_disable();
3339 sock_prot_inuse_add(net, &packet_proto, 1);
3340 preempt_enable();
3341
3342 return 0;
3343out2:
3344 sk_free(sk);
3345out:
3346 return err;
3347}
3348
3349
3350
3351
3352
3353
3354static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3355 int flags)
3356{
3357 struct sock *sk = sock->sk;
3358 struct sk_buff *skb;
3359 int copied, err;
3360 int vnet_hdr_len = 0;
3361 unsigned int origlen = 0;
3362
3363 err = -EINVAL;
3364 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3365 goto out;
3366
3367#if 0
3368
3369 if (pkt_sk(sk)->ifindex < 0)
3370 return -ENODEV;
3371#endif
3372
3373 if (flags & MSG_ERRQUEUE) {
3374 err = sock_recv_errqueue(sk, msg, len,
3375 SOL_PACKET, PACKET_TX_TIMESTAMP);
3376 goto out;
3377 }
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3389
3390
3391
3392
3393
3394
3395
3396 if (skb == NULL)
3397 goto out;
3398
3399 packet_rcv_try_clear_pressure(pkt_sk(sk));
3400
3401 if (pkt_sk(sk)->has_vnet_hdr) {
3402 err = packet_rcv_vnet(msg, skb, &len);
3403 if (err)
3404 goto out_free;
3405 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3406 }
3407
3408
3409
3410
3411
3412 copied = skb->len;
3413 if (copied > len) {
3414 copied = len;
3415 msg->msg_flags |= MSG_TRUNC;
3416 }
3417
3418 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3419 if (err)
3420 goto out_free;
3421
3422 if (sock->type != SOCK_PACKET) {
3423 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3424
3425
3426 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3427 sll->sll_family = AF_PACKET;
3428 sll->sll_protocol = skb->protocol;
3429 }
3430
3431 sock_recv_ts_and_drops(msg, sk, skb);
3432
3433 if (msg->msg_name) {
3434 int copy_len;
3435
3436
3437
3438
3439 if (sock->type == SOCK_PACKET) {
3440 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3441 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3442 copy_len = msg->msg_namelen;
3443 } else {
3444 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3445
3446 msg->msg_namelen = sll->sll_halen +
3447 offsetof(struct sockaddr_ll, sll_addr);
3448 copy_len = msg->msg_namelen;
3449 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3450 memset(msg->msg_name +
3451 offsetof(struct sockaddr_ll, sll_addr),
3452 0, sizeof(sll->sll_addr));
3453 msg->msg_namelen = sizeof(struct sockaddr_ll);
3454 }
3455 }
3456 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3457 }
3458
3459 if (pkt_sk(sk)->auxdata) {
3460 struct tpacket_auxdata aux;
3461
3462 aux.tp_status = TP_STATUS_USER;
3463 if (skb->ip_summed == CHECKSUM_PARTIAL)
3464 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3465 else if (skb->pkt_type != PACKET_OUTGOING &&
3466 (skb->ip_summed == CHECKSUM_COMPLETE ||
3467 skb_csum_unnecessary(skb)))
3468 aux.tp_status |= TP_STATUS_CSUM_VALID;
3469
3470 aux.tp_len = origlen;
3471 aux.tp_snaplen = skb->len;
3472 aux.tp_mac = 0;
3473 aux.tp_net = skb_network_offset(skb);
3474 if (skb_vlan_tag_present(skb)) {
3475 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3476 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3477 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3478 } else {
3479 aux.tp_vlan_tci = 0;
3480 aux.tp_vlan_tpid = 0;
3481 }
3482 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3483 }
3484
3485
3486
3487
3488
3489 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3490
3491out_free:
3492 skb_free_datagram(sk, skb);
3493out:
3494 return err;
3495}
3496
3497static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3498 int peer)
3499{
3500 struct net_device *dev;
3501 struct sock *sk = sock->sk;
3502
3503 if (peer)
3504 return -EOPNOTSUPP;
3505
3506 uaddr->sa_family = AF_PACKET;
3507 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3508 rcu_read_lock();
3509 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3510 if (dev)
3511 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3512 rcu_read_unlock();
3513
3514 return sizeof(*uaddr);
3515}
3516
3517static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3518 int peer)
3519{
3520 struct net_device *dev;
3521 struct sock *sk = sock->sk;
3522 struct packet_sock *po = pkt_sk(sk);
3523 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3524 int ifindex;
3525
3526 if (peer)
3527 return -EOPNOTSUPP;
3528
3529 ifindex = READ_ONCE(po->ifindex);
3530 sll->sll_family = AF_PACKET;
3531 sll->sll_ifindex = ifindex;
3532 sll->sll_protocol = READ_ONCE(po->num);
3533 sll->sll_pkttype = 0;
3534 rcu_read_lock();
3535 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3536 if (dev) {
3537 sll->sll_hatype = dev->type;
3538 sll->sll_halen = dev->addr_len;
3539 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3540 } else {
3541 sll->sll_hatype = 0;
3542 sll->sll_halen = 0;
3543 }
3544 rcu_read_unlock();
3545
3546 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3547}
3548
3549static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3550 int what)
3551{
3552 switch (i->type) {
3553 case PACKET_MR_MULTICAST:
3554 if (i->alen != dev->addr_len)
3555 return -EINVAL;
3556 if (what > 0)
3557 return dev_mc_add(dev, i->addr);
3558 else
3559 return dev_mc_del(dev, i->addr);
3560 break;
3561 case PACKET_MR_PROMISC:
3562 return dev_set_promiscuity(dev, what);
3563 case PACKET_MR_ALLMULTI:
3564 return dev_set_allmulti(dev, what);
3565 case PACKET_MR_UNICAST:
3566 if (i->alen != dev->addr_len)
3567 return -EINVAL;
3568 if (what > 0)
3569 return dev_uc_add(dev, i->addr);
3570 else
3571 return dev_uc_del(dev, i->addr);
3572 break;
3573 default:
3574 break;
3575 }
3576 return 0;
3577}
3578
3579static void packet_dev_mclist_delete(struct net_device *dev,
3580 struct packet_mclist **mlp)
3581{
3582 struct packet_mclist *ml;
3583
3584 while ((ml = *mlp) != NULL) {
3585 if (ml->ifindex == dev->ifindex) {
3586 packet_dev_mc(dev, ml, -1);
3587 *mlp = ml->next;
3588 kfree(ml);
3589 } else
3590 mlp = &ml->next;
3591 }
3592}
3593
3594static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3595{
3596 struct packet_sock *po = pkt_sk(sk);
3597 struct packet_mclist *ml, *i;
3598 struct net_device *dev;
3599 int err;
3600
3601 rtnl_lock();
3602
3603 err = -ENODEV;
3604 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3605 if (!dev)
3606 goto done;
3607
3608 err = -EINVAL;
3609 if (mreq->mr_alen > dev->addr_len)
3610 goto done;
3611
3612 err = -ENOBUFS;
3613 i = kmalloc(sizeof(*i), GFP_KERNEL);
3614 if (i == NULL)
3615 goto done;
3616
3617 err = 0;
3618 for (ml = po->mclist; ml; ml = ml->next) {
3619 if (ml->ifindex == mreq->mr_ifindex &&
3620 ml->type == mreq->mr_type &&
3621 ml->alen == mreq->mr_alen &&
3622 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3623 ml->count++;
3624
3625 kfree(i);
3626 goto done;
3627 }
3628 }
3629
3630 i->type = mreq->mr_type;
3631 i->ifindex = mreq->mr_ifindex;
3632 i->alen = mreq->mr_alen;
3633 memcpy(i->addr, mreq->mr_address, i->alen);
3634 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3635 i->count = 1;
3636 i->next = po->mclist;
3637 po->mclist = i;
3638 err = packet_dev_mc(dev, i, 1);
3639 if (err) {
3640 po->mclist = i->next;
3641 kfree(i);
3642 }
3643
3644done:
3645 rtnl_unlock();
3646 return err;
3647}
3648
3649static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3650{
3651 struct packet_mclist *ml, **mlp;
3652
3653 rtnl_lock();
3654
3655 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3656 if (ml->ifindex == mreq->mr_ifindex &&
3657 ml->type == mreq->mr_type &&
3658 ml->alen == mreq->mr_alen &&
3659 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3660 if (--ml->count == 0) {
3661 struct net_device *dev;
3662 *mlp = ml->next;
3663 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3664 if (dev)
3665 packet_dev_mc(dev, ml, -1);
3666 kfree(ml);
3667 }
3668 break;
3669 }
3670 }
3671 rtnl_unlock();
3672 return 0;
3673}
3674
3675static void packet_flush_mclist(struct sock *sk)
3676{
3677 struct packet_sock *po = pkt_sk(sk);
3678 struct packet_mclist *ml;
3679
3680 if (!po->mclist)
3681 return;
3682
3683 rtnl_lock();
3684 while ((ml = po->mclist) != NULL) {
3685 struct net_device *dev;
3686
3687 po->mclist = ml->next;
3688 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3689 if (dev != NULL)
3690 packet_dev_mc(dev, ml, -1);
3691 kfree(ml);
3692 }
3693 rtnl_unlock();
3694}
3695
3696static int
3697packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3698 unsigned int optlen)
3699{
3700 struct sock *sk = sock->sk;
3701 struct packet_sock *po = pkt_sk(sk);
3702 int ret;
3703
3704 if (level != SOL_PACKET)
3705 return -ENOPROTOOPT;
3706
3707 switch (optname) {
3708 case PACKET_ADD_MEMBERSHIP:
3709 case PACKET_DROP_MEMBERSHIP:
3710 {
3711 struct packet_mreq_max mreq;
3712 int len = optlen;
3713 memset(&mreq, 0, sizeof(mreq));
3714 if (len < sizeof(struct packet_mreq))
3715 return -EINVAL;
3716 if (len > sizeof(mreq))
3717 len = sizeof(mreq);
3718 if (copy_from_sockptr(&mreq, optval, len))
3719 return -EFAULT;
3720 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3721 return -EINVAL;
3722 if (optname == PACKET_ADD_MEMBERSHIP)
3723 ret = packet_mc_add(sk, &mreq);
3724 else
3725 ret = packet_mc_drop(sk, &mreq);
3726 return ret;
3727 }
3728
3729 case PACKET_RX_RING:
3730 case PACKET_TX_RING:
3731 {
3732 union tpacket_req_u req_u;
3733 int len;
3734
3735 lock_sock(sk);
3736 switch (po->tp_version) {
3737 case TPACKET_V1:
3738 case TPACKET_V2:
3739 len = sizeof(req_u.req);
3740 break;
3741 case TPACKET_V3:
3742 default:
3743 len = sizeof(req_u.req3);
3744 break;
3745 }
3746 if (optlen < len) {
3747 ret = -EINVAL;
3748 } else {
3749 if (copy_from_sockptr(&req_u.req, optval, len))
3750 ret = -EFAULT;
3751 else
3752 ret = packet_set_ring(sk, &req_u, 0,
3753 optname == PACKET_TX_RING);
3754 }
3755 release_sock(sk);
3756 return ret;
3757 }
3758 case PACKET_COPY_THRESH:
3759 {
3760 int val;
3761
3762 if (optlen != sizeof(val))
3763 return -EINVAL;
3764 if (copy_from_sockptr(&val, optval, sizeof(val)))
3765 return -EFAULT;
3766
3767 pkt_sk(sk)->copy_thresh = val;
3768 return 0;
3769 }
3770 case PACKET_VERSION:
3771 {
3772 int val;
3773
3774 if (optlen != sizeof(val))
3775 return -EINVAL;
3776 if (copy_from_sockptr(&val, optval, sizeof(val)))
3777 return -EFAULT;
3778 switch (val) {
3779 case TPACKET_V1:
3780 case TPACKET_V2:
3781 case TPACKET_V3:
3782 break;
3783 default:
3784 return -EINVAL;
3785 }
3786 lock_sock(sk);
3787 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3788 ret = -EBUSY;
3789 } else {
3790 po->tp_version = val;
3791 ret = 0;
3792 }
3793 release_sock(sk);
3794 return ret;
3795 }
3796 case PACKET_RESERVE:
3797 {
3798 unsigned int val;
3799
3800 if (optlen != sizeof(val))
3801 return -EINVAL;
3802 if (copy_from_sockptr(&val, optval, sizeof(val)))
3803 return -EFAULT;
3804 if (val > INT_MAX)
3805 return -EINVAL;
3806 lock_sock(sk);
3807 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3808 ret = -EBUSY;
3809 } else {
3810 po->tp_reserve = val;
3811 ret = 0;
3812 }
3813 release_sock(sk);
3814 return ret;
3815 }
3816 case PACKET_LOSS:
3817 {
3818 unsigned int val;
3819
3820 if (optlen != sizeof(val))
3821 return -EINVAL;
3822 if (copy_from_sockptr(&val, optval, sizeof(val)))
3823 return -EFAULT;
3824
3825 lock_sock(sk);
3826 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3827 ret = -EBUSY;
3828 } else {
3829 po->tp_loss = !!val;
3830 ret = 0;
3831 }
3832 release_sock(sk);
3833 return ret;
3834 }
3835 case PACKET_AUXDATA:
3836 {
3837 int val;
3838
3839 if (optlen < sizeof(val))
3840 return -EINVAL;
3841 if (copy_from_sockptr(&val, optval, sizeof(val)))
3842 return -EFAULT;
3843
3844 lock_sock(sk);
3845 po->auxdata = !!val;
3846 release_sock(sk);
3847 return 0;
3848 }
3849 case PACKET_ORIGDEV:
3850 {
3851 int val;
3852
3853 if (optlen < sizeof(val))
3854 return -EINVAL;
3855 if (copy_from_sockptr(&val, optval, sizeof(val)))
3856 return -EFAULT;
3857
3858 lock_sock(sk);
3859 po->origdev = !!val;
3860 release_sock(sk);
3861 return 0;
3862 }
3863 case PACKET_VNET_HDR:
3864 {
3865 int val;
3866
3867 if (sock->type != SOCK_RAW)
3868 return -EINVAL;
3869 if (optlen < sizeof(val))
3870 return -EINVAL;
3871 if (copy_from_sockptr(&val, optval, sizeof(val)))
3872 return -EFAULT;
3873
3874 lock_sock(sk);
3875 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3876 ret = -EBUSY;
3877 } else {
3878 po->has_vnet_hdr = !!val;
3879 ret = 0;
3880 }
3881 release_sock(sk);
3882 return ret;
3883 }
3884 case PACKET_TIMESTAMP:
3885 {
3886 int val;
3887
3888 if (optlen != sizeof(val))
3889 return -EINVAL;
3890 if (copy_from_sockptr(&val, optval, sizeof(val)))
3891 return -EFAULT;
3892
3893 po->tp_tstamp = val;
3894 return 0;
3895 }
3896 case PACKET_FANOUT:
3897 {
3898 struct fanout_args args = { 0 };
3899
3900 if (optlen != sizeof(int) && optlen != sizeof(args))
3901 return -EINVAL;
3902 if (copy_from_sockptr(&args, optval, optlen))
3903 return -EFAULT;
3904
3905 return fanout_add(sk, &args);
3906 }
3907 case PACKET_FANOUT_DATA:
3908 {
3909 if (!po->fanout)
3910 return -EINVAL;
3911
3912 return fanout_set_data(po, optval, optlen);
3913 }
3914 case PACKET_IGNORE_OUTGOING:
3915 {
3916 int val;
3917
3918 if (optlen != sizeof(val))
3919 return -EINVAL;
3920 if (copy_from_sockptr(&val, optval, sizeof(val)))
3921 return -EFAULT;
3922 if (val < 0 || val > 1)
3923 return -EINVAL;
3924
3925 po->prot_hook.ignore_outgoing = !!val;
3926 return 0;
3927 }
3928 case PACKET_TX_HAS_OFF:
3929 {
3930 unsigned int val;
3931
3932 if (optlen != sizeof(val))
3933 return -EINVAL;
3934 if (copy_from_sockptr(&val, optval, sizeof(val)))
3935 return -EFAULT;
3936
3937 lock_sock(sk);
3938 if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
3939 po->tp_tx_has_off = !!val;
3940
3941 release_sock(sk);
3942 return 0;
3943 }
3944 case PACKET_QDISC_BYPASS:
3945 {
3946 int val;
3947
3948 if (optlen != sizeof(val))
3949 return -EINVAL;
3950 if (copy_from_sockptr(&val, optval, sizeof(val)))
3951 return -EFAULT;
3952
3953 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3954 return 0;
3955 }
3956 default:
3957 return -ENOPROTOOPT;
3958 }
3959}
3960
3961static int packet_getsockopt(struct socket *sock, int level, int optname,
3962 char __user *optval, int __user *optlen)
3963{
3964 int len;
3965 int val, lv = sizeof(val);
3966 struct sock *sk = sock->sk;
3967 struct packet_sock *po = pkt_sk(sk);
3968 void *data = &val;
3969 union tpacket_stats_u st;
3970 struct tpacket_rollover_stats rstats;
3971 int drops;
3972
3973 if (level != SOL_PACKET)
3974 return -ENOPROTOOPT;
3975
3976 if (get_user(len, optlen))
3977 return -EFAULT;
3978
3979 if (len < 0)
3980 return -EINVAL;
3981
3982 switch (optname) {
3983 case PACKET_STATISTICS:
3984 spin_lock_bh(&sk->sk_receive_queue.lock);
3985 memcpy(&st, &po->stats, sizeof(st));
3986 memset(&po->stats, 0, sizeof(po->stats));
3987 spin_unlock_bh(&sk->sk_receive_queue.lock);
3988 drops = atomic_xchg(&po->tp_drops, 0);
3989
3990 if (po->tp_version == TPACKET_V3) {
3991 lv = sizeof(struct tpacket_stats_v3);
3992 st.stats3.tp_drops = drops;
3993 st.stats3.tp_packets += drops;
3994 data = &st.stats3;
3995 } else {
3996 lv = sizeof(struct tpacket_stats);
3997 st.stats1.tp_drops = drops;
3998 st.stats1.tp_packets += drops;
3999 data = &st.stats1;
4000 }
4001
4002 break;
4003 case PACKET_AUXDATA:
4004 val = po->auxdata;
4005 break;
4006 case PACKET_ORIGDEV:
4007 val = po->origdev;
4008 break;
4009 case PACKET_VNET_HDR:
4010 val = po->has_vnet_hdr;
4011 break;
4012 case PACKET_VERSION:
4013 val = po->tp_version;
4014 break;
4015 case PACKET_HDRLEN:
4016 if (len > sizeof(int))
4017 len = sizeof(int);
4018 if (len < sizeof(int))
4019 return -EINVAL;
4020 if (copy_from_user(&val, optval, len))
4021 return -EFAULT;
4022 switch (val) {
4023 case TPACKET_V1:
4024 val = sizeof(struct tpacket_hdr);
4025 break;
4026 case TPACKET_V2:
4027 val = sizeof(struct tpacket2_hdr);
4028 break;
4029 case TPACKET_V3:
4030 val = sizeof(struct tpacket3_hdr);
4031 break;
4032 default:
4033 return -EINVAL;
4034 }
4035 break;
4036 case PACKET_RESERVE:
4037 val = po->tp_reserve;
4038 break;
4039 case PACKET_LOSS:
4040 val = po->tp_loss;
4041 break;
4042 case PACKET_TIMESTAMP:
4043 val = po->tp_tstamp;
4044 break;
4045 case PACKET_FANOUT:
4046 val = (po->fanout ?
4047 ((u32)po->fanout->id |
4048 ((u32)po->fanout->type << 16) |
4049 ((u32)po->fanout->flags << 24)) :
4050 0);
4051 break;
4052 case PACKET_IGNORE_OUTGOING:
4053 val = po->prot_hook.ignore_outgoing;
4054 break;
4055 case PACKET_ROLLOVER_STATS:
4056 if (!po->rollover)
4057 return -EINVAL;
4058 rstats.tp_all = atomic_long_read(&po->rollover->num);
4059 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4060 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4061 data = &rstats;
4062 lv = sizeof(rstats);
4063 break;
4064 case PACKET_TX_HAS_OFF:
4065 val = po->tp_tx_has_off;
4066 break;
4067 case PACKET_QDISC_BYPASS:
4068 val = packet_use_direct_xmit(po);
4069 break;
4070 default:
4071 return -ENOPROTOOPT;
4072 }
4073
4074 if (len > lv)
4075 len = lv;
4076 if (put_user(len, optlen))
4077 return -EFAULT;
4078 if (copy_to_user(optval, data, len))
4079 return -EFAULT;
4080 return 0;
4081}
4082
4083static int packet_notifier(struct notifier_block *this,
4084 unsigned long msg, void *ptr)
4085{
4086 struct sock *sk;
4087 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4088 struct net *net = dev_net(dev);
4089
4090 rcu_read_lock();
4091 sk_for_each_rcu(sk, &net->packet.sklist) {
4092 struct packet_sock *po = pkt_sk(sk);
4093
4094 switch (msg) {
4095 case NETDEV_UNREGISTER:
4096 if (po->mclist)
4097 packet_dev_mclist_delete(dev, &po->mclist);
4098 fallthrough;
4099
4100 case NETDEV_DOWN:
4101 if (dev->ifindex == po->ifindex) {
4102 spin_lock(&po->bind_lock);
4103 if (po->running) {
4104 __unregister_prot_hook(sk, false);
4105 sk->sk_err = ENETDOWN;
4106 if (!sock_flag(sk, SOCK_DEAD))
4107 sk_error_report(sk);
4108 }
4109 if (msg == NETDEV_UNREGISTER) {
4110 packet_cached_dev_reset(po);
4111 WRITE_ONCE(po->ifindex, -1);
4112 if (po->prot_hook.dev)
4113 dev_put(po->prot_hook.dev);
4114 po->prot_hook.dev = NULL;
4115 }
4116 spin_unlock(&po->bind_lock);
4117 }
4118 break;
4119 case NETDEV_UP:
4120 if (dev->ifindex == po->ifindex) {
4121 spin_lock(&po->bind_lock);
4122 if (po->num)
4123 register_prot_hook(sk);
4124 spin_unlock(&po->bind_lock);
4125 }
4126 break;
4127 }
4128 }
4129 rcu_read_unlock();
4130 return NOTIFY_DONE;
4131}
4132
4133
4134static int packet_ioctl(struct socket *sock, unsigned int cmd,
4135 unsigned long arg)
4136{
4137 struct sock *sk = sock->sk;
4138
4139 switch (cmd) {
4140 case SIOCOUTQ:
4141 {
4142 int amount = sk_wmem_alloc_get(sk);
4143
4144 return put_user(amount, (int __user *)arg);
4145 }
4146 case SIOCINQ:
4147 {
4148 struct sk_buff *skb;
4149 int amount = 0;
4150
4151 spin_lock_bh(&sk->sk_receive_queue.lock);
4152 skb = skb_peek(&sk->sk_receive_queue);
4153 if (skb)
4154 amount = skb->len;
4155 spin_unlock_bh(&sk->sk_receive_queue.lock);
4156 return put_user(amount, (int __user *)arg);
4157 }
4158#ifdef CONFIG_INET
4159 case SIOCADDRT:
4160 case SIOCDELRT:
4161 case SIOCDARP:
4162 case SIOCGARP:
4163 case SIOCSARP:
4164 case SIOCGIFADDR:
4165 case SIOCSIFADDR:
4166 case SIOCGIFBRDADDR:
4167 case SIOCSIFBRDADDR:
4168 case SIOCGIFNETMASK:
4169 case SIOCSIFNETMASK:
4170 case SIOCGIFDSTADDR:
4171 case SIOCSIFDSTADDR:
4172 case SIOCSIFFLAGS:
4173 return inet_dgram_ops.ioctl(sock, cmd, arg);
4174#endif
4175
4176 default:
4177 return -ENOIOCTLCMD;
4178 }
4179 return 0;
4180}
4181
4182static __poll_t packet_poll(struct file *file, struct socket *sock,
4183 poll_table *wait)
4184{
4185 struct sock *sk = sock->sk;
4186 struct packet_sock *po = pkt_sk(sk);
4187 __poll_t mask = datagram_poll(file, sock, wait);
4188
4189 spin_lock_bh(&sk->sk_receive_queue.lock);
4190 if (po->rx_ring.pg_vec) {
4191 if (!packet_previous_rx_frame(po, &po->rx_ring,
4192 TP_STATUS_KERNEL))
4193 mask |= EPOLLIN | EPOLLRDNORM;
4194 }
4195 packet_rcv_try_clear_pressure(po);
4196 spin_unlock_bh(&sk->sk_receive_queue.lock);
4197 spin_lock_bh(&sk->sk_write_queue.lock);
4198 if (po->tx_ring.pg_vec) {
4199 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4200 mask |= EPOLLOUT | EPOLLWRNORM;
4201 }
4202 spin_unlock_bh(&sk->sk_write_queue.lock);
4203 return mask;
4204}
4205
4206
4207
4208
4209
4210
4211static void packet_mm_open(struct vm_area_struct *vma)
4212{
4213 struct file *file = vma->vm_file;
4214 struct socket *sock = file->private_data;
4215 struct sock *sk = sock->sk;
4216
4217 if (sk)
4218 atomic_inc(&pkt_sk(sk)->mapped);
4219}
4220
4221static void packet_mm_close(struct vm_area_struct *vma)
4222{
4223 struct file *file = vma->vm_file;
4224 struct socket *sock = file->private_data;
4225 struct sock *sk = sock->sk;
4226
4227 if (sk)
4228 atomic_dec(&pkt_sk(sk)->mapped);
4229}
4230
4231static const struct vm_operations_struct packet_mmap_ops = {
4232 .open = packet_mm_open,
4233 .close = packet_mm_close,
4234};
4235
4236static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4237 unsigned int len)
4238{
4239 int i;
4240
4241 for (i = 0; i < len; i++) {
4242 if (likely(pg_vec[i].buffer)) {
4243 if (is_vmalloc_addr(pg_vec[i].buffer))
4244 vfree(pg_vec[i].buffer);
4245 else
4246 free_pages((unsigned long)pg_vec[i].buffer,
4247 order);
4248 pg_vec[i].buffer = NULL;
4249 }
4250 }
4251 kfree(pg_vec);
4252}
4253
4254static char *alloc_one_pg_vec_page(unsigned long order)
4255{
4256 char *buffer;
4257 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4258 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4259
4260 buffer = (char *) __get_free_pages(gfp_flags, order);
4261 if (buffer)
4262 return buffer;
4263
4264
4265 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4266 if (buffer)
4267 return buffer;
4268
4269
4270 gfp_flags &= ~__GFP_NORETRY;
4271 buffer = (char *) __get_free_pages(gfp_flags, order);
4272 if (buffer)
4273 return buffer;
4274
4275
4276 return NULL;
4277}
4278
4279static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4280{
4281 unsigned int block_nr = req->tp_block_nr;
4282 struct pgv *pg_vec;
4283 int i;
4284
4285 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4286 if (unlikely(!pg_vec))
4287 goto out;
4288
4289 for (i = 0; i < block_nr; i++) {
4290 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4291 if (unlikely(!pg_vec[i].buffer))
4292 goto out_free_pgvec;
4293 }
4294
4295out:
4296 return pg_vec;
4297
4298out_free_pgvec:
4299 free_pg_vec(pg_vec, order, block_nr);
4300 pg_vec = NULL;
4301 goto out;
4302}
4303
4304static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4305 int closing, int tx_ring)
4306{
4307 struct pgv *pg_vec = NULL;
4308 struct packet_sock *po = pkt_sk(sk);
4309 unsigned long *rx_owner_map = NULL;
4310 int was_running, order = 0;
4311 struct packet_ring_buffer *rb;
4312 struct sk_buff_head *rb_queue;
4313 __be16 num;
4314 int err;
4315
4316 struct tpacket_req *req = &req_u->req;
4317
4318 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4319 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4320
4321 err = -EBUSY;
4322 if (!closing) {
4323 if (atomic_read(&po->mapped))
4324 goto out;
4325 if (packet_read_pending(rb))
4326 goto out;
4327 }
4328
4329 if (req->tp_block_nr) {
4330 unsigned int min_frame_size;
4331
4332
4333 err = -EBUSY;
4334 if (unlikely(rb->pg_vec))
4335 goto out;
4336
4337 switch (po->tp_version) {
4338 case TPACKET_V1:
4339 po->tp_hdrlen = TPACKET_HDRLEN;
4340 break;
4341 case TPACKET_V2:
4342 po->tp_hdrlen = TPACKET2_HDRLEN;
4343 break;
4344 case TPACKET_V3:
4345 po->tp_hdrlen = TPACKET3_HDRLEN;
4346 break;
4347 }
4348
4349 err = -EINVAL;
4350 if (unlikely((int)req->tp_block_size <= 0))
4351 goto out;
4352 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4353 goto out;
4354 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4355 if (po->tp_version >= TPACKET_V3 &&
4356 req->tp_block_size <
4357 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4358 goto out;
4359 if (unlikely(req->tp_frame_size < min_frame_size))
4360 goto out;
4361 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4362 goto out;
4363
4364 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4365 if (unlikely(rb->frames_per_block == 0))
4366 goto out;
4367 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4368 goto out;
4369 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4370 req->tp_frame_nr))
4371 goto out;
4372
4373 err = -ENOMEM;
4374 order = get_order(req->tp_block_size);
4375 pg_vec = alloc_pg_vec(req, order);
4376 if (unlikely(!pg_vec))
4377 goto out;
4378 switch (po->tp_version) {
4379 case TPACKET_V3:
4380
4381 if (!tx_ring) {
4382 init_prb_bdqc(po, rb, pg_vec, req_u);
4383 } else {
4384 struct tpacket_req3 *req3 = &req_u->req3;
4385
4386 if (req3->tp_retire_blk_tov ||
4387 req3->tp_sizeof_priv ||
4388 req3->tp_feature_req_word) {
4389 err = -EINVAL;
4390 goto out_free_pg_vec;
4391 }
4392 }
4393 break;
4394 default:
4395 if (!tx_ring) {
4396 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4397 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4398 if (!rx_owner_map)
4399 goto out_free_pg_vec;
4400 }
4401 break;
4402 }
4403 }
4404
4405 else {
4406 err = -EINVAL;
4407 if (unlikely(req->tp_frame_nr))
4408 goto out;
4409 }
4410
4411
4412
4413 spin_lock(&po->bind_lock);
4414 was_running = po->running;
4415 num = po->num;
4416 if (was_running) {
4417 WRITE_ONCE(po->num, 0);
4418 __unregister_prot_hook(sk, false);
4419 }
4420 spin_unlock(&po->bind_lock);
4421
4422 synchronize_net();
4423
4424 err = -EBUSY;
4425 mutex_lock(&po->pg_vec_lock);
4426 if (closing || atomic_read(&po->mapped) == 0) {
4427 err = 0;
4428 spin_lock_bh(&rb_queue->lock);
4429 swap(rb->pg_vec, pg_vec);
4430 if (po->tp_version <= TPACKET_V2)
4431 swap(rb->rx_owner_map, rx_owner_map);
4432 rb->frame_max = (req->tp_frame_nr - 1);
4433 rb->head = 0;
4434 rb->frame_size = req->tp_frame_size;
4435 spin_unlock_bh(&rb_queue->lock);
4436
4437 swap(rb->pg_vec_order, order);
4438 swap(rb->pg_vec_len, req->tp_block_nr);
4439
4440 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4441 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4442 tpacket_rcv : packet_rcv;
4443 skb_queue_purge(rb_queue);
4444 if (atomic_read(&po->mapped))
4445 pr_err("packet_mmap: vma is busy: %d\n",
4446 atomic_read(&po->mapped));
4447 }
4448 mutex_unlock(&po->pg_vec_lock);
4449
4450 spin_lock(&po->bind_lock);
4451 if (was_running) {
4452 WRITE_ONCE(po->num, num);
4453 register_prot_hook(sk);
4454 }
4455 spin_unlock(&po->bind_lock);
4456 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4457
4458 if (!tx_ring)
4459 prb_shutdown_retire_blk_timer(po, rb_queue);
4460 }
4461
4462out_free_pg_vec:
4463 bitmap_free(rx_owner_map);
4464 if (pg_vec)
4465 free_pg_vec(pg_vec, order, req->tp_block_nr);
4466out:
4467 return err;
4468}
4469
4470static int packet_mmap(struct file *file, struct socket *sock,
4471 struct vm_area_struct *vma)
4472{
4473 struct sock *sk = sock->sk;
4474 struct packet_sock *po = pkt_sk(sk);
4475 unsigned long size, expected_size;
4476 struct packet_ring_buffer *rb;
4477 unsigned long start;
4478 int err = -EINVAL;
4479 int i;
4480
4481 if (vma->vm_pgoff)
4482 return -EINVAL;
4483
4484 mutex_lock(&po->pg_vec_lock);
4485
4486 expected_size = 0;
4487 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4488 if (rb->pg_vec) {
4489 expected_size += rb->pg_vec_len
4490 * rb->pg_vec_pages
4491 * PAGE_SIZE;
4492 }
4493 }
4494
4495 if (expected_size == 0)
4496 goto out;
4497
4498 size = vma->vm_end - vma->vm_start;
4499 if (size != expected_size)
4500 goto out;
4501
4502 start = vma->vm_start;
4503 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4504 if (rb->pg_vec == NULL)
4505 continue;
4506
4507 for (i = 0; i < rb->pg_vec_len; i++) {
4508 struct page *page;
4509 void *kaddr =