1
2
3
4
5
6
7
8
9
10
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/ipv6.h>
21#include <linux/skbuff.h>
22#include <linux/jhash.h>
23#include <linux/slab.h>
24#include <net/ip.h>
25#include <net/netlink.h>
26#include <net/pkt_sched.h>
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79#define SFQ_DEPTH 128
80#define SFQ_HASH_DIVISOR 1024
81
82
83typedef unsigned char sfq_index;
84
85struct sfq_head
86{
87 sfq_index next;
88 sfq_index prev;
89};
90
91struct sfq_sched_data
92{
93
94 int perturb_period;
95 unsigned quantum;
96 int limit;
97
98
99 struct tcf_proto *filter_list;
100 struct timer_list perturb_timer;
101 u32 perturbation;
102 sfq_index tail;
103 sfq_index max_depth;
104
105 sfq_index ht[SFQ_HASH_DIVISOR];
106 sfq_index next[SFQ_DEPTH];
107 short allot[SFQ_DEPTH];
108 unsigned short hash[SFQ_DEPTH];
109 struct sk_buff_head qs[SFQ_DEPTH];
110 struct sfq_head dep[SFQ_DEPTH*2];
111};
112
113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
114{
115 return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
116}
117
118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
119{
120 u32 h, h2;
121
122 switch (skb->protocol) {
123 case htons(ETH_P_IP):
124 {
125 const struct iphdr *iph;
126
127 if (!pskb_network_may_pull(skb, sizeof(*iph)))
128 goto err;
129 iph = ip_hdr(skb);
130 h = (__force u32)iph->daddr;
131 h2 = (__force u32)iph->saddr ^ iph->protocol;
132 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
133 (iph->protocol == IPPROTO_TCP ||
134 iph->protocol == IPPROTO_UDP ||
135 iph->protocol == IPPROTO_UDPLITE ||
136 iph->protocol == IPPROTO_SCTP ||
137 iph->protocol == IPPROTO_DCCP ||
138 iph->protocol == IPPROTO_ESP) &&
139 pskb_network_may_pull(skb, iph->ihl * 4 + 4))
140 h2 ^= *(((u32*)iph) + iph->ihl);
141 break;
142 }
143 case htons(ETH_P_IPV6):
144 {
145 struct ipv6hdr *iph;
146
147 if (!pskb_network_may_pull(skb, sizeof(*iph)))
148 goto err;
149 iph = ipv6_hdr(skb);
150 h = (__force u32)iph->daddr.s6_addr32[3];
151 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
152 if ((iph->nexthdr == IPPROTO_TCP ||
153 iph->nexthdr == IPPROTO_UDP ||
154 iph->nexthdr == IPPROTO_UDPLITE ||
155 iph->nexthdr == IPPROTO_SCTP ||
156 iph->nexthdr == IPPROTO_DCCP ||
157 iph->nexthdr == IPPROTO_ESP) &&
158 pskb_network_may_pull(skb, sizeof(*iph) + 4))
159 h2 ^= *(u32*)&iph[1];
160 break;
161 }
162 default:
163err:
164 h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
165 h2 = (unsigned long)skb->sk;
166 }
167
168 return sfq_fold_hash(q, h, h2);
169}
170
171static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
172 int *qerr)
173{
174 struct sfq_sched_data *q = qdisc_priv(sch);
175 struct tcf_result res;
176 int result;
177
178 if (TC_H_MAJ(skb->priority) == sch->handle &&
179 TC_H_MIN(skb->priority) > 0 &&
180 TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
181 return TC_H_MIN(skb->priority);
182
183 if (!q->filter_list)
184 return sfq_hash(q, skb) + 1;
185
186 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
187 result = tc_classify(skb, q->filter_list, &res);
188 if (result >= 0) {
189#ifdef CONFIG_NET_CLS_ACT
190 switch (result) {
191 case TC_ACT_STOLEN:
192 case TC_ACT_QUEUED:
193 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
194 case TC_ACT_SHOT:
195 return 0;
196 }
197#endif
198 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
199 return TC_H_MIN(res.classid);
200 }
201 return 0;
202}
203
204static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
205{
206 sfq_index p, n;
207 int d = q->qs[x].qlen + SFQ_DEPTH;
208
209 p = d;
210 n = q->dep[d].next;
211 q->dep[x].next = n;
212 q->dep[x].prev = p;
213 q->dep[p].next = q->dep[n].prev = x;
214}
215
216static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
217{
218 sfq_index p, n;
219
220 n = q->dep[x].next;
221 p = q->dep[x].prev;
222 q->dep[p].next = n;
223 q->dep[n].prev = p;
224
225 if (n == p && q->max_depth == q->qs[x].qlen + 1)
226 q->max_depth--;
227
228 sfq_link(q, x);
229}
230
231static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
232{
233 sfq_index p, n;
234 int d;
235
236 n = q->dep[x].next;
237 p = q->dep[x].prev;
238 q->dep[p].next = n;
239 q->dep[n].prev = p;
240 d = q->qs[x].qlen;
241 if (q->max_depth < d)
242 q->max_depth = d;
243
244 sfq_link(q, x);
245}
246
247static unsigned int sfq_drop(struct Qdisc *sch)
248{
249 struct sfq_sched_data *q = qdisc_priv(sch);
250 sfq_index d = q->max_depth;
251 struct sk_buff *skb;
252 unsigned int len;
253
254
255
256
257 if (d > 1) {
258 sfq_index x = q->dep[d + SFQ_DEPTH].next;
259 skb = q->qs[x].prev;
260 len = qdisc_pkt_len(skb);
261 __skb_unlink(skb, &q->qs[x]);
262 kfree_skb(skb);
263 sfq_dec(q, x);
264 sch->q.qlen--;
265 sch->qstats.drops++;
266 sch->qstats.backlog -= len;
267 return len;
268 }
269
270 if (d == 1) {
271
272 d = q->next[q->tail];
273 q->next[q->tail] = q->next[d];
274 q->allot[q->next[d]] += q->quantum;
275 skb = q->qs[d].prev;
276 len = qdisc_pkt_len(skb);
277 __skb_unlink(skb, &q->qs[d]);
278 kfree_skb(skb);
279 sfq_dec(q, d);
280 sch->q.qlen--;
281 q->ht[q->hash[d]] = SFQ_DEPTH;
282 sch->qstats.drops++;
283 sch->qstats.backlog -= len;
284 return len;
285 }
286
287 return 0;
288}
289
290static int
291sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
292{
293 struct sfq_sched_data *q = qdisc_priv(sch);
294 unsigned int hash;
295 sfq_index x;
296 int uninitialized_var(ret);
297
298 hash = sfq_classify(skb, sch, &ret);
299 if (hash == 0) {
300 if (ret & __NET_XMIT_BYPASS)
301 sch->qstats.drops++;
302 kfree_skb(skb);
303 return ret;
304 }
305 hash--;
306
307 x = q->ht[hash];
308 if (x == SFQ_DEPTH) {
309 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
310 q->hash[x] = hash;
311 }
312
313
314
315
316
317 if (q->qs[x].qlen >= q->limit)
318 return qdisc_drop(skb, sch);
319
320 sch->qstats.backlog += qdisc_pkt_len(skb);
321 __skb_queue_tail(&q->qs[x], skb);
322 sfq_inc(q, x);
323 if (q->qs[x].qlen == 1) {
324 if (q->tail == SFQ_DEPTH) {
325 q->tail = x;
326 q->next[x] = x;
327 q->allot[x] = q->quantum;
328 } else {
329 q->next[x] = q->next[q->tail];
330 q->next[q->tail] = x;
331 q->tail = x;
332 }
333 }
334 if (++sch->q.qlen <= q->limit) {
335 sch->bstats.bytes += qdisc_pkt_len(skb);
336 sch->bstats.packets++;
337 return NET_XMIT_SUCCESS;
338 }
339
340 sfq_drop(sch);
341 return NET_XMIT_CN;
342}
343
344static struct sk_buff *
345sfq_peek(struct Qdisc *sch)
346{
347 struct sfq_sched_data *q = qdisc_priv(sch);
348 sfq_index a;
349
350
351 if (q->tail == SFQ_DEPTH)
352 return NULL;
353
354 a = q->next[q->tail];
355 return skb_peek(&q->qs[a]);
356}
357
358static struct sk_buff *
359sfq_dequeue(struct Qdisc *sch)
360{
361 struct sfq_sched_data *q = qdisc_priv(sch);
362 struct sk_buff *skb;
363 sfq_index a, old_a;
364
365
366 if (q->tail == SFQ_DEPTH)
367 return NULL;
368
369 a = old_a = q->next[q->tail];
370
371
372 skb = __skb_dequeue(&q->qs[a]);
373 sfq_dec(q, a);
374 sch->q.qlen--;
375 sch->qstats.backlog -= qdisc_pkt_len(skb);
376
377
378 if (q->qs[a].qlen == 0) {
379 q->ht[q->hash[a]] = SFQ_DEPTH;
380 a = q->next[a];
381 if (a == old_a) {
382 q->tail = SFQ_DEPTH;
383 return skb;
384 }
385 q->next[q->tail] = a;
386 q->allot[a] += q->quantum;
387 } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
388 q->tail = a;
389 a = q->next[a];
390 q->allot[a] += q->quantum;
391 }
392 return skb;
393}
394
395static void
396sfq_reset(struct Qdisc *sch)
397{
398 struct sk_buff *skb;
399
400 while ((skb = sfq_dequeue(sch)) != NULL)
401 kfree_skb(skb);
402}
403
404static void sfq_perturbation(unsigned long arg)
405{
406 struct Qdisc *sch = (struct Qdisc *)arg;
407 struct sfq_sched_data *q = qdisc_priv(sch);
408
409 q->perturbation = net_random();
410
411 if (q->perturb_period)
412 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
413}
414
415static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
416{
417 struct sfq_sched_data *q = qdisc_priv(sch);
418 struct tc_sfq_qopt *ctl = nla_data(opt);
419 unsigned int qlen;
420
421 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
422 return -EINVAL;
423
424 sch_tree_lock(sch);
425 q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
426 q->perturb_period = ctl->perturb_period * HZ;
427 if (ctl->limit)
428 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
429
430 qlen = sch->q.qlen;
431 while (sch->q.qlen > q->limit)
432 sfq_drop(sch);
433 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
434
435 del_timer(&q->perturb_timer);
436 if (q->perturb_period) {
437 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
438 q->perturbation = net_random();
439 }
440 sch_tree_unlock(sch);
441 return 0;
442}
443
444static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
445{
446 struct sfq_sched_data *q = qdisc_priv(sch);
447 int i;
448
449 q->perturb_timer.function = sfq_perturbation;
450 q->perturb_timer.data = (unsigned long)sch;
451 init_timer_deferrable(&q->perturb_timer);
452
453 for (i = 0; i < SFQ_HASH_DIVISOR; i++)
454 q->ht[i] = SFQ_DEPTH;
455
456 for (i = 0; i < SFQ_DEPTH; i++) {
457 skb_queue_head_init(&q->qs[i]);
458 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
459 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
460 }
461
462 q->limit = SFQ_DEPTH - 1;
463 q->max_depth = 0;
464 q->tail = SFQ_DEPTH;
465 if (opt == NULL) {
466 q->quantum = psched_mtu(qdisc_dev(sch));
467 q->perturb_period = 0;
468 q->perturbation = net_random();
469 } else {
470 int err = sfq_change(sch, opt);
471 if (err)
472 return err;
473 }
474
475 for (i = 0; i < SFQ_DEPTH; i++)
476 sfq_link(q, i);
477 return 0;
478}
479
480static void sfq_destroy(struct Qdisc *sch)
481{
482 struct sfq_sched_data *q = qdisc_priv(sch);
483
484 tcf_destroy_chain(&q->filter_list);
485 q->perturb_period = 0;
486 del_timer_sync(&q->perturb_timer);
487}
488
489static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
490{
491 struct sfq_sched_data *q = qdisc_priv(sch);
492 unsigned char *b = skb_tail_pointer(skb);
493 struct tc_sfq_qopt opt;
494
495 opt.quantum = q->quantum;
496 opt.perturb_period = q->perturb_period / HZ;
497
498 opt.limit = q->limit;
499 opt.divisor = SFQ_HASH_DIVISOR;
500 opt.flows = q->limit;
501
502 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
503
504 return skb->len;
505
506nla_put_failure:
507 nlmsg_trim(skb, b);
508 return -1;
509}
510
511static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
512{
513 return NULL;
514}
515
516static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
517{
518 return 0;
519}
520
521static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
522 u32 classid)
523{
524 return 0;
525}
526
527static void sfq_put(struct Qdisc *q, unsigned long cl)
528{
529}
530
531static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
532{
533 struct sfq_sched_data *q = qdisc_priv(sch);
534
535 if (cl)
536 return NULL;
537 return &q->filter_list;
538}
539
540static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
541 struct sk_buff *skb, struct tcmsg *tcm)
542{
543 tcm->tcm_handle |= TC_H_MIN(cl);
544 return 0;
545}
546
547static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
548 struct gnet_dump *d)
549{
550 struct sfq_sched_data *q = qdisc_priv(sch);
551 sfq_index idx = q->ht[cl-1];
552 struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
553 struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
554
555 if (gnet_stats_copy_queue(d, &qs) < 0)
556 return -1;
557 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
558}
559
560static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
561{
562 struct sfq_sched_data *q = qdisc_priv(sch);
563 unsigned int i;
564
565 if (arg->stop)
566 return;
567
568 for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
569 if (q->ht[i] == SFQ_DEPTH ||
570 arg->count < arg->skip) {
571 arg->count++;
572 continue;
573 }
574 if (arg->fn(sch, i + 1, arg) < 0) {
575 arg->stop = 1;
576 break;
577 }
578 arg->count++;
579 }
580}
581
582static const struct Qdisc_class_ops sfq_class_ops = {
583 .leaf = sfq_leaf,
584 .get = sfq_get,
585 .put = sfq_put,
586 .tcf_chain = sfq_find_tcf,
587 .bind_tcf = sfq_bind,
588 .unbind_tcf = sfq_put,
589 .dump = sfq_dump_class,
590 .dump_stats = sfq_dump_class_stats,
591 .walk = sfq_walk,
592};
593
594static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
595 .cl_ops = &sfq_class_ops,
596 .id = "sfq",
597 .priv_size = sizeof(struct sfq_sched_data),
598 .enqueue = sfq_enqueue,
599 .dequeue = sfq_dequeue,
600 .peek = sfq_peek,
601 .drop = sfq_drop,
602 .init = sfq_init,
603 .reset = sfq_reset,
604 .destroy = sfq_destroy,
605 .change = NULL,
606 .dump = sfq_dump,
607 .owner = THIS_MODULE,
608};
609
610static int __init sfq_module_init(void)
611{
612 return register_qdisc(&sfq_qdisc_ops);
613}
614static void __exit sfq_module_exit(void)
615{
616 unregister_qdisc(&sfq_qdisc_ops);
617}
618module_init(sfq_module_init)
619module_exit(sfq_module_exit)
620MODULE_LICENSE("GPL");
621