1
2
3
4
5
6
7
8
9
10
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/ipv6.h>
21#include <linux/skbuff.h>
22#include <linux/jhash.h>
23#include <linux/slab.h>
24#include <net/ip.h>
25#include <net/netlink.h>
26#include <net/pkt_sched.h>
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79#define SFQ_DEPTH 128
80#define SFQ_HASH_DIVISOR 1024
81
82
83typedef unsigned char sfq_index;
84
85struct sfq_head
86{
87 sfq_index next;
88 sfq_index prev;
89};
90
91struct sfq_sched_data
92{
93
94 int perturb_period;
95 unsigned quantum;
96 int limit;
97
98
99 struct tcf_proto *filter_list;
100 struct timer_list perturb_timer;
101 u32 perturbation;
102 sfq_index tail;
103 sfq_index max_depth;
104
105 sfq_index ht[SFQ_HASH_DIVISOR];
106 sfq_index next[SFQ_DEPTH];
107 short allot[SFQ_DEPTH];
108 unsigned short hash[SFQ_DEPTH];
109 struct sk_buff_head qs[SFQ_DEPTH];
110 struct sfq_head dep[SFQ_DEPTH*2];
111};
112
113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
114{
115 return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
116}
117
118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
119{
120 u32 h, h2;
121
122 switch (skb->protocol) {
123 case htons(ETH_P_IP):
124 {
125 const struct iphdr *iph;
126 int poff;
127
128 if (!pskb_network_may_pull(skb, sizeof(*iph)))
129 goto err;
130 iph = ip_hdr(skb);
131 h = (__force u32)iph->daddr;
132 h2 = (__force u32)iph->saddr ^ iph->protocol;
133 if (iph->frag_off & htons(IP_MF|IP_OFFSET))
134 break;
135 poff = proto_ports_offset(iph->protocol);
136 if (poff >= 0 &&
137 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
138 iph = ip_hdr(skb);
139 h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
140 }
141 break;
142 }
143 case htons(ETH_P_IPV6):
144 {
145 struct ipv6hdr *iph;
146 int poff;
147
148 if (!pskb_network_may_pull(skb, sizeof(*iph)))
149 goto err;
150 iph = ipv6_hdr(skb);
151 h = (__force u32)iph->daddr.s6_addr32[3];
152 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
153 poff = proto_ports_offset(iph->nexthdr);
154 if (poff >= 0 &&
155 pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
156 iph = ipv6_hdr(skb);
157 h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
158 }
159 break;
160 }
161 default:
162err:
163 h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
164 h2 = (unsigned long)skb->sk;
165 }
166
167 return sfq_fold_hash(q, h, h2);
168}
169
170static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
171 int *qerr)
172{
173 struct sfq_sched_data *q = qdisc_priv(sch);
174 struct tcf_result res;
175 int result;
176
177 if (TC_H_MAJ(skb->priority) == sch->handle &&
178 TC_H_MIN(skb->priority) > 0 &&
179 TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
180 return TC_H_MIN(skb->priority);
181
182 if (!q->filter_list)
183 return sfq_hash(q, skb) + 1;
184
185 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
186 result = tc_classify(skb, q->filter_list, &res);
187 if (result >= 0) {
188#ifdef CONFIG_NET_CLS_ACT
189 switch (result) {
190 case TC_ACT_STOLEN:
191 case TC_ACT_QUEUED:
192 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
193 case TC_ACT_SHOT:
194 return 0;
195 }
196#endif
197 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
198 return TC_H_MIN(res.classid);
199 }
200 return 0;
201}
202
203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204{
205 sfq_index p, n;
206 int d = q->qs[x].qlen + SFQ_DEPTH;
207
208 p = d;
209 n = q->dep[d].next;
210 q->dep[x].next = n;
211 q->dep[x].prev = p;
212 q->dep[p].next = q->dep[n].prev = x;
213}
214
215static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
216{
217 sfq_index p, n;
218
219 n = q->dep[x].next;
220 p = q->dep[x].prev;
221 q->dep[p].next = n;
222 q->dep[n].prev = p;
223
224 if (n == p && q->max_depth == q->qs[x].qlen + 1)
225 q->max_depth--;
226
227 sfq_link(q, x);
228}
229
230static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
231{
232 sfq_index p, n;
233 int d;
234
235 n = q->dep[x].next;
236 p = q->dep[x].prev;
237 q->dep[p].next = n;
238 q->dep[n].prev = p;
239 d = q->qs[x].qlen;
240 if (q->max_depth < d)
241 q->max_depth = d;
242
243 sfq_link(q, x);
244}
245
246static unsigned int sfq_drop(struct Qdisc *sch)
247{
248 struct sfq_sched_data *q = qdisc_priv(sch);
249 sfq_index d = q->max_depth;
250 struct sk_buff *skb;
251 unsigned int len;
252
253
254
255
256 if (d > 1) {
257 sfq_index x = q->dep[d + SFQ_DEPTH].next;
258 skb = q->qs[x].prev;
259 len = qdisc_pkt_len(skb);
260 __skb_unlink(skb, &q->qs[x]);
261 kfree_skb(skb);
262 sfq_dec(q, x);
263 sch->q.qlen--;
264 sch->qstats.drops++;
265 sch->qstats.backlog -= len;
266 return len;
267 }
268
269 if (d == 1) {
270
271 d = q->next[q->tail];
272 q->next[q->tail] = q->next[d];
273 skb = q->qs[d].prev;
274 len = qdisc_pkt_len(skb);
275 __skb_unlink(skb, &q->qs[d]);
276 kfree_skb(skb);
277 sfq_dec(q, d);
278 sch->q.qlen--;
279 q->ht[q->hash[d]] = SFQ_DEPTH;
280 sch->qstats.drops++;
281 sch->qstats.backlog -= len;
282 return len;
283 }
284
285 return 0;
286}
287
288static int
289sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
290{
291 struct sfq_sched_data *q = qdisc_priv(sch);
292 unsigned int hash;
293 sfq_index x;
294 int uninitialized_var(ret);
295
296 hash = sfq_classify(skb, sch, &ret);
297 if (hash == 0) {
298 if (ret & __NET_XMIT_BYPASS)
299 sch->qstats.drops++;
300 kfree_skb(skb);
301 return ret;
302 }
303 hash--;
304
305 x = q->ht[hash];
306 if (x == SFQ_DEPTH) {
307 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
308 q->hash[x] = hash;
309 }
310
311
312
313
314
315 if (q->qs[x].qlen >= q->limit)
316 return qdisc_drop(skb, sch);
317
318 sch->qstats.backlog += qdisc_pkt_len(skb);
319 __skb_queue_tail(&q->qs[x], skb);
320 sfq_inc(q, x);
321 if (q->qs[x].qlen == 1) {
322 if (q->tail == SFQ_DEPTH) {
323 q->next[x] = x;
324 } else {
325 q->next[x] = q->next[q->tail];
326 q->next[q->tail] = x;
327 }
328 q->tail = x;
329 q->allot[x] = q->quantum;
330 }
331 if (++sch->q.qlen <= q->limit) {
332 sch->bstats.bytes += qdisc_pkt_len(skb);
333 sch->bstats.packets++;
334 return NET_XMIT_SUCCESS;
335 }
336
337 sfq_drop(sch);
338 return NET_XMIT_CN;
339}
340
341static struct sk_buff *
342sfq_peek(struct Qdisc *sch)
343{
344 struct sfq_sched_data *q = qdisc_priv(sch);
345 sfq_index a;
346
347
348 if (q->tail == SFQ_DEPTH)
349 return NULL;
350
351 a = q->next[q->tail];
352 return skb_peek(&q->qs[a]);
353}
354
355static struct sk_buff *
356sfq_dequeue(struct Qdisc *sch)
357{
358 struct sfq_sched_data *q = qdisc_priv(sch);
359 struct sk_buff *skb;
360 sfq_index a, next_a;
361
362
363 if (q->tail == SFQ_DEPTH)
364 return NULL;
365
366 a = q->next[q->tail];
367
368
369 skb = __skb_dequeue(&q->qs[a]);
370 sfq_dec(q, a);
371 sch->q.qlen--;
372 sch->qstats.backlog -= qdisc_pkt_len(skb);
373
374
375 if (q->qs[a].qlen == 0) {
376 q->ht[q->hash[a]] = SFQ_DEPTH;
377 next_a = q->next[a];
378 if (a == next_a) {
379 q->tail = SFQ_DEPTH;
380 return skb;
381 }
382 q->next[q->tail] = next_a;
383 } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
384 q->allot[a] += q->quantum;
385 q->tail = a;
386 }
387 return skb;
388}
389
390static void
391sfq_reset(struct Qdisc *sch)
392{
393 struct sk_buff *skb;
394
395 while ((skb = sfq_dequeue(sch)) != NULL)
396 kfree_skb(skb);
397}
398
399static void sfq_perturbation(unsigned long arg)
400{
401 struct Qdisc *sch = (struct Qdisc *)arg;
402 struct sfq_sched_data *q = qdisc_priv(sch);
403
404 q->perturbation = net_random();
405
406 if (q->perturb_period)
407 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
408}
409
410static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
411{
412 struct sfq_sched_data *q = qdisc_priv(sch);
413 struct tc_sfq_qopt *ctl = nla_data(opt);
414 unsigned int qlen;
415
416 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
417 return -EINVAL;
418
419 sch_tree_lock(sch);
420 q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
421 q->perturb_period = ctl->perturb_period * HZ;
422 if (ctl->limit)
423 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
424
425 qlen = sch->q.qlen;
426 while (sch->q.qlen > q->limit)
427 sfq_drop(sch);
428 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
429
430 del_timer(&q->perturb_timer);
431 if (q->perturb_period) {
432 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
433 q->perturbation = net_random();
434 }
435 sch_tree_unlock(sch);
436 return 0;
437}
438
439static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
440{
441 struct sfq_sched_data *q = qdisc_priv(sch);
442 int i;
443
444 q->perturb_timer.function = sfq_perturbation;
445 q->perturb_timer.data = (unsigned long)sch;
446 init_timer_deferrable(&q->perturb_timer);
447
448 for (i = 0; i < SFQ_HASH_DIVISOR; i++)
449 q->ht[i] = SFQ_DEPTH;
450
451 for (i = 0; i < SFQ_DEPTH; i++) {
452 skb_queue_head_init(&q->qs[i]);
453 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
454 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
455 }
456
457 q->limit = SFQ_DEPTH - 1;
458 q->max_depth = 0;
459 q->tail = SFQ_DEPTH;
460 if (opt == NULL) {
461 q->quantum = psched_mtu(qdisc_dev(sch));
462 q->perturb_period = 0;
463 q->perturbation = net_random();
464 } else {
465 int err = sfq_change(sch, opt);
466 if (err)
467 return err;
468 }
469
470 for (i = 0; i < SFQ_DEPTH; i++)
471 sfq_link(q, i);
472 return 0;
473}
474
475static void sfq_destroy(struct Qdisc *sch)
476{
477 struct sfq_sched_data *q = qdisc_priv(sch);
478
479 tcf_destroy_chain(&q->filter_list);
480 q->perturb_period = 0;
481 del_timer_sync(&q->perturb_timer);
482}
483
484static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
485{
486 struct sfq_sched_data *q = qdisc_priv(sch);
487 unsigned char *b = skb_tail_pointer(skb);
488 struct tc_sfq_qopt opt;
489
490 opt.quantum = q->quantum;
491 opt.perturb_period = q->perturb_period / HZ;
492
493 opt.limit = q->limit;
494 opt.divisor = SFQ_HASH_DIVISOR;
495 opt.flows = q->limit;
496
497 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
498
499 return skb->len;
500
501nla_put_failure:
502 nlmsg_trim(skb, b);
503 return -1;
504}
505
506static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
507{
508 return NULL;
509}
510
511static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
512{
513 return 0;
514}
515
516static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
517 u32 classid)
518{
519 return 0;
520}
521
522static void sfq_put(struct Qdisc *q, unsigned long cl)
523{
524}
525
526static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
527{
528 struct sfq_sched_data *q = qdisc_priv(sch);
529
530 if (cl)
531 return NULL;
532 return &q->filter_list;
533}
534
535static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
536 struct sk_buff *skb, struct tcmsg *tcm)
537{
538 tcm->tcm_handle |= TC_H_MIN(cl);
539 return 0;
540}
541
542static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
543 struct gnet_dump *d)
544{
545 struct sfq_sched_data *q = qdisc_priv(sch);
546 sfq_index idx = q->ht[cl-1];
547 struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
548 struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
549
550 if (gnet_stats_copy_queue(d, &qs) < 0)
551 return -1;
552 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
553}
554
555static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
556{
557 struct sfq_sched_data *q = qdisc_priv(sch);
558 unsigned int i;
559
560 if (arg->stop)
561 return;
562
563 for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
564 if (q->ht[i] == SFQ_DEPTH ||
565 arg->count < arg->skip) {
566 arg->count++;
567 continue;
568 }
569 if (arg->fn(sch, i + 1, arg) < 0) {
570 arg->stop = 1;
571 break;
572 }
573 arg->count++;
574 }
575}
576
577static const struct Qdisc_class_ops sfq_class_ops = {
578 .leaf = sfq_leaf,
579 .get = sfq_get,
580 .put = sfq_put,
581 .tcf_chain = sfq_find_tcf,
582 .bind_tcf = sfq_bind,
583 .unbind_tcf = sfq_put,
584 .dump = sfq_dump_class,
585 .dump_stats = sfq_dump_class_stats,
586 .walk = sfq_walk,
587};
588
589static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
590 .cl_ops = &sfq_class_ops,
591 .id = "sfq",
592 .priv_size = sizeof(struct sfq_sched_data),
593 .enqueue = sfq_enqueue,
594 .dequeue = sfq_dequeue,
595 .peek = sfq_peek,
596 .drop = sfq_drop,
597 .init = sfq_init,
598 .reset = sfq_reset,
599 .destroy = sfq_destroy,
600 .change = NULL,
601 .dump = sfq_dump,
602 .owner = THIS_MODULE,
603};
604
605static int __init sfq_module_init(void)
606{
607 return register_qdisc(&sfq_qdisc_ops);
608}
609static void __exit sfq_module_exit(void)
610{
611 unregister_qdisc(&sfq_qdisc_ops);
612}
613module_init(sfq_module_init)
614module_exit(sfq_module_exit)
615MODULE_LICENSE("GPL");
616