linux/net/ipv4/tcp_minisocks.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  11 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  14 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  15 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  16 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *              Jorge Cwik, <jorge@laser.satlink.net>
  19 */
  20
  21#include <linux/mm.h>
  22#include <linux/module.h>
  23#include <linux/slab.h>
  24#include <linux/sysctl.h>
  25#include <linux/workqueue.h>
  26#include <net/tcp.h>
  27#include <net/inet_common.h>
  28#include <net/xfrm.h>
  29
  30int sysctl_tcp_syncookies __read_mostly = 1;
  31EXPORT_SYMBOL(sysctl_tcp_syncookies);
  32
  33int sysctl_tcp_abort_on_overflow __read_mostly;
  34
  35struct inet_timewait_death_row tcp_death_row = {
  36        .sysctl_max_tw_buckets = NR_FILE * 2,
  37        .period         = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
  38        .death_lock     = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
  39        .hashinfo       = &tcp_hashinfo,
  40        .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
  41                                            (unsigned long)&tcp_death_row),
  42        .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work,
  43                                             inet_twdr_twkill_work),
  44/* Short-time timewait calendar */
  45
  46        .twcal_hand     = -1,
  47        .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
  48                                            (unsigned long)&tcp_death_row),
  49};
  50EXPORT_SYMBOL_GPL(tcp_death_row);
  51
  52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  53{
  54        if (seq == s_win)
  55                return true;
  56        if (after(end_seq, s_win) && before(seq, e_win))
  57                return true;
  58        return seq == e_win && seq == end_seq;
  59}
  60
  61/*
  62 * * Main purpose of TIME-WAIT state is to close connection gracefully,
  63 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  64 *   (and, probably, tail of data) and one or more our ACKs are lost.
  65 * * What is TIME-WAIT timeout? It is associated with maximal packet
  66 *   lifetime in the internet, which results in wrong conclusion, that
  67 *   it is set to catch "old duplicate segments" wandering out of their path.
  68 *   It is not quite correct. This timeout is calculated so that it exceeds
  69 *   maximal retransmission timeout enough to allow to lose one (or more)
  70 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  71 * * When TIME-WAIT socket receives RST, it means that another end
  72 *   finally closed and we are allowed to kill TIME-WAIT too.
  73 * * Second purpose of TIME-WAIT is catching old duplicate segments.
  74 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  75 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  76 * * If we invented some more clever way to catch duplicates
  77 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  78 *
  79 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  80 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  81 * from the very beginning.
  82 *
  83 * NOTE. With recycling (and later with fin-wait-2) TW bucket
  84 * is _not_ stateless. It means, that strictly speaking we must
  85 * spinlock it. I do not want! Well, probability of misbehaviour
  86 * is ridiculously low and, seems, we could use some mb() tricks
  87 * to avoid misread sequence numbers, states etc.  --ANK
  88 */
  89enum tcp_tw_status
  90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  91                           const struct tcphdr *th)
  92{
  93        struct tcp_options_received tmp_opt;
  94        const u8 *hash_location;
  95        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  96        bool paws_reject = false;
  97
  98        tmp_opt.saw_tstamp = 0;
  99        if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
 100                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 101
 102                if (tmp_opt.saw_tstamp) {
 103                        tmp_opt.ts_recent       = tcptw->tw_ts_recent;
 104                        tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 105                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 106                }
 107        }
 108
 109        if (tw->tw_substate == TCP_FIN_WAIT2) {
 110                /* Just repeat all the checks of tcp_rcv_state_process() */
 111
 112                /* Out of window, send ACK */
 113                if (paws_reject ||
 114                    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 115                                   tcptw->tw_rcv_nxt,
 116                                   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 117                        return TCP_TW_ACK;
 118
 119                if (th->rst)
 120                        goto kill;
 121
 122                if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 123                        goto kill_with_rst;
 124
 125                /* Dup ACK? */
 126                if (!th->ack ||
 127                    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 128                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 129                        inet_twsk_put(tw);
 130                        return TCP_TW_SUCCESS;
 131                }
 132
 133                /* New data or FIN. If new data arrive after half-duplex close,
 134                 * reset.
 135                 */
 136                if (!th->fin ||
 137                    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 138kill_with_rst:
 139                        inet_twsk_deschedule(tw, &tcp_death_row);
 140                        inet_twsk_put(tw);
 141                        return TCP_TW_RST;
 142                }
 143
 144                /* FIN arrived, enter true time-wait state. */
 145                tw->tw_substate   = TCP_TIME_WAIT;
 146                tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 147                if (tmp_opt.saw_tstamp) {
 148                        tcptw->tw_ts_recent_stamp = get_seconds();
 149                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 150                }
 151
 152                if (tcp_death_row.sysctl_tw_recycle &&
 153                    tcptw->tw_ts_recent_stamp &&
 154                    tcp_tw_remember_stamp(tw))
 155                        inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
 156                                           TCP_TIMEWAIT_LEN);
 157                else
 158                        inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 159                                           TCP_TIMEWAIT_LEN);
 160                return TCP_TW_ACK;
 161        }
 162
 163        /*
 164         *      Now real TIME-WAIT state.
 165         *
 166         *      RFC 1122:
 167         *      "When a connection is [...] on TIME-WAIT state [...]
 168         *      [a TCP] MAY accept a new SYN from the remote TCP to
 169         *      reopen the connection directly, if it:
 170         *
 171         *      (1)  assigns its initial sequence number for the new
 172         *      connection to be larger than the largest sequence
 173         *      number it used on the previous connection incarnation,
 174         *      and
 175         *
 176         *      (2)  returns to TIME-WAIT state if the SYN turns out
 177         *      to be an old duplicate".
 178         */
 179
 180        if (!paws_reject &&
 181            (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 182             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 183                /* In window segment, it may be only reset or bare ack. */
 184
 185                if (th->rst) {
 186                        /* This is TIME_WAIT assassination, in two flavors.
 187                         * Oh well... nobody has a sufficient solution to this
 188                         * protocol bug yet.
 189                         */
 190                        if (sysctl_tcp_rfc1337 == 0) {
 191kill:
 192                                inet_twsk_deschedule(tw, &tcp_death_row);
 193                                inet_twsk_put(tw);
 194                                return TCP_TW_SUCCESS;
 195                        }
 196                }
 197                inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 198                                   TCP_TIMEWAIT_LEN);
 199
 200                if (tmp_opt.saw_tstamp) {
 201                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 202                        tcptw->tw_ts_recent_stamp = get_seconds();
 203                }
 204
 205                inet_twsk_put(tw);
 206                return TCP_TW_SUCCESS;
 207        }
 208
 209        /* Out of window segment.
 210
 211           All the segments are ACKed immediately.
 212
 213           The only exception is new SYN. We accept it, if it is
 214           not old duplicate and we are not in danger to be killed
 215           by delayed old duplicates. RFC check is that it has
 216           newer sequence number works at rates <40Mbit/sec.
 217           However, if paws works, it is reliable AND even more,
 218           we even may relax silly seq space cutoff.
 219
 220           RED-PEN: we violate main RFC requirement, if this SYN will appear
 221           old duplicate (i.e. we receive RST in reply to SYN-ACK),
 222           we must return socket to time-wait state. It is not good,
 223           but not fatal yet.
 224         */
 225
 226        if (th->syn && !th->rst && !th->ack && !paws_reject &&
 227            (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
 228             (tmp_opt.saw_tstamp &&
 229              (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
 230                u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 231                if (isn == 0)
 232                        isn++;
 233                TCP_SKB_CB(skb)->when = isn;
 234                return TCP_TW_SYN;
 235        }
 236
 237        if (paws_reject)
 238                NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
 239
 240        if (!th->rst) {
 241                /* In this case we must reset the TIMEWAIT timer.
 242                 *
 243                 * If it is ACKless SYN it may be both old duplicate
 244                 * and new good SYN with random sequence number <rcv_nxt.
 245                 * Do not reschedule in the last case.
 246                 */
 247                if (paws_reject || th->ack)
 248                        inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 249                                           TCP_TIMEWAIT_LEN);
 250
 251                /* Send ACK. Note, we do not put the bucket,
 252                 * it will be released by caller.
 253                 */
 254                return TCP_TW_ACK;
 255        }
 256        inet_twsk_put(tw);
 257        return TCP_TW_SUCCESS;
 258}
 259EXPORT_SYMBOL(tcp_timewait_state_process);
 260
 261/*
 262 * Move a socket to time-wait or dead fin-wait-2 state.
 263 */
 264void tcp_time_wait(struct sock *sk, int state, int timeo)
 265{
 266        struct inet_timewait_sock *tw = NULL;
 267        const struct inet_connection_sock *icsk = inet_csk(sk);
 268        const struct tcp_sock *tp = tcp_sk(sk);
 269        bool recycle_ok = false;
 270
 271        if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 272                recycle_ok = tcp_remember_stamp(sk);
 273
 274        if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 275                tw = inet_twsk_alloc(sk, state);
 276
 277        if (tw != NULL) {
 278                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 279                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 280                struct inet_sock *inet = inet_sk(sk);
 281
 282                tw->tw_transparent      = inet->transparent;
 283                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
 284                tcptw->tw_rcv_nxt       = tp->rcv_nxt;
 285                tcptw->tw_snd_nxt       = tp->snd_nxt;
 286                tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
 287                tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
 288                tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 289
 290#if IS_ENABLED(CONFIG_IPV6)
 291                if (tw->tw_family == PF_INET6) {
 292                        struct ipv6_pinfo *np = inet6_sk(sk);
 293                        struct inet6_timewait_sock *tw6;
 294
 295                        tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
 296                        tw6 = inet6_twsk((struct sock *)tw);
 297                        tw6->tw_v6_daddr = np->daddr;
 298                        tw6->tw_v6_rcv_saddr = np->rcv_saddr;
 299                        tw->tw_tclass = np->tclass;
 300                        tw->tw_ipv6only = np->ipv6only;
 301                }
 302#endif
 303
 304#ifdef CONFIG_TCP_MD5SIG
 305                /*
 306                 * The timewait bucket does not have the key DB from the
 307                 * sock structure. We just make a quick copy of the
 308                 * md5 key being used (if indeed we are using one)
 309                 * so the timewait ack generating code has the key.
 310                 */
 311                do {
 312                        struct tcp_md5sig_key *key;
 313                        tcptw->tw_md5_key = NULL;
 314                        key = tp->af_specific->md5_lookup(sk, sk);
 315                        if (key != NULL) {
 316                                tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
 317                                if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL)
 318                                        BUG();
 319                        }
 320                } while (0);
 321#endif
 322
 323                /* Linkage updates. */
 324                __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 325
 326                /* Get the TIME_WAIT timeout firing. */
 327                if (timeo < rto)
 328                        timeo = rto;
 329
 330                if (recycle_ok) {
 331                        tw->tw_timeout = rto;
 332                } else {
 333                        tw->tw_timeout = TCP_TIMEWAIT_LEN;
 334                        if (state == TCP_TIME_WAIT)
 335                                timeo = TCP_TIMEWAIT_LEN;
 336                }
 337
 338                inet_twsk_schedule(tw, &tcp_death_row, timeo,
 339                                   TCP_TIMEWAIT_LEN);
 340                inet_twsk_put(tw);
 341        } else {
 342                /* Sorry, if we're out of memory, just CLOSE this
 343                 * socket up.  We've got bigger problems than
 344                 * non-graceful socket closings.
 345                 */
 346                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
 347        }
 348
 349        tcp_update_metrics(sk);
 350        tcp_done(sk);
 351}
 352
 353void tcp_twsk_destructor(struct sock *sk)
 354{
 355#ifdef CONFIG_TCP_MD5SIG
 356        struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 357
 358        if (twsk->tw_md5_key) {
 359                tcp_free_md5sig_pool();
 360                kfree_rcu(twsk->tw_md5_key, rcu);
 361        }
 362#endif
 363}
 364EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 365
 366static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
 367                                         struct request_sock *req)
 368{
 369        tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
 370}
 371
 372/* This is not only more efficient than what we used to do, it eliminates
 373 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 374 *
 375 * Actually, we could lots of memory writes here. tp of listening
 376 * socket contains all necessary default parameters.
 377 */
 378struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 379{
 380        struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 381
 382        if (newsk != NULL) {
 383                const struct inet_request_sock *ireq = inet_rsk(req);
 384                struct tcp_request_sock *treq = tcp_rsk(req);
 385                struct inet_connection_sock *newicsk = inet_csk(newsk);
 386                struct tcp_sock *newtp = tcp_sk(newsk);
 387                struct tcp_sock *oldtp = tcp_sk(sk);
 388                struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 389
 390                /* TCP Cookie Transactions require space for the cookie pair,
 391                 * as it differs for each connection.  There is no need to
 392                 * copy any s_data_payload stored at the original socket.
 393                 * Failure will prevent resuming the connection.
 394                 *
 395                 * Presumed copied, in order of appearance:
 396                 *      cookie_in_always, cookie_out_never
 397                 */
 398                if (oldcvp != NULL) {
 399                        struct tcp_cookie_values *newcvp =
 400                                kzalloc(sizeof(*newtp->cookie_values),
 401                                        GFP_ATOMIC);
 402
 403                        if (newcvp != NULL) {
 404                                kref_init(&newcvp->kref);
 405                                newcvp->cookie_desired =
 406                                                oldcvp->cookie_desired;
 407                                newtp->cookie_values = newcvp;
 408                        } else {
 409                                /* Not Yet Implemented */
 410                                newtp->cookie_values = NULL;
 411                        }
 412                }
 413
 414                /* Now setup tcp_sock */
 415                newtp->pred_flags = 0;
 416
 417                newtp->rcv_wup = newtp->copied_seq =
 418                newtp->rcv_nxt = treq->rcv_isn + 1;
 419
 420                newtp->snd_sml = newtp->snd_una =
 421                newtp->snd_nxt = newtp->snd_up =
 422                        treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 423
 424                tcp_prequeue_init(newtp);
 425                INIT_LIST_HEAD(&newtp->tsq_node);
 426
 427                tcp_init_wl(newtp, treq->rcv_isn);
 428
 429                newtp->srtt = 0;
 430                newtp->mdev = TCP_TIMEOUT_INIT;
 431                newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 432
 433                newtp->packets_out = 0;
 434                newtp->retrans_out = 0;
 435                newtp->sacked_out = 0;
 436                newtp->fackets_out = 0;
 437                newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 438                tcp_enable_early_retrans(newtp);
 439
 440                /* So many TCP implementations out there (incorrectly) count the
 441                 * initial SYN frame in their delayed-ACK and congestion control
 442                 * algorithms that we must have the following bandaid to talk
 443                 * efficiently to them.  -DaveM
 444                 */
 445                newtp->snd_cwnd = TCP_INIT_CWND;
 446                newtp->snd_cwnd_cnt = 0;
 447                newtp->bytes_acked = 0;
 448
 449                newtp->frto_counter = 0;
 450                newtp->frto_highmark = 0;
 451
 452                if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
 453                    !try_module_get(newicsk->icsk_ca_ops->owner))
 454                        newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
 455
 456                tcp_set_ca_state(newsk, TCP_CA_Open);
 457                tcp_init_xmit_timers(newsk);
 458                skb_queue_head_init(&newtp->out_of_order_queue);
 459                newtp->write_seq = newtp->pushed_seq =
 460                        treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 461
 462                newtp->rx_opt.saw_tstamp = 0;
 463
 464                newtp->rx_opt.dsack = 0;
 465                newtp->rx_opt.num_sacks = 0;
 466
 467                newtp->urg_data = 0;
 468
 469                if (sock_flag(newsk, SOCK_KEEPOPEN))
 470                        inet_csk_reset_keepalive_timer(newsk,
 471                                                       keepalive_time_when(newtp));
 472
 473                newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 474                if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 475                        if (sysctl_tcp_fack)
 476                                tcp_enable_fack(newtp);
 477                }
 478                newtp->window_clamp = req->window_clamp;
 479                newtp->rcv_ssthresh = req->rcv_wnd;
 480                newtp->rcv_wnd = req->rcv_wnd;
 481                newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 482                if (newtp->rx_opt.wscale_ok) {
 483                        newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 484                        newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 485                } else {
 486                        newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 487                        newtp->window_clamp = min(newtp->window_clamp, 65535U);
 488                }
 489                newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
 490                                  newtp->rx_opt.snd_wscale);
 491                newtp->max_window = newtp->snd_wnd;
 492
 493                if (newtp->rx_opt.tstamp_ok) {
 494                        newtp->rx_opt.ts_recent = req->ts_recent;
 495                        newtp->rx_opt.ts_recent_stamp = get_seconds();
 496                        newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 497                } else {
 498                        newtp->rx_opt.ts_recent_stamp = 0;
 499                        newtp->tcp_header_len = sizeof(struct tcphdr);
 500                }
 501#ifdef CONFIG_TCP_MD5SIG
 502                newtp->md5sig_info = NULL;      /*XXX*/
 503                if (newtp->af_specific->md5_lookup(sk, newsk))
 504                        newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
 505#endif
 506                if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
 507                        newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 508                newtp->rx_opt.mss_clamp = req->mss;
 509                TCP_ECN_openreq_child(newtp, req);
 510
 511                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 512        }
 513        return newsk;
 514}
 515EXPORT_SYMBOL(tcp_create_openreq_child);
 516
 517/*
 518 *      Process an incoming packet for SYN_RECV sockets represented
 519 *      as a request_sock.
 520 */
 521
 522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 523                           struct request_sock *req,
 524                           struct request_sock **prev)
 525{
 526        struct tcp_options_received tmp_opt;
 527        const u8 *hash_location;
 528        struct sock *child;
 529        const struct tcphdr *th = tcp_hdr(skb);
 530        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 531        bool paws_reject = false;
 532
 533        tmp_opt.saw_tstamp = 0;
 534        if (th->doff > (sizeof(struct tcphdr)>>2)) {
 535                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 536
 537                if (tmp_opt.saw_tstamp) {
 538                        tmp_opt.ts_recent = req->ts_recent;
 539                        /* We do not store true stamp, but it is not required,
 540                         * it can be estimated (approximately)
 541                         * from another data.
 542                         */
 543                        tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 544                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 545                }
 546        }
 547
 548        /* Check for pure retransmitted SYN. */
 549        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 550            flg == TCP_FLAG_SYN &&
 551            !paws_reject) {
 552                /*
 553                 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 554                 * this case on figure 6 and figure 8, but formal
 555                 * protocol description says NOTHING.
 556                 * To be more exact, it says that we should send ACK,
 557                 * because this segment (at least, if it has no data)
 558                 * is out of window.
 559                 *
 560                 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 561                 *  describe SYN-RECV state. All the description
 562                 *  is wrong, we cannot believe to it and should
 563                 *  rely only on common sense and implementation
 564                 *  experience.
 565                 *
 566                 * Enforce "SYN-ACK" according to figure 8, figure 6
 567                 * of RFC793, fixed by RFC1122.
 568                 */
 569                req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 570                return NULL;
 571        }
 572
 573        /* Further reproduces section "SEGMENT ARRIVES"
 574           for state SYN-RECEIVED of RFC793.
 575           It is broken, however, it does not work only
 576           when SYNs are crossed.
 577
 578           You would think that SYN crossing is impossible here, since
 579           we should have a SYN_SENT socket (from connect()) on our end,
 580           but this is not true if the crossed SYNs were sent to both
 581           ends by a malicious third party.  We must defend against this,
 582           and to do that we first verify the ACK (as per RFC793, page
 583           36) and reset if it is invalid.  Is this a true full defense?
 584           To convince ourselves, let us consider a way in which the ACK
 585           test can still pass in this 'malicious crossed SYNs' case.
 586           Malicious sender sends identical SYNs (and thus identical sequence
 587           numbers) to both A and B:
 588
 589                A: gets SYN, seq=7
 590                B: gets SYN, seq=7
 591
 592           By our good fortune, both A and B select the same initial
 593           send sequence number of seven :-)
 594
 595                A: sends SYN|ACK, seq=7, ack_seq=8
 596                B: sends SYN|ACK, seq=7, ack_seq=8
 597
 598           So we are now A eating this SYN|ACK, ACK test passes.  So
 599           does sequence test, SYN is truncated, and thus we consider
 600           it a bare ACK.
 601
 602           If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
 603           bare ACK.  Otherwise, we create an established connection.  Both
 604           ends (listening sockets) accept the new incoming connection and try
 605           to talk to each other. 8-)
 606
 607           Note: This case is both harmless, and rare.  Possibility is about the
 608           same as us discovering intelligent life on another plant tomorrow.
 609
 610           But generally, we should (RFC lies!) to accept ACK
 611           from SYNACK both here and in tcp_rcv_state_process().
 612           tcp_rcv_state_process() does not, hence, we do not too.
 613
 614           Note that the case is absolutely generic:
 615           we cannot optimize anything here without
 616           violating protocol. All the checks must be made
 617           before attempt to create socket.
 618         */
 619
 620        /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 621         *                  and the incoming segment acknowledges something not yet
 622         *                  sent (the segment carries an unacceptable ACK) ...
 623         *                  a reset is sent."
 624         *
 625         * Invalid ACK: reset will be sent by listening socket
 626         */
 627        if ((flg & TCP_FLAG_ACK) &&
 628            (TCP_SKB_CB(skb)->ack_seq !=
 629             tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
 630                return sk;
 631
 632        /* Also, it would be not so bad idea to check rcv_tsecr, which
 633         * is essentially ACK extension and too early or too late values
 634         * should cause reset in unsynchronized states.
 635         */
 636
 637        /* RFC793: "first check sequence number". */
 638
 639        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 640                                          tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 641                /* Out of window: send ACK and drop. */
 642                if (!(flg & TCP_FLAG_RST))
 643                        req->rsk_ops->send_ack(sk, skb, req);
 644                if (paws_reject)
 645                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
 646                return NULL;
 647        }
 648
 649        /* In sequence, PAWS is OK. */
 650
 651        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 652                req->ts_recent = tmp_opt.rcv_tsval;
 653
 654        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 655                /* Truncate SYN, it is out of window starting
 656                   at tcp_rsk(req)->rcv_isn + 1. */
 657                flg &= ~TCP_FLAG_SYN;
 658        }
 659
 660        /* RFC793: "second check the RST bit" and
 661         *         "fourth, check the SYN bit"
 662         */
 663        if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
 664                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 665                goto embryonic_reset;
 666        }
 667
 668        /* ACK sequence verified above, just make sure ACK is
 669         * set.  If ACK not set, just silently drop the packet.
 670         */
 671        if (!(flg & TCP_FLAG_ACK))
 672                return NULL;
 673
 674        /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
 675        if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 676            TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 677                inet_rsk(req)->acked = 1;
 678                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
 679                return NULL;
 680        }
 681        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
 682                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
 683        else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
 684                tcp_rsk(req)->snt_synack = 0;
 685
 686        /* OK, ACK is valid, create big socket and
 687         * feed this segment to it. It will repeat all
 688         * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 689         * ESTABLISHED STATE. If it will be dropped after
 690         * socket is created, wait for troubles.
 691         */
 692        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
 693        if (child == NULL)
 694                goto listen_overflow;
 695
 696        inet_csk_reqsk_queue_unlink(sk, req, prev);
 697        inet_csk_reqsk_queue_removed(sk, req);
 698
 699        inet_csk_reqsk_queue_add(sk, req, child);
 700        return child;
 701
 702listen_overflow:
 703        if (!sysctl_tcp_abort_on_overflow) {
 704                inet_rsk(req)->acked = 1;
 705                return NULL;
 706        }
 707
 708embryonic_reset:
 709        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
 710        if (!(flg & TCP_FLAG_RST))
 711                req->rsk_ops->send_reset(sk, skb);
 712
 713        inet_csk_reqsk_queue_drop(sk, req, prev);
 714        return NULL;
 715}
 716EXPORT_SYMBOL(tcp_check_req);
 717
 718/*
 719 * Queue segment on the new socket if the new socket is active,
 720 * otherwise we just shortcircuit this and continue with
 721 * the new socket.
 722 */
 723
 724int tcp_child_process(struct sock *parent, struct sock *child,
 725                      struct sk_buff *skb)
 726{
 727        int ret = 0;
 728        int state = child->sk_state;
 729
 730        if (!sock_owned_by_user(child)) {
 731                ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
 732                                            skb->len);
 733                /* Wakeup parent, send SIGIO */
 734                if (state == TCP_SYN_RECV && child->sk_state != state)
 735                        parent->sk_data_ready(parent, 0);
 736        } else {
 737                /* Alas, it is possible again, because we do lookup
 738                 * in main socket hash table and lock on listening
 739                 * socket does not protect us more.
 740                 */
 741                __sk_add_backlog(child, skb);
 742        }
 743
 744        bh_unlock_sock(child);
 745        sock_put(child);
 746        return ret;
 747}
 748EXPORT_SYMBOL(tcp_child_process);
 749
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.