#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC2988bis initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */
使用iptables对ack包进行过滤 *filter :INPUT DROP [0:0] :FORWARD DROP [0:0] :OUTPUT DROP [0:0] -A OUTPUT -p tcp --tcp-flags SYN,ACK SYN,ACK -j ACCEPT -A INPUT -p tcp --tcp-flags SYN SYN -j ACCEPT
/* This function calculates a "timeout" which is equivalent to the timeout of a * TCP connection after "boundary" unsucessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if * syn_set flag is set. */ staticinlineboolretransmits_timed_out(struct sock *sk, unsignedint boundary, unsignedint timeout, bool syn_set) { unsignedint linear_backoff_thresh, start_ts;
#define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC2988bis initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */
[root@localhost.localdomain ~]# grep CONFIG_HZ /boot/config-2.6.32-358.el6.x86_64 # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set CONFIG_HZ_1000=y CONFIG_HZ=1000 [root@localhost.localdomain ~]# cat /proc/sys/net/ipv4/tcp_syn_retries 5 [root@localhost.localdomain ~]# cat /proc/sys/net/ipv4/tcp_retries2 15
staticinlinevoidtcp_ack_update_rtt(struct sock *sk, constint flag, const s32 seq_rtt) { conststructtcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tcp_ack_saw_tstamp(sk, flag); elseif (seq_rtt >= 0) tcp_ack_no_tstamp(sk, seq_rtt, flag); }
/* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Supersedes RFC1323) */ staticvoidtcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. * * See draft-ietf-tcplw-high-performance-00, section 3.3. * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ structtcp_sock *tp = tcp_sk(sk);
staticvoidtcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine * rtt estimates. Also, we must not reset the * backoff for rto until we get a non-retransmitted * packet. This allows us to deal with a situation * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */
/* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ staticvoidtcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { structtcp_sock *tp = tcp_sk(sk); long m = mrtt; /* RTT */ //mrtt就是刚才得到的seq_rtt,新得到的RTT值
/* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev >> 2); /* similar update on mdev */ } tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev > tp->mdev_max) { tp->mdev_max = tp->mdev; if (tp->mdev_max > tp->rttvar) tp->rttvar = tp->mdev_max; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ tp->srtt = m << 3; /* take the measured time to be rtt */ tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } }
这个函数位于net/ipv4/tcp_input.c内,需要重点讨论一下
如果是第一次被调用时,在649行判断了tp->srtt还未生成,进入到680行的else逻辑
1 2 3 4 5 6 7
649if (tp->srtt != 0) { ... 680 } else { ... 682 tp->srtt = m << 3; /* take the measured time to be rtt */ 683 tp->mdev = m << 1; /* make sure rto = 3*rtt */ ...
此时
srtt = 8m,mdev = 2m
而后tcp_rtt_estimator再被调用时
1 2 3
649if (tp->srtt != 0) { 650 m -= (tp->srtt >> 3); /* m is now error in rtt est */ 651 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
此时
m' = m - SRTT,8SRTT' = 8SRTT + m - SRTT
SRTT' = 7/8SRTT + 1/8m 符合公式1
公式1证明完毕
往下看
1 2 3 4 5 6 7
652if (m < 0) { 653 m = -m; /* m is now abs(error) */ 654 m -= (tp->mdev >> 2); /* similar update on mdev */ 665 } else { 666 m -= (tp->mdev >> 2); /* similar update on mdev */ 667 } 668 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */