深度之下的广度才是有效的

前言

关于超时重传，以前介绍过 TCP 三次握手过程中的 SYN 和 SYN/ACK 超时重传，简述如下：

客户端的 SYN 报文最大重传次数由 tcp_syn_retries 内核参数控制；
服务器端的 SYN/ACK 报文最大重传次数由 tcp_synack_retries 内核参数控制。

那么对于 TCP 数据传输过程中的超时重传，它的重传次数又是什么样的，本篇简单说明一下。

TCP 内核参数

在 Linux 中，实际有两个 TCP 内核参数是有关于数据传输的超时重传，分别是 tcp_retries1 和 tcp_retries2，说明如下：

# sysctl -a | grep tcp_retriesnet.ipv4.tcp_retries1 = 3net.ipv4.tcp_retries2 = 15# tcp_retries1 (integer; default: 3; since Linux 2.2)      The number of times TCP will attempt to retransmit a      packet on an established connection normally, without the      extra effort of getting the network layers involved.  Once      we exceed this number of retransmits, we first have the      network layer update the route if possible before each new      retransmit.  The default is the RFC specified minimum of 3.tcp_retries2 (integer; default: 15; since Linux 2.2)      The maximum number of times a TCP packet is retransmitted      in established state before giving up.  The default value      is 15, which corresponds to a duration of approximately      between 13 to 30 minutes, depending on the retransmission      timeout.  The RFC 1122 specified minimum limit of 100      seconds is typically deemed too short.

tcp_retries1，默认值为 3 ，表示尝试重新传输数据包的次数，如果超过了这个重传次数，会让网络层在每次新的重传之前更新路由。

tcp_retries2，默认值为 15 ，表示 TCP 数据段可以重传的最大次数，如果超过了这个重传次数，则会直接关闭 TCP 连接。

但实际上来说，真正的超时判断是基于时间的，而不是简单地计算重传次数，也就是说并不是一定要超过 3 或 15 次才采取进一步措施。

源码中的相关定义，参考如下。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */

TCP 数据段超时重传

TCP 在重传定时器超时后，会调用函数 tcp_retransmit_timer() 进行重传，在重传之前需要判断重传的次数，主要通过函数 tcp_write_timeout() 中进行具体判断，如下主要分为两部分，分别处理 SYN_SENT/SYN_RECV 状态，以及其他状态，其中后者涉及到 ipv4.sysctl_tcp_retries1 和 ipv4.sysctl_tcp_retries2 处理。

/* A write timeout has occurred. Process the after effects. */static int tcp_write_timeout(struct sock *sk){  struct inet_connection_sock *icsk = inet_csk(sk);  struct tcp_sock *tp = tcp_sk(sk);  struct net *net = sock_net(sk);  bool expired, do_reset;  int retry_until;  if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {    if (icsk->icsk_retransmits) {      dst_negative_advice(sk);    } else {      sk_rethink_txhash(sk);    }    retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;    expired = icsk->icsk_retransmits >= retry_until;  } else {    if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {      /* Black hole detection */      tcp_mtu_probing(icsk, sk);      dst_negative_advice(sk);    } else {      sk_rethink_txhash(sk);    }    retry_until = net->ipv4.sysctl_tcp_retries2;    if (sock_flag(sk, SOCK_DEAD)) {      const bool alive = icsk->icsk_rto < TCP_RTO_MAX;      retry_until = tcp_orphan_retries(sk, alive);      do_reset = alive ||        !retransmits_timed_out(sk, retry_until, 0);      if (tcp_out_of_resources(sk, do_reset))        return 1;    }    expired = retransmits_timed_out(sk, retry_until,            icsk->icsk_user_timeout);  }  tcp_fastopen_active_detect_blackhole(sk, expired);  if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))    tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,          icsk->icsk_retransmits,          icsk->icsk_rto, (int)expired);  if (expired) {    /* Has it gone just too far? */    tcp_write_err(sk);    return 1;  }  return 0;}

tcp_retries1

涉及 tcp_retries1 的代码部分，retransmits_timed_out 函数检查重传是否已经超时，参数 net->ipv4.sysctl_tcp_retries1 是系统设置的第一阶段重试次数，如果重传次数超过这个值，函数返回 true，然后启动 MTU 探测，更新路由缓存或触发其他优化，用以避免由于路由选路变化带来的问题。

    if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {      /* Black hole detection */      tcp_mtu_probing(icsk, sk);      dst_negative_advice(sk);    } else {      sk_rethink_txhash(sk);    }

tcp_retries2

涉及 tcp_retries2 的代码部分，设置重试次数上限为系统配置的 net->ipv4.sysctl_tcp_retries2 值，同样是调用 retransmits_timed_out 函数检查重传是否已经超时，如果重传超时则放弃连接。

    retry_until = net->ipv4.sysctl_tcp_retries2;    ...    expired = retransmits_timed_out(sk, retry_until,            icsk->icsk_user_timeout);  if (expired) {    /* Has it gone just too far? */    tcp_write_err(sk);    return 1;  }

retransmits_timed_out

可以看到 tcp_retries1 和 tcp_retries2 均有调用 retransmits_timed_out 函数，以 net->ipv4.sysctl_tcp_retries1 和 net->ipv4.sysctl_tcp_retries2 值为 boundary，而在 timeout 值为 0 的情况下，则调用 tcp_model_timeout 计算出一个 timeout 值，最终计算当前时间与原始数据包开始发送时间的差值，如果这个差值大于或等于计算出的超时时间，则认为已超时。

/** *  retransmits_timed_out() - returns true if this connection has timed out *  @sk:       The current socket *  @boundary: max number of retransmissions *  @timeout:  A custom timeout value. *             If set to 0 the default timeout is calculated and used. *             Using TCP_RTO_MIN and the number of unsuccessful retransmits. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */static bool retransmits_timed_out(struct sock *sk,          unsigned int boundary,          unsigned int timeout){  unsigned int start_ts;  if (!inet_csk(sk)->icsk_retransmits)    return false;  start_ts = tcp_sk(sk)->retrans_stamp;  if (likely(timeout == 0)) {    unsigned int rto_base = TCP_RTO_MIN;    if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))      rto_base = tcp_timeout_init(sk);    timeout = tcp_model_timeout(sk, boundary, rto_base);  }  return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;}

static unsigned int tcp_model_timeout(struct sock *sk,              unsigned int boundary,              unsigned int rto_base){  unsigned int linear_backoff_thresh, timeout;  linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);  if (boundary <= linear_backoff_thresh)    timeout = ((2 << boundary) - 1) * rto_base;  else    timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +      (boundary - linear_backoff_thresh) * TCP_RTO_MAX;  return jiffies_to_msecs(timeout);}

可见基于 rto_base = TCP_RTO_MIN 也就是 200ms 下，tcp_retries1 值 3 下的超时时间是 3 秒，而 tcp_retries2 值 15 下的超时时间是 924.6 秒。

重传次数	RTO(ms)	Timeout 超时时间
1	200	0.2 秒
2	400	0.6 秒
3	800	1.4 秒
4	1600	3.0 秒
5	3200	6.2 秒
6	6400	12.6 秒
7	12800	25.4 秒
8	25600	51.0 秒
9	51200	102.2 秒
10	102400	204.6 秒
11	120000	324.6 秒
12	120000	444.6 秒
13	120000	564.6 秒
14	120000	684.6 秒
15	120000	804.6 秒
16	120000	924.6 秒

那么根据实际 RTT 的大小不同，RTO 值也会动态变化，在 tcp_retries1 默认值 3 和 tcp_retries2 值 10 的情况下：

1. 如果 RTT 比较小，那么 RTO 值就为最小值的限制 200ms，所以表现出来的现象就是重传了 3 次或 15 次，最终超过了 timeout 值，进行 MTU 探测和更新路由缓存，或者放弃连接。

2. 如果 RTT 比较大，那么 RTO 值也会相对较大，如果是 800 ms，重传了 2 次后进行 MTU 探测和更新路由缓存，而当重传了 8 次就放弃连接。

由于绝大多数场景 RTT/RTO 的值并不会过大，对于像是 tcp_retries2 超时放弃连接，以重传次数上来看实际并没有太多区别。

重传次数	RTO(ms)	Timeout 超时
1	800	0.8 秒
2	1600	2.4 秒
3	3200	5.6 秒
4	6400	12.0 秒
5	12800	24.8 秒
6	25600	50.4 秒
7	51200	101.6 秒
8	102400	204.0 秒
9	120000	324.0 秒
10	120000	444.0 秒

综上所述，Linux 并不是直接用 tcp_retries1 和 tcp_retries2 的值来限制重传次数的，而是用计算得到的一个 timeout 值来进行判断，与 RTT 和 RTO 相关。

实验测试

超时重传

测试脚本如下，模拟服务器端场景

# cat tcp_rto_000.pkt 0   socket(..., SOCK_STREAM, IPPROTO_TCP) = 3+0  setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0+0  bind(3, ..., ...) = 0+0  listen(3, 1) = 0+0 < S 0:0(0) win 10000 <mss 1000>+0 > S. 0:0(0) ack 1 <...>+0.01 < . 1:1(0) ack 1 win 10000+0 accept(3, ..., ...) = 4+0.01 write(4, ..., 1000) = 1000+0 `sleep 10000`#

tcpdump 捕获数据包示意如下，15 次超时重传。

# packetdrill tcp_rto_000.pkt## tcpdump -i any -nn port 8080tcpdump: data link type LINUX_SLL2tcpdump: verbose output suppressed, use -v[v]... for full protocol decodelistening on any, link-type LINUX_SLL2 (Linux cooked v2), snapshot length 262144 bytes22:45:53.082336 tun0  In  IP 192.0.2.1.40917 > 192.168.30.42.8080: Flags [S], seq 0, win 10000, options [mss 1000], length 022:45:53.082377 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [S.], seq 1490837262, ack 1, win 64240, options [mss 1460], length 022:45:53.092458 tun0  In  IP 192.0.2.1.40917 > 192.168.30.42.8080: Flags [.], ack 1, win 10000, length 022:45:53.102585 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:45:53.318048 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:45:53.758056 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:45:54.622040 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:45:56.350034 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:45:59.966044 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:46:06.878048 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:46:20.702046 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:46:49.886076 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:47:45.182052 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:49:35.774056 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:51:36.606061 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:53:37.438094 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:55:38.270054 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:57:39.102074 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP22:59:39.934113 tun0  Out IP 192.168.30.42.8080 > 192.0.2.1.40917: Flags [P.], seq 1:1001, ack 1, win 64240, length 1000: HTTP#

ss socket 状态示意如下。

# ss -anto | grep 8080# ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                             ESTAB     0      1000   192.168.222.37:8080        192.0.2.1:49435 timer:(on,3.200ms,4)       # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                             ESTAB     0      1000   192.168.222.37:8080        192.0.2.1:49435 timer:(on,1.900ms,5)    # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                             ESTAB     0      1000   192.168.222.37:8080        192.0.2.1:49435 timer:(on,1.908ms,12)   # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                             ESTAB     0      1000   192.168.222.37:8080        192.0.2.1:49435 timer:(on,1min57sec,13) # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                             ESTAB     0      1000   192.168.222.37:8080        192.0.2.1:49435 timer:(on,1min32sec,14)   # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                                                      # ss -anto | grep 8080LISTEN    0      1      192.168.222.37:8080          0.0.0.0:*                              #

Wireshark 数据包跟踪文件示意如下，15 次超时重传。

tcp_retries1

通过修改 net.ipv4.tcp_mtu_probing 值为 1，开启 MTU 探测功能。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */0

仍基于上个实验中的 packetdrill 脚本进行测试，tcpdump 捕获数据包示意如下，在 3 次超时重传后可观察到 MSS 变化，动态调整逐步降低。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */1

Wireshark 数据包跟踪文件示意如下，仍然是 15 次超时重传，但是 TCP Len 长度从 1000、500、250 一直降到最后 48 字节。

tcp_retries2

通过修改 net.ipv4.tcp_retries2 值为 10，调整 tcp_retries2 超时重传次数。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */2

测试脚本修改如下，模拟服务器端场景，增加模拟网卡延迟，从而增大 RTT，最终使得初始 RTO 增大。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */3

tcpdump 捕获数据包示意如下，只进行了 8 次超时重传，并不是 tcp_retries2 所设置的值 10 次。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */4

ss socket 状态示意如下。

#define TCP_RETR1  3  /*         * This is how many retries it does before it         * tries to figure out if the gateway is         * down. Minimal RFC value is 3; it corresponds         * to ~3sec-8min depending on RTO.         */#define TCP_RETR2  15  /*         * This should take at least         * 90 minutes to time out.         * RFC1122 says that the limit is 100 sec.         * 15 is ~13-30min depending on RTO.         */5