From 846998ae87a80b0fd45b4cf5cf001a159d746f27 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 4 Aug 2005 19:52:01 -0700
Subject: [PATCH] tcp: fix TSO sizing bugs

MSS changes can be lost since we preemptively initialize the tso_segs count
for an SKB before we %100 commit to sending it out.

So, by the time we send it out, the tso_size information can be stale due
to PMTU events.  This mucks up all of the logic in our send engine, and can
even result in the BUG() triggering in tcp_tso_should_defer().

Another problem we have is that we're storing the tp->mss_cache, not the
SACK block normalized MSS, as the tso_size.  That's wrong too.

Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/tcp_output.c | 56 +++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3f8ea1bfa9c..e118b4b5b326 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -403,11 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_send_head = skb;
 }
 
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (skb->len <= tp->mss_cache ||
+	if (skb->len <= mss_now ||
 	    !(sk->sk_route_caps & NETIF_F_TSO)) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
@@ -417,10 +415,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	} else {
 		unsigned int factor;
 
-		factor = skb->len + (tp->mss_cache - 1);
-		factor /= tp->mss_cache;
+		factor = skb->len + (mss_now - 1);
+		factor /= mss_now;
 		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache;
+		skb_shinfo(skb)->tso_size = mss_now;
 	}
 }
 
@@ -429,7 +427,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
  * packet to the list.  This won't be called frequently, I hope. 
  * Remember, these are still headerless SKBs at this point.
  */
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
@@ -492,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	}
 
 	/* Fix up tso_factor for both original and new SKB.  */
-	tcp_set_skb_tso_segs(sk, skb);
-	tcp_set_skb_tso_segs(sk, buff);
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out += tcp_skb_pcount(skb);
@@ -569,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * factor and mss.
 	 */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb);
+		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
 
 	return 0;
 }
@@ -734,12 +732,14 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
 	int tso_segs = tcp_skb_pcount(skb);
 
-	if (!tso_segs) {
-		tcp_set_skb_tso_segs(sk, skb);
+	if (!tso_segs ||
+	    (tso_segs > 1 &&
+	     skb_shinfo(skb)->tso_size != mss_now)) {
+		tcp_set_skb_tso_segs(sk, skb, mss_now);
 		tso_segs = tcp_skb_pcount(skb);
 	}
 	return tso_segs;
@@ -817,7 +817,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int cwnd_quota;
 
-	tcp_init_tso_segs(sk, skb);
+	tcp_init_tso_segs(sk, skb, cur_mss);
 
 	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
 		return 0;
@@ -854,7 +854,7 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -887,8 +887,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
 	skb_split(skb, buff, len);
 
 	/* Fix up tso_factor for both original and new SKB.  */
-	tcp_set_skb_tso_segs(sk, skb);
-	tcp_set_skb_tso_segs(sk, buff);
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
@@ -976,7 +976,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	if (unlikely(!skb))
 		return 0;
 
-	tso_segs = tcp_init_tso_segs(sk, skb);
+	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	cwnd_quota = tcp_cwnd_test(tp, skb);
 	if (unlikely(!cwnd_quota))
 		goto out;
@@ -1006,11 +1006,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 					limit = skb->len - trim;
 			}
 			if (skb->len > limit) {
-				if (tso_fragment(sk, skb, limit))
+				if (tso_fragment(sk, skb, limit, mss_now))
 					break;
 			}
 		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
+			if (unlikely(tcp_fragment(sk, skb,  mss_now, mss_now)))
 				break;
 		}
 
@@ -1039,7 +1039,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		skb = sk->sk_send_head;
 		if (!skb)
 			break;
-		tso_segs = tcp_init_tso_segs(sk, skb);
+		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	}
 
 	if (likely(sent_pkts)) {
@@ -1076,7 +1076,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 
 	BUG_ON(!skb || skb->len < mss_now);
 
-	tso_segs = tcp_init_tso_segs(sk, skb);
+	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
 
 	if (likely(cwnd_quota)) {
@@ -1093,11 +1093,11 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 					limit = skb->len - trim;
 			}
 			if (skb->len > limit) {
-				if (unlikely(tso_fragment(sk, skb, limit)))
+				if (unlikely(tso_fragment(sk, skb, limit, mss_now)))
 					return;
 			}
 		} else if (unlikely(skb->len > mss_now)) {
-			if (unlikely(tcp_fragment(sk, skb, mss_now)))
+			if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
 				return;
 		}
 
@@ -1388,7 +1388,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		int old_factor = tcp_skb_pcount(skb);
 		int new_factor;
 
-		if (tcp_fragment(sk, skb, cur_mss))
+		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
@@ -1991,7 +1991,7 @@ int tcp_write_wakeup(struct sock *sk)
 			    skb->len > mss) {
 				seg_size = min(seg_size, mss);
 				TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-				if (tcp_fragment(sk, skb, seg_size))
+				if (tcp_fragment(sk, skb, seg_size, mss))
 					return -1;
 				/* SWS override triggered forced fragmentation.
 				 * Disable TSO, the connection is too sick. */
@@ -2000,7 +2000,7 @@ int tcp_write_wakeup(struct sock *sk)
 					sk->sk_route_caps &= ~NETIF_F_TSO;
 				}
 			} else if (!tcp_skb_pcount(skb))
-				tcp_set_skb_tso_segs(sk, skb);
+				tcp_set_skb_tso_segs(sk, skb, mss);
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-- 
cgit v1.2.1


From b68e9f857271189bd7a59b74c99890de9195b0e1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 4 Aug 2005 19:52:02 -0700
Subject: [PATCH] tcp: fix TSO cwnd caching bug

tcp_write_xmit caches the cwnd value indirectly in cwnd_quota.  When
tcp_transmit_skb reduces the cwnd because of tcp_enter_cwr, the cached
value becomes invalid.

This patch ensures that the cwnd value is always reread after each
tcp_transmit_skb call.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/tcp_output.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e118b4b5b326..7d076f0db100 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -972,19 +972,18 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	if (unlikely(sk->sk_state == TCP_CLOSE))
 		return 0;
 
-	skb = sk->sk_send_head;
-	if (unlikely(!skb))
-		return 0;
-
-	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
-	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (unlikely(!cwnd_quota))
-		goto out;
-
 	sent_pkts = 0;
-	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+	while ((skb = sk->sk_send_head)) {
+		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
+		cwnd_quota = tcp_cwnd_test(tp, skb);
+		if (!cwnd_quota)
+			break;
+
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+			break;
+
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
@@ -1026,27 +1025,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 		tcp_minshall_update(tp, mss_now, skb);
 		sent_pkts++;
-
-		/* Do not optimize this to use tso_segs. If we chopped up
-		 * the packet above, tso_segs will no longer be valid.
-		 */
-		cwnd_quota -= tcp_skb_pcount(skb);
-
-		BUG_ON(cwnd_quota < 0);
-		if (!cwnd_quota)
-			break;
-
-		skb = sk->sk_send_head;
-		if (!skb)
-			break;
-		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 	}
 
 	if (likely(sent_pkts)) {
 		tcp_cwnd_validate(sk, tp);
 		return 0;
 	}
-out:
 	return !tp->packets_out && sk->sk_send_head;
 }
 
-- 
cgit v1.2.1


From b5da623ae9be680ea59f268eeb339f0acb2d88c4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 10 Aug 2005 18:32:36 -0700
Subject: [TCP]: Adjust {p,f}ackets_out correctly in tcp_retransmit_skb()

Well I've only found one potential cause for the assertion
failure in tcp_mark_head_lost.  First of all, this can only
occur if cnt > 1 since tp->packets_out is never zero here.
If it did hit zero we'd have much bigger problems.

So cnt is equal to fackets_out - reordering.  Normally
fackets_out is less than packets_out.  The only reason
I've found that might cause fackets_out to exceed packets_out
is if tcp_fragment is called from tcp_retransmit_skb with a
TSO skb and the current MSS is greater than the MSS stored
in the TSO skb.  This might occur as the result of an expiring
dst entry.

In that case, packets_out may decrease (line 1380-1381 in
tcp_output.c).  However, fackets_out is unchanged which means
that it may in fact exceed packets_out.

Previously tcp_retrans_try_collapse was the only place where
packets_out can go down and it takes care of this by decrementing
fackets_out.

So we should make sure that fackets_out is reduced by an appropriate
amount here as well.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d076f0db100..3ed6fc15815b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1370,15 +1370,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 
 	if (skb->len > cur_mss) {
 		int old_factor = tcp_skb_pcount(skb);
-		int new_factor;
+		int diff;
 
 		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
-		new_factor = tcp_skb_pcount(skb);
-		tp->packets_out -= old_factor - new_factor;
-		tp->packets_out += tcp_skb_pcount(skb->next);
+		diff = old_factor - tcp_skb_pcount(skb) -
+		       tcp_skb_pcount(skb->next);
+		tp->packets_out -= diff;
+
+		if (diff > 0) {
+			tp->fackets_out -= diff;
+			if ((int)tp->fackets_out < 0)
+				tp->fackets_out = 0;
+		}
 	}
 
 	/* Collapse two adjacent packets if worthwhile and we can. */
-- 
cgit v1.2.1