This commit is contained in:
Dave Jones 2012-10-02 15:47:25 -04:00
parent 5b27c793de
commit 9e634cfc4b
8 changed files with 12 additions and 743 deletions

View File

@ -1,114 +0,0 @@
Subject: [PATCH] af_netlink: force credentials passing [CVE-2012-3520]
From: Eric Dumazet <eric.dumazet@gmail.com>
To: David Miller <davem@davemloft.net>
Cc: netdev <netdev@vger.kernel.org>, Petr Matousek <pmatouse@redhat.com>,
Florian Weimer <fweimer@redhat.com>,
Pablo Neira Ayuso <pablo@netfilter.org>
Content-Type: text/plain; charset="UTF-8"
Date: Tue, 21 Aug 2012 18:21:17 +0200
Message-ID: <1345566077.5158.530.camel@edumazet-glaptop>
Mime-Version: 1.0
Content-Transfer-Encoding: 7bit
Sender: netdev-owner@vger.kernel.org
Precedence: bulk
List-ID: <netdev.vger.kernel.org>
X-Mailing-List: netdev@vger.kernel.org
X-RedHat-Spam-Score: -6.999 (BAYES_00,DKIM_ADSP_CUSTOM_MED,DKIM_SIGNED,FREEMAIL_FROM,RCVD_IN_DNSWL_HI,RP_MATCHES_RCVD,T_DKIM_INVALID)
X-Scanned-By: MIMEDefang 2.68 on 10.5.11.22
X-Scanned-By: MIMEDefang 2.68 on 10.5.110.16
Status: RO
Content-Length: 3042
Lines: 91
From: Eric Dumazet <edumazet@google.com>
Pablo Neira Ayuso discovered that avahi and
potentially NetworkManager accept spoofed Netlink messages because of a
kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data
to the receiver if the sender did not provide such data, instead of not
including any such data at all or including the correct data from the
peer (as it is the case with AF_UNIX).
This bug was introduced in commit 16e572626961
(af_unix: dont send SCM_CREDENTIALS by default)
This patch forces passing credentials for netlink, as
before the regression.
Another fix would be to not add SCM_CREDENTIALS in
netlink messages if not provided by the sender, but it
might break some programs.
With help from Florian Weimer & Petr Matousek
This issue is designated as CVE-2012-3520
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/scm.h | 4 +++-
net/netlink/af_netlink.c | 2 +-
net/unix/af_unix.c | 4 ++--
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/include/net/scm.h b/include/net/scm.h
index 079d788..7dc0854 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -70,9 +70,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm)
}
static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
- struct scm_cookie *scm)
+ struct scm_cookie *scm, bool forcecreds)
{
memset(scm, 0, sizeof(*scm));
+ if (forcecreds)
+ scm_set_cred(scm, task_tgid(current), current_cred());
unix_get_peersec_dgram(sock, scm);
if (msg->msg_controllen <= 0)
return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5463969..1445d73 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (NULL == siocb->scm)
siocb->scm = &scm;
- err = scm_send(sock, msg, siocb->scm);
+ err = scm_send(sock, msg, siocb->scm, true);
if (err < 0)
return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e4768c1..c5ee4ff 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (NULL == siocb->scm)
siocb->scm = &tmp_scm;
wait_for_unix_gc();
- err = scm_send(sock, msg, siocb->scm);
+ err = scm_send(sock, msg, siocb->scm, false);
if (err < 0)
return err;
@@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (NULL == siocb->scm)
siocb->scm = &tmp_scm;
wait_for_unix_gc();
- err = scm_send(sock, msg, siocb->scm);
+ err = scm_send(sock, msg, siocb->scm, false);
if (err < 0)
return err;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

View File

@ -1,35 +0,0 @@
From 4a2b6662c3632176b4fdf012243dd3751367bf1f Mon Sep 17 00:00:00 2001
From: Jerome Glisse <jglisse@redhat.com>
Date: Tue, 28 Aug 2012 16:50:22 -0400
Subject: [PATCH] drm/radeon: force dma32 to fix regression rs4xx,rs6xx,rs740
It seems some of those IGP dislike non dma32 page despite what
documentation says. Fix regression since we allowed non dma32
pages. It seems it only affect some revision of those IGP chips
as we don't know which one just force dma32 for all of them.
https://bugzilla.redhat.com/show_bug.cgi?id=785375
Signed-off-by: Jerome Glisse <jglisse@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/radeon/radeon_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index d2e2438..33da8bf 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1051,7 +1051,7 @@ int radeon_device_init(struct radeon_device *rdev,
if (rdev->flags & RADEON_IS_AGP)
rdev->need_dma32 = true;
if ((rdev->flags & RADEON_IS_PCI) &&
- (rdev->family < CHIP_RS400))
+ (rdev->family <= CHIP_RS740))
rdev->need_dma32 = true;
dma_bits = rdev->need_dma32 ? 32 : 40;
--
1.7.11.4

View File

@ -54,7 +54,7 @@ Summary: The Linux kernel
# For non-released -rc kernels, this will be appended after the rcX and
# gitX tags, so a 3 here would become part of release "0.rcX.gitX.3"
#
%global baserelease 3
%global baserelease 1
%global fedora_build %{baserelease}
# base_sublevel is the kernel version we're starting with and patching
@ -66,7 +66,7 @@ Summary: The Linux kernel
%if 0%{?released_kernel}
# Do we have a -stable update to apply?
%define stable_update 11
%define stable_update 12
# Is it a -stable RC?
%define stable_rc 0
# Set rpm version accordingly
@ -638,7 +638,6 @@ Patch800: linux-2.6-crash-driver.patch
# DRM
#atch1700: drm-edid-try-harder-to-fix-up-broken-headers.patch
Patch1701: drm-radeon-force-dma32-to-fix-regression-rs4xx-rs6xx.patch
# intel drm is all merged upstream
Patch1824: drm-intel-next.patch
@ -707,16 +706,6 @@ Patch22056: crypto-aesni-intel-fix-wrong-kfree-pointer.patch
#rhbz 714271
Patch22060: CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
#rhbz 820039 843554
Patch22061: rds-set-correct-msg_namelen.patch
#rhbz 845558 844714
Patch22070: net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
Patch22071: sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
Patch22072: tcp-Apply-device-TSO-segment-limit-earlier.patch
Patch22075: af_netlink-credentials-cve-2012-3520.patch
# END OF PATCH DEFINITIONS
%endif
@ -1283,7 +1272,6 @@ ApplyPatch linux-2.6-e1000-ich9-montevina.patch
# DRM core
#ApplyPatch drm-edid-try-harder-to-fix-up-broken-headers.patch
ApplyPatch drm-radeon-force-dma32-to-fix-regression-rs4xx-rs6xx.patch
# Intel DRM
ApplyOptionalPatch drm-intel-next.patch
@ -1339,16 +1327,6 @@ ApplyPatch crypto-aesni-intel-fix-wrong-kfree-pointer.patch
#rhbz 714271
ApplyPatch CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
#rhbz 820039 843554
ApplyPatch rds-set-correct-msg_namelen.patch
#rhbz 845558 844714
ApplyPatch net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
ApplyPatch sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
ApplyPatch tcp-Apply-device-TSO-segment-limit-earlier.patch
ApplyPatch af_netlink-credentials-cve-2012-3520.patch
# END OF PATCH APPLICATIONS
%endif
@ -2049,6 +2027,15 @@ fi
# and build.
%changelog
* Tue Oct 02 2012 Dave Jones <davej@redhat.com> 3.4.12-1
- Linux v3.4.12
merged: drm-radeon-force-dma32-to-fix-regression-rs4xx-rs6xx.patch
merged: rds-set-correct-msg_namelen.patch
merged: net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
merged: sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
merged: tcp-Apply-device-TSO-segment-limit-earlier.patch
merged: af_netlink-credentials-cve-2012-3520.patch
* Fri Sep 28 2012 Josh Boyer <jwboyer@redhat.com> - 3.4.11-3
- Split out kernel-tools-libs (rhbz 859943)

View File

@ -1,70 +0,0 @@
From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 30 Jul 2012 15:57:00 +0000
Subject: [PATCH] net: Allow driver to limit number of GSO segments per skb
A peer (or local user) may cause TCP to use a nominal MSS of as little
as 88 (actual MSS of 76 with timestamps). Given that we have a
sufficiently prodigious local sender and the peer ACKs quickly enough,
it is nevertheless possible to grow the window for such a connection
to the point that we will try to send just under 64K at once. This
results in a single skb that expands to 861 segments.
In some drivers with TSO support, such an skb will require hundreds of
DMA descriptors; a substantial fraction of a TX ring or even more than
a full ring. The TX queue selected for the skb may stall and trigger
the TX watchdog repeatedly (since the problem skb will be retried
after the TX reset). This particularly affects sfc, for which the
issue is designated as CVE-2012-3412.
Therefore:
1. Add the field net_device::gso_max_segs holding the device-specific
limit.
2. In netif_skb_features(), if the number of segments is too high then
mask out GSO features to force fall back to software GSO.
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/netdevice.h | 2 ++
net/core/dev.c | 4 ++++
2 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eb06e58..a9db4f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1300,6 +1300,8 @@ struct net_device {
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
+#define GSO_MAX_SEGS 65535
+ u16 gso_max_segs;
#ifdef CONFIG_DCB
/* Data Center Bridging netlink ops */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0cb3fe8..f91abf8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
__be16 protocol = skb->protocol;
netdev_features_t features = skb->dev->features;
+ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ features &= ~NETIF_F_GSO_MASK;
+
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
@@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
--
1.7.7.6

View File

@ -1,219 +0,0 @@
From 06b6a1cf6e776426766298d055bb3991957d90a7 Mon Sep 17 00:00:00 2001
From: Weiping Pan <wpan@redhat.com>
Date: Mon, 23 Jul 2012 10:37:48 +0800
Subject: [PATCH] rds: set correct msg_namelen
Jay Fenlason (fenlason@redhat.com) found a bug,
that recvfrom() on an RDS socket can return the contents of random kernel
memory to userspace if it was called with a address length larger than
sizeof(struct sockaddr_in).
rds_recvmsg() also fails to set the addr_len paramater properly before
returning, but that's just a bug.
There are also a number of cases wher recvfrom() can return an entirely bogus
address. Anything in rds_recvmsg() that returns a non-negative value but does
not go through the "sin = (struct sockaddr_in *)msg->msg_name;" code path
at the end of the while(1) loop will return up to 128 bytes of kernel memory
to userspace.
And I write two test programs to reproduce this bug, you will see that in
rds_server, fromAddr will be overwritten and the following sock_fd will be
destroyed.
Yes, it is the programmer's fault to set msg_namelen incorrectly, but it is
better to make the kernel copy the real length of address to user space in
such case.
How to run the test programs ?
I test them on 32bit x86 system, 3.5.0-rc7.
1 compile
gcc -o rds_client rds_client.c
gcc -o rds_server rds_server.c
2 run ./rds_server on one console
3 run ./rds_client on another console
4 you will see something like:
server is waiting to receive data...
old socket fd=3
server received data from client:data from client
msg.msg_namelen=32
new socket fd=-1067277685
sendmsg()
: Bad file descriptor
/***************** rds_client.c ********************/
int main(void)
{
int sock_fd;
struct sockaddr_in serverAddr;
struct sockaddr_in toAddr;
char recvBuffer[128] = "data from client";
struct msghdr msg;
struct iovec iov;
sock_fd = socket(AF_RDS, SOCK_SEQPACKET, 0);
if (sock_fd < 0) {
perror("create socket error\n");
exit(1);
}
memset(&serverAddr, 0, sizeof(serverAddr));
serverAddr.sin_family = AF_INET;
serverAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
serverAddr.sin_port = htons(4001);
if (bind(sock_fd, (struct sockaddr*)&serverAddr, sizeof(serverAddr)) < 0) {
perror("bind() error\n");
close(sock_fd);
exit(1);
}
memset(&toAddr, 0, sizeof(toAddr));
toAddr.sin_family = AF_INET;
toAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
toAddr.sin_port = htons(4000);
msg.msg_name = &toAddr;
msg.msg_namelen = sizeof(toAddr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_iov->iov_base = recvBuffer;
msg.msg_iov->iov_len = strlen(recvBuffer) + 1;
msg.msg_control = 0;
msg.msg_controllen = 0;
msg.msg_flags = 0;
if (sendmsg(sock_fd, &msg, 0) == -1) {
perror("sendto() error\n");
close(sock_fd);
exit(1);
}
printf("client send data:%s\n", recvBuffer);
memset(recvBuffer, '\0', 128);
msg.msg_name = &toAddr;
msg.msg_namelen = sizeof(toAddr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_iov->iov_base = recvBuffer;
msg.msg_iov->iov_len = 128;
msg.msg_control = 0;
msg.msg_controllen = 0;
msg.msg_flags = 0;
if (recvmsg(sock_fd, &msg, 0) == -1) {
perror("recvmsg() error\n");
close(sock_fd);
exit(1);
}
printf("receive data from server:%s\n", recvBuffer);
close(sock_fd);
return 0;
}
/***************** rds_server.c ********************/
int main(void)
{
struct sockaddr_in fromAddr;
int sock_fd;
struct sockaddr_in serverAddr;
unsigned int addrLen;
char recvBuffer[128];
struct msghdr msg;
struct iovec iov;
sock_fd = socket(AF_RDS, SOCK_SEQPACKET, 0);
if(sock_fd < 0) {
perror("create socket error\n");
exit(0);
}
memset(&serverAddr, 0, sizeof(serverAddr));
serverAddr.sin_family = AF_INET;
serverAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
serverAddr.sin_port = htons(4000);
if (bind(sock_fd, (struct sockaddr*)&serverAddr, sizeof(serverAddr)) < 0) {
perror("bind error\n");
close(sock_fd);
exit(1);
}
printf("server is waiting to receive data...\n");
msg.msg_name = &fromAddr;
/*
* I add 16 to sizeof(fromAddr), ie 32,
* and pay attention to the definition of fromAddr,
* recvmsg() will overwrite sock_fd,
* since kernel will copy 32 bytes to userspace.
*
* If you just use sizeof(fromAddr), it works fine.
* */
msg.msg_namelen = sizeof(fromAddr) + 16;
/* msg.msg_namelen = sizeof(fromAddr); */
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_iov->iov_base = recvBuffer;
msg.msg_iov->iov_len = 128;
msg.msg_control = 0;
msg.msg_controllen = 0;
msg.msg_flags = 0;
while (1) {
printf("old socket fd=%d\n", sock_fd);
if (recvmsg(sock_fd, &msg, 0) == -1) {
perror("recvmsg() error\n");
close(sock_fd);
exit(1);
}
printf("server received data from client:%s\n", recvBuffer);
printf("msg.msg_namelen=%d\n", msg.msg_namelen);
printf("new socket fd=%d\n", sock_fd);
strcat(recvBuffer, "--data from server");
if (sendmsg(sock_fd, &msg, 0) == -1) {
perror("sendmsg()\n");
close(sock_fd);
exit(1);
}
}
close(sock_fd);
return 0;
}
Signed-off-by: Weiping Pan <wpan@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/rds/recv.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 5c6e9f1..9f0f17c 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -410,6 +410,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+ msg->msg_namelen = 0;
+
if (msg_flags & MSG_OOB)
goto out;
@@ -485,6 +487,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
sin->sin_port = inc->i_hdr.h_sport;
sin->sin_addr.s_addr = inc->i_saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ msg->msg_namelen = sizeof(*sin);
}
break;
}
--
1.7.11.2

View File

@ -1,156 +0,0 @@
From 7e6d06f0de3f74ca929441add094518ae332257c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 30 Jul 2012 15:57:44 +0000
Subject: [PATCH] sfc: Fix maximum number of TSO segments and minimum TX queue
size
Currently an skb requiring TSO may not fit within a minimum-size TX
queue. The TX queue selected for the skb may stall and trigger the TX
watchdog repeatedly (since the problem skb will be retried after the
TX reset). This issue is designated as CVE-2012-3412.
Set the maximum number of TSO segments for our devices to 100. This
should make no difference to behaviour unless the actual MSS is less
than about 700. Increase the minimum TX queue size accordingly to
allow for 2 worst-case skbs, so that there will definitely be space
to add an skb after we wake a queue.
To avoid invalidating existing configurations, change
efx_ethtool_set_ringparam() to fix up values that are too small rather
than returning -EINVAL.
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/sfc/efx.c | 6 ++++++
drivers/net/ethernet/sfc/efx.h | 14 ++++++++++----
drivers/net/ethernet/sfc/ethtool.c | 16 +++++++++++-----
drivers/net/ethernet/sfc/tx.c | 19 +++++++++++++++++++
4 files changed, 46 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 70554a1..65a8d49 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1503,6 +1503,11 @@ static int efx_probe_all(struct efx_nic *efx)
goto fail2;
}
+ BUILD_BUG_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_RXQ_MIN_ENT);
+ if (WARN_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_TXQ_MIN_ENT(efx))) {
+ rc = -EINVAL;
+ goto fail3;
+ }
efx->rxq_entries = efx->txq_entries = EFX_DEFAULT_DMAQ_SIZE;
rc = efx_probe_filters(efx);
@@ -2070,6 +2075,7 @@ static int efx_register_netdev(struct efx_nic *efx)
net_dev->irq = efx->pci_dev->irq;
net_dev->netdev_ops = &efx_netdev_ops;
SET_ETHTOOL_OPS(net_dev, &efx_ethtool_ops);
+ net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
rtnl_lock();
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index be8f915..70755c9 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -30,6 +30,7 @@ extern netdev_tx_t
efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
extern void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
+extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
/* RX */
extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
@@ -52,10 +53,15 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
#define EFX_MAX_EVQ_SIZE 16384UL
#define EFX_MIN_EVQ_SIZE 512UL
-/* The smallest [rt]xq_entries that the driver supports. Callers of
- * efx_wake_queue() assume that they can subsequently send at least one
- * skb. Falcon/A1 may require up to three descriptors per skb_frag. */
-#define EFX_MIN_RING_SIZE (roundup_pow_of_two(2 * 3 * MAX_SKB_FRAGS))
+/* Maximum number of TCP segments we support for soft-TSO */
+#define EFX_TSO_MAX_SEGS 100
+
+/* The smallest [rt]xq_entries that the driver supports. RX minimum
+ * is a bit arbitrary. For TX, we must have space for at least 2
+ * TSO skbs.
+ */
+#define EFX_RXQ_MIN_ENT 128U
+#define EFX_TXQ_MIN_ENT(efx) (2 * efx_tx_max_skb_descs(efx))
/* Filters */
extern int efx_probe_filters(struct efx_nic *efx);
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 10536f9..8cba2df 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -680,21 +680,27 @@ static int efx_ethtool_set_ringparam(struct net_device *net_dev,
struct ethtool_ringparam *ring)
{
struct efx_nic *efx = netdev_priv(net_dev);
+ u32 txq_entries;
if (ring->rx_mini_pending || ring->rx_jumbo_pending ||
ring->rx_pending > EFX_MAX_DMAQ_SIZE ||
ring->tx_pending > EFX_MAX_DMAQ_SIZE)
return -EINVAL;
- if (ring->rx_pending < EFX_MIN_RING_SIZE ||
- ring->tx_pending < EFX_MIN_RING_SIZE) {
+ if (ring->rx_pending < EFX_RXQ_MIN_ENT) {
netif_err(efx, drv, efx->net_dev,
- "TX and RX queues cannot be smaller than %ld\n",
- EFX_MIN_RING_SIZE);
+ "RX queues cannot be smaller than %u\n",
+ EFX_RXQ_MIN_ENT);
return -EINVAL;
}
- return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending);
+ txq_entries = max(ring->tx_pending, EFX_TXQ_MIN_ENT(efx));
+ if (txq_entries != ring->tx_pending)
+ netif_warn(efx, drv, efx->net_dev,
+ "increasing TX queue size to minimum of %u\n",
+ txq_entries);
+
+ return efx_realloc_channels(efx, ring->rx_pending, txq_entries);
}
static int efx_ethtool_set_pauseparam(struct net_device *net_dev,
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 9b225a7..1871343 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -119,6 +119,25 @@ efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
return len;
}
+unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
+{
+ /* Header and payload descriptor for each output segment, plus
+ * one for every input fragment boundary within a segment
+ */
+ unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
+
+ /* Possibly one more per segment for the alignment workaround */
+ if (EFX_WORKAROUND_5391(efx))
+ max_descs += EFX_TSO_MAX_SEGS;
+
+ /* Possibly more for PCIe page boundaries within input fragments */
+ if (PAGE_SIZE > EFX_PAGE_SIZE)
+ max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
+ DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
+
+ return max_descs;
+}
+
/*
* Add a socket buffer to a TX queue
*
--
1.7.7.6

View File

@ -1,2 +1,2 @@
967f72983655e2479f951195953e8480 linux-3.4.tar.xz
2149df47fc96fec05787bf0197fb7b16 patch-3.4.11.xz
4b80842c0edc3d7f37185716160687d0 patch-3.4.12.xz

View File

@ -1,124 +0,0 @@
From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 30 Jul 2012 16:11:42 +0000
Subject: [PATCH] tcp: Apply device TSO segment limit earlier
Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to
limit the size of TSO skbs. This avoids the need to fall back to
software GSO for local TCP senders.
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/sock.h | 2 ++
net/core/sock.c | 1 +
net/ipv4/tcp.c | 4 +++-
net/ipv4/tcp_cong.c | 3 ++-
net/ipv4/tcp_output.c | 21 ++++++++++++---------
5 files changed, 20 insertions(+), 11 deletions(-)
--- linux-3.4.noarch.orig/include/net/sock.h
+++ linux-3.4.noarch/include/net/sock.h
@@ -216,6 +216,7 @@ struct cg_proto;
* @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
* @sk_gso_max_size: Maximum GSO segment size to build
+ * @sk_gso_max_segs: Maximum number of GSO segments
* @sk_lingertime: %SO_LINGER l_linger setting
* @sk_backlog: always used with the per-socket spinlock held
* @sk_callback_lock: used with the callbacks in the end of this struct
@@ -335,6 +336,7 @@ struct sock {
netdev_features_t sk_route_nocaps;
int sk_gso_type;
unsigned int sk_gso_max_size;
+ u16 sk_gso_max_segs;
int sk_rcvlowat;
unsigned long sk_lingertime;
struct sk_buff_head sk_error_queue;
--- linux-3.4.noarch.orig/net/core/sock.c
+++ linux-3.4.noarch/net/core/sock.c
@@ -1411,6 +1411,7 @@ void sk_setup_caps(struct sock *sk, stru
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
}
}
}
--- linux-3.4.noarch.orig/net/ipv4/tcp.c
+++ linux-3.4.noarch/net/ipv4/tcp.c
@@ -740,7 +740,9 @@ static unsigned int tcp_xmit_size_goal(s
old_size_goal + mss_now > xmit_size_goal)) {
xmit_size_goal = old_size_goal;
} else {
- tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+ tp->xmit_size_goal_segs =
+ min_t(u16, xmit_size_goal / mss_now,
+ sk->sk_gso_max_segs);
xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
}
}
--- linux-3.4.noarch.orig/net/ipv4/tcp_cong.c
+++ linux-3.4.noarch/net/ipv4/tcp_cong.c
@@ -291,7 +291,8 @@ int tcp_is_cwnd_limited(const struct soc
left = tp->snd_cwnd - in_flight;
if (sk_can_gso(sk) &&
left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
- left * tp->mss_cache < sk->sk_gso_max_size)
+ left * tp->mss_cache < sk->sk_gso_max_size &&
+ left < sk->sk_gso_max_segs)
return 1;
return left <= tcp_max_tso_deferred_mss(tp);
}
--- linux-3.4.noarch.orig/net/ipv4/tcp_output.c
+++ linux-3.4.noarch/net/ipv4/tcp_output.c
@@ -1318,21 +1318,21 @@ static void tcp_cwnd_validate(struct soc
* when we would be allowed to send the split-due-to-Nagle skb fully.
*/
static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
- unsigned int mss_now, unsigned int cwnd)
+ unsigned int mss_now, unsigned int max_segs)
{
const struct tcp_sock *tp = tcp_sk(sk);
- u32 needed, window, cwnd_len;
+ u32 needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
- cwnd_len = mss_now * cwnd;
+ max_len = mss_now * max_segs;
- if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
- return cwnd_len;
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
needed = min(skb->len, window);
- if (cwnd_len <= needed)
- return cwnd_len;
+ if (max_len <= needed)
+ return max_len;
return needed - needed % mss_now;
}
@@ -1560,7 +1560,8 @@ static int tcp_tso_should_defer(struct s
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= sk->sk_gso_max_size)
+ if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+ sk->sk_gso_max_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
@@ -1786,7 +1787,9 @@ static int tcp_write_xmit(struct sock *s
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- cwnd_quota);
+ min_t(unsigned int,
+ cwnd_quota,
+ sk->sk_gso_max_segs));
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))