Linux v4.3.3

2015-12-15 07:31:12 -05:00 · 2015-12-15 07:31:12 -05:00 · e7ca3b90d2
parent be9160e268
commit e7ca3b90d2
5 changed files with 5 additions and 709 deletions
--- a/Btrfs-fix-truncation-of-compressed-and-inlined-exten.patch
+++ b/Btrfs-fix-truncation-of-compressed-and-inlined-exten.patch
@ -1,288 +0,0 @@
-From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Fri, 16 Oct 2015 12:34:25 +0100
-Subject: [PATCH] Btrfs: fix truncation of compressed and inlined extents
-
-When truncating a file to a smaller size which consists of an inline
-extent that is compressed, we did not discard (or made unusable) the
-data between the new file size and the old file size, wasting metadata
-space and allowing for the truncated data to be leaked and the data
-corruption/loss mentioned below.
-We were also not correctly decrementing the number of bytes used by the
-inode, we were setting it to zero, giving a wrong report for callers of
-the stat(2) syscall. The fsck tool also reported an error about a mismatch
-between the nbytes of the file versus the real space used by the file.
-
-Now because we weren't discarding the truncated region of the file, it
-was possible for a caller of the clone ioctl to actually read the data
-that was truncated, allowing for a security breach without requiring root
-access to the system, using only standard filesystem operations. The
-scenario is the following:
-
-   1) User A creates a file which consists of an inline and compressed
-      extent with a size of 2000 bytes - the file is not accessible to
-      any other users (no read, write or execution permission for anyone
-      else);
-
-   2) The user truncates the file to a size of 1000 bytes;
-
-   3) User A makes the file world readable;
-
-   4) User B creates a file consisting of an inline extent of 2000 bytes;
-
-   5) User B issues a clone operation from user A's file into its own
-      file (using a length argument of 0, clone the whole range);
-
-   6) User B now gets to see the 1000 bytes that user A truncated from
-      its file before it made its file world readbale. User B also lost
-      the bytes in the range [1000, 2000[ bytes from its own file, but
-      that might be ok if his/her intention was reading stale data from
-      user A that was never supposed to be public.
-
-Note that this contrasts with the case where we truncate a file from 2000
-bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
-this case reading any byte from the range [1000, 2000[ will return a value
-of 0x00, instead of the original data.
-
-This problem exists since the clone ioctl was added and happens both with
-and without my recent data loss and file corruption fixes for the clone
-ioctl (patch "Btrfs: fix file corruption and data loss after cloning
-inline extents").
-
-So fix this by truncating the compressed inline extents as we do for the
-non-compressed case, which involves decompressing, if the data isn't already
-in the page cache, compressing the truncated version of the extent, writing
-the compressed content into the inline extent and then truncate it.
-
-The following test case for fstests reproduces the problem. In order for
-the test to pass both this fix and my previous fix for the clone ioctl
-that forbids cloning a smaller inline extent into a larger one,
-which is titled "Btrfs: fix file corruption and data loss after cloning
-inline extents", are needed. Without that other fix the test fails in a
-different way that does not leak the truncated data, instead part of
-destination file gets replaced with zeroes (because the destination file
-has a larger inline extent than the source).
-
-  seq=`basename $0`
-  seqres=$RESULT_DIR/$seq
-  echo "QA output created by $seq"
-  tmp=/tmp/$$
-  status=1	# failure is the default!
-  trap "_cleanup; exit \$status" 0 1 2 3 15
-
-  _cleanup()
-  {
-      rm -f $tmp.*
-  }
-
-  # get standard environment, filters and checks
-  . ./common/rc
-  . ./common/filter
-
-  # real QA test starts here
-  _need_to_be_root
-  _supported_fs btrfs
-  _supported_os Linux
-  _require_scratch
-  _require_cloner
-
-  rm -f $seqres.full
-
-  _scratch_mkfs >>$seqres.full 2>&1
-  _scratch_mount "-o compress"
-
-  # Create our test files. File foo is going to be the source of a clone operation
-  # and consists of a single inline extent with an uncompressed size of 512 bytes,
-  # while file bar consists of a single inline extent with an uncompressed size of
-  # 256 bytes. For our test's purpose, it's important that file bar has an inline
-  # extent with a size smaller than foo's inline extent.
-  $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128"   \
-          -c "pwrite -S 0x2a 128 384" \
-          $SCRATCH_MNT/foo | _filter_xfs_io
-  $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
-
-  # Now durably persist all metadata and data. We do this to make sure that we get
-  # on disk an inline extent with a size of 512 bytes for file foo.
-  sync
-
-  # Now truncate our file foo to a smaller size. Because it consists of a
-  # compressed and inline extent, btrfs did not shrink the inline extent to the
-  # new size (if the extent was not compressed, btrfs would shrink it to 128
-  # bytes), it only updates the inode's i_size to 128 bytes.
-  $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
-
-  # Now clone foo's inline extent into bar.
-  # This clone operation should fail with errno EOPNOTSUPP because the source
-  # file consists only of an inline extent and the file's size is smaller than
-  # the inline extent of the destination (128 bytes < 256 bytes). However the
-  # clone ioctl was not prepared to deal with a file that has a size smaller
-  # than the size of its inline extent (something that happens only for compressed
-  # inline extents), resulting in copying the full inline extent from the source
-  # file into the destination file.
-  #
-  # Note that btrfs' clone operation for inline extents consists of removing the
-  # inline extent from the destination inode and copy the inline extent from the
-  # source inode into the destination inode, meaning that if the destination
-  # inode's inline extent is larger (N bytes) than the source inode's inline
-  # extent (M bytes), some bytes (N - M bytes) will be lost from the destination
-  # file. Btrfs could copy the source inline extent's data into the destination's
-  # inline extent so that we would not lose any data, but that's currently not
-  # done due to the complexity that would be needed to deal with such cases
-  # (specially when one or both extents are compressed), returning EOPNOTSUPP, as
-  # it's normally not a very common case to clone very small files (only case
-  # where we get inline extents) and copying inline extents does not save any
-  # space (unlike for normal, non-inlined extents).
-  $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
-
-  # Now because the above clone operation used to succeed, and due to foo's inline
-  # extent not being shinked by the truncate operation, our file bar got the whole
-  # inline extent copied from foo, making us lose the last 128 bytes from bar
-  # which got replaced by the bytes in range [128, 256[ from foo before foo was
-  # truncated - in other words, data loss from bar and being able to read old and
-  # stale data from foo that should not be possible to read anymore through normal
-  # filesystem operations. Contrast with the case where we truncate a file from a
-  # size N to a smaller size M, truncate it back to size N and then read the range
-  # [M, N[, we should always get the value 0x00 for all the bytes in that range.
-
-  # We expected the clone operation to fail with errno EOPNOTSUPP and therefore
-  # not modify our file's bar data/metadata. So its content should be 256 bytes
-  # long with all bytes having the value 0xbb.
-  #
-  # Without the btrfs bug fix, the clone operation succeeded and resulted in
-  # leaking truncated data from foo, the bytes that belonged to its range
-  # [128, 256[, and losing data from bar in that same range. So reading the
-  # file gave us the following content:
-  #
-  # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
-  # *
-  # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
-  # *
-  # 0000400
-  echo "File bar's content after the clone operation:"
-  od -t x1 $SCRATCH_MNT/bar
-
-  # Also because the foo's inline extent was not shrunk by the truncate
-  # operation, btrfs' fsck, which is run by the fstests framework everytime a
-  # test completes, failed reporting the following error:
-  #
-  #  root 5 inode 257 errors 400, nbytes wrong
-
-  status=0
-  exit
-
-Cc: stable@vger.kernel.org
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
- fs/btrfs/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++----------
- 1 file changed, 68 insertions(+), 14 deletions(-)
-
-diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
-index 208db4e835f0..cbb4286490a1 100644
--- a/fs/btrfs/inode.c
-+++ b/fs/btrfs/inode.c
-@@ -4217,6 +4217,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
- 
- }
- 
-+static int truncate_inline_extent(struct inode *inode,
-+				  struct btrfs_path *path,
-+				  struct btrfs_key *found_key,
-+				  const u64 item_end,
-+				  const u64 new_size)
-+{
-+	struct extent_buffer *leaf = path->nodes[0];
-+	int slot = path->slots[0];
-+	struct btrfs_file_extent_item *fi;
-+	u32 size = (u32)(new_size - found_key->offset);
-+	struct btrfs_root *root = BTRFS_I(inode)->root;
-+
-+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-+
-+	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
-+		loff_t offset = new_size;
-+		loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
-+
-+		/*
-+		 * Zero out the remaining of the last page of our inline extent,
-+		 * instead of directly truncating our inline extent here - that
-+		 * would be much more complex (decompressing all the data, then
-+		 * compressing the truncated data, which might be bigger than
-+		 * the size of the inline extent, resize the extent, etc).
-+		 * We release the path because to get the page we might need to
-+		 * read the extent item from disk (data not in the page cache).
-+		 */
-+		btrfs_release_path(path);
-+		return btrfs_truncate_page(inode, offset, page_end - offset, 0);
-+	}
-+
-+	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-+	size = btrfs_file_extent_calc_inline_size(size);
-+	btrfs_truncate_item(root, path, size, 1);
-+
-+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-+		inode_sub_bytes(inode, item_end + 1 - new_size);
-+
-+	return 0;
-+}
-+
- /*
-  * this can truncate away extent items, csum items and directory items.
-  * It starts at a high offset and removes keys until it can't find
-@@ -4411,27 +4452,40 @@ search_again:
- 			 * special encodings
- 			 */
- 			if (!del_item &&
-			    btrfs_file_extent_compression(leaf, fi) == 0 &&
- 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
- 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-				u32 size = new_size - found_key.offset;
-
-				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-					inode_sub_bytes(inode, item_end + 1 -
-							new_size);
- 
- 				/*
-				 * update the ram bytes to properly reflect
-				 * the new size of our item
-+				 * Need to release path in order to truncate a
-+				 * compressed extent. So delete any accumulated
-+				 * extent items so far.
- 				 */
-				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-				size =
-				    btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(root, path, size, 1);
-+				if (btrfs_file_extent_compression(leaf, fi) !=
-+				    BTRFS_COMPRESS_NONE && pending_del_nr) {
-+					err = btrfs_del_items(trans, root, path,
-+							      pending_del_slot,
-+							      pending_del_nr);
-+					if (err) {
-+						btrfs_abort_transaction(trans,
-+									root,
-+									err);
-+						goto error;
-+					}
-+					pending_del_nr = 0;
-+				}
-+
-+				err = truncate_inline_extent(inode, path,
-+							     &found_key,
-+							     item_end,
-+							     new_size);
-+				if (err) {
-+					btrfs_abort_transaction(trans,
-+								root, err);
-+					goto error;
-+				}
- 			} else if (test_bit(BTRFS_ROOT_REF_COWS,
- 					    &root->state)) {
-				inode_sub_bytes(inode, item_end + 1 -
-						found_key.offset);
-+				inode_sub_bytes(inode, item_end + 1 - new_size);
- 			}
- 		}
- delete:
-- 
-2.5.0
-
--- a/RDS-fix-race-condition-when-sending-a-message-on-unb.patch
+++ b/RDS-fix-race-condition-when-sending-a-message-on-unb.patch
@ -1,77 +0,0 @@
-From 8e92c2b0cb50a31e2956760498bc8cdb72993fb3 Mon Sep 17 00:00:00 2001
-From: Quentin Casasnovas <quentin.casasnovas@oracle.com>
-Date: Fri, 16 Oct 2015 17:11:42 +0200
-Subject: [PATCH] RDS: fix race condition when sending a message on unbound
- socket.
-
-Sasha's found a NULL pointer dereference in the RDS connection code when
-sending a message to an apparently unbound socket.  The problem is caused
-by the code checking if the socket is bound in rds_sendmsg(), which checks
-the rs_bound_addr field without taking a lock on the socket.  This opens a
-race where rs_bound_addr is temporarily set but where the transport is not
-in rds_bind(), leading to a NULL pointer dereference when trying to
-dereference 'trans' in __rds_conn_create().
-
-Vegard wrote a reproducer for this issue, so kindly ask him to share if
-you're interested.
-
-I cannot reproduce the NULL pointer dereference using Vegard's reproducer
-with this patch, whereas I could without.
-
-Complete earlier incomplete fix to CVE-2015-6937:
-
-  74e98eb08588 ("RDS: verify the underlying transport exists before creating a connection")
-
-Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
-Reviewed-by: Vegard Nossum <vegard.nossum@oracle.com>
-Reviewed-by: Sasha Levin <sasha.levin@oracle.com>
-Cc: Vegard Nossum <vegard.nossum@oracle.com>
-Cc: Sasha Levin <sasha.levin@oracle.com>
-Cc: Chien Yen <chien.yen@oracle.com>
-Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
-Cc: David S. Miller <davem@davemloft.net>
-Cc: stable@vger.kernel.org
---
- net/rds/connection.c | 6 ------
- net/rds/send.c       | 4 +++-
- 2 files changed, 3 insertions(+), 7 deletions(-)
-
-diff --git a/net/rds/connection.c b/net/rds/connection.c
-index 49adeef8090c..9b2de5e67d79 100644
--- a/net/rds/connection.c
-+++ b/net/rds/connection.c
-@@ -190,12 +190,6 @@ new_conn:
- 		}
- 	}
- 
-	if (trans == NULL) {
-		kmem_cache_free(rds_conn_slab, conn);
-		conn = ERR_PTR(-ENODEV);
-		goto out;
-	}
-
- 	conn->c_trans = trans;
- 
- 	ret = trans->conn_alloc(conn, gfp);
-diff --git a/net/rds/send.c b/net/rds/send.c
-index 4df61a515b83..859de6f32521 100644
--- a/net/rds/send.c
-+++ b/net/rds/send.c
-@@ -1009,11 +1009,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
- 		release_sock(sk);
- 	}
- 
-	/* racing with another thread binding seems ok here */
-+	lock_sock(sk);
- 	if (daddr == 0 || rs->rs_bound_addr == 0) {
-+		release_sock(sk);
- 		ret = -ENOTCONN; /* XXX not a great errno */
- 		goto out;
- 	}
-+	release_sock(sk);
- 
- 	if (payload_len > rds_sk_sndbuf(rs)) {
- 		ret = -EMSGSIZE;
-- 
-2.4.3
-
--- a/kernel.spec
+++ b/kernel.spec
@ -52,7 +52,7 @@ Summary: The Linux kernel
 %if 0%{?released_kernel}

 # Do we have a -stable update to apply?
-%define stable_update 2
+%define stable_update 3
 # Set rpm version accordingly
 %if 0%{?stable_update}
 %define stablerev %{stable_update}
@ -592,9 +592,6 @@ Patch503: drm-i915-turn-off-wc-mmaps.patch

 Patch508: kexec-uefi-copy-secure_boot-flag-in-boot-params.patch

-#CVE-2015-7990 rhbz 1276437 1276438
-Patch524: RDS-fix-race-condition-when-sending-a-message-on-unb.patch
-
 #CVE-2015-7799 rhbz 1271134 1271135
 Patch512: isdn_ppp-Add-checks-for-allocation-failure-in-isdn_p.patch
 Patch513: ppp-slip-Validate-VJ-compression-slot-parameters-com.patch
@ -613,9 +610,6 @@ Patch556: netfilter-ipset-Fix-extension-alignment.patch
 Patch557: netfilter-ipset-Fix-hash-type-expiration.patch
 Patch558: netfilter-ipset-Fix-hash-type-expire-release-empty-h.patch

-#CVE-2015-8374 rhbz 1286261 1286262
-Patch565: Btrfs-fix-truncation-of-compressed-and-inlined-exten.patch
-
 #rhbz 1284059
 Patch566: KEYS-Fix-handling-of-stored-error-in-a-negatively-in.patch

@ -634,9 +628,6 @@ Patch571: ideapad-laptop-Add-Lenovo-ideapad-Y700-17ISK-to-no_h.patch
 #rhbz 1288687
 Patch572: alua_fix.patch

-#CVE-2013-7446 rhbz 1282688 1282712
-Patch573: unix-avoid-use-after-free-in-ep_remove_wait_queue.patch
-
 #CVE-XXXX-XXXX rhbz 1291329 1291332
 Patch574: ovl-fix-permission-checking-for-setattr.patch

@ -2086,6 +2077,9 @@ fi
 #
 # 
 %changelog
+* Tue Dec 15 2015 Josh Boyer <jwboyer@fedoraproject.org>
+- Linux v4.3.3
+
 * Mon Dec 14 2015 Josh Boyer <jwboyer@fedoraproject.org>
 - CVE-2015-7550 Race between read and revoke keys (rhbz 1291197 1291198)
 - CVE-XXXX-XXXX permission bypass on overlayfs (rhbz 1291329 1291332)
--- a/2
+++ b/2
@ -1,3 +1,3 @@
 58b35794eee3b6d52ce7be39357801e7  linux-4.3.tar.xz
 7c516c9528b9f9aac0136944b0200b7e  perf-man-4.3.tar.gz
-3a465c7cf55ec9dbf2d72d9292aa5fde  patch-4.3.2.xz
+d3235b3640ae6ac1ab579171943fda4b  patch-4.3.3.xz
--- a/unix-avoid-use-after-free-in-ep_remove_wait_queue.patch
+++ b/unix-avoid-use-after-free-in-ep_remove_wait_queue.patch
@ -1,333 +0,0 @@
-From a46b9d2bac864f3ef6b21eb96864ddd88794222d Mon Sep 17 00:00:00 2001
-From: Rainer Weikusat <rweikusat@mobileactivedefense.com>
-Date: Fri, 20 Nov 2015 22:07:23 +0000
-Subject: [PATCH 05/43] unix: avoid use-after-free in ep_remove_wait_queue
-
-[ Upstream commit 7d267278a9ece963d77eefec61630223fce08c6c ]
-
-Rainer Weikusat <rweikusat@mobileactivedefense.com> writes:
-An AF_UNIX datagram socket being the client in an n:1 association with
-some server socket is only allowed to send messages to the server if the
-receive queue of this socket contains at most sk_max_ack_backlog
-datagrams. This implies that prospective writers might be forced to go
-to sleep despite none of the message presently enqueued on the server
-receive queue were sent by them. In order to ensure that these will be
-woken up once space becomes again available, the present unix_dgram_poll
-routine does a second sock_poll_wait call with the peer_wait wait queue
-of the server socket as queue argument (unix_dgram_recvmsg does a wake
-up on this queue after a datagram was received). This is inherently
-problematic because the server socket is only guaranteed to remain alive
-for as long as the client still holds a reference to it. In case the
-connection is dissolved via connect or by the dead peer detection logic
-in unix_dgram_sendmsg, the server socket may be freed despite "the
-polling mechanism" (in particular, epoll) still has a pointer to the
-corresponding peer_wait queue. There's no way to forcibly deregister a
-wait queue with epoll.
-
-Based on an idea by Jason Baron, the patch below changes the code such
-that a wait_queue_t belonging to the client socket is enqueued on the
-peer_wait queue of the server whenever the peer receive queue full
-condition is detected by either a sendmsg or a poll. A wake up on the
-peer queue is then relayed to the ordinary wait queue of the client
-socket via wake function. The connection to the peer wait queue is again
-dissolved if either a wake up is about to be relayed or the client
-socket reconnects or a dead peer is detected or the client socket is
-itself closed. This enables removing the second sock_poll_wait from
-unix_dgram_poll, thus avoiding the use-after-free, while still ensuring
-that no blocked writer sleeps forever.
-
-Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
-Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets")
-Reviewed-by: Jason Baron <jbaron@akamai.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
---
- include/net/af_unix.h |   1 +
- net/unix/af_unix.c    | 183 ++++++++++++++++++++++++++++++++++++++++++++------
- 2 files changed, 165 insertions(+), 19 deletions(-)
-
-diff --git a/include/net/af_unix.h b/include/net/af_unix.h
-index b36d837..2a91a05 100644
--- a/include/net/af_unix.h
-+++ b/include/net/af_unix.h
-@@ -62,6 +62,7 @@ struct unix_sock {
- #define UNIX_GC_CANDIDATE	0
- #define UNIX_GC_MAYBE_CYCLE	1
- 	struct socket_wq	peer_wq;
-+	wait_queue_t		peer_wake;
- };
- 
- static inline struct unix_sock *unix_sk(const struct sock *sk)
-diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
-index 42ab2cc..153b2f2 100644
--- a/net/unix/af_unix.c
-+++ b/net/unix/af_unix.c
-@@ -326,6 +326,118 @@ found:
- 	return s;
- }
- 
-+/* Support code for asymmetrically connected dgram sockets
-+ *
-+ * If a datagram socket is connected to a socket not itself connected
-+ * to the first socket (eg, /dev/log), clients may only enqueue more
-+ * messages if the present receive queue of the server socket is not
-+ * "too large". This means there's a second writeability condition
-+ * poll and sendmsg need to test. The dgram recv code will do a wake
-+ * up on the peer_wait wait queue of a socket upon reception of a
-+ * datagram which needs to be propagated to sleeping would-be writers
-+ * since these might not have sent anything so far. This can't be
-+ * accomplished via poll_wait because the lifetime of the server
-+ * socket might be less than that of its clients if these break their
-+ * association with it or if the server socket is closed while clients
-+ * are still connected to it and there's no way to inform "a polling
-+ * implementation" that it should let go of a certain wait queue
-+ *
-+ * In order to propagate a wake up, a wait_queue_t of the client
-+ * socket is enqueued on the peer_wait queue of the server socket
-+ * whose wake function does a wake_up on the ordinary client socket
-+ * wait queue. This connection is established whenever a write (or
-+ * poll for write) hit the flow control condition and broken when the
-+ * association to the server socket is dissolved or after a wake up
-+ * was relayed.
-+ */
-+
-+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
-+				      void *key)
-+{
-+	struct unix_sock *u;
-+	wait_queue_head_t *u_sleep;
-+
-+	u = container_of(q, struct unix_sock, peer_wake);
-+
-+	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
-+			    q);
-+	u->peer_wake.private = NULL;
-+
-+	/* relaying can only happen while the wq still exists */
-+	u_sleep = sk_sleep(&u->sk);
-+	if (u_sleep)
-+		wake_up_interruptible_poll(u_sleep, key);
-+
-+	return 0;
-+}
-+
-+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
-+{
-+	struct unix_sock *u, *u_other;
-+	int rc;
-+
-+	u = unix_sk(sk);
-+	u_other = unix_sk(other);
-+	rc = 0;
-+	spin_lock(&u_other->peer_wait.lock);
-+
-+	if (!u->peer_wake.private) {
-+		u->peer_wake.private = other;
-+		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
-+
-+		rc = 1;
-+	}
-+
-+	spin_unlock(&u_other->peer_wait.lock);
-+	return rc;
-+}
-+
-+static void unix_dgram_peer_wake_disconnect(struct sock *sk,
-+					    struct sock *other)
-+{
-+	struct unix_sock *u, *u_other;
-+
-+	u = unix_sk(sk);
-+	u_other = unix_sk(other);
-+	spin_lock(&u_other->peer_wait.lock);
-+
-+	if (u->peer_wake.private == other) {
-+		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
-+		u->peer_wake.private = NULL;
-+	}
-+
-+	spin_unlock(&u_other->peer_wait.lock);
-+}
-+
-+static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
-+						   struct sock *other)
-+{
-+	unix_dgram_peer_wake_disconnect(sk, other);
-+	wake_up_interruptible_poll(sk_sleep(sk),
-+				   POLLOUT |
-+				   POLLWRNORM |
-+				   POLLWRBAND);
-+}
-+
-+/* preconditions:
-+ *	- unix_peer(sk) == other
-+ *	- association is stable
-+ */
-+static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
-+{
-+	int connected;
-+
-+	connected = unix_dgram_peer_wake_connect(sk, other);
-+
-+	if (unix_recvq_full(other))
-+		return 1;
-+
-+	if (connected)
-+		unix_dgram_peer_wake_disconnect(sk, other);
-+
-+	return 0;
-+}
-+
- static inline int unix_writable(struct sock *sk)
- {
- 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
-@@ -430,6 +542,8 @@ static void unix_release_sock(struct sock *sk, int embrion)
- 			skpair->sk_state_change(skpair);
- 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
- 		}
-+
-+		unix_dgram_peer_wake_disconnect(sk, skpair);
- 		sock_put(skpair); /* It may now die */
- 		unix_peer(sk) = NULL;
- 	}
-@@ -665,6 +779,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
- 	INIT_LIST_HEAD(&u->link);
- 	mutex_init(&u->readlock); /* single task reading lock */
- 	init_waitqueue_head(&u->peer_wait);
-+	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
- 	unix_insert_socket(unix_sockets_unbound(sk), sk);
- out:
- 	if (sk == NULL)
-@@ -1032,6 +1147,8 @@ restart:
- 	if (unix_peer(sk)) {
- 		struct sock *old_peer = unix_peer(sk);
- 		unix_peer(sk) = other;
-+		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
-+
- 		unix_state_double_unlock(sk, other);
- 
- 		if (other != old_peer)
-@@ -1471,6 +1588,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
- 	struct scm_cookie scm;
- 	int max_level;
- 	int data_len = 0;
-+	int sk_locked;
- 
- 	wait_for_unix_gc();
- 	err = scm_send(sock, msg, &scm, false);
-@@ -1549,12 +1667,14 @@ restart:
- 		goto out_free;
- 	}
- 
-+	sk_locked = 0;
- 	unix_state_lock(other);
-+restart_locked:
- 	err = -EPERM;
- 	if (!unix_may_send(sk, other))
- 		goto out_unlock;
- 
-	if (sock_flag(other, SOCK_DEAD)) {
-+	if (unlikely(sock_flag(other, SOCK_DEAD))) {
- 		/*
- 		 *	Check with 1003.1g - what should
- 		 *	datagram error
-@@ -1562,10 +1682,14 @@ restart:
- 		unix_state_unlock(other);
- 		sock_put(other);
- 
-+		if (!sk_locked)
-+			unix_state_lock(sk);
-+
- 		err = 0;
-		unix_state_lock(sk);
- 		if (unix_peer(sk) == other) {
- 			unix_peer(sk) = NULL;
-+			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
-+
- 			unix_state_unlock(sk);
- 
- 			unix_dgram_disconnected(sk, other);
-@@ -1591,21 +1715,38 @@ restart:
- 			goto out_unlock;
- 	}
- 
-	if (unix_peer(other) != sk && unix_recvq_full(other)) {
-		if (!timeo) {
-			err = -EAGAIN;
-			goto out_unlock;
-+	if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
-+		if (timeo) {
-+			timeo = unix_wait_for_peer(other, timeo);
-+
-+			err = sock_intr_errno(timeo);
-+			if (signal_pending(current))
-+				goto out_free;
-+
-+			goto restart;
- 		}
- 
-		timeo = unix_wait_for_peer(other, timeo);
-+		if (!sk_locked) {
-+			unix_state_unlock(other);
-+			unix_state_double_lock(sk, other);
-+		}
- 
-		err = sock_intr_errno(timeo);
-		if (signal_pending(current))
-			goto out_free;
-+		if (unix_peer(sk) != other ||
-+		    unix_dgram_peer_wake_me(sk, other)) {
-+			err = -EAGAIN;
-+			sk_locked = 1;
-+			goto out_unlock;
-+		}
- 
-		goto restart;
-+		if (!sk_locked) {
-+			sk_locked = 1;
-+			goto restart_locked;
-+		}
- 	}
- 
-+	if (unlikely(sk_locked))
-+		unix_state_unlock(sk);
-+
- 	if (sock_flag(other, SOCK_RCVTSTAMP))
- 		__net_timestamp(skb);
- 	maybe_add_creds(skb, sock, other);
-@@ -1619,6 +1760,8 @@ restart:
- 	return len;
- 
- out_unlock:
-+	if (sk_locked)
-+		unix_state_unlock(sk);
- 	unix_state_unlock(other);
- out_free:
- 	kfree_skb(skb);
-@@ -2475,14 +2618,16 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
- 		return mask;
- 
- 	writable = unix_writable(sk);
-	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
-				writable = 0;
-		}
-		sock_put(other);
-+	if (writable) {
-+		unix_state_lock(sk);
-+
-+		other = unix_peer(sk);
-+		if (other && unix_peer(other) != sk &&
-+		    unix_recvq_full(other) &&
-+		    unix_dgram_peer_wake_me(sk, other))
-+			writable = 0;
-+
-+		unix_state_unlock(sk);
- 	}
- 
- 	if (writable)
-- 
-2.1.0
-