From 47059d930f0e002ff851beea87d738146804726d Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 3 Jul 2014 18:22:07 +0800
Subject: Btrfs: make defragment work with nodatacow option

Btrfs defragment will utilize COW feature, which means this
did not work for nodatacow option, this problem was detected
by xfstests generic/018 with nodatacow mount option.

Fix this problem by forcing cow for a extent with state
@EXTETN_DEFRAG setting.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..fd879418fd42 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -120,6 +120,12 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
+	/*
+	 * total number of bytes pending defrag, used by stat to check whether
+	 * it needs COW.
+	 */
+	u64 defrag_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
-- 
cgit v1.2.1


From 23ea8e5a07673127d05cb5cf6f9914d7a53e0847 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 12 Sep 2014 18:43:54 +0800
Subject: Btrfs: load checksum data once when submitting a direct read io

The current code would load checksum data for several times when we split
a whole direct read io because of the limit of the raid stripe, it would
make us search the csum tree for several times. In fact, it just wasted time,
and made the contention of the csum tree root be more serious. This patch
improves this problem by loading the data at once.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fd879418fd42..8bea70e02a3d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -263,7 +263,6 @@ struct btrfs_dio_private {
 
 	/* dio_bio came from fs/direct-io.c */
 	struct bio *dio_bio;
-	u8 csum[0];
 };
 
 /*
-- 
cgit v1.2.1


From c1dc08967f69c6b5067f8302c600f6628123f3bf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 12 Sep 2014 18:43:56 +0800
Subject: Btrfs: do file data check by sub-bio's self

Direct IO splits the original bio to several sub-bios because of the limit of
raid stripe, and the filesystem will wait for all sub-bios and then run final
end io process.

But it was very hard to implement the data repair when dio read failure happens,
because at the final end io function, we didn't know which mirror the data was
read from. So in order to implement the data repair, we have to move the file data
check in the final end io function to the sub-bio end io function, in which we can
get the mirror number of the device we access. This patch did this work as the
first step of the direct io data repair implementation.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8bea70e02a3d..4d309471294e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -245,8 +245,11 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	return 0;
 }
 
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED	0x1
+
 struct btrfs_dio_private {
 	struct inode *inode;
+	unsigned long flags;
 	u64 logical_offset;
 	u64 disk_bytenr;
 	u64 bytes;
@@ -263,6 +266,12 @@ struct btrfs_dio_private {
 
 	/* dio_bio came from fs/direct-io.c */
 	struct bio *dio_bio;
+
+	/*
+	 * The original bio may be splited to several sub-bios, this is
+	 * done during endio of sub-bios
+	 */
+	int (*subio_endio)(struct inode *, struct btrfs_io_bio *);
 };
 
 /*
-- 
cgit v1.2.1


From 8b110e393c5a6e72d50fcdf9fa7ed8b647cfdfc9 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 12 Sep 2014 18:44:03 +0800
Subject: Btrfs: implement repair function when direct read fails

This patch implement data repair function when direct read fails.

The detail of the implementation is:
- When we find the data is not right, we try to read the data from the other
  mirror.
- When the io on the mirror ends, we will insert the endio work into the
  dedicated btrfs workqueue, not common read endio workqueue, because the
  original endio work is still blocked in the btrfs endio workqueue, if we
  insert the endio work of the io on the mirror into that workqueue, deadlock
  would happen.
- After we get right data, we write it back to the corrupted mirror.
- And if the data on the new mirror is still corrupted, we will try next
  mirror until we read right data or all the mirrors are traversed.
- After the above work, we set the uptodate flag according to the result.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4d309471294e..7a7521c87c88 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -271,7 +271,7 @@ struct btrfs_dio_private {
 	 * The original bio may be splited to several sub-bios, this is
 	 * done during endio of sub-bios
 	 */
-	int (*subio_endio)(struct inode *, struct btrfs_io_bio *);
+	int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
 };
 
 /*
-- 
cgit v1.2.1


From 656f30dba7ab8179c9a2e04293b0c7b383fa9ce9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 26 Sep 2014 12:25:56 +0100
Subject: Btrfs: be aware of btree inode write errors to avoid fs corruption

While we have a transaction ongoing, the VM might decide at any time
to call btree_inode->i_mapping->a_ops->writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
"parent transid verify failed on 10826481664 wanted 25748 found 29562"
when reading btree nodes/leafs from disk).

Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.

Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a7521c87c88..8a42adb4e5ed 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
 #define BTRFS_INODE_HAS_PROPS		        11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR		        12
+#define BTRFS_INODE_BTREE_LOG1_ERR		13
+#define BTRFS_INODE_BTREE_LOG2_ERR		14
 
 /* in memory btrfs inode */
 struct btrfs_inode {
-- 
cgit v1.2.1