9db63cb5df
gluster 4K block size fixes (bz #1737256)
370 lines
15 KiB
Diff
370 lines
15 KiB
Diff
From: Nir Soffer <nirsof@gmail.com>
|
|
Date: Tue, 27 Aug 2019 04:05:27 +0300
|
|
Subject: [PATCH] block: posix: Always allocate the first block
|
|
|
|
When creating an image with preallocation "off" or "falloc", the first
|
|
block of the image is typically not allocated. When using Gluster
|
|
storage backed by XFS filesystem, reading this block using direct I/O
|
|
succeeds regardless of request length, fooling alignment detection.
|
|
|
|
In this case we fallback to a safe value (4096) instead of the optimal
|
|
value (512), which may lead to unneeded data copying when aligning
|
|
requests. Allocating the first block avoids the fallback.
|
|
|
|
Since we allocate the first block even with preallocation=off, we no
|
|
longer create images with zero disk size:
|
|
|
|
$ ./qemu-img create -f raw test.raw 1g
|
|
Formatting 'test.raw', fmt=raw size=1073741824
|
|
|
|
$ ls -lhs test.raw
|
|
4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw
|
|
|
|
And converting the image requires additional cluster:
|
|
|
|
$ ./qemu-img measure -f raw -O qcow2 test.raw
|
|
required size: 458752
|
|
fully allocated size: 1074135040
|
|
|
|
When using format like vmdk with multiple files per image, we allocate
|
|
one block per file:
|
|
|
|
$ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g
|
|
Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat
|
|
|
|
$ ls -lhs test*.vmdk
|
|
4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk
|
|
4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk
|
|
4.0K -rw-r--r--. 1 nsoffer nsoffer 353 Aug 27 03:23 test.vmdk
|
|
|
|
I did quick performance test for copying disks with qemu-img convert to
|
|
new raw target image to Gluster storage with sector size of 512 bytes:
|
|
|
|
for i in $(seq 10); do
|
|
rm -f dst.raw
|
|
sleep 10
|
|
time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw
|
|
done
|
|
|
|
Here is a table comparing the total time spent:
|
|
|
|
Type Before(s) After(s) Diff(%)
|
|
---------------------------------------
|
|
real 530.028 469.123 -11.4
|
|
user 17.204 10.768 -37.4
|
|
sys 17.881 7.011 -60.7
|
|
|
|
We can see very clear improvement in CPU usage.
|
|
|
|
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
|
|
Message-id: 20190827010528.8818-2-nsoffer@redhat.com
|
|
Reviewed-by: Max Reitz <mreitz@redhat.com>
|
|
Signed-off-by: Max Reitz <mreitz@redhat.com>
|
|
(cherry picked from commit 3a20013fbb26d2a1bd11ef148eefdb1508783787)
|
|
---
|
|
block/file-posix.c | 51 ++++++++++++++++++++++++++++++++
|
|
tests/qemu-iotests/059.out | 2 +-
|
|
tests/qemu-iotests/150.out | 11 -------
|
|
tests/qemu-iotests/150.out.qcow2 | 11 +++++++
|
|
tests/qemu-iotests/150.out.raw | 12 ++++++++
|
|
tests/qemu-iotests/175 | 19 ++++++++----
|
|
tests/qemu-iotests/175.out | 8 ++---
|
|
tests/qemu-iotests/178.out.qcow2 | 4 +--
|
|
tests/qemu-iotests/221.out | 12 +++++---
|
|
tests/qemu-iotests/253.out | 12 +++++---
|
|
10 files changed, 110 insertions(+), 32 deletions(-)
|
|
delete mode 100644 tests/qemu-iotests/150.out
|
|
create mode 100644 tests/qemu-iotests/150.out.qcow2
|
|
create mode 100644 tests/qemu-iotests/150.out.raw
|
|
|
|
diff --git a/block/file-posix.c b/block/file-posix.c
|
|
index b8b4dad553..8ea98896ce 100644
|
|
--- a/block/file-posix.c
|
|
+++ b/block/file-posix.c
|
|
@@ -1749,6 +1749,43 @@ static int handle_aiocb_discard(void *opaque)
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * Help alignment probing by allocating the first block.
|
|
+ *
|
|
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
|
|
+ * reading succeeds regardless of request length. In this case we fallback to
|
|
+ * safe alignment which is not optimal. Allocating the first block avoids this
|
|
+ * fallback.
|
|
+ *
|
|
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
|
|
+ * request alignment, so we use safe values.
|
|
+ *
|
|
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
|
|
+ * caller may ignore failures.
|
|
+ */
|
|
+static int allocate_first_block(int fd, size_t max_size)
|
|
+{
|
|
+ size_t write_size = (max_size < MAX_BLOCKSIZE)
|
|
+ ? BDRV_SECTOR_SIZE
|
|
+ : MAX_BLOCKSIZE;
|
|
+ size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
|
|
+ void *buf;
|
|
+ ssize_t n;
|
|
+ int ret;
|
|
+
|
|
+ buf = qemu_memalign(max_align, write_size);
|
|
+ memset(buf, 0, write_size);
|
|
+
|
|
+ do {
|
|
+ n = pwrite(fd, buf, write_size, 0);
|
|
+ } while (n == -1 && errno == EINTR);
|
|
+
|
|
+ ret = (n == -1) ? -errno : 0;
|
|
+
|
|
+ qemu_vfree(buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int handle_aiocb_truncate(void *opaque)
|
|
{
|
|
RawPosixAIOData *aiocb = opaque;
|
|
@@ -1788,6 +1825,17 @@ static int handle_aiocb_truncate(void *opaque)
|
|
/* posix_fallocate() doesn't set errno. */
|
|
error_setg_errno(errp, -result,
|
|
"Could not preallocate new data");
|
|
+ } else if (current_length == 0) {
|
|
+ /*
|
|
+ * posix_fallocate() uses fallocate() if the filesystem
|
|
+ * supports it, or fallback to manually writing zeroes. If
|
|
+ * fallocate() was used, unaligned reads from the fallocated
|
|
+ * area in raw_probe_alignment() will succeed, hence we need to
|
|
+ * allocate the first block.
|
|
+ *
|
|
+ * Optimize future alignment probing; ignore failures.
|
|
+ */
|
|
+ allocate_first_block(fd, offset);
|
|
}
|
|
} else {
|
|
result = 0;
|
|
@@ -1849,6 +1897,9 @@ static int handle_aiocb_truncate(void *opaque)
|
|
if (ftruncate(fd, offset) != 0) {
|
|
result = -errno;
|
|
error_setg_errno(errp, -result, "Could not resize file");
|
|
+ } else if (current_length == 0 && offset > current_length) {
|
|
+ /* Optimize future alignment probing; ignore failures. */
|
|
+ allocate_first_block(fd, offset);
|
|
}
|
|
return result;
|
|
default:
|
|
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
|
|
index 4fab42a28c..fe3f861f3c 100644
|
|
--- a/tests/qemu-iotests/059.out
|
|
+++ b/tests/qemu-iotests/059.out
|
|
@@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 subformat=twoGbMax
|
|
image: TEST_DIR/t.vmdk
|
|
file format: vmdk
|
|
virtual size: 0.977 TiB (1073741824000 bytes)
|
|
-disk size: 16 KiB
|
|
+disk size: 1.97 MiB
|
|
Format specific information:
|
|
cid: XXXXXXXX
|
|
parent cid: XXXXXXXX
|
|
diff --git a/tests/qemu-iotests/150.out b/tests/qemu-iotests/150.out
|
|
deleted file mode 100644
|
|
index 2a54e8dcfa..0000000000
|
|
--- a/tests/qemu-iotests/150.out
|
|
+++ /dev/null
|
|
@@ -1,11 +0,0 @@
|
|
-QA output created by 150
|
|
-
|
|
-=== Mapping sparse conversion ===
|
|
-
|
|
-Offset Length File
|
|
-
|
|
-=== Mapping non-sparse conversion ===
|
|
-
|
|
-Offset Length File
|
|
-0 0x100000 TEST_DIR/t.IMGFMT
|
|
-*** done
|
|
diff --git a/tests/qemu-iotests/150.out.qcow2 b/tests/qemu-iotests/150.out.qcow2
|
|
new file mode 100644
|
|
index 0000000000..2a54e8dcfa
|
|
--- /dev/null
|
|
+++ b/tests/qemu-iotests/150.out.qcow2
|
|
@@ -0,0 +1,11 @@
|
|
+QA output created by 150
|
|
+
|
|
+=== Mapping sparse conversion ===
|
|
+
|
|
+Offset Length File
|
|
+
|
|
+=== Mapping non-sparse conversion ===
|
|
+
|
|
+Offset Length File
|
|
+0 0x100000 TEST_DIR/t.IMGFMT
|
|
+*** done
|
|
diff --git a/tests/qemu-iotests/150.out.raw b/tests/qemu-iotests/150.out.raw
|
|
new file mode 100644
|
|
index 0000000000..3cdc7727a5
|
|
--- /dev/null
|
|
+++ b/tests/qemu-iotests/150.out.raw
|
|
@@ -0,0 +1,12 @@
|
|
+QA output created by 150
|
|
+
|
|
+=== Mapping sparse conversion ===
|
|
+
|
|
+Offset Length File
|
|
+0 0x1000 TEST_DIR/t.IMGFMT
|
|
+
|
|
+=== Mapping non-sparse conversion ===
|
|
+
|
|
+Offset Length File
|
|
+0 0x100000 TEST_DIR/t.IMGFMT
|
|
+*** done
|
|
diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
|
|
index 51e62c8276..7ba28b3c1b 100755
|
|
--- a/tests/qemu-iotests/175
|
|
+++ b/tests/qemu-iotests/175
|
|
@@ -37,14 +37,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
|
|
# the file size. This function hides the resulting difference in the
|
|
# stat -c '%b' output.
|
|
# Parameter 1: Number of blocks an empty file occupies
|
|
-# Parameter 2: Image size in bytes
|
|
+# Parameter 2: Minimal number of blocks in an image
|
|
+# Parameter 3: Image size in bytes
|
|
_filter_blocks()
|
|
{
|
|
extra_blocks=$1
|
|
- img_size=$2
|
|
+ min_blocks=$2
|
|
+ img_size=$3
|
|
|
|
- sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
|
|
- -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/"
|
|
+ sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \
|
|
+ -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/"
|
|
}
|
|
|
|
# get standard environment, filters and checks
|
|
@@ -60,16 +62,21 @@ size=$((1 * 1024 * 1024))
|
|
touch "$TEST_DIR/empty"
|
|
extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
|
|
|
|
+# We always write the first byte; check how many blocks this filesystem
|
|
+# allocates to match empty image alloation.
|
|
+printf "\0" > "$TEST_DIR/empty"
|
|
+min_blocks=$(stat -c '%b' "$TEST_DIR/empty")
|
|
+
|
|
echo
|
|
echo "== creating image with default preallocation =="
|
|
_make_test_img $size | _filter_imgfmt
|
|
-stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
|
|
+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
|
|
|
|
for mode in off full falloc; do
|
|
echo
|
|
echo "== creating image with preallocation $mode =="
|
|
IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
|
|
- stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
|
|
+ stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
|
|
done
|
|
|
|
# success, all done
|
|
diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
|
|
index 6d9a5ed84e..263e521262 100644
|
|
--- a/tests/qemu-iotests/175.out
|
|
+++ b/tests/qemu-iotests/175.out
|
|
@@ -2,17 +2,17 @@ QA output created by 175
|
|
|
|
== creating image with default preallocation ==
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
|
|
-size=1048576, nothing allocated
|
|
+size=1048576, min allocation
|
|
|
|
== creating image with preallocation off ==
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
|
|
-size=1048576, nothing allocated
|
|
+size=1048576, min allocation
|
|
|
|
== creating image with preallocation full ==
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
|
|
-size=1048576, everything allocated
|
|
+size=1048576, max allocation
|
|
|
|
== creating image with preallocation falloc ==
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
|
|
-size=1048576, everything allocated
|
|
+size=1048576, max allocation
|
|
*** done
|
|
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
|
|
index 55a8dc926f..9e7d8c44df 100644
|
|
--- a/tests/qemu-iotests/178.out.qcow2
|
|
+++ b/tests/qemu-iotests/178.out.qcow2
|
|
@@ -101,7 +101,7 @@ converted image file size in bytes: 196608
|
|
== raw input image with data (human) ==
|
|
|
|
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
|
-required size: 393216
|
|
+required size: 458752
|
|
fully allocated size: 1074135040
|
|
wrote 512/512 bytes at offset 512
|
|
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
|
@@ -257,7 +257,7 @@ converted image file size in bytes: 196608
|
|
|
|
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
|
{
|
|
- "required": 393216,
|
|
+ "required": 458752,
|
|
"fully-allocated": 1074135040
|
|
}
|
|
wrote 512/512 bytes at offset 512
|
|
diff --git a/tests/qemu-iotests/221.out b/tests/qemu-iotests/221.out
|
|
index 9f9dd52bb0..dca024a0c3 100644
|
|
--- a/tests/qemu-iotests/221.out
|
|
+++ b/tests/qemu-iotests/221.out
|
|
@@ -3,14 +3,18 @@ QA output created by 221
|
|
=== Check mapping of unaligned raw image ===
|
|
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
|
|
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
wrote 1/1 bytes at offset 65536
|
|
1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
|
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
*** done
|
|
diff --git a/tests/qemu-iotests/253.out b/tests/qemu-iotests/253.out
|
|
index 607c0baa0b..3d08b305d7 100644
|
|
--- a/tests/qemu-iotests/253.out
|
|
+++ b/tests/qemu-iotests/253.out
|
|
@@ -3,12 +3,16 @@ QA output created by 253
|
|
=== Check mapping of unaligned raw image ===
|
|
|
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
|
|
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
|
wrote 65535/65535 bytes at offset 983040
|
|
63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
|
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
|
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
|
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
|
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
|
*** done
|