All of lore.kernel.org
 help / color / mirror / Atom feed
From: Timofey Titovets <nefelim4ag@gmail.com>
To: linux-btrfs@vger.kernel.org
Subject: [RFC PATCH V2] btrfs/ioctl.c: extent_same - Use inode as src which, close to disk beginning
Date: Wed, 21 Oct 2015 04:11:22 +0300	[thread overview]
Message-ID: <5626E63A.2090608@gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 6681 bytes --]

It's just a proof of concept, and i hope to see feedback/ideas/review 
about it.
---
While deduplication,
Btrfs produce extent and file fragmentation
But it's can be optimized by compute - which inode data placed a closest 
to beginning of hdd
It's allow to:
1. Performance boost on hdd (beginning of disk faster then end)
2. Make sparse only on tail of fs, what can give boost later for
balancing and resizing operations

New function:
static u64 btrfs_avg_disko(struct inode *inode,
                         const u64 off, const u64 olen_aligned);

It normalize offsets with data lengths, by represent it like offsets of 
blocks
It return average data offset of all "pagesized" blocks in given range 
for inode
Function cloned from btrfs_clone()

Changes from V1:
         Added new function which compute "normal" offset

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
  fs/btrfs/ioctl.c | 147 
++++++++++++++++++++++++++++++++++++++++++++++++++++++-
  1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e3e613..17e5313 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -86,6 +86,9 @@ struct btrfs_ioctl_received_subvol_args_32 {
  #endif


+static u64 btrfs_avg_disko(struct inode *inode,
+                        const u64 off, const u64 olen_aligned);
+
  static int btrfs_clone(struct inode *src, struct inode *inode,
                 u64 off, u64 olen, u64 olen_aligned, u64 destoff,
                 int no_time_update);
@@ -3074,8 +3077,20 @@ static int btrfs_extent_same(struct inode *src, 
u64 loff, u64 olen,

      /* pass original length for comparison so we stay within i_size */
      ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
-    if (ret == 0)
-        ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+    if (ret == 0) {
+        /* prefer inode with lowest offset as source for clone*/
+        u64 src_weight;
+        u64 dst_weight;
+        src_weight = btrfs_avg_disko(src, off, olen);
+        dst_weight = btrfs_avg_disko(dest, dst_loff, olen);
+        /* if one of weight == 0 -> fallback */
+        if (dest_weight == 0)
+            src_weight = 0;
+        if (src_weight > dest_weight)
+            ret = btrfs_clone(dst, src, dst_loff, olen, len, loff, 1);
+        else
+            ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+    }

      if (same_inode)
          unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
@@ -3329,6 +3344,134 @@ static void clone_update_extent_map(struct inode 
*inode,
  }

  /**
+ * btrfs_avg_disko() - return avg data offset weight for inode
+ *
+ * @inode: Inode
+ * @off: Offset for computing
+ * @olen_aligned: Block-aligned len of data
+ *
+ * Computing avg address place of data, allow to heuristically
+ * determine where on the disk placed most fragment of data
+ */
+static u64 btrfs_avg_disko(struct inode *inode,
+                    const u64 off, const u64 olen_aligned)
+{
+    struct btrfs_root *root = BTRFS_I(inode)->root;
+    struct btrfs_path *path = NULL;
+    struct extent_buffer *leaf;
+    char *buf = NULL;
+    struct btrfs_key key;
+    u32 nritems;
+    int slot;
+    int no_quota;
+    double sum = 0;
+    u64 ret = 0;
+    u64 counter = 0;
+
+    buf = vmalloc(root->nodesize);
+    if (!buf)
+        return ret;
+
+    path = btrfs_alloc_path();
+    if (!path) {
+        vfree(buf);
+        return ret;
+    }
+
+    path->reada = 2;
+    /* clone data */
+    key.objectid = btrfs_ino(inode);
+    key.type = BTRFS_EXTENT_DATA_KEY;
+    key.offset = off;
+
+    while (1) {
+        u64 next_key_min_offset = key.offset + 1;
+
+        /*
+         * note the key will change type as we walk through the
+         * tree.
+         */
+        path->leave_spinning = 1;
+        ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, 
path,    0, 0);
+        if (ret < 0)
+            goto out;
+        /*
+         * First search, if no extent item that starts at offset off was
+         * found but the previous item is an extent item, it's possible
+         * it might overlap our target range, therefore process it.
+         */
+        if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+            btrfs_item_key_to_cpu(path->nodes[0], &key,
+                          path->slots[0] - 1);
+            if (key.type == BTRFS_EXTENT_DATA_KEY)
+                path->slots[0]--;
+        }
+
+        nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+        no_quota = 1;
+        if (path->slots[0] >= nritems) {
+            ret = btrfs_next_leaf(BTRFS_I(inode)->root, path);
+            if (ret < 0)
+                goto out;
+            if (ret > 0)
+                break;
+            nritems = btrfs_header_nritems(path->nodes[0]);
+        }
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.type > BTRFS_EXTENT_DATA_KEY ||
+            key.objectid != btrfs_ino(inode))
+            break;
+
+        if (key.type == BTRFS_EXTENT_DATA_KEY) {
+            struct btrfs_file_extent_item *extent;
+            int type;
+            u64 disko = 0;
+            u64 diskl = 0;
+            u64 datal = 0;
+
+            extent = btrfs_item_ptr(leaf, slot,    struct 
btrfs_file_extent_item);
+            type = btrfs_file_extent_type(leaf, extent);
+            if (type == BTRFS_FILE_EXTENT_REG ||
+                type == BTRFS_FILE_EXTENT_PREALLOC) {
+                disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+                diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+                datal = btrfs_file_extent_num_bytes(leaf, extent);
+            }
+
+            /*
+             * The first search might have left us at an extent
+             * item that ends before our target range's start, can
+             * happen if we have holes and NO_HOLES feature enabled.
+             */
+            if (key.offset + datal <= off) {
+                path->slots[0]++;
+                goto process_slot;
+            } else if (key.offset >= off + olen_aligned) {
+                break;
+            }
+            next_key_min_offset = key.offset + datal;
+
+            for (;diskl >= 0; diskl -= pagesize) {
+                sum += disko+diskl;
+                counter++;
+            }
+
+            btrfs_release_path(path);
+            path->leave_spinning = 0;
+    }
+
+out:
+    ret = sum/counter;
+    btrfs_free_path(path);
+    vfree(buf);
+    return ret;
+}
+
+/**
   * btrfs_clone() - clone a range from inode file to another
   *
   * @src: Inode to clone from
-- 
2.6.1



[-- Attachment #2: 0001-btrfs-ioctl.c-extent_same-Use-inode-as-src-which-clo.patch --]
[-- Type: text/x-patch, Size: 5892 bytes --]

>From 0fcd8c8f03064b9d7ed89606f3eff65ffc53cbf5 Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Wed, 21 Oct 2015 04:10:13 +0300
Subject: [RFC PATCH V2] btrfs/ioctl.c: extent_same - Use inode as src which
 close to disk beginning

While deduplication,
Btrfs produce extent and file fragmentation
But it's can be optimized by compute - which inode data placed a closest to beginning of hdd
It's allow to:
1. Performance boost on hdd (beginning of disk faster then end)
2. Make sparse only on tail of fs, what can give boost later for
balancing and resizing operations

New function:
static u64 btrfs_avg_disko(struct inode *inode,
                        const u64 off, const u64 olen_aligned);

It normalize offsets with data lenghts, by represent it like offsets of blocks
It return average data offset of all "pagesized" blocks in given range for inode

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 fs/btrfs/ioctl.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e3e613..cd0ecd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -86,6 +86,9 @@ struct btrfs_ioctl_received_subvol_args_32 {
 #endif
 
 
+static u64 btrfs_avg_disko(struct inode *inode,
+						const u64 off, const u64 olen_aligned);
+
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
 		       int no_time_update);
@@ -3074,8 +3077,20 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 
 	/* pass original length for comparison so we stay within i_size */
 	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
-	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+	if (ret == 0) {
+		/* prefer inode with lowest offset as source for clone*/
+		u64 src_weight;
+		u64 dst_weight;
+		src_weight = btrfs_avg_disko(src, off, olen);
+		dst_weight = btrfs_avg_disko(dest, dst_loff, olen);
+		/* if one of weight == 0 -> fallback */
+		if (dest_weight == 0)
+			src_weight = 0;
+		if (src_weight > dest_weight)
+			ret = btrfs_clone(dst, src, dst_loff, olen, len, loff, 1);
+		else
+			ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+	}
 
 	if (same_inode)
 		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
@@ -3329,6 +3344,134 @@ static void clone_update_extent_map(struct inode *inode,
 }
 
 /**
+ * btrfs_avg_disko() - return avg data offset weight for inode
+ *
+ * @inode: Inode
+ * @off: Offset for computing
+ * @olen_aligned: Block-aligned len of data
+ *
+ * Computing average offset of data, allow to heuristically
+ * determine where on the disk placed most percent of data fragments
+ */
+static u64 btrfs_avg_disko(struct inode *inode,
+					const u64 off, const u64 olen_aligned)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
+	char *buf = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int no_quota;
+	double sum = 0;
+	u64 ret = 0;
+	u64 counter = 0;
+
+	buf = vmalloc(root->nodesize);
+	if (!buf)
+		return ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		return ret;
+	}
+
+	path->reada = 2;
+	/* clone data */
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = off;
+
+	while (1) {
+		u64 next_key_min_offset = key.offset + 1;
+
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path,	0, 0);
+		if (ret < 0)
+			goto out;
+		/*
+		 * First search, if no extent item that starts at offset off was
+		 * found but the previous item is an extent item, it's possible
+		 * it might overlap our target range, therefore process it.
+		 */
+		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0] - 1);
+			if (key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+		no_quota = 1;
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(BTRFS_I(inode)->root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.type > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != btrfs_ino(inode))
+			break;
+
+		if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int type;
+			u64 disko = 0;
+			u64 diskl = 0;
+			u64 datal = 0;
+
+			extent = btrfs_item_ptr(leaf, slot,	struct btrfs_file_extent_item);
+			type = btrfs_file_extent_type(leaf, extent);
+			if (type == BTRFS_FILE_EXTENT_REG ||
+			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf, extent);
+			}
+
+			/*
+			 * The first search might have left us at an extent
+			 * item that ends before our target range's start, can
+			 * happen if we have holes and NO_HOLES feature enabled.
+			 */
+			if (key.offset + datal <= off) {
+				path->slots[0]++;
+				goto process_slot;
+			} else if (key.offset >= off + olen_aligned) {
+				break;
+			}
+			next_key_min_offset = key.offset + datal;
+
+			for (;diskl >= 0; diskl -= pagesize) {
+				sum += disko+diskl;
+				counter++;
+			}
+
+			btrfs_release_path(path);
+			path->leave_spinning = 0;
+	}
+
+out:
+	ret = sum/counter;
+	btrfs_free_path(path);
+	vfree(buf);
+	return ret;
+}
+
+/**
  * btrfs_clone() - clone a range from inode file to another
  *
  * @src: Inode to clone from
-- 
2.6.1


             reply	other threads:[~2015-10-21  1:11 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-10-21  1:11 Timofey Titovets [this message]
2015-10-21  8:52 ` [RFC PATCH V2] btrfs/ioctl.c: extent_same - Use inode as src which, close to disk beginning Timofey Titovets
  -- strict thread matches above, loose matches on Subject: below --
2015-10-21  0:59 Timofey Titovets

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5626E63A.2090608@gmail.com \
    --to=nefelim4ag@gmail.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.