linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Luis Chamberlain <mcgrof@kernel.org>
To: david@redhat.com, tglx@linutronix.de, hch@lst.de,
	patches@lists.linux.dev, linux-modules@vger.kernel.org,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	pmladek@suse.com, petr.pavlu@suse.com, prarit@redhat.com,
	torvalds@linux-foundation.org, lennart@poettering.net
Cc: gregkh@linuxfoundation.org, rafael@kernel.org, song@kernel.org,
	lucas.de.marchi@gmail.com, lucas.demarchi@intel.com,
	christophe.leroy@csgroup.eu, peterz@infradead.org,
	rppt@kernel.org, dave@stgolabs.net, willy@infradead.org,
	vbabka@suse.cz, mhocko@suse.com, dave.hansen@linux.intel.com,
	colin.i.king@gmail.com, jim.cromie@gmail.com,
	catalin.marinas@arm.com, jbaron@akamai.com,
	rick.p.edgecombe@intel.com, yujie.liu@intel.com,
	mcgrof@kernel.org
Subject: [PATCH 1/2] fs/kernel_read_file: add support for duplicate detection
Date: Wed, 24 May 2023 14:36:19 -0700	[thread overview]
Message-ID: <20230524213620.3509138-2-mcgrof@kernel.org> (raw)
In-Reply-To: <20230524213620.3509138-1-mcgrof@kernel.org>

Add support for a new call which allows to detect duplicate requests
for each inode passed. This enables users to avoid having to incur
a whole vmalloc() for duplicates inodes with kernel_read_file().
Support to avoid duplicates is desirable in-kernel since changing
existing userspace or kernel users to account for these duplicates
would otherwise be difficult to implement, and the measured impact
of amount of wasted memory due to duplicates with vmalloc is observed
to be high.

This currently goes disabled because no user exists yet which wants
this enabled. Kernel code which needs this enabled should select the
new CONFIG_KREAD_UNIQ, otherwise the API falls back to the existing
kernel_read_file_from_fd().

If we later need to have some code enable CONFIG_KREAD_UNIQ but some
not we can have the feature be enabled per enum kernel_read_file_id id.
For now this should cover the main future use case with modules and
allow easily to disable / enable this feature with just one future
kconfig option.

Contrary to kernel_read_file_from_fd() users of thew new API will
use kread_uniq_fd(), keep track of the inode internally, and once
done use kread_uniq_fd_free() once the inode is no longer used.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/Kconfig                       |   3 +
 fs/kernel_read_file.c            | 124 +++++++++++++++++++++++++++++++
 include/linux/kernel_read_file.h |  14 ++++
 3 files changed, 141 insertions(+)

diff --git a/fs/Kconfig b/fs/Kconfig
index cc07a0cd3172..0a78657b00d5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,6 +18,9 @@ config VALIDATE_FS_PARSER
 config FS_IOMAP
 	bool
 
+config KREAD_UNIQ
+	bool
+
 # old blockdev_direct_IO implementation.  Use iomap for new code instead
 config LEGACY_DIRECT_IO
 	bool
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 5d826274570c..a066e2f239e8 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -1,9 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "kread: " fmt
+
 #include <linux/fs.h>
 #include <linux/fs_struct.h>
 #include <linux/kernel_read_file.h>
 #include <linux/security.h>
 #include <linux/vmalloc.h>
+#include <linux/fdtable.h>
 
 /**
  * kernel_read_file() - read file contents into a kernel buffer
@@ -187,3 +190,124 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
+
+#ifdef CONFIG_KREAD_UNIQ
+static DEFINE_MUTEX(kread_dup_mutex);
+static LIST_HEAD(kread_dup_reqs);
+
+struct kread_dup_req {
+	struct list_head list;
+	unsigned long i_ino;
+};
+
+static struct kread_dup_req *kread_dup_request_lookup(unsigned long i_ino)
+{
+	struct kread_dup_req *kread_req;
+
+	list_for_each_entry_rcu(kread_req, &kread_dup_reqs, list,
+				lockdep_is_held(&kread_dup_mutex)) {
+		if (kread_req->i_ino == i_ino)
+			return kread_req;
+        }
+
+	return NULL;
+}
+
+static struct kread_dup_req *kread_dup_request_new(char *name, unsigned long i_ino)
+{
+	struct kread_dup_req *kread_req, *new_kread_req;
+
+	/*
+	 * Pre-allocate the entry in case we have to use it later
+	 * to avoid contention with the mutex.
+	 */
+	new_kread_req = kzalloc(sizeof(*new_kread_req), GFP_KERNEL);
+	if (!new_kread_req)
+		return false;
+
+	new_kread_req->i_ino = i_ino;
+
+	kread_req = kread_dup_request_lookup(i_ino);
+	if (!kread_req) {
+		/*
+		 * There was no duplicate, just add the request so we can
+		 * keep tab on duplicates later.
+		 */
+		pr_debug("accepted request for i_ino %lu for: %s\n", i_ino, name);
+		return new_kread_req;
+	}
+
+	/* We are dealing with a duplicate request now */
+
+	kfree(new_kread_req);
+
+	pr_debug("duplicate request on i_ino %lu for: %s\n", i_ino, name);
+
+	return NULL;
+}
+
+ssize_t kread_uniq_fd(int fd, loff_t offset, void **buf, unsigned long *i_ino,
+		      size_t buf_size, size_t *file_size, enum kernel_read_file_id id)
+{
+	struct fd f = fdget(fd);
+	ssize_t ret = -EBADF;
+	char *name, *path;
+	struct kread_dup_req *req;
+
+	if (!f.file || !(f.file->f_mode & FMODE_READ))
+		goto out;
+
+	path = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	name = file_path(f.file, path, PATH_MAX);
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		goto out_mem;
+	}
+
+	*i_ino = file_inode(f.file)->i_ino;
+
+	mutex_lock(&kread_dup_mutex);
+	req = kread_dup_request_new(name, *i_ino);
+	if (!req) {
+		mutex_unlock(&kread_dup_mutex);
+		ret = -EBUSY;
+		goto out_mem;
+	}
+
+	ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
+	if (ret < 0)
+		kfree(req);
+	else
+		list_add_rcu(&req->list, &kread_dup_reqs);
+	mutex_unlock(&kread_dup_mutex);
+out_mem:
+	kfree(path);
+out:
+	fdput(f);
+	return ret;
+}
+
+void kread_uniq_fd_free(unsigned long i_ino)
+{
+	struct kread_dup_req *kread_req;
+
+	if (!i_ino)
+		return;
+
+	mutex_lock(&kread_dup_mutex);
+
+	kread_req = kread_dup_request_lookup(i_ino);
+	if (!kread_req)
+		goto out;
+
+	list_del_rcu(&kread_req->list);
+	synchronize_rcu();
+
+out:
+	mutex_unlock(&kread_dup_mutex);
+	kfree(kread_req);
+}
+#endif /* CONFIG_KREAD_UNIQ */
diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 90451e2e12bd..884985b0dc88 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -51,5 +51,19 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset,
 				 void **buf, size_t buf_size,
 				 size_t *file_size,
 				 enum kernel_read_file_id id);
+#ifdef CONFIG_KREAD_UNIQ
+ssize_t kread_uniq_fd(int fd, loff_t offset, void **buf, unsigned long *i_ino,
+		      size_t buf_size, size_t *file_size, enum kernel_read_file_id id);
+void kread_uniq_fd_free(unsigned long i_ino);
+#else
+static inline ssize_t kread_uniq_fd(int fd, loff_t offset, void **buf, unsigned long *i_ino,
+				    size_t buf_size, size_t *file_size, enum kernel_read_file_id id)
+{
+	return kernel_read_file_from_fd(fd, offset, buf, buf_size, file_size, id);
+}
+static inline void kread_uniq_fd_free(unsigned long i_ino)
+{
+}
+#endif /* CONFIG_KREAD_UNIQ */
 
 #endif /* _LINUX_KERNEL_READ_FILE_H */
-- 
2.39.2


  reply	other threads:[~2023-05-24 21:36 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-24 21:36 [PATCH 0/2] module: avoid all memory pressure due to duplicates Luis Chamberlain
2023-05-24 21:36 ` Luis Chamberlain [this message]
2023-05-24 21:52   ` [PATCH 1/2] fs/kernel_read_file: add support for duplicate detection Linus Torvalds
2023-05-24 21:56     ` Linus Torvalds
2023-05-24 22:07       ` Luis Chamberlain
2023-05-25  4:00     ` Linus Torvalds
2023-05-25 18:08       ` Luis Chamberlain
2023-05-25 18:35         ` Luis Chamberlain
2023-05-25 18:50         ` Linus Torvalds
2023-05-25 19:32           ` Luis Chamberlain
2023-05-25  7:01     ` Christian Brauner
2023-05-24 21:36 ` [PATCH 2/2] module: add support to avoid duplicates early on load Luis Chamberlain
2023-05-25 11:40   ` Petr Pavlu
2023-05-25 16:07     ` Linus Torvalds
2023-05-25 16:42       ` Greg KH
2023-05-25 18:22         ` Luis Chamberlain
2023-05-25 17:52       ` Linus Torvalds
2023-05-25 18:45       ` Lucas De Marchi
2023-05-25 21:12         ` Linus Torvalds
2023-05-25 22:02           ` Luis Chamberlain
2023-05-26  1:39             ` Linus Torvalds
2023-05-29  8:58               ` Johan Hovold
2023-05-29 11:00                 ` Linus Torvalds
2023-05-29 12:44                   ` Johan Hovold
2023-05-29 15:18                     ` Johan Hovold
2023-05-30  1:55                       ` Linus Torvalds
2023-05-30  9:40                         ` Johan Hovold
2023-06-05 12:25                           ` Johan Hovold
2023-05-30 16:22                         ` Luis Chamberlain
2023-05-30 17:16                           ` Lucas De Marchi
2023-05-30 19:41                             ` Luis Chamberlain
2023-05-30 22:17                               ` Linus Torvalds
2023-05-31  5:30                                 ` Lucas De Marchi
2023-05-31  0:31                           ` Luis Chamberlain
2023-05-31  7:51                           ` David Hildenbrand
2023-05-31 16:57                             ` Luis Chamberlain
2023-06-02 15:19                               ` David Hildenbrand
2023-06-02 16:04                                 ` Luis Chamberlain
2023-06-05 11:26                                   ` David Hildenbrand
2023-06-05 15:17                                     ` Luis Chamberlain
2023-06-05 15:28                                       ` Luis Chamberlain
2023-06-28 18:52                                         ` Luis Chamberlain
2023-06-28 20:14                                           ` Linus Torvalds
2023-06-28 22:07                                             ` Linus Torvalds
2023-06-28 23:17                                               ` Linus Torvalds
2023-06-29  0:18                                                 ` Luis Chamberlain
2023-06-02 16:06                                 ` Linus Torvalds
2023-06-02 16:37                                   ` David Hildenbrand
2023-05-30 22:45                         ` Dan Williams
2023-06-04 14:26                         ` Rudi Heitbaum
2023-05-29 17:47                     ` Linus Torvalds
2023-05-30 10:01                       ` Johan Hovold
2023-05-25 16:54     ` Lucas De Marchi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230524213620.3509138-2-mcgrof@kernel.org \
    --to=mcgrof@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=colin.i.king@gmail.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hch@lst.de \
    --cc=jbaron@akamai.com \
    --cc=jim.cromie@gmail.com \
    --cc=lennart@poettering.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-modules@vger.kernel.org \
    --cc=lucas.de.marchi@gmail.com \
    --cc=lucas.demarchi@intel.com \
    --cc=mhocko@suse.com \
    --cc=patches@lists.linux.dev \
    --cc=peterz@infradead.org \
    --cc=petr.pavlu@suse.com \
    --cc=pmladek@suse.com \
    --cc=prarit@redhat.com \
    --cc=rafael@kernel.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=rppt@kernel.org \
    --cc=song@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=yujie.liu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).