All of lore.kernel.org
 help / color / mirror / Atom feed
* Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-23 14:58 ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-23 14:58 UTC (permalink / raw)
  To: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, akpm, riel, hannes, matthew,
	chris.mason, dan.magenheimer, sjenning, jackdachef, cyclonusj

From: Dan Magenheimer <dan.magenheimer@oracle.com>
Subject: [PATCH V7 2/4] mm: frontswap: core code

This second patch of four in this frontswap series provides the core code
for frontswap that interfaces between the hooks in the swap subsystem and
a frontswap backend via frontswap_ops.

Two new files are added: mm/frontswap.c and include/linux/frontswap.h

Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
design for tmem; sysfs code modelled after mm/ksm.c

[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
[v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v5: no change from v4]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>

--- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/include/linux/frontswap.h	2011-08-23 08:22:14.645692424 -0600
@@ -0,0 +1,125 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+
+struct frontswap_ops {
+	void (*init)(unsigned);
+	int (*put_page)(unsigned, pgoff_t, struct page *);
+	int (*get_page)(unsigned, pgoff_t, struct page *);
+	void (*flush_page)(unsigned, pgoff_t);
+	void (*flush_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_flush_page(unsigned, pgoff_t);
+extern void __frontswap_flush_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	int ret = 0;
+
+	if (frontswap_enabled && sis->frontswap_map)
+		ret = test_bit(offset, sis->frontswap_map);
+	return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+	p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	return 0;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return NULL;
+}
+#endif
+
+static inline int frontswap_put_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_put_page(page);
+	return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_get_page(page);
+	return ret;
+}
+
+static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_page(type, offset);
+}
+
+static inline void frontswap_flush_area(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
--- linux/mm/frontswap.c	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/mm/frontswap.c	2011-08-23 08:20:09.774813309 -0600
@@ -0,0 +1,346 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/* useful stats available in /sys/kernel/mm/frontswap */
+static unsigned long frontswap_gets;
+static unsigned long frontswap_succ_puts;
+static unsigned long frontswap_failed_puts;
+static unsigned long frontswap_flushes;
+
+/*
+ * register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+	struct frontswap_ops old = frontswap_ops;
+
+	frontswap_ops = *ops;
+	frontswap_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	if (frontswap_enabled)
+		(*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data
+ * and return success or flush the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+	int ret = -1, dup = 0;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		dup = 1;
+	ret = (*frontswap_ops.put_page)(type, offset, page);
+	if (ret == 0) {
+		frontswap_set(sis, offset);
+		frontswap_succ_puts++;
+		if (!dup)
+			sis->frontswap_pages++;
+	} else if (dup) {
+		/*
+		  failed dup always results in automatic flush of
+		  the (older) page from frontswap
+		 */
+		frontswap_clear(sis, offset);
+		sis->frontswap_pages--;
+		frontswap_failed_puts++;
+	} else
+		frontswap_failed_puts++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		ret = (*frontswap_ops.get_page)(type, offset, page);
+	if (ret == 0)
+		frontswap_gets++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Flush any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset)) {
+		(*frontswap_ops.flush_page)(type, offset);
+		sis->frontswap_pages--;
+		frontswap_clear(sis, offset);
+		frontswap_flushes++;
+	}
+}
+EXPORT_SYMBOL(__frontswap_flush_page);
+
+/*
+ * Flush all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	(*frontswap_ops.flush_area)(type);
+	sis->frontswap_pages = 0;
+	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_flush_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+	int wrapped = 0;
+	bool locked = false;
+
+	/* try a few times to maximize chance of try_to_unuse success */
+	for (wrapped = 0; wrapped < 3; wrapped++) {
+
+		struct swap_info_struct *si = NULL;
+		unsigned long total_pages = 0, total_pages_to_unuse;
+		unsigned long pages = 0, pages_to_unuse = 0;
+		int type;
+
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		locked = true;
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			total_pages += si->frontswap_pages;
+		}
+		if (total_pages <= target_pages)
+			goto out;
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			if (total_pages_to_unuse < si->frontswap_pages)
+				pages = pages_to_unuse = total_pages_to_unuse;
+			else {
+				pages = si->frontswap_pages;
+				pages_to_unuse = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory_kern(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		if (type < 0)
+			goto out;
+		locked = false;
+		spin_unlock(&swap_lock);
+		try_to_unuse(type, true, pages_to_unuse);
+	}
+
+out:
+	if (locked)
+		spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * count and return the number of pages frontswap pages across all
+ * swap devices.  This is exported so that a kernel module can
+ * determine current usage without reading sysfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		if (si != NULL)
+			totalpages += si->frontswap_pages;
+	}
+	spin_unlock(&swap_lock);
+	return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
+
+#define FRONTSWAP_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define FRONTSWAP_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t curr_pages_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_curr_pages());
+}
+
+static ssize_t curr_pages_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long target_pages;
+
+	if (strict_strtoul(buf, 10, &target_pages))
+		return -EINVAL;
+
+	frontswap_shrink(target_pages);
+
+	return count;
+}
+FRONTSWAP_ATTR(curr_pages);
+
+static ssize_t succ_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_succ_puts);
+}
+FRONTSWAP_ATTR_RO(succ_puts);
+
+static ssize_t failed_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_failed_puts);
+}
+FRONTSWAP_ATTR_RO(failed_puts);
+
+static ssize_t gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_gets);
+}
+FRONTSWAP_ATTR_RO(gets);
+
+static ssize_t flushes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_flushes);
+}
+FRONTSWAP_ATTR_RO(flushes);
+
+static struct attribute *frontswap_attrs[] = {
+	&curr_pages_attr.attr,
+	&succ_puts_attr.attr,
+	&failed_puts_attr.attr,
+	&gets_attr.attr,
+	&flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group frontswap_attr_group = {
+	.attrs = frontswap_attrs,
+	.name = "frontswap",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_frontswap(void)
+{
+	int err = 0;
+
+#ifdef CONFIG_SYSFS
+	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
+#endif /* CONFIG_SYSFS */
+	return err;
+}
+
+static void __exit exit_frontswap(void)
+{
+	frontswap_shrink(0UL);
+}
+
+module_init(init_frontswap);
+module_exit(exit_frontswap);

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-23 14:58 ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-23 14:58 UTC (permalink / raw)
  To: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, akpm, riel, hannes, matthew,
	chris.mason, dan.magenheimer, sjenning, jackdachef, cyclonusj

From: Dan Magenheimer <dan.magenheimer@oracle.com>
Subject: [PATCH V7 2/4] mm: frontswap: core code

This second patch of four in this frontswap series provides the core code
for frontswap that interfaces between the hooks in the swap subsystem and
a frontswap backend via frontswap_ops.

Two new files are added: mm/frontswap.c and include/linux/frontswap.h

Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
design for tmem; sysfs code modelled after mm/ksm.c

[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
[v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v5: no change from v4]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>

--- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/include/linux/frontswap.h	2011-08-23 08:22:14.645692424 -0600
@@ -0,0 +1,125 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+
+struct frontswap_ops {
+	void (*init)(unsigned);
+	int (*put_page)(unsigned, pgoff_t, struct page *);
+	int (*get_page)(unsigned, pgoff_t, struct page *);
+	void (*flush_page)(unsigned, pgoff_t);
+	void (*flush_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_flush_page(unsigned, pgoff_t);
+extern void __frontswap_flush_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	int ret = 0;
+
+	if (frontswap_enabled && sis->frontswap_map)
+		ret = test_bit(offset, sis->frontswap_map);
+	return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+	p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	return 0;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return NULL;
+}
+#endif
+
+static inline int frontswap_put_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_put_page(page);
+	return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_get_page(page);
+	return ret;
+}
+
+static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_page(type, offset);
+}
+
+static inline void frontswap_flush_area(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
--- linux/mm/frontswap.c	1969-12-31 17:00:00.000000000 -0700
+++ frontswap/mm/frontswap.c	2011-08-23 08:20:09.774813309 -0600
@@ -0,0 +1,346 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/* useful stats available in /sys/kernel/mm/frontswap */
+static unsigned long frontswap_gets;
+static unsigned long frontswap_succ_puts;
+static unsigned long frontswap_failed_puts;
+static unsigned long frontswap_flushes;
+
+/*
+ * register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+	struct frontswap_ops old = frontswap_ops;
+
+	frontswap_ops = *ops;
+	frontswap_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	if (frontswap_enabled)
+		(*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data
+ * and return success or flush the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+	int ret = -1, dup = 0;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		dup = 1;
+	ret = (*frontswap_ops.put_page)(type, offset, page);
+	if (ret == 0) {
+		frontswap_set(sis, offset);
+		frontswap_succ_puts++;
+		if (!dup)
+			sis->frontswap_pages++;
+	} else if (dup) {
+		/*
+		  failed dup always results in automatic flush of
+		  the (older) page from frontswap
+		 */
+		frontswap_clear(sis, offset);
+		sis->frontswap_pages--;
+		frontswap_failed_puts++;
+	} else
+		frontswap_failed_puts++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		ret = (*frontswap_ops.get_page)(type, offset, page);
+	if (ret == 0)
+		frontswap_gets++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Flush any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset)) {
+		(*frontswap_ops.flush_page)(type, offset);
+		sis->frontswap_pages--;
+		frontswap_clear(sis, offset);
+		frontswap_flushes++;
+	}
+}
+EXPORT_SYMBOL(__frontswap_flush_page);
+
+/*
+ * Flush all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	(*frontswap_ops.flush_area)(type);
+	sis->frontswap_pages = 0;
+	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_flush_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+	int wrapped = 0;
+	bool locked = false;
+
+	/* try a few times to maximize chance of try_to_unuse success */
+	for (wrapped = 0; wrapped < 3; wrapped++) {
+
+		struct swap_info_struct *si = NULL;
+		unsigned long total_pages = 0, total_pages_to_unuse;
+		unsigned long pages = 0, pages_to_unuse = 0;
+		int type;
+
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		locked = true;
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			total_pages += si->frontswap_pages;
+		}
+		if (total_pages <= target_pages)
+			goto out;
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			if (total_pages_to_unuse < si->frontswap_pages)
+				pages = pages_to_unuse = total_pages_to_unuse;
+			else {
+				pages = si->frontswap_pages;
+				pages_to_unuse = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory_kern(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		if (type < 0)
+			goto out;
+		locked = false;
+		spin_unlock(&swap_lock);
+		try_to_unuse(type, true, pages_to_unuse);
+	}
+
+out:
+	if (locked)
+		spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * count and return the number of pages frontswap pages across all
+ * swap devices.  This is exported so that a kernel module can
+ * determine current usage without reading sysfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		if (si != NULL)
+			totalpages += si->frontswap_pages;
+	}
+	spin_unlock(&swap_lock);
+	return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
+
+#define FRONTSWAP_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define FRONTSWAP_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t curr_pages_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_curr_pages());
+}
+
+static ssize_t curr_pages_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long target_pages;
+
+	if (strict_strtoul(buf, 10, &target_pages))
+		return -EINVAL;
+
+	frontswap_shrink(target_pages);
+
+	return count;
+}
+FRONTSWAP_ATTR(curr_pages);
+
+static ssize_t succ_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_succ_puts);
+}
+FRONTSWAP_ATTR_RO(succ_puts);
+
+static ssize_t failed_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_failed_puts);
+}
+FRONTSWAP_ATTR_RO(failed_puts);
+
+static ssize_t gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_gets);
+}
+FRONTSWAP_ATTR_RO(gets);
+
+static ssize_t flushes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_flushes);
+}
+FRONTSWAP_ATTR_RO(flushes);
+
+static struct attribute *frontswap_attrs[] = {
+	&curr_pages_attr.attr,
+	&succ_puts_attr.attr,
+	&failed_puts_attr.attr,
+	&gets_attr.attr,
+	&flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group frontswap_attr_group = {
+	.attrs = frontswap_attrs,
+	.name = "frontswap",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_frontswap(void)
+{
+	int err = 0;
+
+#ifdef CONFIG_SYSFS
+	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
+#endif /* CONFIG_SYSFS */
+	return err;
+}
+
+static void __exit exit_frontswap(void)
+{
+	frontswap_shrink(0UL);
+}
+
+module_init(init_frontswap);
+module_exit(exit_frontswap);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-23 14:58 ` Dan Magenheimer
@ 2011-08-25  6:05   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 18+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-08-25  6:05 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, akpm, riel, hannes, matthew,
	chris.mason, sjenning, jackdachef, cyclonusj

On Tue, 23 Aug 2011 07:58:15 -0700
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> From: Dan Magenheimer <dan.magenheimer@oracle.com>
> Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> This second patch of four in this frontswap series provides the core code
> for frontswap that interfaces between the hooks in the swap subsystem and
> a frontswap backend via frontswap_ops.
> 
> Two new files are added: mm/frontswap.c and include/linux/frontswap.h
> 
> Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
> design for tmem; sysfs code modelled after mm/ksm.c
> 
> [v7: rebase to 3.0-rc3]
> [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
> [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
> [v6: rebase to 3.1-rc1]
> [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
> [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
> [v5: no change from v4]
> [v4: rebase to 2.6.39]
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
> Acked-by: Jan Beulich <JBeulich@novell.com>
> Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Nitin Gupta <ngupta@vflare.org>
> Cc: Matthew Wilcox <matthew@wil.cx>
> Cc: Chris Mason <chris.mason@oracle.com>
> Cc: Rik Riel <riel@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 

I think you should add diffstat...and add include changes to Makefile
in the same patch.

I have small questions.

> --- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
> +++ frontswap/include/linux/frontswap.h	2011-08-23 08:22:14.645692424 -0600
> @@ -0,0 +1,125 @@
> +#ifndef _LINUX_FRONTSWAP_H
> +#define _LINUX_FRONTSWAP_H
> +
> +#include <linux/swap.h>
> +#include <linux/mm.h>
> +
> +struct frontswap_ops {
> +	void (*init)(unsigned);
> +	int (*put_page)(unsigned, pgoff_t, struct page *);
> +	int (*get_page)(unsigned, pgoff_t, struct page *);
> +	void (*flush_page)(unsigned, pgoff_t);
> +	void (*flush_area)(unsigned);
> +};
> +
> +extern int frontswap_enabled;
> +extern struct frontswap_ops
> +	frontswap_register_ops(struct frontswap_ops *ops);
> +extern void frontswap_shrink(unsigned long);
> +extern unsigned long frontswap_curr_pages(void);
> +
> +extern void __frontswap_init(unsigned type);
> +extern int __frontswap_put_page(struct page *page);
> +extern int __frontswap_get_page(struct page *page);
> +extern void __frontswap_flush_page(unsigned, pgoff_t);
> +extern void __frontswap_flush_area(unsigned);
> +
> +#ifdef CONFIG_FRONTSWAP
> +
> +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	int ret = 0;
> +
> +	if (frontswap_enabled && sis->frontswap_map)
> +		ret = test_bit(offset, sis->frontswap_map);
> +	return ret;
> +}
> +
> +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	if (frontswap_enabled && sis->frontswap_map)
> +		set_bit(offset, sis->frontswap_map);
> +}
> +
> +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	if (frontswap_enabled && sis->frontswap_map)
> +		clear_bit(offset, sis->frontswap_map);
> +}
> +
> +static inline void frontswap_map_set(struct swap_info_struct *p,
> +				     unsigned long *map)
> +{
> +	p->frontswap_map = map;
> +}
> +
> +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
> +{
> +	return p->frontswap_map;
> +}
> +#else
> +/* all inline routines become no-ops and all externs are ignored */
> +
> +#define frontswap_enabled (0)
> +
> +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	return 0;
> +}
> +
> +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +}
> +
> +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +}
> +
> +static inline void frontswap_map_set(struct swap_info_struct *p,
> +				     unsigned long *map)
> +{
> +}
> +
> +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
> +{
> +	return NULL;
> +}
> +#endif
> +
> +static inline int frontswap_put_page(struct page *page)
> +{
> +	int ret = -1;
> +
> +	if (frontswap_enabled)
> +		ret = __frontswap_put_page(page);
> +	return ret;
> +}
> +
> +static inline int frontswap_get_page(struct page *page)
> +{
> +	int ret = -1;
> +
> +	if (frontswap_enabled)
> +		ret = __frontswap_get_page(page);
> +	return ret;
> +}
> +
> +static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_flush_page(type, offset);
> +}
> +
> +static inline void frontswap_flush_area(unsigned type)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_flush_area(type);
> +}
> +
> +static inline void frontswap_init(unsigned type)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_init(type);
> +}
> +
> +#endif /* _LINUX_FRONTSWAP_H */
> --- linux/mm/frontswap.c	1969-12-31 17:00:00.000000000 -0700
> +++ frontswap/mm/frontswap.c	2011-08-23 08:20:09.774813309 -0600
> @@ -0,0 +1,346 @@
> +/*
> + * Frontswap frontend
> + *
> + * This code provides the generic "frontend" layer to call a matching
> + * "backend" driver implementation of frontswap.  See
> + * Documentation/vm/frontswap.txt for more information.
> + *
> + * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
> + * Author: Dan Magenheimer
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/sysctl.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <linux/proc_fs.h>
> +#include <linux/security.h>
> +#include <linux/capability.h>
> +#include <linux/module.h>
> +#include <linux/uaccess.h>
> +#include <linux/frontswap.h>
> +#include <linux/swapfile.h>
> +
> +/*
> + * frontswap_ops is set by frontswap_register_ops to contain the pointers
> + * to the frontswap "backend" implementation functions.
> + */
> +static struct frontswap_ops frontswap_ops;
> +

Hmm, only one frontswap_ops can be registered to the system ?
Then...why it required to be registered ? This just comes from problem in
coding ?

BTW, Do I have a chance to implement frontswap accounting per cgroup
(under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
Do you think it is worth to do ?



> +/*
> + * This global enablement flag reduces overhead on systems where frontswap_ops
> + * has not been registered, so is preferred to the slower alternative: a
> + * function call that checks a non-global.
> + */
> +int frontswap_enabled;
> +EXPORT_SYMBOL(frontswap_enabled);
> +
> +/* useful stats available in /sys/kernel/mm/frontswap */
> +static unsigned long frontswap_gets;
> +static unsigned long frontswap_succ_puts;
> +static unsigned long frontswap_failed_puts;
> +static unsigned long frontswap_flushes;
> +

What lock guard these ? swap_lock ?

> +/*
> + * register operations for frontswap, returning previous thus allowing
> + * detection of multiple backends and possible nesting
> + */
> +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
> +{
> +	struct frontswap_ops old = frontswap_ops;
> +
> +	frontswap_ops = *ops;
> +	frontswap_enabled = 1;
> +	return old;
> +}
> +EXPORT_SYMBOL(frontswap_register_ops);
> +

No lock ? and there is no unregister_ops() ?


> +/* Called when a swap device is swapon'd */
> +void __frontswap_init(unsigned type)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (sis->frontswap_map == NULL)
> +		return;
> +	if (frontswap_enabled)
> +		(*frontswap_ops.init)(type);
> +}
> +EXPORT_SYMBOL(__frontswap_init);
> +
> +/*
> + * "Put" data from a page to frontswap and associate it with the page's
> + * swaptype and offset.  Page must be locked and in the swap cache.
> + * If frontswap already contains a page with matching swaptype and
> + * offset, the frontswap implmentation may either overwrite the data
> + * and return success or flush the page from frontswap and return failure
> + */

What lock should be held to guard global variables ? swap_lock ?


> +int __frontswap_put_page(struct page *page)
> +{
> +	int ret = -1, dup = 0;
> +	swp_entry_t entry = { .val = page_private(page), };
> +	int type = swp_type(entry);
> +	struct swap_info_struct *sis = swap_info[type];
> +	pgoff_t offset = swp_offset(entry);
> +
> +	BUG_ON(!PageLocked(page));
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset))
> +		dup = 1;
> +	ret = (*frontswap_ops.put_page)(type, offset, page);
> +	if (ret == 0) {
> +		frontswap_set(sis, offset);
> +		frontswap_succ_puts++;
> +		if (!dup)
> +			sis->frontswap_pages++;
> +	} else if (dup) {
> +		/*
> +		  failed dup always results in automatic flush of
> +		  the (older) page from frontswap
> +		 */
> +		frontswap_clear(sis, offset);
> +		sis->frontswap_pages--;
> +		frontswap_failed_puts++;
> +	} else
> +		frontswap_failed_puts++;
> +	return ret;
> +}
> +EXPORT_SYMBOL(__frontswap_put_page);
> +
> +/*
> + * "Get" data from frontswap associated with swaptype and offset that were
> + * specified when the data was put to frontswap and use it to fill the
> + * specified page with data. Page must be locked and in the swap cache
> + */
> +int __frontswap_get_page(struct page *page)
> +{
> +	int ret = -1;
> +	swp_entry_t entry = { .val = page_private(page), };
> +	int type = swp_type(entry);
> +	struct swap_info_struct *sis = swap_info[type];
> +	pgoff_t offset = swp_offset(entry);
> +
> +	BUG_ON(!PageLocked(page));
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset))
> +		ret = (*frontswap_ops.get_page)(type, offset, page);
> +	if (ret == 0)
> +		frontswap_gets++;
> +	return ret;
> +}
> +EXPORT_SYMBOL(__frontswap_get_page);
> +
> +/*
> + * Flush any data from frontswap associated with the specified swaptype
> + * and offset so that a subsequent "get" will fail.
> + */
> +void __frontswap_flush_page(unsigned type, pgoff_t offset)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset)) {
> +		(*frontswap_ops.flush_page)(type, offset);
> +		sis->frontswap_pages--;
> +		frontswap_clear(sis, offset);
> +		frontswap_flushes++;
> +	}
> +}
> +EXPORT_SYMBOL(__frontswap_flush_page);
> +
> +/*
> + * Flush all data from frontswap associated with all offsets for the
> + * specified swaptype.
> + */
> +void __frontswap_flush_area(unsigned type)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (sis->frontswap_map == NULL)
> +		return;
> +	(*frontswap_ops.flush_area)(type);
> +	sis->frontswap_pages = 0;
> +	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
> +}
> +EXPORT_SYMBOL(__frontswap_flush_area);
> +
> +/*
> + * Frontswap, like a true swap device, may unnecessarily retain pages
> + * under certain circumstances; "shrink" frontswap is essentially a
> + * "partial swapoff" and works by calling try_to_unuse to attempt to
> + * unuse enough frontswap pages to attempt to -- subject to memory
> + * constraints -- reduce the number of pages in frontswap
> + */
> +void frontswap_shrink(unsigned long target_pages)
> +{
> +	int wrapped = 0;
> +	bool locked = false;
> +
> +	/* try a few times to maximize chance of try_to_unuse success */
> +	for (wrapped = 0; wrapped < 3; wrapped++) {
> +
> +		struct swap_info_struct *si = NULL;
> +		unsigned long total_pages = 0, total_pages_to_unuse;
> +		unsigned long pages = 0, pages_to_unuse = 0;
> +		int type;
> +
> +		/*
> +		 * we don't want to hold swap_lock while doing a very
> +		 * lengthy try_to_unuse, but swap_list may change
> +		 * so restart scan from swap_list.head each time
> +		 */
> +		spin_lock(&swap_lock);
> +		locked = true;
> +		total_pages = 0;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			total_pages += si->frontswap_pages;
> +		}
> +		if (total_pages <= target_pages)
> +			goto out;
> +		total_pages_to_unuse = total_pages - target_pages;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			if (total_pages_to_unuse < si->frontswap_pages)
> +				pages = pages_to_unuse = total_pages_to_unuse;
> +			else {
> +				pages = si->frontswap_pages;
> +				pages_to_unuse = 0; /* unuse all */
> +			}
> +			if (security_vm_enough_memory_kern(pages))
> +				continue;
> +			vm_unacct_memory(pages);
> +			break;
> +		}
> +		if (type < 0)
> +			goto out;
> +		locked = false;
> +		spin_unlock(&swap_lock);
> +		try_to_unuse(type, true, pages_to_unuse);
> +	}
> +
> +out:
> +	if (locked)
> +		spin_unlock(&swap_lock);
> +	return;
> +}
> +EXPORT_SYMBOL(frontswap_shrink);
> +
> +/*
> + * count and return the number of pages frontswap pages across all
> + * swap devices.  This is exported so that a kernel module can
> + * determine current usage without reading sysfs.
> + */
> +unsigned long frontswap_curr_pages(void)
> +{
> +	int type;
> +	unsigned long totalpages = 0;
> +	struct swap_info_struct *si = NULL;
> +
> +	spin_lock(&swap_lock);
> +	for (type = swap_list.head; type >= 0; type = si->next) {
> +		si = swap_info[type];
> +		if (si != NULL)
> +			totalpages += si->frontswap_pages;
> +	}
> +	spin_unlock(&swap_lock);
> +	return totalpages;
> +}
> +EXPORT_SYMBOL(frontswap_curr_pages);
> +
> +#ifdef CONFIG_SYSFS
> +
> +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
> +
> +#define FRONTSWAP_ATTR_RO(_name) \
> +	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +#define FRONTSWAP_ATTR(_name) \
> +	static struct kobj_attribute _name##_attr = \
> +		__ATTR(_name, 0644, _name##_show, _name##_store)
> +
> +static ssize_t curr_pages_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_curr_pages());
> +}
> +
> +static ssize_t curr_pages_store(struct kobject *kobj,
> +			       struct kobj_attribute *attr,
> +			       const char *buf, size_t count)
> +{
> +	unsigned long target_pages;
> +
> +	if (strict_strtoul(buf, 10, &target_pages))
> +		return -EINVAL;
> +
> +	frontswap_shrink(target_pages);
> +
> +	return count;
> +}
> +FRONTSWAP_ATTR(curr_pages);
> +
> +static ssize_t succ_puts_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_succ_puts);
> +}
> +FRONTSWAP_ATTR_RO(succ_puts);
> +
> +static ssize_t failed_puts_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_failed_puts);
> +}
> +FRONTSWAP_ATTR_RO(failed_puts);
> +
> +static ssize_t gets_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_gets);
> +}
> +FRONTSWAP_ATTR_RO(gets);
> +
> +static ssize_t flushes_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_flushes);
> +}
> +FRONTSWAP_ATTR_RO(flushes);
> +
> +static struct attribute *frontswap_attrs[] = {
> +	&curr_pages_attr.attr,
> +	&succ_puts_attr.attr,
> +	&failed_puts_attr.attr,
> +	&gets_attr.attr,
> +	&flushes_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group frontswap_attr_group = {
> +	.attrs = frontswap_attrs,
> +	.name = "frontswap",
> +};
> +
> +#endif /* CONFIG_SYSFS */
> +
> +static int __init init_frontswap(void)
> +{
> +	int err = 0;
> +
> +#ifdef CONFIG_SYSFS
> +	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
> +#endif /* CONFIG_SYSFS */
> +	return err;
> +}
> +
> +static void __exit exit_frontswap(void)
> +{
> +	frontswap_shrink(0UL);
> +}
> +
> +module_init(init_frontswap);
> +module_exit(exit_frontswap);


I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
in this patch set ? Or how to test the work ?

Thanks,
-Kame









^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-25  6:05   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 18+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-08-25  6:05 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, konrad.wilk,
	JBeulich, kurt.hackel, npiggin, akpm, riel, hannes, matthew,
	chris.mason, sjenning, jackdachef, cyclonusj

On Tue, 23 Aug 2011 07:58:15 -0700
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> From: Dan Magenheimer <dan.magenheimer@oracle.com>
> Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> This second patch of four in this frontswap series provides the core code
> for frontswap that interfaces between the hooks in the swap subsystem and
> a frontswap backend via frontswap_ops.
> 
> Two new files are added: mm/frontswap.c and include/linux/frontswap.h
> 
> Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
> design for tmem; sysfs code modelled after mm/ksm.c
> 
> [v7: rebase to 3.0-rc3]
> [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
> [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
> [v6: rebase to 3.1-rc1]
> [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
> [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
> [v5: no change from v4]
> [v4: rebase to 2.6.39]
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
> Acked-by: Jan Beulich <JBeulich@novell.com>
> Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Nitin Gupta <ngupta@vflare.org>
> Cc: Matthew Wilcox <matthew@wil.cx>
> Cc: Chris Mason <chris.mason@oracle.com>
> Cc: Rik Riel <riel@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 

I think you should add diffstat...and add include changes to Makefile
in the same patch.

I have small questions.

> --- linux/include/linux/frontswap.h	1969-12-31 17:00:00.000000000 -0700
> +++ frontswap/include/linux/frontswap.h	2011-08-23 08:22:14.645692424 -0600
> @@ -0,0 +1,125 @@
> +#ifndef _LINUX_FRONTSWAP_H
> +#define _LINUX_FRONTSWAP_H
> +
> +#include <linux/swap.h>
> +#include <linux/mm.h>
> +
> +struct frontswap_ops {
> +	void (*init)(unsigned);
> +	int (*put_page)(unsigned, pgoff_t, struct page *);
> +	int (*get_page)(unsigned, pgoff_t, struct page *);
> +	void (*flush_page)(unsigned, pgoff_t);
> +	void (*flush_area)(unsigned);
> +};
> +
> +extern int frontswap_enabled;
> +extern struct frontswap_ops
> +	frontswap_register_ops(struct frontswap_ops *ops);
> +extern void frontswap_shrink(unsigned long);
> +extern unsigned long frontswap_curr_pages(void);
> +
> +extern void __frontswap_init(unsigned type);
> +extern int __frontswap_put_page(struct page *page);
> +extern int __frontswap_get_page(struct page *page);
> +extern void __frontswap_flush_page(unsigned, pgoff_t);
> +extern void __frontswap_flush_area(unsigned);
> +
> +#ifdef CONFIG_FRONTSWAP
> +
> +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	int ret = 0;
> +
> +	if (frontswap_enabled && sis->frontswap_map)
> +		ret = test_bit(offset, sis->frontswap_map);
> +	return ret;
> +}
> +
> +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	if (frontswap_enabled && sis->frontswap_map)
> +		set_bit(offset, sis->frontswap_map);
> +}
> +
> +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	if (frontswap_enabled && sis->frontswap_map)
> +		clear_bit(offset, sis->frontswap_map);
> +}
> +
> +static inline void frontswap_map_set(struct swap_info_struct *p,
> +				     unsigned long *map)
> +{
> +	p->frontswap_map = map;
> +}
> +
> +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
> +{
> +	return p->frontswap_map;
> +}
> +#else
> +/* all inline routines become no-ops and all externs are ignored */
> +
> +#define frontswap_enabled (0)
> +
> +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +	return 0;
> +}
> +
> +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +}
> +
> +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
> +{
> +}
> +
> +static inline void frontswap_map_set(struct swap_info_struct *p,
> +				     unsigned long *map)
> +{
> +}
> +
> +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
> +{
> +	return NULL;
> +}
> +#endif
> +
> +static inline int frontswap_put_page(struct page *page)
> +{
> +	int ret = -1;
> +
> +	if (frontswap_enabled)
> +		ret = __frontswap_put_page(page);
> +	return ret;
> +}
> +
> +static inline int frontswap_get_page(struct page *page)
> +{
> +	int ret = -1;
> +
> +	if (frontswap_enabled)
> +		ret = __frontswap_get_page(page);
> +	return ret;
> +}
> +
> +static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_flush_page(type, offset);
> +}
> +
> +static inline void frontswap_flush_area(unsigned type)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_flush_area(type);
> +}
> +
> +static inline void frontswap_init(unsigned type)
> +{
> +	if (frontswap_enabled)
> +		__frontswap_init(type);
> +}
> +
> +#endif /* _LINUX_FRONTSWAP_H */
> --- linux/mm/frontswap.c	1969-12-31 17:00:00.000000000 -0700
> +++ frontswap/mm/frontswap.c	2011-08-23 08:20:09.774813309 -0600
> @@ -0,0 +1,346 @@
> +/*
> + * Frontswap frontend
> + *
> + * This code provides the generic "frontend" layer to call a matching
> + * "backend" driver implementation of frontswap.  See
> + * Documentation/vm/frontswap.txt for more information.
> + *
> + * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
> + * Author: Dan Magenheimer
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/sysctl.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <linux/proc_fs.h>
> +#include <linux/security.h>
> +#include <linux/capability.h>
> +#include <linux/module.h>
> +#include <linux/uaccess.h>
> +#include <linux/frontswap.h>
> +#include <linux/swapfile.h>
> +
> +/*
> + * frontswap_ops is set by frontswap_register_ops to contain the pointers
> + * to the frontswap "backend" implementation functions.
> + */
> +static struct frontswap_ops frontswap_ops;
> +

Hmm, only one frontswap_ops can be registered to the system ?
Then...why it required to be registered ? This just comes from problem in
coding ?

BTW, Do I have a chance to implement frontswap accounting per cgroup
(under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
Do you think it is worth to do ?



> +/*
> + * This global enablement flag reduces overhead on systems where frontswap_ops
> + * has not been registered, so is preferred to the slower alternative: a
> + * function call that checks a non-global.
> + */
> +int frontswap_enabled;
> +EXPORT_SYMBOL(frontswap_enabled);
> +
> +/* useful stats available in /sys/kernel/mm/frontswap */
> +static unsigned long frontswap_gets;
> +static unsigned long frontswap_succ_puts;
> +static unsigned long frontswap_failed_puts;
> +static unsigned long frontswap_flushes;
> +

What lock guard these ? swap_lock ?

> +/*
> + * register operations for frontswap, returning previous thus allowing
> + * detection of multiple backends and possible nesting
> + */
> +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
> +{
> +	struct frontswap_ops old = frontswap_ops;
> +
> +	frontswap_ops = *ops;
> +	frontswap_enabled = 1;
> +	return old;
> +}
> +EXPORT_SYMBOL(frontswap_register_ops);
> +

No lock ? and there is no unregister_ops() ?


> +/* Called when a swap device is swapon'd */
> +void __frontswap_init(unsigned type)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (sis->frontswap_map == NULL)
> +		return;
> +	if (frontswap_enabled)
> +		(*frontswap_ops.init)(type);
> +}
> +EXPORT_SYMBOL(__frontswap_init);
> +
> +/*
> + * "Put" data from a page to frontswap and associate it with the page's
> + * swaptype and offset.  Page must be locked and in the swap cache.
> + * If frontswap already contains a page with matching swaptype and
> + * offset, the frontswap implmentation may either overwrite the data
> + * and return success or flush the page from frontswap and return failure
> + */

What lock should be held to guard global variables ? swap_lock ?


> +int __frontswap_put_page(struct page *page)
> +{
> +	int ret = -1, dup = 0;
> +	swp_entry_t entry = { .val = page_private(page), };
> +	int type = swp_type(entry);
> +	struct swap_info_struct *sis = swap_info[type];
> +	pgoff_t offset = swp_offset(entry);
> +
> +	BUG_ON(!PageLocked(page));
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset))
> +		dup = 1;
> +	ret = (*frontswap_ops.put_page)(type, offset, page);
> +	if (ret == 0) {
> +		frontswap_set(sis, offset);
> +		frontswap_succ_puts++;
> +		if (!dup)
> +			sis->frontswap_pages++;
> +	} else if (dup) {
> +		/*
> +		  failed dup always results in automatic flush of
> +		  the (older) page from frontswap
> +		 */
> +		frontswap_clear(sis, offset);
> +		sis->frontswap_pages--;
> +		frontswap_failed_puts++;
> +	} else
> +		frontswap_failed_puts++;
> +	return ret;
> +}
> +EXPORT_SYMBOL(__frontswap_put_page);
> +
> +/*
> + * "Get" data from frontswap associated with swaptype and offset that were
> + * specified when the data was put to frontswap and use it to fill the
> + * specified page with data. Page must be locked and in the swap cache
> + */
> +int __frontswap_get_page(struct page *page)
> +{
> +	int ret = -1;
> +	swp_entry_t entry = { .val = page_private(page), };
> +	int type = swp_type(entry);
> +	struct swap_info_struct *sis = swap_info[type];
> +	pgoff_t offset = swp_offset(entry);
> +
> +	BUG_ON(!PageLocked(page));
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset))
> +		ret = (*frontswap_ops.get_page)(type, offset, page);
> +	if (ret == 0)
> +		frontswap_gets++;
> +	return ret;
> +}
> +EXPORT_SYMBOL(__frontswap_get_page);
> +
> +/*
> + * Flush any data from frontswap associated with the specified swaptype
> + * and offset so that a subsequent "get" will fail.
> + */
> +void __frontswap_flush_page(unsigned type, pgoff_t offset)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (frontswap_test(sis, offset)) {
> +		(*frontswap_ops.flush_page)(type, offset);
> +		sis->frontswap_pages--;
> +		frontswap_clear(sis, offset);
> +		frontswap_flushes++;
> +	}
> +}
> +EXPORT_SYMBOL(__frontswap_flush_page);
> +
> +/*
> + * Flush all data from frontswap associated with all offsets for the
> + * specified swaptype.
> + */
> +void __frontswap_flush_area(unsigned type)
> +{
> +	struct swap_info_struct *sis = swap_info[type];
> +
> +	BUG_ON(sis == NULL);
> +	if (sis->frontswap_map == NULL)
> +		return;
> +	(*frontswap_ops.flush_area)(type);
> +	sis->frontswap_pages = 0;
> +	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
> +}
> +EXPORT_SYMBOL(__frontswap_flush_area);
> +
> +/*
> + * Frontswap, like a true swap device, may unnecessarily retain pages
> + * under certain circumstances; "shrink" frontswap is essentially a
> + * "partial swapoff" and works by calling try_to_unuse to attempt to
> + * unuse enough frontswap pages to attempt to -- subject to memory
> + * constraints -- reduce the number of pages in frontswap
> + */
> +void frontswap_shrink(unsigned long target_pages)
> +{
> +	int wrapped = 0;
> +	bool locked = false;
> +
> +	/* try a few times to maximize chance of try_to_unuse success */
> +	for (wrapped = 0; wrapped < 3; wrapped++) {
> +
> +		struct swap_info_struct *si = NULL;
> +		unsigned long total_pages = 0, total_pages_to_unuse;
> +		unsigned long pages = 0, pages_to_unuse = 0;
> +		int type;
> +
> +		/*
> +		 * we don't want to hold swap_lock while doing a very
> +		 * lengthy try_to_unuse, but swap_list may change
> +		 * so restart scan from swap_list.head each time
> +		 */
> +		spin_lock(&swap_lock);
> +		locked = true;
> +		total_pages = 0;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			total_pages += si->frontswap_pages;
> +		}
> +		if (total_pages <= target_pages)
> +			goto out;
> +		total_pages_to_unuse = total_pages - target_pages;
> +		for (type = swap_list.head; type >= 0; type = si->next) {
> +			si = swap_info[type];
> +			if (total_pages_to_unuse < si->frontswap_pages)
> +				pages = pages_to_unuse = total_pages_to_unuse;
> +			else {
> +				pages = si->frontswap_pages;
> +				pages_to_unuse = 0; /* unuse all */
> +			}
> +			if (security_vm_enough_memory_kern(pages))
> +				continue;
> +			vm_unacct_memory(pages);
> +			break;
> +		}
> +		if (type < 0)
> +			goto out;
> +		locked = false;
> +		spin_unlock(&swap_lock);
> +		try_to_unuse(type, true, pages_to_unuse);
> +	}
> +
> +out:
> +	if (locked)
> +		spin_unlock(&swap_lock);
> +	return;
> +}
> +EXPORT_SYMBOL(frontswap_shrink);
> +
> +/*
> + * count and return the number of pages frontswap pages across all
> + * swap devices.  This is exported so that a kernel module can
> + * determine current usage without reading sysfs.
> + */
> +unsigned long frontswap_curr_pages(void)
> +{
> +	int type;
> +	unsigned long totalpages = 0;
> +	struct swap_info_struct *si = NULL;
> +
> +	spin_lock(&swap_lock);
> +	for (type = swap_list.head; type >= 0; type = si->next) {
> +		si = swap_info[type];
> +		if (si != NULL)
> +			totalpages += si->frontswap_pages;
> +	}
> +	spin_unlock(&swap_lock);
> +	return totalpages;
> +}
> +EXPORT_SYMBOL(frontswap_curr_pages);
> +
> +#ifdef CONFIG_SYSFS
> +
> +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
> +
> +#define FRONTSWAP_ATTR_RO(_name) \
> +	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +#define FRONTSWAP_ATTR(_name) \
> +	static struct kobj_attribute _name##_attr = \
> +		__ATTR(_name, 0644, _name##_show, _name##_store)
> +
> +static ssize_t curr_pages_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_curr_pages());
> +}
> +
> +static ssize_t curr_pages_store(struct kobject *kobj,
> +			       struct kobj_attribute *attr,
> +			       const char *buf, size_t count)
> +{
> +	unsigned long target_pages;
> +
> +	if (strict_strtoul(buf, 10, &target_pages))
> +		return -EINVAL;
> +
> +	frontswap_shrink(target_pages);
> +
> +	return count;
> +}
> +FRONTSWAP_ATTR(curr_pages);
> +
> +static ssize_t succ_puts_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_succ_puts);
> +}
> +FRONTSWAP_ATTR_RO(succ_puts);
> +
> +static ssize_t failed_puts_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_failed_puts);
> +}
> +FRONTSWAP_ATTR_RO(failed_puts);
> +
> +static ssize_t gets_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_gets);
> +}
> +FRONTSWAP_ATTR_RO(gets);
> +
> +static ssize_t flushes_show(struct kobject *kobj,
> +			       struct kobj_attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%lu\n", frontswap_flushes);
> +}
> +FRONTSWAP_ATTR_RO(flushes);
> +
> +static struct attribute *frontswap_attrs[] = {
> +	&curr_pages_attr.attr,
> +	&succ_puts_attr.attr,
> +	&failed_puts_attr.attr,
> +	&gets_attr.attr,
> +	&flushes_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group frontswap_attr_group = {
> +	.attrs = frontswap_attrs,
> +	.name = "frontswap",
> +};
> +
> +#endif /* CONFIG_SYSFS */
> +
> +static int __init init_frontswap(void)
> +{
> +	int err = 0;
> +
> +#ifdef CONFIG_SYSFS
> +	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
> +#endif /* CONFIG_SYSFS */
> +	return err;
> +}
> +
> +static void __exit exit_frontswap(void)
> +{
> +	frontswap_shrink(0UL);
> +}
> +
> +module_init(init_frontswap);
> +module_exit(exit_frontswap);


I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
in this patch set ? Or how to test the work ?

Thanks,
-Kame








--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-25  6:05   ` KAMEZAWA Hiroyuki
@ 2011-08-25 13:29     ` Seth Jennings
  -1 siblings, 0 replies; 18+ messages in thread
From: Seth Jennings @ 2011-08-25 13:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dan Magenheimer, linux-kernel, linux-mm, jeremy, hughd, ngupta,
	konrad.wilk, JBeulich, kurt.hackel, npiggin, akpm, riel, hannes,
	matthew, chris.mason, jackdachef, cyclonusj

On 08/25/2011 01:05 AM, KAMEZAWA Hiroyuki wrote:
> On Tue, 23 Aug 2011 07:58:15 -0700
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Subject: [PATCH V7 2/4] mm: frontswap: core code
>>
>> This second patch of four in this frontswap series provides the core code
>> for frontswap that interfaces between the hooks in the swap subsystem and
>> a frontswap backend via frontswap_ops.
>>
>> Two new files are added: mm/frontswap.c and include/linux/frontswap.h
>>
>> Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
>> design for tmem; sysfs code modelled after mm/ksm.c
>>
>> [v7: rebase to 3.0-rc3]
>> [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
>> [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
>> [v6: rebase to 3.1-rc1]
>> [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
>> [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
>> [v5: no change from v4]
>> [v4: rebase to 2.6.39]
>> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
>> Acked-by: Jan Beulich <JBeulich@novell.com>
>> Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
>> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Nitin Gupta <ngupta@vflare.org>
>> Cc: Matthew Wilcox <matthew@wil.cx>
>> Cc: Chris Mason <chris.mason@oracle.com>
>> Cc: Rik Riel <riel@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>
<cut>
> 
> 
> I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
> in this patch set ? Or how to test the work ?

The zcache driver (in drivers/staging/zcache) is the one that registers frontswap ops.

You can test frontswap by enabling CONFIG_FRONTSWAP and CONFIG_ZCACHE, and putting 
"zcache" in the kernel boot parameters.

> 
> Thanks,
> -Kame
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-25 13:29     ` Seth Jennings
  0 siblings, 0 replies; 18+ messages in thread
From: Seth Jennings @ 2011-08-25 13:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dan Magenheimer, linux-kernel, linux-mm, jeremy, hughd, ngupta,
	konrad.wilk, JBeulich, kurt.hackel, npiggin, akpm, riel, hannes,
	matthew, chris.mason, jackdachef, cyclonusj

On 08/25/2011 01:05 AM, KAMEZAWA Hiroyuki wrote:
> On Tue, 23 Aug 2011 07:58:15 -0700
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Subject: [PATCH V7 2/4] mm: frontswap: core code
>>
>> This second patch of four in this frontswap series provides the core code
>> for frontswap that interfaces between the hooks in the swap subsystem and
>> a frontswap backend via frontswap_ops.
>>
>> Two new files are added: mm/frontswap.c and include/linux/frontswap.h
>>
>> Credits: Frontswap_ops design derived from Jeremy Fitzhardinge
>> design for tmem; sysfs code modelled after mm/ksm.c
>>
>> [v7: rebase to 3.0-rc3]
>> [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd]
>> [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls]
>> [v6: rebase to 3.1-rc1]
>> [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
>> [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
>> [v5: no change from v4]
>> [v4: rebase to 2.6.39]
>> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Reviewed-by: Konrad Wilk <konrad.wilk@oracle.com>
>> Acked-by: Jan Beulich <JBeulich@novell.com>
>> Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
>> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Nitin Gupta <ngupta@vflare.org>
>> Cc: Matthew Wilcox <matthew@wil.cx>
>> Cc: Chris Mason <chris.mason@oracle.com>
>> Cc: Rik Riel <riel@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>
<cut>
> 
> 
> I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
> in this patch set ? Or how to test the work ?

The zcache driver (in drivers/staging/zcache) is the one that registers frontswap ops.

You can test frontswap by enabling CONFIG_FRONTSWAP and CONFIG_ZCACHE, and putting 
"zcache" in the kernel boot parameters.

> 
> Thanks,
> -Kame
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-25  6:05   ` KAMEZAWA Hiroyuki
@ 2011-08-25 17:37     ` Dan Magenheimer
  -1 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-25 17:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > From: Dan Magenheimer <dan.magenheimer@oracle.com>
> > Subject: [PATCH V7 2/4] mm: frontswap: core code
> >
> > This second patch of four in this frontswap series provides the core code
> 
> I think you should add diffstat...

The diffstat is in [PATCH V7 0/4] for the whole patchset.  I didn't
think a separate diffstat for each patch in the patchset was necessary?

> and add include changes to Makefile in the same patch.

The Makefile change must be in a patch after the patch that
creates frontswap.o or the build will fail.

> I have small questions.
> 
> > +/*
> > + * frontswap_ops is set by frontswap_register_ops to contain the pointers
> > + * to the frontswap "backend" implementation functions.
> > + */
> > +static struct frontswap_ops frontswap_ops;
> > +
> 
> Hmm, only one frontswap_ops can be registered to the system ?
> Then...why it required to be registered ? This just comes from problem in
> coding ?

Yes, currently only one frontswap_ops can be registered.  However there
are different users (zcache and Xen tmem) that will register different
callbacks for the frontswap_ops.  A future enhancement may allow "chaining"
(see https://lkml.org/lkml/2011/6/3/202 which describes chaining for
cleancache).

> BTW, Do I have a chance to implement frontswap accounting per cgroup
> (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> Do you think it is worth to do ?

I'm not very familiar with cgroups or memcg but I think it may be possible
to implement transcendent memory with cgroup as the "guest" and the default
cgroup as the "host" to allow for more memory elasticity for cgroups.
(See http://lwn.net/Articles/454795/ for a good overview of all of
transcendent memory.)

> > +/*
> > + * This global enablement flag reduces overhead on systems where frontswap_ops
> > + * has not been registered, so is preferred to the slower alternative: a
> > + * function call that checks a non-global.
> > + */
> > +int frontswap_enabled;
> > +EXPORT_SYMBOL(frontswap_enabled);
> > +
> > +/* useful stats available in /sys/kernel/mm/frontswap */
> > +static unsigned long frontswap_gets;
> > +static unsigned long frontswap_succ_puts;
> > +static unsigned long frontswap_failed_puts;
> > +static unsigned long frontswap_flushes;
> > +
> 
> What lock guard these ? swap_lock ?

These are informational statistics so do not need to be protected
by a lock or an atomic-type.  If an increment is lost due to a cpu
race, it is not a problem.

> > +/*
> > + * register operations for frontswap, returning previous thus allowing
> > + * detection of multiple backends and possible nesting
> > + */
> > +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
> > +{
> > +	struct frontswap_ops old = frontswap_ops;
> > +
> > +	frontswap_ops = *ops;
> > +	frontswap_enabled = 1;
> > +	return old;
> > +}
> > +EXPORT_SYMBOL(frontswap_register_ops);
> > +
> 
> No lock ? and there is no unregister_ops() ?

Right now only one "backend" can register with frontswap.  Existing
backends (zcache and Xen tmem) only register when enabled via a
kernel parameter.  In the future, there will need to be better
ways to do this, but I think this is sufficient for now.

So since only one backend can register, no lock is needed and
no unregister is needed yet.

> > +/* Called when a swap device is swapon'd */
> > +void __frontswap_init(unsigned type)
> > +{
> > +	struct swap_info_struct *sis = swap_info[type];
> > +
> > +	BUG_ON(sis == NULL);
> > +	if (sis->frontswap_map == NULL)
> > +		return;
> > +	if (frontswap_enabled)
> > +		(*frontswap_ops.init)(type);
> > +}
> > +EXPORT_SYMBOL(__frontswap_init);
> > +
> > +/*
> > + * "Put" data from a page to frontswap and associate it with the page's
> > + * swaptype and offset.  Page must be locked and in the swap cache.
> > + * If frontswap already contains a page with matching swaptype and
> > + * offset, the frontswap implmentation may either overwrite the data
> > + * and return success or flush the page from frontswap and return failure
> > + */
> 
> What lock should be held to guard global variables ? swap_lock ?

Which global variables do you mean and in what routines?  I think the
page lock is required for put/get (as documented in the comments)
but not the swap_lock.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-25 17:37     ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-25 17:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > From: Dan Magenheimer <dan.magenheimer@oracle.com>
> > Subject: [PATCH V7 2/4] mm: frontswap: core code
> >
> > This second patch of four in this frontswap series provides the core code
> 
> I think you should add diffstat...

The diffstat is in [PATCH V7 0/4] for the whole patchset.  I didn't
think a separate diffstat for each patch in the patchset was necessary?

> and add include changes to Makefile in the same patch.

The Makefile change must be in a patch after the patch that
creates frontswap.o or the build will fail.

> I have small questions.
> 
> > +/*
> > + * frontswap_ops is set by frontswap_register_ops to contain the pointers
> > + * to the frontswap "backend" implementation functions.
> > + */
> > +static struct frontswap_ops frontswap_ops;
> > +
> 
> Hmm, only one frontswap_ops can be registered to the system ?
> Then...why it required to be registered ? This just comes from problem in
> coding ?

Yes, currently only one frontswap_ops can be registered.  However there
are different users (zcache and Xen tmem) that will register different
callbacks for the frontswap_ops.  A future enhancement may allow "chaining"
(see https://lkml.org/lkml/2011/6/3/202 which describes chaining for
cleancache).

> BTW, Do I have a chance to implement frontswap accounting per cgroup
> (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> Do you think it is worth to do ?

I'm not very familiar with cgroups or memcg but I think it may be possible
to implement transcendent memory with cgroup as the "guest" and the default
cgroup as the "host" to allow for more memory elasticity for cgroups.
(See http://lwn.net/Articles/454795/ for a good overview of all of
transcendent memory.)

> > +/*
> > + * This global enablement flag reduces overhead on systems where frontswap_ops
> > + * has not been registered, so is preferred to the slower alternative: a
> > + * function call that checks a non-global.
> > + */
> > +int frontswap_enabled;
> > +EXPORT_SYMBOL(frontswap_enabled);
> > +
> > +/* useful stats available in /sys/kernel/mm/frontswap */
> > +static unsigned long frontswap_gets;
> > +static unsigned long frontswap_succ_puts;
> > +static unsigned long frontswap_failed_puts;
> > +static unsigned long frontswap_flushes;
> > +
> 
> What lock guard these ? swap_lock ?

These are informational statistics so do not need to be protected
by a lock or an atomic-type.  If an increment is lost due to a cpu
race, it is not a problem.

> > +/*
> > + * register operations for frontswap, returning previous thus allowing
> > + * detection of multiple backends and possible nesting
> > + */
> > +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
> > +{
> > +	struct frontswap_ops old = frontswap_ops;
> > +
> > +	frontswap_ops = *ops;
> > +	frontswap_enabled = 1;
> > +	return old;
> > +}
> > +EXPORT_SYMBOL(frontswap_register_ops);
> > +
> 
> No lock ? and there is no unregister_ops() ?

Right now only one "backend" can register with frontswap.  Existing
backends (zcache and Xen tmem) only register when enabled via a
kernel parameter.  In the future, there will need to be better
ways to do this, but I think this is sufficient for now.

So since only one backend can register, no lock is needed and
no unregister is needed yet.

> > +/* Called when a swap device is swapon'd */
> > +void __frontswap_init(unsigned type)
> > +{
> > +	struct swap_info_struct *sis = swap_info[type];
> > +
> > +	BUG_ON(sis == NULL);
> > +	if (sis->frontswap_map == NULL)
> > +		return;
> > +	if (frontswap_enabled)
> > +		(*frontswap_ops.init)(type);
> > +}
> > +EXPORT_SYMBOL(__frontswap_init);
> > +
> > +/*
> > + * "Put" data from a page to frontswap and associate it with the page's
> > + * swaptype and offset.  Page must be locked and in the swap cache.
> > + * If frontswap already contains a page with matching swaptype and
> > + * offset, the frontswap implmentation may either overwrite the data
> > + * and return success or flush the page from frontswap and return failure
> > + */
> 
> What lock should be held to guard global variables ? swap_lock ?

Which global variables do you mean and in what routines?  I think the
page lock is required for put/get (as documented in the comments)
but not the swap_lock.

Thanks,
Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-25 13:29     ` Seth Jennings
@ 2011-08-25 17:52       ` Dan Magenheimer
  -1 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-25 17:52 UTC (permalink / raw)
  To: Seth Jennings, KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, jackdachef, cyclonusj

> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
> To: KAMEZAWA Hiroyuki
> 
> On 08/25/2011 01:05 AM, KAMEZAWA Hiroyuki wrote:
> <cut>
> >
> >
> > I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
> > in this patch set ? Or how to test the work ?
> 
> The zcache driver (in drivers/staging/zcache) is the one that registers frontswap ops.
> 
> You can test frontswap by enabling CONFIG_FRONTSWAP and CONFIG_ZCACHE, and putting
> "zcache" in the kernel boot parameters.

Also see Xen tmem (in drivers/xen).  I am also working on a related project
called RAMster that uses frontswap.  And someone has started code for KVM
to work with transcendent memory (including frontswap).  But for now zcache
is the only non-virtualization in-kernel user for frontswap.

Dan

P.S. A recent build fix for zcache is necessary for it to work without
manual modification to the zcache Makefile.
See 8c70aac04e01a08b7eca204312946206d1c1baac in Linus's tree.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-25 17:52       ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-25 17:52 UTC (permalink / raw)
  To: Seth Jennings, KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, jackdachef, cyclonusj

> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
> To: KAMEZAWA Hiroyuki
> 
> On 08/25/2011 01:05 AM, KAMEZAWA Hiroyuki wrote:
> <cut>
> >
> >
> > I'm sorry if I miss codes but.... is an implementation of frontswap.ops included
> > in this patch set ? Or how to test the work ?
> 
> The zcache driver (in drivers/staging/zcache) is the one that registers frontswap ops.
> 
> You can test frontswap by enabling CONFIG_FRONTSWAP and CONFIG_ZCACHE, and putting
> "zcache" in the kernel boot parameters.

Also see Xen tmem (in drivers/xen).  I am also working on a related project
called RAMster that uses frontswap.  And someone has started code for KVM
to work with transcendent memory (including frontswap).  But for now zcache
is the only non-virtualization in-kernel user for frontswap.

Dan

P.S. A recent build fix for zcache is necessary for it to work without
manual modification to the zcache Makefile.
See 8c70aac04e01a08b7eca204312946206d1c1baac in Linus's tree.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-25 17:37     ` Dan Magenheimer
@ 2011-08-26  0:16       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 18+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-08-26  0:16 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code

> > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > Do you think it is worth to do ?
> 
> I'm not very familiar with cgroups or memcg but I think it may be possible
> to implement transcendent memory with cgroup as the "guest" and the default
> cgroup as the "host" to allow for more memory elasticity for cgroups.
> (See http://lwn.net/Articles/454795/ for a good overview of all of
> transcendent memory.)
> 
Ok, I'll see it.

I just wonder following case.

Assume 2 memcgs.
	memcg X: memory limit = 300M.
	memcg Y: memory limit = 300M.

This limitation is done for performance isolation.
When using frontswap, X and Y can cause resource confliction in frontswap and
performance of X and Y cannot be predictable.


> > > +/*
> > > + * This global enablement flag reduces overhead on systems where frontswap_ops
> > > + * has not been registered, so is preferred to the slower alternative: a
> > > + * function call that checks a non-global.
> > > + */
> > > +int frontswap_enabled;
> > > +EXPORT_SYMBOL(frontswap_enabled);
> > > +
> > > +/* useful stats available in /sys/kernel/mm/frontswap */
> > > +static unsigned long frontswap_gets;
> > > +static unsigned long frontswap_succ_puts;
> > > +static unsigned long frontswap_failed_puts;
> > > +static unsigned long frontswap_flushes;
> > > +
> > 
> > What lock guard these ? swap_lock ?
> 
> These are informational statistics so do not need to be protected
> by a lock or an atomic-type.  If an increment is lost due to a cpu
> race, it is not a problem.
> 

Hmm...Personally, I don't like incorrect counters. Could you add comments ?
Or How anout using percpu_counter ? (see lib/percpu_counter.c)


> > > +/* Called when a swap device is swapon'd */
> > > +void __frontswap_init(unsigned type)
> > > +{
> > > +	struct swap_info_struct *sis = swap_info[type];
> > > +
> > > +	BUG_ON(sis == NULL);
> > > +	if (sis->frontswap_map == NULL)
> > > +		return;
> > > +	if (frontswap_enabled)
> > > +		(*frontswap_ops.init)(type);
> > > +}
> > > +EXPORT_SYMBOL(__frontswap_init);
> > > +
> > > +/*
> > > + * "Put" data from a page to frontswap and associate it with the page's
> > > + * swaptype and offset.  Page must be locked and in the swap cache.
> > > + * If frontswap already contains a page with matching swaptype and
> > > + * offset, the frontswap implmentation may either overwrite the data
> > > + * and return success or flush the page from frontswap and return failure
> > > + */
> > 
> > What lock should be held to guard global variables ? swap_lock ?
> 
> Which global variables do you mean and in what routines?  I think the
> page lock is required for put/get (as documented in the comments)
> but not the swap_lock.
> 

My concern was race in counters. Even you allow race in frontswap_succ_puts++,

Don't you need some lock for
	sis->frontswap_pages++
	sis->frontswap_pages--
?

Thanks,
-Kame






^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-26  0:16       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 18+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-08-26  0:16 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code

> > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > Do you think it is worth to do ?
> 
> I'm not very familiar with cgroups or memcg but I think it may be possible
> to implement transcendent memory with cgroup as the "guest" and the default
> cgroup as the "host" to allow for more memory elasticity for cgroups.
> (See http://lwn.net/Articles/454795/ for a good overview of all of
> transcendent memory.)
> 
Ok, I'll see it.

I just wonder following case.

Assume 2 memcgs.
	memcg X: memory limit = 300M.
	memcg Y: memory limit = 300M.

This limitation is done for performance isolation.
When using frontswap, X and Y can cause resource confliction in frontswap and
performance of X and Y cannot be predictable.


> > > +/*
> > > + * This global enablement flag reduces overhead on systems where frontswap_ops
> > > + * has not been registered, so is preferred to the slower alternative: a
> > > + * function call that checks a non-global.
> > > + */
> > > +int frontswap_enabled;
> > > +EXPORT_SYMBOL(frontswap_enabled);
> > > +
> > > +/* useful stats available in /sys/kernel/mm/frontswap */
> > > +static unsigned long frontswap_gets;
> > > +static unsigned long frontswap_succ_puts;
> > > +static unsigned long frontswap_failed_puts;
> > > +static unsigned long frontswap_flushes;
> > > +
> > 
> > What lock guard these ? swap_lock ?
> 
> These are informational statistics so do not need to be protected
> by a lock or an atomic-type.  If an increment is lost due to a cpu
> race, it is not a problem.
> 

Hmm...Personally, I don't like incorrect counters. Could you add comments ?
Or How anout using percpu_counter ? (see lib/percpu_counter.c)


> > > +/* Called when a swap device is swapon'd */
> > > +void __frontswap_init(unsigned type)
> > > +{
> > > +	struct swap_info_struct *sis = swap_info[type];
> > > +
> > > +	BUG_ON(sis == NULL);
> > > +	if (sis->frontswap_map == NULL)
> > > +		return;
> > > +	if (frontswap_enabled)
> > > +		(*frontswap_ops.init)(type);
> > > +}
> > > +EXPORT_SYMBOL(__frontswap_init);
> > > +
> > > +/*
> > > + * "Put" data from a page to frontswap and associate it with the page's
> > > + * swaptype and offset.  Page must be locked and in the swap cache.
> > > + * If frontswap already contains a page with matching swaptype and
> > > + * offset, the frontswap implmentation may either overwrite the data
> > > + * and return success or flush the page from frontswap and return failure
> > > + */
> > 
> > What lock should be held to guard global variables ? swap_lock ?
> 
> Which global variables do you mean and in what routines?  I think the
> page lock is required for put/get (as documented in the comments)
> but not the swap_lock.
> 

My concern was race in counters. Even you allow race in frontswap_succ_puts++,

Don't you need some lock for
	sis->frontswap_pages++
	sis->frontswap_pages--
?

Thanks,
-Kame





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-26  0:16       ` KAMEZAWA Hiroyuki
@ 2011-08-26 14:28         ` Dan Magenheimer
  -1 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-26 14:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
> > > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > > Do you think it is worth to do ?
> >
> > I'm not very familiar with cgroups or memcg but I think it may be possible
> > to implement transcendent memory with cgroup as the "guest" and the default
> > cgroup as the "host" to allow for more memory elasticity for cgroups.
> > (See http://lwn.net/Articles/454795/ for a good overview of all of
> > transcendent memory.)
> >
> Ok, I'll see it.
> 
> I just wonder following case.
> 
> Assume 2 memcgs.
> 	memcg X: memory limit = 300M.
> 	memcg Y: memory limit = 300M.
> 
> This limitation is done for performance isolation.
> When using frontswap, X and Y can cause resource confliction in frontswap and
> performance of X and Y cannot be predictable.

> > These are informational statistics so do not need to be protected
> > by a lock or an atomic-type.  If an increment is lost due to a cpu
> > race, it is not a problem.
> 
> Hmm...Personally, I don't like incorrect counters. Could you add comments ?
> Or How anout using percpu_counter ? (see lib/percpu_counter.c)

Since the exact values of these counters is not required
by any code (just information for userland), I think I will
just add a comment.

> > > What lock should be held to guard global variables ? swap_lock ?
> >
> > Which global variables do you mean and in what routines?  I think the
> > page lock is required for put/get (as documented in the comments)
> > but not the swap_lock.
> 
> My concern was race in counters. Even you allow race in frontswap_succ_puts++,
> 
> Don't you need some lock for
> 	sis->frontswap_pages++
> 	sis->frontswap_pages--

Hmmm... OK, you've convinced me.  If this counter should be one and
a race leaves it as zero, I think data corruption could result on
a swapoff or partial swapoff.  And after thinking about it, I
think I also need to check for locking on frontswap_set/clear
as I don't think these bitfield modifiers are atomic.

Thanks for pointing this out.  Good catch!  I will need to
play with this and test it so probably will not submit V8 until
next week as today is a vacation day for me.

Dan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-26 14:28         ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-26 14:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
> > > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > > Do you think it is worth to do ?
> >
> > I'm not very familiar with cgroups or memcg but I think it may be possible
> > to implement transcendent memory with cgroup as the "guest" and the default
> > cgroup as the "host" to allow for more memory elasticity for cgroups.
> > (See http://lwn.net/Articles/454795/ for a good overview of all of
> > transcendent memory.)
> >
> Ok, I'll see it.
> 
> I just wonder following case.
> 
> Assume 2 memcgs.
> 	memcg X: memory limit = 300M.
> 	memcg Y: memory limit = 300M.
> 
> This limitation is done for performance isolation.
> When using frontswap, X and Y can cause resource confliction in frontswap and
> performance of X and Y cannot be predictable.

> > These are informational statistics so do not need to be protected
> > by a lock or an atomic-type.  If an increment is lost due to a cpu
> > race, it is not a problem.
> 
> Hmm...Personally, I don't like incorrect counters. Could you add comments ?
> Or How anout using percpu_counter ? (see lib/percpu_counter.c)

Since the exact values of these counters is not required
by any code (just information for userland), I think I will
just add a comment.

> > > What lock should be held to guard global variables ? swap_lock ?
> >
> > Which global variables do you mean and in what routines?  I think the
> > page lock is required for put/get (as documented in the comments)
> > but not the swap_lock.
> 
> My concern was race in counters. Even you allow race in frontswap_succ_puts++,
> 
> Don't you need some lock for
> 	sis->frontswap_pages++
> 	sis->frontswap_pages--

Hmmm... OK, you've convinced me.  If this counter should be one and
a race leaves it as zero, I think data corruption could result on
a swapoff or partial swapoff.  And after thinking about it, I
think I also need to check for locking on frontswap_set/clear
as I don't think these bitfield modifiers are atomic.

Thanks for pointing this out.  Good catch!  I will need to
play with this and test it so probably will not submit V8 until
next week as today is a vacation day for me.

Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-26  0:16       ` KAMEZAWA Hiroyuki
@ 2011-08-26 14:53         ` Dan Magenheimer
  -1 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-26 14:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
> > > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > > Do you think it is worth to do ?
> >
> > I'm not very familiar with cgroups or memcg but I think it may be possible
> > to implement transcendent memory with cgroup as the "guest" and the default
> > cgroup as the "host" to allow for more memory elasticity for cgroups.
> > (See http://lwn.net/Articles/454795/ for a good overview of all of
> > transcendent memory.)
> >
> Ok, I'll see it.
> 
> I just wonder following case.
> 
> Assume 2 memcgs.
> 	memcg X: memory limit = 300M.
> 	memcg Y: memory limit = 300M.
> 
> This limitation is done for performance isolation.
> When using frontswap, X and Y can cause resource confliction in frontswap and
> performance of X and Y cannot be predictable.

Oops, sorry for the extra reply, but I realize I cut/paste to
reply to this part and neglected to reply.

IMHO, it is impossible to do both dynamic resource optimization and
performance isolation.  So if the purpose of containers is for performance
isolation you need to partition ALL resources, including CPUs (and
not even split threads) and I/O devices.  And even then there will
be unpredictable use of some shared system resource (such as maybe
a QPI link or PCI bus).  If you are not partitioning all resources,
then RAM is just another resource that should be dynamically optimized
which may result in performance variations.  With strict policies,
maybe some quality-of-service guarantees can be made.

But that's just my opinion...

Dan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-26 14:53         ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-26 14:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> On Thu, 25 Aug 2011 10:37:05 -0700 (PDT)
> Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
> > > From: KAMEZAWA Hiroyuki [mailto:kamezawa.hiroyu@jp.fujitsu.com]
> > > Subject: Re: Subject: [PATCH V7 2/4] mm: frontswap: core code
> 
> > > BTW, Do I have a chance to implement frontswap accounting per cgroup
> > > (under memcg) ? Or Do I need to enable/disale switch for frontswap per memcg ?
> > > Do you think it is worth to do ?
> >
> > I'm not very familiar with cgroups or memcg but I think it may be possible
> > to implement transcendent memory with cgroup as the "guest" and the default
> > cgroup as the "host" to allow for more memory elasticity for cgroups.
> > (See http://lwn.net/Articles/454795/ for a good overview of all of
> > transcendent memory.)
> >
> Ok, I'll see it.
> 
> I just wonder following case.
> 
> Assume 2 memcgs.
> 	memcg X: memory limit = 300M.
> 	memcg Y: memory limit = 300M.
> 
> This limitation is done for performance isolation.
> When using frontswap, X and Y can cause resource confliction in frontswap and
> performance of X and Y cannot be predictable.

Oops, sorry for the extra reply, but I realize I cut/paste to
reply to this part and neglected to reply.

IMHO, it is impossible to do both dynamic resource optimization and
performance isolation.  So if the purpose of containers is for performance
isolation you need to partition ALL resources, including CPUs (and
not even split threads) and I/O devices.  And even then there will
be unpredictable use of some shared system resource (such as maybe
a QPI link or PCI bus).  If you are not partitioning all resources,
then RAM is just another resource that should be dynamically optimized
which may result in performance variations.  With strict policies,
maybe some quality-of-service guarantees can be made.

But that's just my opinion...

Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
  2011-08-26 14:28         ` Dan Magenheimer
@ 2011-08-29 15:47           ` Dan Magenheimer
  -1 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-29 15:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> > My concern was race in counters. Even you allow race in frontswap_succ_puts++,
> >
> > Don't you need some lock for
> > 	sis->frontswap_pages++
> > 	sis->frontswap_pages--
> 
> Hmmm... OK, you've convinced me.  If this counter should be one and
> a race leaves it as zero, I think data corruption could result on
> a swapoff or partial swapoff.  And after thinking about it, I
> think I also need to check for locking on frontswap_set/clear
> as I don't think these bitfield modifiers are atomic.
> 
> Thanks for pointing this out.  Good catch!  I will need to
> play with this and test it so probably will not submit V8 until
> next week as today is a vacation day for me.

Silly me: Of course set_bit and clear_bit ARE atomic.  I will
post V8 later today with the only changes being frontswap_pages
is now a type atomic_t.

Thanks again for catching this, Kame!

Thanks,
Dan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: Subject: [PATCH V7 2/4] mm: frontswap: core code
@ 2011-08-29 15:47           ` Dan Magenheimer
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Magenheimer @ 2011-08-29 15:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, jeremy, hughd, ngupta, Konrad Wilk,
	JBeulich, Kurt Hackel, npiggin, akpm, riel, hannes, matthew,
	Chris Mason, sjenning, jackdachef, cyclonusj

> > My concern was race in counters. Even you allow race in frontswap_succ_puts++,
> >
> > Don't you need some lock for
> > 	sis->frontswap_pages++
> > 	sis->frontswap_pages--
> 
> Hmmm... OK, you've convinced me.  If this counter should be one and
> a race leaves it as zero, I think data corruption could result on
> a swapoff or partial swapoff.  And after thinking about it, I
> think I also need to check for locking on frontswap_set/clear
> as I don't think these bitfield modifiers are atomic.
> 
> Thanks for pointing this out.  Good catch!  I will need to
> play with this and test it so probably will not submit V8 until
> next week as today is a vacation day for me.

Silly me: Of course set_bit and clear_bit ARE atomic.  I will
post V8 later today with the only changes being frontswap_pages
is now a type atomic_t.

Thanks again for catching this, Kame!

Thanks,
Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2011-08-29 15:49 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-23 14:58 Subject: [PATCH V7 2/4] mm: frontswap: core code Dan Magenheimer
2011-08-23 14:58 ` Dan Magenheimer
2011-08-25  6:05 ` KAMEZAWA Hiroyuki
2011-08-25  6:05   ` KAMEZAWA Hiroyuki
2011-08-25 13:29   ` Seth Jennings
2011-08-25 13:29     ` Seth Jennings
2011-08-25 17:52     ` Dan Magenheimer
2011-08-25 17:52       ` Dan Magenheimer
2011-08-25 17:37   ` Dan Magenheimer
2011-08-25 17:37     ` Dan Magenheimer
2011-08-26  0:16     ` KAMEZAWA Hiroyuki
2011-08-26  0:16       ` KAMEZAWA Hiroyuki
2011-08-26 14:28       ` Dan Magenheimer
2011-08-26 14:28         ` Dan Magenheimer
2011-08-29 15:47         ` Dan Magenheimer
2011-08-29 15:47           ` Dan Magenheimer
2011-08-26 14:53       ` Dan Magenheimer
2011-08-26 14:53         ` Dan Magenheimer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.