All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 19:01 ` Dan Magenheimer
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-06-30 19:01 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Marcus Klemm, Seth Jennings, Konrad Wilk, linux-kernel, devel,
	kvm, linux-mm

Hi Greg --

I think this patch is now ready for staging-next and for merging when
the 3.1 window opens.  Please let me know if you need any logistics
done differently.

Thanks,
Dan

===

>From: Dan Magenheimer <dan.magenheimer@oracle.com>
Subject: staging: zcache: support multiple clients, prep for KVM and RAMster

This is version 2 of an update to zcache, incorporating feedback from the list.
This patch adds support to the in-kernel transcendent memory ("tmem") code
and the zcache driver for multiple clients, which will be needed for both
RAMster and KVM support.  It also adds additional tmem callbacks to support
RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
also taken the liberty of adding some additional sysfs variables to
both surface information and allow policy control.  Those experimenting
with zcache should find them useful.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>

[v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
[v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
[v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: linux-mm@kvack.org
Cc: kvm@vger.kernel.org

===

Diffstat:
 drivers/staging/zcache/tmem.c            |  100 +++-
 drivers/staging/zcache/tmem.h            |   23 
 drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
 3 files changed, 520 insertions(+), 115 deletions(-)


diff -Napur linux-3.0-rc4/drivers/staging/zcache/tmem.c linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.c
--- linux-3.0-rc4/drivers/staging/zcache/tmem.c	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.c	2011-06-30 12:40:00.443197550 -0600
@@ -142,6 +142,7 @@ static void tmem_obj_init(struct tmem_ob
 	obj->oid = *oidp;
 	obj->objnode_count = 0;
 	obj->pampd_count = 0;
+	(*tmem_pamops.new_obj)(obj);
 	SET_SENTINEL(obj, OBJ);
 	while (*new) {
 		BUG_ON(RB_EMPTY_NODE(*new));
@@ -274,7 +275,7 @@ static void tmem_objnode_free(struct tme
 /*
  * lookup index in object and return associated pampd (or NULL if not found)
  */
-static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
 {
 	unsigned int height, shift;
 	struct tmem_objnode **slot = NULL;
@@ -303,9 +304,33 @@ static void *tmem_pampd_lookup_in_obj(st
 		height--;
 	}
 out:
+	return slot != NULL ? (void **)slot : NULL;
+}
+
+static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+{
+	struct tmem_objnode **slot;
+
+	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
 	return slot != NULL ? *slot : NULL;
 }
 
+static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
+					void *new_pampd)
+{
+	struct tmem_objnode **slot;
+	void *ret = NULL;
+
+	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
+	if ((slot != NULL) && (*slot != NULL)) {
+		void *old_pampd = *(void **)slot;
+		*(void **)slot = new_pampd;
+		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
+		ret = new_pampd;
+	}
+	return ret;
+}
+
 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
 					void *pampd)
 {
@@ -456,7 +481,7 @@ static void tmem_objnode_node_destroy(st
 			if (ht == 1) {
 				obj->pampd_count--;
 				(*tmem_pamops.free)(objnode->slots[i],
-								obj->pool);
+						obj->pool, NULL, 0);
 				objnode->slots[i] = NULL;
 				continue;
 			}
@@ -473,7 +498,7 @@ static void tmem_pampd_destroy_all_in_ob
 		return;
 	if (obj->objnode_tree_height == 0) {
 		obj->pampd_count--;
-		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool);
+		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
 	} else {
 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
 					obj->objnode_tree_height);
@@ -481,6 +506,7 @@ static void tmem_pampd_destroy_all_in_ob
 		obj->objnode_tree_height = 0;
 	}
 	obj->objnode_tree_root = NULL;
+	(*tmem_pamops.free_obj)(obj->pool, obj);
 }
 
 /*
@@ -503,15 +529,13 @@ static void tmem_pampd_destroy_all_in_ob
  * always flushes for simplicity.
  */
 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		struct page *page)
+		char *data, size_t size, bool raw, bool ephemeral)
 {
 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
 	void *pampd = NULL, *pampd_del = NULL;
 	int ret = -ENOMEM;
-	bool ephemeral;
 	struct tmem_hashbucket *hb;
 
-	ephemeral = is_ephemeral(pool);
 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 	spin_lock(&hb->lock);
 	obj = objfound = tmem_obj_find(hb, oidp);
@@ -521,7 +545,7 @@ int tmem_put(struct tmem_pool *pool, str
 			/* if found, is a dup put, flush the old one */
 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
 			BUG_ON(pampd_del != pampd);
-			(*tmem_pamops.free)(pampd, pool);
+			(*tmem_pamops.free)(pampd, pool, oidp, index);
 			if (obj->pampd_count == 0) {
 				objnew = obj;
 				objfound = NULL;
@@ -538,7 +562,8 @@ int tmem_put(struct tmem_pool *pool, str
 	}
 	BUG_ON(obj == NULL);
 	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
-	pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page);
+	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
+					obj->pool, &obj->oid, index);
 	if (unlikely(pampd == NULL))
 		goto free;
 	ret = tmem_pampd_add_to_obj(obj, index, pampd);
@@ -551,7 +576,7 @@ delete_and_free:
 	(void)tmem_pampd_delete_from_obj(obj, index);
 free:
 	if (pampd)
-		(*tmem_pamops.free)(pampd, pool);
+		(*tmem_pamops.free)(pampd, pool, NULL, 0);
 	if (objnew) {
 		tmem_obj_free(objnew, hb);
 		(*tmem_hostops.obj_free)(objnew, pool);
@@ -573,41 +598,52 @@ out:
  * "put" done with the same handle).
 
  */
-int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp,
-				uint32_t index, struct page *page)
+int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
+		char *data, size_t *size, bool raw, int get_and_free)
 {
 	struct tmem_obj *obj;
 	void *pampd;
 	bool ephemeral = is_ephemeral(pool);
 	uint32_t ret = -1;
 	struct tmem_hashbucket *hb;
+	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
+	bool lock_held = false;
 
 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 	spin_lock(&hb->lock);
+	lock_held = true;
 	obj = tmem_obj_find(hb, oidp);
 	if (obj == NULL)
 		goto out;
-	ephemeral = is_ephemeral(pool);
-	if (ephemeral)
+	if (free)
 		pampd = tmem_pampd_delete_from_obj(obj, index);
 	else
 		pampd = tmem_pampd_lookup_in_obj(obj, index);
 	if (pampd == NULL)
 		goto out;
-	ret = (*tmem_pamops.get_data)(page, pampd, pool);
-	if (ret < 0)
-		goto out;
-	if (ephemeral) {
-		(*tmem_pamops.free)(pampd, pool);
+	if (free) {
 		if (obj->pampd_count == 0) {
 			tmem_obj_free(obj, hb);
 			(*tmem_hostops.obj_free)(obj, pool);
 			obj = NULL;
 		}
 	}
+	if (tmem_pamops.is_remote(pampd)) {
+		lock_held = false;
+		spin_unlock(&hb->lock);
+	}
+	if (free)
+		ret = (*tmem_pamops.get_data_and_free)(
+				data, size, raw, pampd, pool, oidp, index);
+	else
+		ret = (*tmem_pamops.get_data)(
+				data, size, raw, pampd, pool, oidp, index);
+	if (ret < 0)
+		goto out;
 	ret = 0;
 out:
-	spin_unlock(&hb->lock);
+	if (lock_held)
+		spin_unlock(&hb->lock);
 	return ret;
 }
 
@@ -632,7 +668,7 @@ int tmem_flush_page(struct tmem_pool *po
 	pampd = tmem_pampd_delete_from_obj(obj, index);
 	if (pampd == NULL)
 		goto out;
-	(*tmem_pamops.free)(pampd, pool);
+	(*tmem_pamops.free)(pampd, pool, oidp, index);
 	if (obj->pampd_count == 0) {
 		tmem_obj_free(obj, hb);
 		(*tmem_hostops.obj_free)(obj, pool);
@@ -645,6 +681,30 @@ out:
 }
 
 /*
+ * If a page in tmem matches the handle, replace the page so that any
+ * subsequent "get" gets the new page.  Returns 0 if
+ * there was a page to replace, else returns -1.
+ */
+int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
+			uint32_t index, void *new_pampd)
+{
+	struct tmem_obj *obj;
+	int ret = -1;
+	struct tmem_hashbucket *hb;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (obj == NULL)
+		goto out;
+	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
+	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
  * "Flush" all pages in tmem matching this oid.
  */
 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
diff -Napur linux-3.0-rc4/drivers/staging/zcache/tmem.h linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.h
--- linux-3.0-rc4/drivers/staging/zcache/tmem.h	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.h	2011-06-30 12:39:00.833064393 -0600
@@ -147,6 +147,7 @@ struct tmem_obj {
 	unsigned int objnode_tree_height;
 	unsigned long objnode_count;
 	long pampd_count;
+	void *extra; /* for private use by pampd implementation */
 	DECL_SENTINEL
 };
 
@@ -166,10 +167,18 @@ struct tmem_objnode {
 
 /* pampd abstract datatype methods provided by the PAM implementation */
 struct tmem_pamops {
-	void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t,
-			struct page *);
-	int (*get_data)(struct page *, void *, struct tmem_pool *);
-	void (*free)(void *, struct tmem_pool *);
+	void *(*create)(char *, size_t, bool, int,
+			struct tmem_pool *, struct tmem_oid *, uint32_t);
+	int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
+				struct tmem_oid *, uint32_t);
+	int (*get_data_and_free)(char *, size_t *, bool, void *,
+				struct tmem_pool *, struct tmem_oid *,
+				uint32_t);
+	void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
+	void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
+	bool (*is_remote)(void *);
+	void (*new_obj)(struct tmem_obj *);
+	int (*replace_in_obj)(void *, struct tmem_obj *);
 };
 extern void tmem_register_pamops(struct tmem_pamops *m);
 
@@ -184,9 +193,11 @@ extern void tmem_register_hostops(struct
 
 /* core tmem accessor functions */
 extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			struct page *page);
+			char *, size_t, bool, bool);
 extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			struct page *page);
+			char *, size_t *, bool, int);
+extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+			void *);
 extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
 			uint32_t index);
 extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
diff -Napur linux-3.0-rc4/drivers/staging/zcache/zcache.c linux-3.0-rc4-zcache/drivers/staging/zcache/zcache.c
--- linux-3.0-rc4/drivers/staging/zcache/zcache.c	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/zcache.c	2011-06-30 12:39:00.836064399 -0600
@@ -49,6 +49,33 @@
 	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
 #endif
 
+#define MAX_POOLS_PER_CLIENT 16
+
+#define MAX_CLIENTS 16
+#define LOCAL_CLIENT ((uint16_t)-1)
+struct zcache_client {
+	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
+	struct xv_pool *xvpool;
+	bool allocated;
+	atomic_t refcount;
+};
+
+static struct zcache_client zcache_host;
+static struct zcache_client zcache_clients[MAX_CLIENTS];
+
+static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
+{
+	BUG_ON(cli == NULL);
+	if (cli == &zcache_host)
+		return LOCAL_CLIENT;
+	return cli - &zcache_clients[0];
+}
+
+static inline bool is_local_client(struct zcache_client *cli)
+{
+	return cli == &zcache_host;
+}
+
 /**********
  * Compression buddies ("zbud") provides for packing two (or, possibly
  * in the future, more) compressed ephemeral pages into a single "raw"
@@ -72,7 +99,8 @@
 #define ZBUD_MAX_BUDS 2
 
 struct zbud_hdr {
-	uint32_t pool_id;
+	uint16_t client_id;
+	uint16_t pool_id;
 	struct tmem_oid oid;
 	uint32_t index;
 	uint16_t size; /* compressed size in bytes, zero means unused */
@@ -120,6 +148,7 @@ static unsigned long zcache_zbud_curr_zb
 static unsigned long zcache_zbud_cumul_zpages;
 static unsigned long zcache_zbud_cumul_zbytes;
 static unsigned long zcache_compress_poor;
+static unsigned long zcache_mean_compress_poor;
 
 /* forward references */
 static void *zcache_get_free_page(void);
@@ -294,7 +323,8 @@ static void zbud_free_and_delist(struct 
 	}
 }
 
-static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
+static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
+					struct tmem_oid *oid,
 					uint32_t index, struct page *page,
 					void *cdata, unsigned size)
 {
@@ -353,6 +383,7 @@ init_zh:
 	zh->index = index;
 	zh->oid = *oid;
 	zh->pool_id = pool_id;
+	zh->client_id = client_id;
 	/* can wait to copy the data until the list locks are dropped */
 	spin_unlock(&zbud_budlists_spinlock);
 
@@ -407,7 +438,8 @@ static unsigned long zcache_evicted_raw_
 static unsigned long zcache_evicted_buddied_pages;
 static unsigned long zcache_evicted_unbuddied_pages;
 
-static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
+static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
+						uint16_t poolid);
 static void zcache_put_pool(struct tmem_pool *pool);
 
 /*
@@ -417,7 +449,8 @@ static void zbud_evict_zbpg(struct zbud_
 {
 	struct zbud_hdr *zh;
 	int i, j;
-	uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
+	uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
+	uint32_t index[ZBUD_MAX_BUDS];
 	struct tmem_oid oid[ZBUD_MAX_BUDS];
 	struct tmem_pool *pool;
 
@@ -426,6 +459,7 @@ static void zbud_evict_zbpg(struct zbud_
 	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
 		zh = &zbpg->buddy[i];
 		if (zh->size) {
+			client_id[j] = zh->client_id;
 			pool_id[j] = zh->pool_id;
 			oid[j] = zh->oid;
 			index[j] = zh->index;
@@ -435,7 +469,7 @@ static void zbud_evict_zbpg(struct zbud_
 	}
 	spin_unlock(&zbpg->lock);
 	for (i = 0; i < j; i++) {
-		pool = zcache_get_pool_by_id(pool_id[i]);
+		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 		if (pool != NULL) {
 			tmem_flush_page(pool, &oid[i], index[i]);
 			zcache_put_pool(pool);
@@ -602,7 +636,23 @@ struct zv_hdr {
 	DECL_SENTINEL
 };
 
-static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
+/* rudimentary policy limits */
+/* total number of persistent pages may not exceed this percentage */
+static unsigned int zv_page_count_policy_percent = 75;
+/*
+ * byte count defining poor compression; pages with greater zsize will be
+ * rejected
+ */
+static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
+/*
+ * byte count defining poor *mean* compression; pages with greater zsize
+ * will be rejected until sufficient better-compressed pages are accepted
+ * driving the man below this threshold
+ */
+static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
+
+static unsigned long zv_curr_dist_counts[NCHUNKS+1];
+static unsigned long zv_cumul_dist_counts[NCHUNKS+1];
 
 static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
 				struct tmem_oid *oid, uint32_t index,
@@ -611,13 +661,17 @@ static struct zv_hdr *zv_create(struct x
 	struct page *page;
 	struct zv_hdr *zv = NULL;
 	uint32_t offset;
+	int alloc_size = clen + sizeof(struct zv_hdr);
+	int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 	int ret;
 
 	BUG_ON(!irqs_disabled());
-	ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
+	ret = xv_malloc(xvpool, alloc_size,
 			&page, &offset, ZCACHE_GFP_MASK);
 	if (unlikely(ret))
 		goto out;
+	zv_curr_dist_counts[chunks]++;
+	zv_cumul_dist_counts[chunks]++;
 	zv = kmap_atomic(page, KM_USER0) + offset;
 	zv->index = index;
 	zv->oid = *oid;
@@ -634,11 +688,13 @@ static void zv_free(struct xv_pool *xvpo
 	unsigned long flags;
 	struct page *page;
 	uint32_t offset;
-	uint16_t size;
+	uint16_t size = xv_get_object_size(zv);
+	int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 
 	ASSERT_SENTINEL(zv, ZVH);
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
+	zv_curr_dist_counts[chunks]--;
+	size -= sizeof(*zv);
+	BUG_ON(size == 0);
 	INVERT_SENTINEL(zv, ZVH);
 	page = virt_to_page(zv);
 	offset = (unsigned long)zv & ~PAGE_MASK;
@@ -656,7 +712,7 @@ static void zv_decompress(struct page *p
 
 	ASSERT_SENTINEL(zv, ZVH);
 	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
+	BUG_ON(size == 0);
 	to_va = kmap_atomic(page, KM_USER0);
 	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
 					size, to_va, &clen);
@@ -665,6 +721,159 @@ static void zv_decompress(struct page *p
 	BUG_ON(clen != PAGE_SIZE);
 }
 
+#ifdef CONFIG_SYSFS
+/*
+ * show a distribution of compression stats for zv pages.
+ */
+
+static int zv_curr_dist_counts_show(char *buf)
+{
+	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
+	char *p = buf;
+
+	for (i = 0; i <= NCHUNKS - 1; i++) {
+		n = zv_curr_dist_counts[i];
+		p += sprintf(p, "%lu ", n);
+		chunks += n;
+		sum_total_chunks += i * n;
+	}
+	p += sprintf(p, "mean:%lu\n",
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+
+static int zv_cumul_dist_counts_show(char *buf)
+{
+	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
+	char *p = buf;
+
+	for (i = 0; i <= NCHUNKS - 1; i++) {
+		n = zv_cumul_dist_counts[i];
+		p += sprintf(p, "%lu ", n);
+		chunks += n;
+		sum_total_chunks += i * n;
+	}
+	p += sprintf(p, "mean:%lu\n",
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+
+/*
+ * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
+ * pages that don't compress to less than this value (including metadata
+ * overhead) to be rejected.  We don't allow the value to get too close
+ * to PAGE_SIZE.
+ */
+static ssize_t zv_max_zsize_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", zv_max_zsize);
+}
+
+static ssize_t zv_max_zsize_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
+		return -EINVAL;
+	zv_max_zsize = val;
+	return count;
+}
+
+/*
+ * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
+ * pages that don't compress to less than this value (including metadata
+ * overhead) to be rejected UNLESS the mean compression is also smaller
+ * than this value.  In other words, we are load-balancing-by-zsize the
+ * accepted pages.  Again, we don't allow the value to get too close
+ * to PAGE_SIZE.
+ */
+static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", zv_max_mean_zsize);
+}
+
+static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
+		return -EINVAL;
+	zv_max_mean_zsize = val;
+	return count;
+}
+
+/*
+ * setting zv_page_count_policy_percent via sysfs sets an upper bound of
+ * persistent (e.g. swap) pages that will be retained according to:
+ *     (zv_page_count_policy_percent * totalram_pages) / 100)
+ * when that limit is reached, further puts will be rejected (until
+ * some pages have been flushed).  Note that, due to compression,
+ * this number may exceed 100; it defaults to 75 and we set an
+ * arbitary limit of 150.  A poor choice will almost certainly result
+ * in OOM's, so this value should only be changed prudently.
+ */
+static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 char *buf)
+{
+	return sprintf(buf, "%u\n", zv_page_count_policy_percent);
+}
+
+static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
+						  struct kobj_attribute *attr,
+						  const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > 150))
+		return -EINVAL;
+	zv_page_count_policy_percent = val;
+	return count;
+}
+
+static struct kobj_attribute zcache_zv_max_zsize_attr = {
+		.attr = { .name = "zv_max_zsize", .mode = 0644 },
+		.show = zv_max_zsize_show,
+		.store = zv_max_zsize_store,
+};
+
+static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
+		.attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
+		.show = zv_max_mean_zsize_show,
+		.store = zv_max_mean_zsize_store,
+};
+
+static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
+		.attr = { .name = "zv_page_count_policy_percent",
+			  .mode = 0644 },
+		.show = zv_page_count_policy_percent_show,
+		.store = zv_page_count_policy_percent_store,
+};
+#endif
+
 /*
  * zcache core code starts here
  */
@@ -677,36 +886,70 @@ static unsigned long zcache_flobj_found;
 static unsigned long zcache_failed_eph_puts;
 static unsigned long zcache_failed_pers_puts;
 
-#define MAX_POOLS_PER_CLIENT 16
-
-static struct {
-	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
-	struct xv_pool *xvpool;
-} zcache_client;
-
 /*
  * Tmem operations assume the poolid implies the invoking client.
- * Zcache only has one client (the kernel itself), so translate
- * the poolid into the tmem_pool allocated for it.  A KVM version
+ * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
+ * RAMster has each client numbered by cluster node, and a KVM version
  * of zcache would have one client per guest and each client might
  * have a poolid==N.
  */
-static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
+static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
 {
 	struct tmem_pool *pool = NULL;
+	struct zcache_client *cli = NULL;
 
-	if (poolid >= 0) {
-		pool = zcache_client.tmem_pools[poolid];
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else {
+		if (cli_id >= MAX_CLIENTS)
+			goto out;
+		cli = &zcache_clients[cli_id];
+		if (cli == NULL)
+			goto out;
+		atomic_inc(&cli->refcount);
+	}
+	if (poolid < MAX_POOLS_PER_CLIENT) {
+		pool = cli->tmem_pools[poolid];
 		if (pool != NULL)
 			atomic_inc(&pool->refcount);
 	}
+out:
 	return pool;
 }
 
 static void zcache_put_pool(struct tmem_pool *pool)
 {
-	if (pool != NULL)
-		atomic_dec(&pool->refcount);
+	struct zcache_client *cli = NULL;
+
+	if (pool == NULL)
+		BUG();
+	cli = pool->client;
+	atomic_dec(&pool->refcount);
+	atomic_dec(&cli->refcount);
+}
+
+int zcache_new_client(uint16_t cli_id)
+{
+	struct zcache_client *cli = NULL;
+	int ret = -1;
+
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	if (cli->allocated)
+		goto out;
+	cli->allocated = 1;
+#ifdef CONFIG_FRONTSWAP
+	cli->xvpool = xv_create_pool();
+	if (cli->xvpool == NULL)
+		goto out;
+#endif
+	ret = 0;
+out:
+	return ret;
 }
 
 /* counters for debugging */
@@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
 /* forward reference */
 static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
 
-static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
-				 uint32_t index, struct page *page)
+static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
+				struct tmem_pool *pool, struct tmem_oid *oid,
+				 uint32_t index)
 {
 	void *pampd = NULL, *cdata;
 	size_t clen;
 	int ret;
-	bool ephemeral = is_ephemeral(pool);
 	unsigned long count;
+	struct page *page = virt_to_page(data);
+	struct zcache_client *cli = pool->client;
+	uint16_t client_id = get_client_id_from_client(cli);
+	unsigned long zv_mean_zsize;
+	unsigned long curr_pers_pampd_count;
 
-	if (ephemeral) {
+	if (eph) {
 		ret = zcache_compress(page, &cdata, &clen);
 		if (ret == 0)
-
 			goto out;
 		if (clen == 0 || clen > zbud_max_buddy_size()) {
 			zcache_compress_poor++;
 			goto out;
 		}
-		pampd = (void *)zbud_create(pool->pool_id, oid, index,
-						page, cdata, clen);
+		pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
+						index, page, cdata, clen);
 		if (pampd != NULL) {
 			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
 			if (count > zcache_curr_eph_pampd_count_max)
 				zcache_curr_eph_pampd_count_max = count;
 		}
 	} else {
-		/*
-		 * FIXME: This is all the "policy" there is for now.
-		 * 3/4 totpages should allow ~37% of RAM to be filled with
-		 * compressed frontswap pages
-		 */
-		if (atomic_read(&zcache_curr_pers_pampd_count) >
-							3 * totalram_pages / 4)
+		curr_pers_pampd_count =
+			atomic_read(&zcache_curr_pers_pampd_count);
+		if (curr_pers_pampd_count >
+		    (zv_page_count_policy_percent * totalram_pages) / 100)
 			goto out;
 		ret = zcache_compress(page, &cdata, &clen);
 		if (ret == 0)
 			goto out;
-		if (clen > zv_max_page_size) {
+		/* reject if compression is too poor */
+		if (clen > zv_max_zsize) {
 			zcache_compress_poor++;
 			goto out;
 		}
-		pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
+		/* reject if mean compression is too poor */
+		if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
+			zv_mean_zsize = xv_get_total_size_bytes(cli->xvpool) /
+						curr_pers_pampd_count;
+			if (zv_mean_zsize > zv_max_mean_zsize) {
+				zcache_mean_compress_poor++;
+				goto out;
+			}
+		}
+		pampd = (void *)zv_create(cli->xvpool, pool->pool_id,
 						oid, index, cdata, clen);
 		if (pampd == NULL)
 			goto out;
@@ -958,15 +1212,31 @@ out:
  * fill the pageframe corresponding to the struct page with the data
  * from the passed pampd
  */
-static int zcache_pampd_get_data(struct page *page, void *pampd,
-						struct tmem_pool *pool)
+static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
+					void *pampd, struct tmem_pool *pool,
+					struct tmem_oid *oid, uint32_t index)
 {
 	int ret = 0;
 
-	if (is_ephemeral(pool))
-		ret = zbud_decompress(page, pampd);
-	else
-		zv_decompress(page, pampd);
+	BUG_ON(is_ephemeral(pool));
+	zv_decompress(virt_to_page(data), pampd);
+	return ret;
+}
+
+/*
+ * fill the pageframe corresponding to the struct page with the data
+ * from the passed pampd
+ */
+static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
+					void *pampd, struct tmem_pool *pool,
+					struct tmem_oid *oid, uint32_t index)
+{
+	int ret = 0;
+
+	BUG_ON(!is_ephemeral(pool));
+	zbud_decompress(virt_to_page(data), pampd);
+	zbud_free_and_delist((struct zbud_hdr *)pampd);
+	atomic_dec(&zcache_curr_eph_pampd_count);
 	return ret;
 }
 
@@ -974,23 +1244,49 @@ static int zcache_pampd_get_data(struct 
  * free the pampd and remove it from any zcache lists
  * pampd must no longer be pointed to from any tmem data structures!
  */
-static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
+static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
+				struct tmem_oid *oid, uint32_t index)
 {
+	struct zcache_client *cli = pool->client;
+
 	if (is_ephemeral(pool)) {
 		zbud_free_and_delist((struct zbud_hdr *)pampd);
 		atomic_dec(&zcache_curr_eph_pampd_count);
 		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
 	} else {
-		zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
+		zv_free(cli->xvpool, (struct zv_hdr *)pampd);
 		atomic_dec(&zcache_curr_pers_pampd_count);
 		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
 	}
 }
 
+static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
+{
+}
+
+static void zcache_pampd_new_obj(struct tmem_obj *obj)
+{
+}
+
+static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
+{
+	return -1;
+}
+
+static bool zcache_pampd_is_remote(void *pampd)
+{
+	return 0;
+}
+
 static struct tmem_pamops zcache_pamops = {
 	.create = zcache_pampd_create,
 	.get_data = zcache_pampd_get_data,
+	.get_data_and_free = zcache_pampd_get_data_and_free,
 	.free = zcache_pampd_free,
+	.free_obj = zcache_pampd_free_obj,
+	.new_obj = zcache_pampd_new_obj,
+	.replace_in_obj = zcache_pampd_replace_in_obj,
+	.is_remote = zcache_pampd_is_remote,
 };
 
 /*
@@ -1122,6 +1418,7 @@ ZCACHE_SYSFS_RO(put_to_flush);
 ZCACHE_SYSFS_RO(aborted_preload);
 ZCACHE_SYSFS_RO(aborted_shrink);
 ZCACHE_SYSFS_RO(compress_poor);
+ZCACHE_SYSFS_RO(mean_compress_poor);
 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
@@ -1130,6 +1427,10 @@ ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_li
 			zbud_show_unbuddied_list_counts);
 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
 			zbud_show_cumul_chunk_counts);
+ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
+			zv_curr_dist_counts_show);
+ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
+			zv_cumul_dist_counts_show);
 
 static struct attribute *zcache_attrs[] = {
 	&zcache_curr_obj_count_attr.attr,
@@ -1143,6 +1444,7 @@ static struct attribute *zcache_attrs[] 
 	&zcache_failed_eph_puts_attr.attr,
 	&zcache_failed_pers_puts_attr.attr,
 	&zcache_compress_poor_attr.attr,
+	&zcache_mean_compress_poor_attr.attr,
 	&zcache_zbud_curr_raw_pages_attr.attr,
 	&zcache_zbud_curr_zpages_attr.attr,
 	&zcache_zbud_curr_zbytes_attr.attr,
@@ -1160,6 +1462,11 @@ static struct attribute *zcache_attrs[] 
 	&zcache_aborted_shrink_attr.attr,
 	&zcache_zbud_unbuddied_list_counts_attr.attr,
 	&zcache_zbud_cumul_chunk_counts_attr.attr,
+	&zcache_zv_curr_dist_counts_attr.attr,
+	&zcache_zv_cumul_dist_counts_attr.attr,
+	&zcache_zv_max_zsize_attr.attr,
+	&zcache_zv_max_mean_zsize_attr.attr,
+	&zcache_zv_page_count_policy_percent_attr.attr,
 	NULL,
 };
 
@@ -1212,19 +1519,20 @@ static struct shrinker zcache_shrinker =
  * zcache shims between cleancache/frontswap ops and tmem
  */
 
-static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
+static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
 				uint32_t index, struct page *page)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
 
 	BUG_ON(!irqs_disabled());
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (unlikely(pool == NULL))
 		goto out;
 	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
 		/* preload does preempt_disable on success */
-		ret = tmem_put(pool, oidp, index, page);
+		ret = tmem_put(pool, oidp, index, page_address(page),
+				PAGE_SIZE, 0, is_ephemeral(pool));
 		if (ret < 0) {
 			if (is_ephemeral(pool))
 				zcache_failed_eph_puts++;
@@ -1244,25 +1552,28 @@ out:
 	return ret;
 }
 
-static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
+static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
 				uint32_t index, struct page *page)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
 	unsigned long flags;
+	size_t size = PAGE_SIZE;
 
 	local_irq_save(flags);
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_get(pool, oidp, index, page);
+			ret = tmem_get(pool, oidp, index, page_address(page),
+					&size, 0, is_ephemeral(pool));
 		zcache_put_pool(pool);
 	}
 	local_irq_restore(flags);
 	return ret;
 }
 
-static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
+static int zcache_flush_page(int cli_id, int pool_id,
+				struct tmem_oid *oidp, uint32_t index)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
@@ -1270,7 +1581,7 @@ static int zcache_flush_page(int pool_id
 
 	local_irq_save(flags);
 	zcache_flush_total++;
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
 			ret = tmem_flush_page(pool, oidp, index);
@@ -1282,7 +1593,8 @@ static int zcache_flush_page(int pool_id
 	return ret;
 }
 
-static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
+static int zcache_flush_object(int cli_id, int pool_id,
+				struct tmem_oid *oidp)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
@@ -1290,7 +1602,7 @@ static int zcache_flush_object(int pool_
 
 	local_irq_save(flags);
 	zcache_flobj_total++;
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
 			ret = tmem_flush_object(pool, oidp);
@@ -1302,34 +1614,52 @@ static int zcache_flush_object(int pool_
 	return ret;
 }
 
-static int zcache_destroy_pool(int pool_id)
+static int zcache_destroy_pool(int cli_id, int pool_id)
 {
 	struct tmem_pool *pool = NULL;
+	struct zcache_client *cli = NULL;
 	int ret = -1;
 
 	if (pool_id < 0)
 		goto out;
-	pool = zcache_client.tmem_pools[pool_id];
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	atomic_inc(&cli->refcount);
+	pool = cli->tmem_pools[pool_id];
 	if (pool == NULL)
 		goto out;
-	zcache_client.tmem_pools[pool_id] = NULL;
+	cli->tmem_pools[pool_id] = NULL;
 	/* wait for pool activity on other cpus to quiesce */
 	while (atomic_read(&pool->refcount) != 0)
 		;
+	atomic_dec(&cli->refcount);
 	local_bh_disable();
 	ret = tmem_destroy_pool(pool);
 	local_bh_enable();
 	kfree(pool);
-	pr_info("zcache: destroyed pool id=%d\n", pool_id);
+	pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
+			pool_id, cli_id);
 out:
 	return ret;
 }
 
-static int zcache_new_pool(uint32_t flags)
+static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
 {
 	int poolid = -1;
 	struct tmem_pool *pool;
+	struct zcache_client *cli = NULL;
 
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	atomic_inc(&cli->refcount);
 	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
 	if (pool == NULL) {
 		pr_info("zcache: pool creation failed: out of memory\n");
@@ -1337,7 +1667,7 @@ static int zcache_new_pool(uint32_t flag
 	}
 
 	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
-		if (zcache_client.tmem_pools[poolid] == NULL)
+		if (cli->tmem_pools[poolid] == NULL)
 			break;
 	if (poolid >= MAX_POOLS_PER_CLIENT) {
 		pr_info("zcache: pool creation failed: max exceeded\n");
@@ -1346,14 +1676,16 @@ static int zcache_new_pool(uint32_t flag
 		goto out;
 	}
 	atomic_set(&pool->refcount, 0);
-	pool->client = &zcache_client;
+	pool->client = cli;
 	pool->pool_id = poolid;
 	tmem_new_pool(pool, flags);
-	zcache_client.tmem_pools[poolid] = pool;
-	pr_info("zcache: created %s tmem pool, id=%d\n",
+	cli->tmem_pools[poolid] = pool;
+	pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
 		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-		poolid);
+		poolid, cli_id);
 out:
+	if (cli != NULL)
+		atomic_dec(&cli->refcount);
 	return poolid;
 }
 
@@ -1374,7 +1706,7 @@ static void zcache_cleancache_put_page(i
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
 	if (likely(ind == index))
-		(void)zcache_put_page(pool_id, &oid, index, page);
+		(void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
 }
 
 static int zcache_cleancache_get_page(int pool_id,
@@ -1386,7 +1718,7 @@ static int zcache_cleancache_get_page(in
 	int ret = -1;
 
 	if (likely(ind == index))
-		ret = zcache_get_page(pool_id, &oid, index, page);
+		ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
 	return ret;
 }
 
@@ -1398,7 +1730,7 @@ static void zcache_cleancache_flush_page
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
 	if (likely(ind == index))
-		(void)zcache_flush_page(pool_id, &oid, ind);
+		(void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
 }
 
 static void zcache_cleancache_flush_inode(int pool_id,
@@ -1406,13 +1738,13 @@ static void zcache_cleancache_flush_inod
 {
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
-	(void)zcache_flush_object(pool_id, &oid);
+	(void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
 }
 
 static void zcache_cleancache_flush_fs(int pool_id)
 {
 	if (pool_id >= 0)
-		(void)zcache_destroy_pool(pool_id);
+		(void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
 }
 
 static int zcache_cleancache_init_fs(size_t pagesize)
@@ -1420,7 +1752,7 @@ static int zcache_cleancache_init_fs(siz
 	BUG_ON(sizeof(struct cleancache_filekey) !=
 				sizeof(struct tmem_oid));
 	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(0);
+	return zcache_new_pool(LOCAL_CLIENT, 0);
 }
 
 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
@@ -1429,7 +1761,7 @@ static int zcache_cleancache_init_shared
 	BUG_ON(sizeof(struct cleancache_filekey) !=
 				sizeof(struct tmem_oid));
 	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(0);
+	return zcache_new_pool(LOCAL_CLIENT, 0);
 }
 
 static struct cleancache_ops zcache_cleancache_ops = {
@@ -1483,8 +1815,8 @@ static int zcache_frontswap_put_page(uns
 	BUG_ON(!PageLocked(page));
 	if (likely(ind64 == ind)) {
 		local_irq_save(flags);
-		ret = zcache_put_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind), page);
+		ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind), page);
 		local_irq_restore(flags);
 	}
 	return ret;
@@ -1502,8 +1834,8 @@ static int zcache_frontswap_get_page(uns
 
 	BUG_ON(!PageLocked(page));
 	if (likely(ind64 == ind))
-		ret = zcache_get_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind), page);
+		ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind), page);
 	return ret;
 }
 
@@ -1515,8 +1847,8 @@ static void zcache_frontswap_flush_page(
 	struct tmem_oid oid = oswiz(type, ind);
 
 	if (likely(ind64 == ind))
-		(void)zcache_flush_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind));
+		(void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind));
 }
 
 /* flush all pages from the passed swaptype */
@@ -1527,7 +1859,8 @@ static void zcache_frontswap_flush_area(
 
 	for (ind = SWIZ_MASK; ind >= 0; ind--) {
 		oid = oswiz(type, ind);
-		(void)zcache_flush_object(zcache_frontswap_poolid, &oid);
+		(void)zcache_flush_object(LOCAL_CLIENT,
+						zcache_frontswap_poolid, &oid);
 	}
 }
 
@@ -1535,7 +1868,8 @@ static void zcache_frontswap_init(unsign
 {
 	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
 	if (zcache_frontswap_poolid < 0)
-		zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
+		zcache_frontswap_poolid =
+			zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
 }
 
 static struct frontswap_ops zcache_frontswap_ops = {
@@ -1624,6 +1958,11 @@ static int __init zcache_init(void)
 				sizeof(struct tmem_objnode), 0, 0, NULL);
 	zcache_obj_cache = kmem_cache_create("zcache_obj",
 				sizeof(struct tmem_obj), 0, 0, NULL);
+	ret = zcache_new_client(LOCAL_CLIENT);
+	if (ret) {
+		pr_err("zcache: can't create client\n");
+		goto out;
+	}
 #endif
 #ifdef CONFIG_CLEANCACHE
 	if (zcache_enabled && use_cleancache) {
@@ -1642,11 +1981,6 @@ static int __init zcache_init(void)
 	if (zcache_enabled && use_frontswap) {
 		struct frontswap_ops old_ops;
 
-		zcache_client.xvpool = xv_create_pool();
-		if (zcache_client.xvpool == NULL) {
-			pr_err("zcache: can't create xvpool\n");
-			goto out;
-		}
 		old_ops = zcache_frontswap_register_ops();
 		pr_info("zcache: frontswap enabled using kernel "
 			"transcendent memory and xvmalloc\n");

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 19:01 ` Dan Magenheimer
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-06-30 19:01 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Marcus Klemm, Seth Jennings, Konrad Wilk, linux-kernel, devel,
	kvm, linux-mm

Hi Greg --

I think this patch is now ready for staging-next and for merging when
the 3.1 window opens.  Please let me know if you need any logistics
done differently.

Thanks,
Dan

===

>From: Dan Magenheimer <dan.magenheimer@oracle.com>
Subject: staging: zcache: support multiple clients, prep for KVM and RAMster

This is version 2 of an update to zcache, incorporating feedback from the list.
This patch adds support to the in-kernel transcendent memory ("tmem") code
and the zcache driver for multiple clients, which will be needed for both
RAMster and KVM support.  It also adds additional tmem callbacks to support
RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
also taken the liberty of adding some additional sysfs variables to
both surface information and allow policy control.  Those experimenting
with zcache should find them useful.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>

[v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
[v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
[v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: linux-mm@kvack.org
Cc: kvm@vger.kernel.org

===

Diffstat:
 drivers/staging/zcache/tmem.c            |  100 +++-
 drivers/staging/zcache/tmem.h            |   23 
 drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
 3 files changed, 520 insertions(+), 115 deletions(-)


diff -Napur linux-3.0-rc4/drivers/staging/zcache/tmem.c linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.c
--- linux-3.0-rc4/drivers/staging/zcache/tmem.c	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.c	2011-06-30 12:40:00.443197550 -0600
@@ -142,6 +142,7 @@ static void tmem_obj_init(struct tmem_ob
 	obj->oid = *oidp;
 	obj->objnode_count = 0;
 	obj->pampd_count = 0;
+	(*tmem_pamops.new_obj)(obj);
 	SET_SENTINEL(obj, OBJ);
 	while (*new) {
 		BUG_ON(RB_EMPTY_NODE(*new));
@@ -274,7 +275,7 @@ static void tmem_objnode_free(struct tme
 /*
  * lookup index in object and return associated pampd (or NULL if not found)
  */
-static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
 {
 	unsigned int height, shift;
 	struct tmem_objnode **slot = NULL;
@@ -303,9 +304,33 @@ static void *tmem_pampd_lookup_in_obj(st
 		height--;
 	}
 out:
+	return slot != NULL ? (void **)slot : NULL;
+}
+
+static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+{
+	struct tmem_objnode **slot;
+
+	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
 	return slot != NULL ? *slot : NULL;
 }
 
+static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
+					void *new_pampd)
+{
+	struct tmem_objnode **slot;
+	void *ret = NULL;
+
+	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
+	if ((slot != NULL) && (*slot != NULL)) {
+		void *old_pampd = *(void **)slot;
+		*(void **)slot = new_pampd;
+		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
+		ret = new_pampd;
+	}
+	return ret;
+}
+
 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
 					void *pampd)
 {
@@ -456,7 +481,7 @@ static void tmem_objnode_node_destroy(st
 			if (ht == 1) {
 				obj->pampd_count--;
 				(*tmem_pamops.free)(objnode->slots[i],
-								obj->pool);
+						obj->pool, NULL, 0);
 				objnode->slots[i] = NULL;
 				continue;
 			}
@@ -473,7 +498,7 @@ static void tmem_pampd_destroy_all_in_ob
 		return;
 	if (obj->objnode_tree_height == 0) {
 		obj->pampd_count--;
-		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool);
+		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
 	} else {
 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
 					obj->objnode_tree_height);
@@ -481,6 +506,7 @@ static void tmem_pampd_destroy_all_in_ob
 		obj->objnode_tree_height = 0;
 	}
 	obj->objnode_tree_root = NULL;
+	(*tmem_pamops.free_obj)(obj->pool, obj);
 }
 
 /*
@@ -503,15 +529,13 @@ static void tmem_pampd_destroy_all_in_ob
  * always flushes for simplicity.
  */
 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		struct page *page)
+		char *data, size_t size, bool raw, bool ephemeral)
 {
 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
 	void *pampd = NULL, *pampd_del = NULL;
 	int ret = -ENOMEM;
-	bool ephemeral;
 	struct tmem_hashbucket *hb;
 
-	ephemeral = is_ephemeral(pool);
 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 	spin_lock(&hb->lock);
 	obj = objfound = tmem_obj_find(hb, oidp);
@@ -521,7 +545,7 @@ int tmem_put(struct tmem_pool *pool, str
 			/* if found, is a dup put, flush the old one */
 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
 			BUG_ON(pampd_del != pampd);
-			(*tmem_pamops.free)(pampd, pool);
+			(*tmem_pamops.free)(pampd, pool, oidp, index);
 			if (obj->pampd_count == 0) {
 				objnew = obj;
 				objfound = NULL;
@@ -538,7 +562,8 @@ int tmem_put(struct tmem_pool *pool, str
 	}
 	BUG_ON(obj == NULL);
 	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
-	pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page);
+	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
+					obj->pool, &obj->oid, index);
 	if (unlikely(pampd == NULL))
 		goto free;
 	ret = tmem_pampd_add_to_obj(obj, index, pampd);
@@ -551,7 +576,7 @@ delete_and_free:
 	(void)tmem_pampd_delete_from_obj(obj, index);
 free:
 	if (pampd)
-		(*tmem_pamops.free)(pampd, pool);
+		(*tmem_pamops.free)(pampd, pool, NULL, 0);
 	if (objnew) {
 		tmem_obj_free(objnew, hb);
 		(*tmem_hostops.obj_free)(objnew, pool);
@@ -573,41 +598,52 @@ out:
  * "put" done with the same handle).
 
  */
-int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp,
-				uint32_t index, struct page *page)
+int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
+		char *data, size_t *size, bool raw, int get_and_free)
 {
 	struct tmem_obj *obj;
 	void *pampd;
 	bool ephemeral = is_ephemeral(pool);
 	uint32_t ret = -1;
 	struct tmem_hashbucket *hb;
+	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
+	bool lock_held = false;
 
 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 	spin_lock(&hb->lock);
+	lock_held = true;
 	obj = tmem_obj_find(hb, oidp);
 	if (obj == NULL)
 		goto out;
-	ephemeral = is_ephemeral(pool);
-	if (ephemeral)
+	if (free)
 		pampd = tmem_pampd_delete_from_obj(obj, index);
 	else
 		pampd = tmem_pampd_lookup_in_obj(obj, index);
 	if (pampd == NULL)
 		goto out;
-	ret = (*tmem_pamops.get_data)(page, pampd, pool);
-	if (ret < 0)
-		goto out;
-	if (ephemeral) {
-		(*tmem_pamops.free)(pampd, pool);
+	if (free) {
 		if (obj->pampd_count == 0) {
 			tmem_obj_free(obj, hb);
 			(*tmem_hostops.obj_free)(obj, pool);
 			obj = NULL;
 		}
 	}
+	if (tmem_pamops.is_remote(pampd)) {
+		lock_held = false;
+		spin_unlock(&hb->lock);
+	}
+	if (free)
+		ret = (*tmem_pamops.get_data_and_free)(
+				data, size, raw, pampd, pool, oidp, index);
+	else
+		ret = (*tmem_pamops.get_data)(
+				data, size, raw, pampd, pool, oidp, index);
+	if (ret < 0)
+		goto out;
 	ret = 0;
 out:
-	spin_unlock(&hb->lock);
+	if (lock_held)
+		spin_unlock(&hb->lock);
 	return ret;
 }
 
@@ -632,7 +668,7 @@ int tmem_flush_page(struct tmem_pool *po
 	pampd = tmem_pampd_delete_from_obj(obj, index);
 	if (pampd == NULL)
 		goto out;
-	(*tmem_pamops.free)(pampd, pool);
+	(*tmem_pamops.free)(pampd, pool, oidp, index);
 	if (obj->pampd_count == 0) {
 		tmem_obj_free(obj, hb);
 		(*tmem_hostops.obj_free)(obj, pool);
@@ -645,6 +681,30 @@ out:
 }
 
 /*
+ * If a page in tmem matches the handle, replace the page so that any
+ * subsequent "get" gets the new page.  Returns 0 if
+ * there was a page to replace, else returns -1.
+ */
+int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
+			uint32_t index, void *new_pampd)
+{
+	struct tmem_obj *obj;
+	int ret = -1;
+	struct tmem_hashbucket *hb;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (obj == NULL)
+		goto out;
+	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
+	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
  * "Flush" all pages in tmem matching this oid.
  */
 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
diff -Napur linux-3.0-rc4/drivers/staging/zcache/tmem.h linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.h
--- linux-3.0-rc4/drivers/staging/zcache/tmem.h	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/tmem.h	2011-06-30 12:39:00.833064393 -0600
@@ -147,6 +147,7 @@ struct tmem_obj {
 	unsigned int objnode_tree_height;
 	unsigned long objnode_count;
 	long pampd_count;
+	void *extra; /* for private use by pampd implementation */
 	DECL_SENTINEL
 };
 
@@ -166,10 +167,18 @@ struct tmem_objnode {
 
 /* pampd abstract datatype methods provided by the PAM implementation */
 struct tmem_pamops {
-	void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t,
-			struct page *);
-	int (*get_data)(struct page *, void *, struct tmem_pool *);
-	void (*free)(void *, struct tmem_pool *);
+	void *(*create)(char *, size_t, bool, int,
+			struct tmem_pool *, struct tmem_oid *, uint32_t);
+	int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
+				struct tmem_oid *, uint32_t);
+	int (*get_data_and_free)(char *, size_t *, bool, void *,
+				struct tmem_pool *, struct tmem_oid *,
+				uint32_t);
+	void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
+	void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
+	bool (*is_remote)(void *);
+	void (*new_obj)(struct tmem_obj *);
+	int (*replace_in_obj)(void *, struct tmem_obj *);
 };
 extern void tmem_register_pamops(struct tmem_pamops *m);
 
@@ -184,9 +193,11 @@ extern void tmem_register_hostops(struct
 
 /* core tmem accessor functions */
 extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			struct page *page);
+			char *, size_t, bool, bool);
 extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			struct page *page);
+			char *, size_t *, bool, int);
+extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+			void *);
 extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
 			uint32_t index);
 extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
diff -Napur linux-3.0-rc4/drivers/staging/zcache/zcache.c linux-3.0-rc4-zcache/drivers/staging/zcache/zcache.c
--- linux-3.0-rc4/drivers/staging/zcache/zcache.c	2011-06-20 21:25:46.000000000 -0600
+++ linux-3.0-rc4-zcache/drivers/staging/zcache/zcache.c	2011-06-30 12:39:00.836064399 -0600
@@ -49,6 +49,33 @@
 	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
 #endif
 
+#define MAX_POOLS_PER_CLIENT 16
+
+#define MAX_CLIENTS 16
+#define LOCAL_CLIENT ((uint16_t)-1)
+struct zcache_client {
+	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
+	struct xv_pool *xvpool;
+	bool allocated;
+	atomic_t refcount;
+};
+
+static struct zcache_client zcache_host;
+static struct zcache_client zcache_clients[MAX_CLIENTS];
+
+static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
+{
+	BUG_ON(cli == NULL);
+	if (cli == &zcache_host)
+		return LOCAL_CLIENT;
+	return cli - &zcache_clients[0];
+}
+
+static inline bool is_local_client(struct zcache_client *cli)
+{
+	return cli == &zcache_host;
+}
+
 /**********
  * Compression buddies ("zbud") provides for packing two (or, possibly
  * in the future, more) compressed ephemeral pages into a single "raw"
@@ -72,7 +99,8 @@
 #define ZBUD_MAX_BUDS 2
 
 struct zbud_hdr {
-	uint32_t pool_id;
+	uint16_t client_id;
+	uint16_t pool_id;
 	struct tmem_oid oid;
 	uint32_t index;
 	uint16_t size; /* compressed size in bytes, zero means unused */
@@ -120,6 +148,7 @@ static unsigned long zcache_zbud_curr_zb
 static unsigned long zcache_zbud_cumul_zpages;
 static unsigned long zcache_zbud_cumul_zbytes;
 static unsigned long zcache_compress_poor;
+static unsigned long zcache_mean_compress_poor;
 
 /* forward references */
 static void *zcache_get_free_page(void);
@@ -294,7 +323,8 @@ static void zbud_free_and_delist(struct 
 	}
 }
 
-static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
+static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
+					struct tmem_oid *oid,
 					uint32_t index, struct page *page,
 					void *cdata, unsigned size)
 {
@@ -353,6 +383,7 @@ init_zh:
 	zh->index = index;
 	zh->oid = *oid;
 	zh->pool_id = pool_id;
+	zh->client_id = client_id;
 	/* can wait to copy the data until the list locks are dropped */
 	spin_unlock(&zbud_budlists_spinlock);
 
@@ -407,7 +438,8 @@ static unsigned long zcache_evicted_raw_
 static unsigned long zcache_evicted_buddied_pages;
 static unsigned long zcache_evicted_unbuddied_pages;
 
-static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
+static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
+						uint16_t poolid);
 static void zcache_put_pool(struct tmem_pool *pool);
 
 /*
@@ -417,7 +449,8 @@ static void zbud_evict_zbpg(struct zbud_
 {
 	struct zbud_hdr *zh;
 	int i, j;
-	uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
+	uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
+	uint32_t index[ZBUD_MAX_BUDS];
 	struct tmem_oid oid[ZBUD_MAX_BUDS];
 	struct tmem_pool *pool;
 
@@ -426,6 +459,7 @@ static void zbud_evict_zbpg(struct zbud_
 	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
 		zh = &zbpg->buddy[i];
 		if (zh->size) {
+			client_id[j] = zh->client_id;
 			pool_id[j] = zh->pool_id;
 			oid[j] = zh->oid;
 			index[j] = zh->index;
@@ -435,7 +469,7 @@ static void zbud_evict_zbpg(struct zbud_
 	}
 	spin_unlock(&zbpg->lock);
 	for (i = 0; i < j; i++) {
-		pool = zcache_get_pool_by_id(pool_id[i]);
+		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 		if (pool != NULL) {
 			tmem_flush_page(pool, &oid[i], index[i]);
 			zcache_put_pool(pool);
@@ -602,7 +636,23 @@ struct zv_hdr {
 	DECL_SENTINEL
 };
 
-static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
+/* rudimentary policy limits */
+/* total number of persistent pages may not exceed this percentage */
+static unsigned int zv_page_count_policy_percent = 75;
+/*
+ * byte count defining poor compression; pages with greater zsize will be
+ * rejected
+ */
+static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
+/*
+ * byte count defining poor *mean* compression; pages with greater zsize
+ * will be rejected until sufficient better-compressed pages are accepted
+ * driving the man below this threshold
+ */
+static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
+
+static unsigned long zv_curr_dist_counts[NCHUNKS+1];
+static unsigned long zv_cumul_dist_counts[NCHUNKS+1];
 
 static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
 				struct tmem_oid *oid, uint32_t index,
@@ -611,13 +661,17 @@ static struct zv_hdr *zv_create(struct x
 	struct page *page;
 	struct zv_hdr *zv = NULL;
 	uint32_t offset;
+	int alloc_size = clen + sizeof(struct zv_hdr);
+	int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 	int ret;
 
 	BUG_ON(!irqs_disabled());
-	ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
+	ret = xv_malloc(xvpool, alloc_size,
 			&page, &offset, ZCACHE_GFP_MASK);
 	if (unlikely(ret))
 		goto out;
+	zv_curr_dist_counts[chunks]++;
+	zv_cumul_dist_counts[chunks]++;
 	zv = kmap_atomic(page, KM_USER0) + offset;
 	zv->index = index;
 	zv->oid = *oid;
@@ -634,11 +688,13 @@ static void zv_free(struct xv_pool *xvpo
 	unsigned long flags;
 	struct page *page;
 	uint32_t offset;
-	uint16_t size;
+	uint16_t size = xv_get_object_size(zv);
+	int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
 
 	ASSERT_SENTINEL(zv, ZVH);
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
+	zv_curr_dist_counts[chunks]--;
+	size -= sizeof(*zv);
+	BUG_ON(size == 0);
 	INVERT_SENTINEL(zv, ZVH);
 	page = virt_to_page(zv);
 	offset = (unsigned long)zv & ~PAGE_MASK;
@@ -656,7 +712,7 @@ static void zv_decompress(struct page *p
 
 	ASSERT_SENTINEL(zv, ZVH);
 	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
+	BUG_ON(size == 0);
 	to_va = kmap_atomic(page, KM_USER0);
 	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
 					size, to_va, &clen);
@@ -665,6 +721,159 @@ static void zv_decompress(struct page *p
 	BUG_ON(clen != PAGE_SIZE);
 }
 
+#ifdef CONFIG_SYSFS
+/*
+ * show a distribution of compression stats for zv pages.
+ */
+
+static int zv_curr_dist_counts_show(char *buf)
+{
+	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
+	char *p = buf;
+
+	for (i = 0; i <= NCHUNKS - 1; i++) {
+		n = zv_curr_dist_counts[i];
+		p += sprintf(p, "%lu ", n);
+		chunks += n;
+		sum_total_chunks += i * n;
+	}
+	p += sprintf(p, "mean:%lu\n",
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+
+static int zv_cumul_dist_counts_show(char *buf)
+{
+	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
+	char *p = buf;
+
+	for (i = 0; i <= NCHUNKS - 1; i++) {
+		n = zv_cumul_dist_counts[i];
+		p += sprintf(p, "%lu ", n);
+		chunks += n;
+		sum_total_chunks += i * n;
+	}
+	p += sprintf(p, "mean:%lu\n",
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+
+/*
+ * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
+ * pages that don't compress to less than this value (including metadata
+ * overhead) to be rejected.  We don't allow the value to get too close
+ * to PAGE_SIZE.
+ */
+static ssize_t zv_max_zsize_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", zv_max_zsize);
+}
+
+static ssize_t zv_max_zsize_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
+		return -EINVAL;
+	zv_max_zsize = val;
+	return count;
+}
+
+/*
+ * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
+ * pages that don't compress to less than this value (including metadata
+ * overhead) to be rejected UNLESS the mean compression is also smaller
+ * than this value.  In other words, we are load-balancing-by-zsize the
+ * accepted pages.  Again, we don't allow the value to get too close
+ * to PAGE_SIZE.
+ */
+static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", zv_max_mean_zsize);
+}
+
+static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
+		return -EINVAL;
+	zv_max_mean_zsize = val;
+	return count;
+}
+
+/*
+ * setting zv_page_count_policy_percent via sysfs sets an upper bound of
+ * persistent (e.g. swap) pages that will be retained according to:
+ *     (zv_page_count_policy_percent * totalram_pages) / 100)
+ * when that limit is reached, further puts will be rejected (until
+ * some pages have been flushed).  Note that, due to compression,
+ * this number may exceed 100; it defaults to 75 and we set an
+ * arbitary limit of 150.  A poor choice will almost certainly result
+ * in OOM's, so this value should only be changed prudently.
+ */
+static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 char *buf)
+{
+	return sprintf(buf, "%u\n", zv_page_count_policy_percent);
+}
+
+static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
+						  struct kobj_attribute *attr,
+						  const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err || (val == 0) || (val > 150))
+		return -EINVAL;
+	zv_page_count_policy_percent = val;
+	return count;
+}
+
+static struct kobj_attribute zcache_zv_max_zsize_attr = {
+		.attr = { .name = "zv_max_zsize", .mode = 0644 },
+		.show = zv_max_zsize_show,
+		.store = zv_max_zsize_store,
+};
+
+static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
+		.attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
+		.show = zv_max_mean_zsize_show,
+		.store = zv_max_mean_zsize_store,
+};
+
+static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
+		.attr = { .name = "zv_page_count_policy_percent",
+			  .mode = 0644 },
+		.show = zv_page_count_policy_percent_show,
+		.store = zv_page_count_policy_percent_store,
+};
+#endif
+
 /*
  * zcache core code starts here
  */
@@ -677,36 +886,70 @@ static unsigned long zcache_flobj_found;
 static unsigned long zcache_failed_eph_puts;
 static unsigned long zcache_failed_pers_puts;
 
-#define MAX_POOLS_PER_CLIENT 16
-
-static struct {
-	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
-	struct xv_pool *xvpool;
-} zcache_client;
-
 /*
  * Tmem operations assume the poolid implies the invoking client.
- * Zcache only has one client (the kernel itself), so translate
- * the poolid into the tmem_pool allocated for it.  A KVM version
+ * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
+ * RAMster has each client numbered by cluster node, and a KVM version
  * of zcache would have one client per guest and each client might
  * have a poolid==N.
  */
-static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
+static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
 {
 	struct tmem_pool *pool = NULL;
+	struct zcache_client *cli = NULL;
 
-	if (poolid >= 0) {
-		pool = zcache_client.tmem_pools[poolid];
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else {
+		if (cli_id >= MAX_CLIENTS)
+			goto out;
+		cli = &zcache_clients[cli_id];
+		if (cli == NULL)
+			goto out;
+		atomic_inc(&cli->refcount);
+	}
+	if (poolid < MAX_POOLS_PER_CLIENT) {
+		pool = cli->tmem_pools[poolid];
 		if (pool != NULL)
 			atomic_inc(&pool->refcount);
 	}
+out:
 	return pool;
 }
 
 static void zcache_put_pool(struct tmem_pool *pool)
 {
-	if (pool != NULL)
-		atomic_dec(&pool->refcount);
+	struct zcache_client *cli = NULL;
+
+	if (pool == NULL)
+		BUG();
+	cli = pool->client;
+	atomic_dec(&pool->refcount);
+	atomic_dec(&cli->refcount);
+}
+
+int zcache_new_client(uint16_t cli_id)
+{
+	struct zcache_client *cli = NULL;
+	int ret = -1;
+
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	if (cli->allocated)
+		goto out;
+	cli->allocated = 1;
+#ifdef CONFIG_FRONTSWAP
+	cli->xvpool = xv_create_pool();
+	if (cli->xvpool == NULL)
+		goto out;
+#endif
+	ret = 0;
+out:
+	return ret;
 }
 
 /* counters for debugging */
@@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
 /* forward reference */
 static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
 
-static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
-				 uint32_t index, struct page *page)
+static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
+				struct tmem_pool *pool, struct tmem_oid *oid,
+				 uint32_t index)
 {
 	void *pampd = NULL, *cdata;
 	size_t clen;
 	int ret;
-	bool ephemeral = is_ephemeral(pool);
 	unsigned long count;
+	struct page *page = virt_to_page(data);
+	struct zcache_client *cli = pool->client;
+	uint16_t client_id = get_client_id_from_client(cli);
+	unsigned long zv_mean_zsize;
+	unsigned long curr_pers_pampd_count;
 
-	if (ephemeral) {
+	if (eph) {
 		ret = zcache_compress(page, &cdata, &clen);
 		if (ret == 0)
-
 			goto out;
 		if (clen == 0 || clen > zbud_max_buddy_size()) {
 			zcache_compress_poor++;
 			goto out;
 		}
-		pampd = (void *)zbud_create(pool->pool_id, oid, index,
-						page, cdata, clen);
+		pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
+						index, page, cdata, clen);
 		if (pampd != NULL) {
 			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
 			if (count > zcache_curr_eph_pampd_count_max)
 				zcache_curr_eph_pampd_count_max = count;
 		}
 	} else {
-		/*
-		 * FIXME: This is all the "policy" there is for now.
-		 * 3/4 totpages should allow ~37% of RAM to be filled with
-		 * compressed frontswap pages
-		 */
-		if (atomic_read(&zcache_curr_pers_pampd_count) >
-							3 * totalram_pages / 4)
+		curr_pers_pampd_count =
+			atomic_read(&zcache_curr_pers_pampd_count);
+		if (curr_pers_pampd_count >
+		    (zv_page_count_policy_percent * totalram_pages) / 100)
 			goto out;
 		ret = zcache_compress(page, &cdata, &clen);
 		if (ret == 0)
 			goto out;
-		if (clen > zv_max_page_size) {
+		/* reject if compression is too poor */
+		if (clen > zv_max_zsize) {
 			zcache_compress_poor++;
 			goto out;
 		}
-		pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
+		/* reject if mean compression is too poor */
+		if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
+			zv_mean_zsize = xv_get_total_size_bytes(cli->xvpool) /
+						curr_pers_pampd_count;
+			if (zv_mean_zsize > zv_max_mean_zsize) {
+				zcache_mean_compress_poor++;
+				goto out;
+			}
+		}
+		pampd = (void *)zv_create(cli->xvpool, pool->pool_id,
 						oid, index, cdata, clen);
 		if (pampd == NULL)
 			goto out;
@@ -958,15 +1212,31 @@ out:
  * fill the pageframe corresponding to the struct page with the data
  * from the passed pampd
  */
-static int zcache_pampd_get_data(struct page *page, void *pampd,
-						struct tmem_pool *pool)
+static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
+					void *pampd, struct tmem_pool *pool,
+					struct tmem_oid *oid, uint32_t index)
 {
 	int ret = 0;
 
-	if (is_ephemeral(pool))
-		ret = zbud_decompress(page, pampd);
-	else
-		zv_decompress(page, pampd);
+	BUG_ON(is_ephemeral(pool));
+	zv_decompress(virt_to_page(data), pampd);
+	return ret;
+}
+
+/*
+ * fill the pageframe corresponding to the struct page with the data
+ * from the passed pampd
+ */
+static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
+					void *pampd, struct tmem_pool *pool,
+					struct tmem_oid *oid, uint32_t index)
+{
+	int ret = 0;
+
+	BUG_ON(!is_ephemeral(pool));
+	zbud_decompress(virt_to_page(data), pampd);
+	zbud_free_and_delist((struct zbud_hdr *)pampd);
+	atomic_dec(&zcache_curr_eph_pampd_count);
 	return ret;
 }
 
@@ -974,23 +1244,49 @@ static int zcache_pampd_get_data(struct 
  * free the pampd and remove it from any zcache lists
  * pampd must no longer be pointed to from any tmem data structures!
  */
-static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
+static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
+				struct tmem_oid *oid, uint32_t index)
 {
+	struct zcache_client *cli = pool->client;
+
 	if (is_ephemeral(pool)) {
 		zbud_free_and_delist((struct zbud_hdr *)pampd);
 		atomic_dec(&zcache_curr_eph_pampd_count);
 		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
 	} else {
-		zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
+		zv_free(cli->xvpool, (struct zv_hdr *)pampd);
 		atomic_dec(&zcache_curr_pers_pampd_count);
 		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
 	}
 }
 
+static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
+{
+}
+
+static void zcache_pampd_new_obj(struct tmem_obj *obj)
+{
+}
+
+static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
+{
+	return -1;
+}
+
+static bool zcache_pampd_is_remote(void *pampd)
+{
+	return 0;
+}
+
 static struct tmem_pamops zcache_pamops = {
 	.create = zcache_pampd_create,
 	.get_data = zcache_pampd_get_data,
+	.get_data_and_free = zcache_pampd_get_data_and_free,
 	.free = zcache_pampd_free,
+	.free_obj = zcache_pampd_free_obj,
+	.new_obj = zcache_pampd_new_obj,
+	.replace_in_obj = zcache_pampd_replace_in_obj,
+	.is_remote = zcache_pampd_is_remote,
 };
 
 /*
@@ -1122,6 +1418,7 @@ ZCACHE_SYSFS_RO(put_to_flush);
 ZCACHE_SYSFS_RO(aborted_preload);
 ZCACHE_SYSFS_RO(aborted_shrink);
 ZCACHE_SYSFS_RO(compress_poor);
+ZCACHE_SYSFS_RO(mean_compress_poor);
 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
@@ -1130,6 +1427,10 @@ ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_li
 			zbud_show_unbuddied_list_counts);
 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
 			zbud_show_cumul_chunk_counts);
+ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
+			zv_curr_dist_counts_show);
+ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
+			zv_cumul_dist_counts_show);
 
 static struct attribute *zcache_attrs[] = {
 	&zcache_curr_obj_count_attr.attr,
@@ -1143,6 +1444,7 @@ static struct attribute *zcache_attrs[] 
 	&zcache_failed_eph_puts_attr.attr,
 	&zcache_failed_pers_puts_attr.attr,
 	&zcache_compress_poor_attr.attr,
+	&zcache_mean_compress_poor_attr.attr,
 	&zcache_zbud_curr_raw_pages_attr.attr,
 	&zcache_zbud_curr_zpages_attr.attr,
 	&zcache_zbud_curr_zbytes_attr.attr,
@@ -1160,6 +1462,11 @@ static struct attribute *zcache_attrs[] 
 	&zcache_aborted_shrink_attr.attr,
 	&zcache_zbud_unbuddied_list_counts_attr.attr,
 	&zcache_zbud_cumul_chunk_counts_attr.attr,
+	&zcache_zv_curr_dist_counts_attr.attr,
+	&zcache_zv_cumul_dist_counts_attr.attr,
+	&zcache_zv_max_zsize_attr.attr,
+	&zcache_zv_max_mean_zsize_attr.attr,
+	&zcache_zv_page_count_policy_percent_attr.attr,
 	NULL,
 };
 
@@ -1212,19 +1519,20 @@ static struct shrinker zcache_shrinker =
  * zcache shims between cleancache/frontswap ops and tmem
  */
 
-static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
+static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
 				uint32_t index, struct page *page)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
 
 	BUG_ON(!irqs_disabled());
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (unlikely(pool == NULL))
 		goto out;
 	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
 		/* preload does preempt_disable on success */
-		ret = tmem_put(pool, oidp, index, page);
+		ret = tmem_put(pool, oidp, index, page_address(page),
+				PAGE_SIZE, 0, is_ephemeral(pool));
 		if (ret < 0) {
 			if (is_ephemeral(pool))
 				zcache_failed_eph_puts++;
@@ -1244,25 +1552,28 @@ out:
 	return ret;
 }
 
-static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
+static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
 				uint32_t index, struct page *page)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
 	unsigned long flags;
+	size_t size = PAGE_SIZE;
 
 	local_irq_save(flags);
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_get(pool, oidp, index, page);
+			ret = tmem_get(pool, oidp, index, page_address(page),
+					&size, 0, is_ephemeral(pool));
 		zcache_put_pool(pool);
 	}
 	local_irq_restore(flags);
 	return ret;
 }
 
-static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
+static int zcache_flush_page(int cli_id, int pool_id,
+				struct tmem_oid *oidp, uint32_t index)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
@@ -1270,7 +1581,7 @@ static int zcache_flush_page(int pool_id
 
 	local_irq_save(flags);
 	zcache_flush_total++;
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
 			ret = tmem_flush_page(pool, oidp, index);
@@ -1282,7 +1593,8 @@ static int zcache_flush_page(int pool_id
 	return ret;
 }
 
-static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
+static int zcache_flush_object(int cli_id, int pool_id,
+				struct tmem_oid *oidp)
 {
 	struct tmem_pool *pool;
 	int ret = -1;
@@ -1290,7 +1602,7 @@ static int zcache_flush_object(int pool_
 
 	local_irq_save(flags);
 	zcache_flobj_total++;
-	pool = zcache_get_pool_by_id(pool_id);
+	pool = zcache_get_pool_by_id(cli_id, pool_id);
 	if (likely(pool != NULL)) {
 		if (atomic_read(&pool->obj_count) > 0)
 			ret = tmem_flush_object(pool, oidp);
@@ -1302,34 +1614,52 @@ static int zcache_flush_object(int pool_
 	return ret;
 }
 
-static int zcache_destroy_pool(int pool_id)
+static int zcache_destroy_pool(int cli_id, int pool_id)
 {
 	struct tmem_pool *pool = NULL;
+	struct zcache_client *cli = NULL;
 	int ret = -1;
 
 	if (pool_id < 0)
 		goto out;
-	pool = zcache_client.tmem_pools[pool_id];
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	atomic_inc(&cli->refcount);
+	pool = cli->tmem_pools[pool_id];
 	if (pool == NULL)
 		goto out;
-	zcache_client.tmem_pools[pool_id] = NULL;
+	cli->tmem_pools[pool_id] = NULL;
 	/* wait for pool activity on other cpus to quiesce */
 	while (atomic_read(&pool->refcount) != 0)
 		;
+	atomic_dec(&cli->refcount);
 	local_bh_disable();
 	ret = tmem_destroy_pool(pool);
 	local_bh_enable();
 	kfree(pool);
-	pr_info("zcache: destroyed pool id=%d\n", pool_id);
+	pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
+			pool_id, cli_id);
 out:
 	return ret;
 }
 
-static int zcache_new_pool(uint32_t flags)
+static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
 {
 	int poolid = -1;
 	struct tmem_pool *pool;
+	struct zcache_client *cli = NULL;
 
+	if (cli_id == LOCAL_CLIENT)
+		cli = &zcache_host;
+	else if ((unsigned int)cli_id < MAX_CLIENTS)
+		cli = &zcache_clients[cli_id];
+	if (cli == NULL)
+		goto out;
+	atomic_inc(&cli->refcount);
 	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
 	if (pool == NULL) {
 		pr_info("zcache: pool creation failed: out of memory\n");
@@ -1337,7 +1667,7 @@ static int zcache_new_pool(uint32_t flag
 	}
 
 	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
-		if (zcache_client.tmem_pools[poolid] == NULL)
+		if (cli->tmem_pools[poolid] == NULL)
 			break;
 	if (poolid >= MAX_POOLS_PER_CLIENT) {
 		pr_info("zcache: pool creation failed: max exceeded\n");
@@ -1346,14 +1676,16 @@ static int zcache_new_pool(uint32_t flag
 		goto out;
 	}
 	atomic_set(&pool->refcount, 0);
-	pool->client = &zcache_client;
+	pool->client = cli;
 	pool->pool_id = poolid;
 	tmem_new_pool(pool, flags);
-	zcache_client.tmem_pools[poolid] = pool;
-	pr_info("zcache: created %s tmem pool, id=%d\n",
+	cli->tmem_pools[poolid] = pool;
+	pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
 		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-		poolid);
+		poolid, cli_id);
 out:
+	if (cli != NULL)
+		atomic_dec(&cli->refcount);
 	return poolid;
 }
 
@@ -1374,7 +1706,7 @@ static void zcache_cleancache_put_page(i
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
 	if (likely(ind == index))
-		(void)zcache_put_page(pool_id, &oid, index, page);
+		(void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
 }
 
 static int zcache_cleancache_get_page(int pool_id,
@@ -1386,7 +1718,7 @@ static int zcache_cleancache_get_page(in
 	int ret = -1;
 
 	if (likely(ind == index))
-		ret = zcache_get_page(pool_id, &oid, index, page);
+		ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
 	return ret;
 }
 
@@ -1398,7 +1730,7 @@ static void zcache_cleancache_flush_page
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
 	if (likely(ind == index))
-		(void)zcache_flush_page(pool_id, &oid, ind);
+		(void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
 }
 
 static void zcache_cleancache_flush_inode(int pool_id,
@@ -1406,13 +1738,13 @@ static void zcache_cleancache_flush_inod
 {
 	struct tmem_oid oid = *(struct tmem_oid *)&key;
 
-	(void)zcache_flush_object(pool_id, &oid);
+	(void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
 }
 
 static void zcache_cleancache_flush_fs(int pool_id)
 {
 	if (pool_id >= 0)
-		(void)zcache_destroy_pool(pool_id);
+		(void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
 }
 
 static int zcache_cleancache_init_fs(size_t pagesize)
@@ -1420,7 +1752,7 @@ static int zcache_cleancache_init_fs(siz
 	BUG_ON(sizeof(struct cleancache_filekey) !=
 				sizeof(struct tmem_oid));
 	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(0);
+	return zcache_new_pool(LOCAL_CLIENT, 0);
 }
 
 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
@@ -1429,7 +1761,7 @@ static int zcache_cleancache_init_shared
 	BUG_ON(sizeof(struct cleancache_filekey) !=
 				sizeof(struct tmem_oid));
 	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(0);
+	return zcache_new_pool(LOCAL_CLIENT, 0);
 }
 
 static struct cleancache_ops zcache_cleancache_ops = {
@@ -1483,8 +1815,8 @@ static int zcache_frontswap_put_page(uns
 	BUG_ON(!PageLocked(page));
 	if (likely(ind64 == ind)) {
 		local_irq_save(flags);
-		ret = zcache_put_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind), page);
+		ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind), page);
 		local_irq_restore(flags);
 	}
 	return ret;
@@ -1502,8 +1834,8 @@ static int zcache_frontswap_get_page(uns
 
 	BUG_ON(!PageLocked(page));
 	if (likely(ind64 == ind))
-		ret = zcache_get_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind), page);
+		ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind), page);
 	return ret;
 }
 
@@ -1515,8 +1847,8 @@ static void zcache_frontswap_flush_page(
 	struct tmem_oid oid = oswiz(type, ind);
 
 	if (likely(ind64 == ind))
-		(void)zcache_flush_page(zcache_frontswap_poolid, &oid,
-					iswiz(ind));
+		(void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
+					&oid, iswiz(ind));
 }
 
 /* flush all pages from the passed swaptype */
@@ -1527,7 +1859,8 @@ static void zcache_frontswap_flush_area(
 
 	for (ind = SWIZ_MASK; ind >= 0; ind--) {
 		oid = oswiz(type, ind);
-		(void)zcache_flush_object(zcache_frontswap_poolid, &oid);
+		(void)zcache_flush_object(LOCAL_CLIENT,
+						zcache_frontswap_poolid, &oid);
 	}
 }
 
@@ -1535,7 +1868,8 @@ static void zcache_frontswap_init(unsign
 {
 	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
 	if (zcache_frontswap_poolid < 0)
-		zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
+		zcache_frontswap_poolid =
+			zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
 }
 
 static struct frontswap_ops zcache_frontswap_ops = {
@@ -1624,6 +1958,11 @@ static int __init zcache_init(void)
 				sizeof(struct tmem_objnode), 0, 0, NULL);
 	zcache_obj_cache = kmem_cache_create("zcache_obj",
 				sizeof(struct tmem_obj), 0, 0, NULL);
+	ret = zcache_new_client(LOCAL_CLIENT);
+	if (ret) {
+		pr_err("zcache: can't create client\n");
+		goto out;
+	}
 #endif
 #ifdef CONFIG_CLEANCACHE
 	if (zcache_enabled && use_cleancache) {
@@ -1642,11 +1981,6 @@ static int __init zcache_init(void)
 	if (zcache_enabled && use_frontswap) {
 		struct frontswap_ops old_ops;
 
-		zcache_client.xvpool = xv_create_pool();
-		if (zcache_client.xvpool == NULL) {
-			pr_err("zcache: can't create xvpool\n");
-			goto out;
-		}
 		old_ops = zcache_frontswap_register_ops();
 		pr_info("zcache: frontswap enabled using kernel "
 			"transcendent memory and xvmalloc\n");

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-06-30 19:01 ` Dan Magenheimer
@ 2011-06-30 19:23   ` Greg KH
  -1 siblings, 0 replies; 25+ messages in thread
From: Greg KH @ 2011-06-30 19:23 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Marcus Klemm, Seth Jennings, Konrad Wilk, linux-kernel, devel,
	kvm, linux-mm

On Thu, Jun 30, 2011 at 12:01:08PM -0700, Dan Magenheimer wrote:
> Hi Greg --
> 
> I think this patch is now ready for staging-next and for merging when
> the 3.1 window opens.  Please let me know if you need any logistics
> done differently.

Ok, thanks, I'll queue it up later this week for 3.1

greg k-h

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 19:23   ` Greg KH
  0 siblings, 0 replies; 25+ messages in thread
From: Greg KH @ 2011-06-30 19:23 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Marcus Klemm, Seth Jennings, Konrad Wilk, linux-kernel, devel,
	kvm, linux-mm

On Thu, Jun 30, 2011 at 12:01:08PM -0700, Dan Magenheimer wrote:
> Hi Greg --
> 
> I think this patch is now ready for staging-next and for merging when
> the 3.1 window opens.  Please let me know if you need any logistics
> done differently.

Ok, thanks, I'll queue it up later this week for 3.1

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-06-30 19:01 ` Dan Magenheimer
  (?)
@ 2011-06-30 22:40   ` Dan Carpenter
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-06-30 22:40 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Thu, Jun 30, 2011 at 12:01:08PM -0700, Dan Magenheimer wrote:
> +static int zv_curr_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {

It's more common to write the condition as:  i < NCHUNKS.


> +		n = zv_curr_dist_counts[i];

zv_curr_dist_counts has NCHUNKS + 1 elements so we never print
display the final element.  I don't know this coe, so I could be
wrong but I think that we could make zv_curr_dist_counts only hold
NCHUNKS elements.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +
> +static int zv_cumul_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {
> +		n = zv_cumul_dist_counts[i];

Same situation.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +

regards,
dan carpenter

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 22:40   ` Dan Carpenter
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-06-30 22:40 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Marcus Klemm, kvm, Konrad Wilk, Greg Kroah-Hartman, linux-kernel,
	linux-mm, Seth Jennings, devel

On Thu, Jun 30, 2011 at 12:01:08PM -0700, Dan Magenheimer wrote:
> +static int zv_curr_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {

It's more common to write the condition as:  i < NCHUNKS.


> +		n = zv_curr_dist_counts[i];

zv_curr_dist_counts has NCHUNKS + 1 elements so we never print
display the final element.  I don't know this coe, so I could be
wrong but I think that we could make zv_curr_dist_counts only hold
NCHUNKS elements.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +
> +static int zv_cumul_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {
> +		n = zv_cumul_dist_counts[i];

Same situation.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +

regards,
dan carpenter

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 22:40   ` Dan Carpenter
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-06-30 22:40 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Thu, Jun 30, 2011 at 12:01:08PM -0700, Dan Magenheimer wrote:
> +static int zv_curr_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {

It's more common to write the condition as:  i < NCHUNKS.


> +		n = zv_curr_dist_counts[i];

zv_curr_dist_counts has NCHUNKS + 1 elements so we never print
display the final element.  I don't know this coe, so I could be
wrong but I think that we could make zv_curr_dist_counts only hold
NCHUNKS elements.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +
> +static int zv_cumul_dist_counts_show(char *buf)
> +{
> +	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i <= NCHUNKS - 1; i++) {
> +		n = zv_cumul_dist_counts[i];

Same situation.

> +		p += sprintf(p, "%lu ", n);
> +		chunks += n;
> +		sum_total_chunks += i * n;
> +	}
> +	p += sprintf(p, "mean:%lu\n",
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +

regards,
dan carpenter

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-06-30 22:40   ` Dan Carpenter
@ 2011-06-30 23:28     ` Dan Magenheimer
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-06-30 23:28 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

> > +	for (i = 0; i <= NCHUNKS - 1; i++) {
> 
> It's more common to write the condition as:  i < NCHUNKS.
> 
> > +		n = zv_curr_dist_counts[i];
> 
> zv_curr_dist_counts has NCHUNKS + 1 elements so we never print
> display the final element.  I don't know this coe, so I could be
> wrong but I think that we could make zv_curr_dist_counts only hold
> NCHUNKS elements.
> 
> > +	for (i = 0; i <= NCHUNKS - 1; i++) {
> 
> Same situation.

Hi Dan --

Thanks for the careful review.  You're right... some
of this was leftover from debugging an off-by-one error,
though the code as is still works.

OTOH, there's a good chance that much of this sysfs
code will disappear before zcache would get promoted
out of staging, since it is to help those experimenting
with zcache to get more insight into what the underlying
compression/accept-reject algorithms are doing.

So I hope you (and GregKH) are OK that another version is
not necessary at this time to fix these.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-06-30 23:28     ` Dan Magenheimer
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-06-30 23:28 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

> > +	for (i = 0; i <= NCHUNKS - 1; i++) {
> 
> It's more common to write the condition as:  i < NCHUNKS.
> 
> > +		n = zv_curr_dist_counts[i];
> 
> zv_curr_dist_counts has NCHUNKS + 1 elements so we never print
> display the final element.  I don't know this coe, so I could be
> wrong but I think that we could make zv_curr_dist_counts only hold
> NCHUNKS elements.
> 
> > +	for (i = 0; i <= NCHUNKS - 1; i++) {
> 
> Same situation.

Hi Dan --

Thanks for the careful review.  You're right... some
of this was leftover from debugging an off-by-one error,
though the code as is still works.

OTOH, there's a good chance that much of this sysfs
code will disappear before zcache would get promoted
out of staging, since it is to help those experimenting
with zcache to get more insight into what the underlying
compression/accept-reject algorithms are doing.

So I hope you (and GregKH) are OK that another version is
not necessary at this time to fix these.

Thanks,
Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-06-30 23:28     ` Dan Magenheimer
@ 2011-07-01  8:38       ` Dan Carpenter
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-07-01  8:38 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Thu, Jun 30, 2011 at 04:28:14PM -0700, Dan Magenheimer wrote:
> Hi Dan --
> 
> Thanks for the careful review.  You're right... some
> of this was leftover from debugging an off-by-one error,
> though the code as is still works.
> 
> OTOH, there's a good chance that much of this sysfs
> code will disappear before zcache would get promoted
> out of staging, since it is to help those experimenting
> with zcache to get more insight into what the underlying
> compression/accept-reject algorithms are doing.
> 
> So I hope you (and GregKH) are OK that another version is
> not necessary at this time to fix these.

Off by one errors are kind of insidious.  People cut and paste them
and they spread.  If someone adds a new list of chunks then there
are now two examples that are correct and two which have an extra
element, so it's 50/50 that he'll copy the right one.

Btw, looking at it again, this seems like maybe a similar issue in
zbud_evict_zbpg():

   515          /* now try freeing unbuddied pages, starting with least space avail */
   516          for (i = 0; i < MAX_CHUNK; i++) {
   517  retry_unbud_list_i:


MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
reach the last element in the list?

regards,
dan carpenter


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-07-01  8:38       ` Dan Carpenter
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-07-01  8:38 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Thu, Jun 30, 2011 at 04:28:14PM -0700, Dan Magenheimer wrote:
> Hi Dan --
> 
> Thanks for the careful review.  You're right... some
> of this was leftover from debugging an off-by-one error,
> though the code as is still works.
> 
> OTOH, there's a good chance that much of this sysfs
> code will disappear before zcache would get promoted
> out of staging, since it is to help those experimenting
> with zcache to get more insight into what the underlying
> compression/accept-reject algorithms are doing.
> 
> So I hope you (and GregKH) are OK that another version is
> not necessary at this time to fix these.

Off by one errors are kind of insidious.  People cut and paste them
and they spread.  If someone adds a new list of chunks then there
are now two examples that are correct and two which have an extra
element, so it's 50/50 that he'll copy the right one.

Btw, looking at it again, this seems like maybe a similar issue in
zbud_evict_zbpg():

   515          /* now try freeing unbuddied pages, starting with least space avail */
   516          for (i = 0; i < MAX_CHUNK; i++) {
   517  retry_unbud_list_i:


MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
reach the last element in the list?

regards,
dan carpenter

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-07-01  8:38       ` Dan Carpenter
@ 2011-07-01 14:31         ` Dan Magenheimer
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-07-01 14:31 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

> From: Dan Carpenter [mailto:error27@gmail.com]
> 
> On Thu, Jun 30, 2011 at 04:28:14PM -0700, Dan Magenheimer wrote:
> > Hi Dan --
> >
> > Thanks for the careful review.  You're right... some
> > of this was leftover from debugging an off-by-one error,
> > though the code as is still works.
> >
> > OTOH, there's a good chance that much of this sysfs
> > code will disappear before zcache would get promoted
> > out of staging, since it is to help those experimenting
> > with zcache to get more insight into what the underlying
> > compression/accept-reject algorithms are doing.
> >
> > So I hope you (and GregKH) are OK that another version is
> > not necessary at this time to fix these.
> 
> Off by one errors are kind of insidious.  People cut and paste them
> and they spread.  If someone adds a new list of chunks then there
> are now two examples that are correct and two which have an extra
> element, so it's 50/50 that he'll copy the right one.

True, but these are NOT off-by-one errors... they are
correct-but-slightly-ugly code snippets.  (To clarify, I said
the *ugliness* arose when debugging an off-by-one error.)

Patches always welcome, and I agree that these should be
fixed eventually, assuming the code doesn't go away completely
first.. I'm simply stating the position
that going through another test/submit cycling to fix
correct-but-slightly-ugly code which is present only to
surface information for experiments is not high on my priority
list right now... unless GregKH says he won't accept the patch.
 
> Btw, looking at it again, this seems like maybe a similar issue in
> zbud_evict_zbpg():
> 
>    516          for (i = 0; i < MAX_CHUNK; i++) {
>    517  retry_unbud_list_i:
> 
> 
> MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> reach the last element in the list?

No, the last element in that list is unused.  There is a comment
to that effect someplace in the code.  (These lists are keeping
track of pages with "chunks" of available space and the last
entry would have no available space so is always empty.)

Thanks again for your interest... are you using zcache?

Dan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-07-01 14:31         ` Dan Magenheimer
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-07-01 14:31 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

> From: Dan Carpenter [mailto:error27@gmail.com]
> 
> On Thu, Jun 30, 2011 at 04:28:14PM -0700, Dan Magenheimer wrote:
> > Hi Dan --
> >
> > Thanks for the careful review.  You're right... some
> > of this was leftover from debugging an off-by-one error,
> > though the code as is still works.
> >
> > OTOH, there's a good chance that much of this sysfs
> > code will disappear before zcache would get promoted
> > out of staging, since it is to help those experimenting
> > with zcache to get more insight into what the underlying
> > compression/accept-reject algorithms are doing.
> >
> > So I hope you (and GregKH) are OK that another version is
> > not necessary at this time to fix these.
> 
> Off by one errors are kind of insidious.  People cut and paste them
> and they spread.  If someone adds a new list of chunks then there
> are now two examples that are correct and two which have an extra
> element, so it's 50/50 that he'll copy the right one.

True, but these are NOT off-by-one errors... they are
correct-but-slightly-ugly code snippets.  (To clarify, I said
the *ugliness* arose when debugging an off-by-one error.)

Patches always welcome, and I agree that these should be
fixed eventually, assuming the code doesn't go away completely
first.. I'm simply stating the position
that going through another test/submit cycling to fix
correct-but-slightly-ugly code which is present only to
surface information for experiments is not high on my priority
list right now... unless GregKH says he won't accept the patch.
 
> Btw, looking at it again, this seems like maybe a similar issue in
> zbud_evict_zbpg():
> 
>    516          for (i = 0; i < MAX_CHUNK; i++) {
>    517  retry_unbud_list_i:
> 
> 
> MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> reach the last element in the list?

No, the last element in that list is unused.  There is a comment
to that effect someplace in the code.  (These lists are keeping
track of pages with "chunks" of available space and the last
entry would have no available space so is always empty.)

Thanks again for your interest... are you using zcache?

Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-07-01 14:31         ` Dan Magenheimer
@ 2011-07-01 16:58           ` Dan Carpenter
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-07-01 16:58 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Fri, Jul 01, 2011 at 07:31:54AM -0700, Dan Magenheimer wrote:
> > Off by one errors are kind of insidious.  People cut and paste them
> > and they spread.  If someone adds a new list of chunks then there
> > are now two examples that are correct and two which have an extra
> > element, so it's 50/50 that he'll copy the right one.
> 
> True, but these are NOT off-by-one errors... they are
> correct-but-slightly-ugly code snippets.  (To clarify, I said
> the *ugliness* arose when debugging an off-by-one error.)
> 

What I meant was the new arrays are *one* element too large.

> Patches always welcome, and I agree that these should be
> fixed eventually, assuming the code doesn't go away completely
> first.. I'm simply stating the position
> that going through another test/submit cycling to fix
> correct-but-slightly-ugly code which is present only to
> surface information for experiments is not high on my priority
> list right now... unless GregKH says he won't accept the patch.
>  
> > Btw, looking at it again, this seems like maybe a similar issue in
> > zbud_evict_zbpg():
> > 
> >    516          for (i = 0; i < MAX_CHUNK; i++) {
> >    517  retry_unbud_list_i:
> > 
> > 
> > MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> > reach the last element in the list?
> 
> No, the last element in that list is unused.  There is a comment
> to that effect someplace in the code.  (These lists are keeping
> track of pages with "chunks" of available space and the last
> entry would have no available space so is always empty.)

The comment says that the first element isn't used.  Perhaps the
comment is out of date and now it's the last element that isn't
used.  To me, it makes sense to have an unused first element, but it
doesn't make sense to have an unused last element.  Why not just
make the array smaller?

Also if the last element of the original arrays isn't used, then
does that mean the last *two* elements of the new arrays aren't
used?

Getting array sizes wrong is not a "correct-but-slightly-ugly"
thing.  *grumble* *grumble* *grumble*.  But it doesn't crash the
system so I'm fine with it going in as is...

> 
> Thanks again for your interest... are you using zcache?

No.  I was just on the driver-devel list reviewing patches at
random.

regards,
dan carpenter


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-07-01 16:58           ` Dan Carpenter
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Carpenter @ 2011-07-01 16:58 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, kvm, Konrad Wilk, linux-kernel,
	linux-mm, Seth Jennings, devel

On Fri, Jul 01, 2011 at 07:31:54AM -0700, Dan Magenheimer wrote:
> > Off by one errors are kind of insidious.  People cut and paste them
> > and they spread.  If someone adds a new list of chunks then there
> > are now two examples that are correct and two which have an extra
> > element, so it's 50/50 that he'll copy the right one.
> 
> True, but these are NOT off-by-one errors... they are
> correct-but-slightly-ugly code snippets.  (To clarify, I said
> the *ugliness* arose when debugging an off-by-one error.)
> 

What I meant was the new arrays are *one* element too large.

> Patches always welcome, and I agree that these should be
> fixed eventually, assuming the code doesn't go away completely
> first.. I'm simply stating the position
> that going through another test/submit cycling to fix
> correct-but-slightly-ugly code which is present only to
> surface information for experiments is not high on my priority
> list right now... unless GregKH says he won't accept the patch.
>  
> > Btw, looking at it again, this seems like maybe a similar issue in
> > zbud_evict_zbpg():
> > 
> >    516          for (i = 0; i < MAX_CHUNK; i++) {
> >    517  retry_unbud_list_i:
> > 
> > 
> > MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> > reach the last element in the list?
> 
> No, the last element in that list is unused.  There is a comment
> to that effect someplace in the code.  (These lists are keeping
> track of pages with "chunks" of available space and the last
> entry would have no available space so is always empty.)

The comment says that the first element isn't used.  Perhaps the
comment is out of date and now it's the last element that isn't
used.  To me, it makes sense to have an unused first element, but it
doesn't make sense to have an unused last element.  Why not just
make the array smaller?

Also if the last element of the original arrays isn't used, then
does that mean the last *two* elements of the new arrays aren't
used?

Getting array sizes wrong is not a "correct-but-slightly-ugly"
thing.  *grumble* *grumble* *grumble*.  But it doesn't crash the
system so I'm fine with it going in as is...

> 
> Thanks again for your interest... are you using zcache?

No.  I was just on the driver-devel list reviewing patches at
random.

regards,
dan carpenter

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-07-01 16:58           ` Dan Carpenter
@ 2011-07-06  3:30             ` Greg KH
  -1 siblings, 0 replies; 25+ messages in thread
From: Greg KH @ 2011-07-06  3:30 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Dan Magenheimer, Greg Kroah-Hartman, Marcus Klemm, kvm,
	Konrad Wilk, linux-kernel, linux-mm, Seth Jennings, devel

On Fri, Jul 01, 2011 at 07:58:45PM +0300, Dan Carpenter wrote:
> On Fri, Jul 01, 2011 at 07:31:54AM -0700, Dan Magenheimer wrote:
> > > Off by one errors are kind of insidious.  People cut and paste them
> > > and they spread.  If someone adds a new list of chunks then there
> > > are now two examples that are correct and two which have an extra
> > > element, so it's 50/50 that he'll copy the right one.
> > 
> > True, but these are NOT off-by-one errors... they are
> > correct-but-slightly-ugly code snippets.  (To clarify, I said
> > the *ugliness* arose when debugging an off-by-one error.)
> > 
> 
> What I meant was the new arrays are *one* element too large.
> 
> > Patches always welcome, and I agree that these should be
> > fixed eventually, assuming the code doesn't go away completely
> > first.. I'm simply stating the position
> > that going through another test/submit cycling to fix
> > correct-but-slightly-ugly code which is present only to
> > surface information for experiments is not high on my priority
> > list right now... unless GregKH says he won't accept the patch.
> >  
> > > Btw, looking at it again, this seems like maybe a similar issue in
> > > zbud_evict_zbpg():
> > > 
> > >    516          for (i = 0; i < MAX_CHUNK; i++) {
> > >    517  retry_unbud_list_i:
> > > 
> > > 
> > > MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> > > reach the last element in the list?
> > 
> > No, the last element in that list is unused.  There is a comment
> > to that effect someplace in the code.  (These lists are keeping
> > track of pages with "chunks" of available space and the last
> > entry would have no available space so is always empty.)
> 
> The comment says that the first element isn't used.  Perhaps the
> comment is out of date and now it's the last element that isn't
> used.  To me, it makes sense to have an unused first element, but it
> doesn't make sense to have an unused last element.  Why not just
> make the array smaller?
> 
> Also if the last element of the original arrays isn't used, then
> does that mean the last *two* elements of the new arrays aren't
> used?
> 
> Getting array sizes wrong is not a "correct-but-slightly-ugly"
> thing.  *grumble* *grumble* *grumble*.  But it doesn't crash the
> system so I'm fine with it going in as is...

I'm not.  Please fix this up.  I'll not accept it until it is.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-07-06  3:30             ` Greg KH
  0 siblings, 0 replies; 25+ messages in thread
From: Greg KH @ 2011-07-06  3:30 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Dan Magenheimer, Greg Kroah-Hartman, Marcus Klemm, kvm,
	Konrad Wilk, linux-kernel, linux-mm, Seth Jennings, devel

On Fri, Jul 01, 2011 at 07:58:45PM +0300, Dan Carpenter wrote:
> On Fri, Jul 01, 2011 at 07:31:54AM -0700, Dan Magenheimer wrote:
> > > Off by one errors are kind of insidious.  People cut and paste them
> > > and they spread.  If someone adds a new list of chunks then there
> > > are now two examples that are correct and two which have an extra
> > > element, so it's 50/50 that he'll copy the right one.
> > 
> > True, but these are NOT off-by-one errors... they are
> > correct-but-slightly-ugly code snippets.  (To clarify, I said
> > the *ugliness* arose when debugging an off-by-one error.)
> > 
> 
> What I meant was the new arrays are *one* element too large.
> 
> > Patches always welcome, and I agree that these should be
> > fixed eventually, assuming the code doesn't go away completely
> > first.. I'm simply stating the position
> > that going through another test/submit cycling to fix
> > correct-but-slightly-ugly code which is present only to
> > surface information for experiments is not high on my priority
> > list right now... unless GregKH says he won't accept the patch.
> >  
> > > Btw, looking at it again, this seems like maybe a similar issue in
> > > zbud_evict_zbpg():
> > > 
> > >    516          for (i = 0; i < MAX_CHUNK; i++) {
> > >    517  retry_unbud_list_i:
> > > 
> > > 
> > > MAX_CHUNKS is NCHUNKS - 1.  Shouldn't that be i < NCHUNKS so that we
> > > reach the last element in the list?
> > 
> > No, the last element in that list is unused.  There is a comment
> > to that effect someplace in the code.  (These lists are keeping
> > track of pages with "chunks" of available space and the last
> > entry would have no available space so is always empty.)
> 
> The comment says that the first element isn't used.  Perhaps the
> comment is out of date and now it's the last element that isn't
> used.  To me, it makes sense to have an unused first element, but it
> doesn't make sense to have an unused last element.  Why not just
> make the array smaller?
> 
> Also if the last element of the original arrays isn't used, then
> does that mean the last *two* elements of the new arrays aren't
> used?
> 
> Getting array sizes wrong is not a "correct-but-slightly-ugly"
> thing.  *grumble* *grumble* *grumble*.  But it doesn't crash the
> system so I'm fine with it going in as is...

I'm not.  Please fix this up.  I'll not accept it until it is.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-06-30 19:01 ` Dan Magenheimer
@ 2011-08-10 14:21   ` Seth Jennings
  -1 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 14:21 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 06/30/2011 02:01 PM, Dan Magenheimer wrote:
> Hi Greg --
> 
> I think this patch is now ready for staging-next and for merging when
> the 3.1 window opens.  Please let me know if you need any logistics
> done differently.
> 
> Thanks,
> Dan
> 
> ===
> 
>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
> Subject: staging: zcache: support multiple clients, prep for KVM and RAMster
> 
> This is version 2 of an update to zcache, incorporating feedback from the list.
> This patch adds support to the in-kernel transcendent memory ("tmem") code
> and the zcache driver for multiple clients, which will be needed for both
> RAMster and KVM support.  It also adds additional tmem callbacks to support
> RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
> also taken the liberty of adding some additional sysfs variables to
> both surface information and allow policy control.  Those experimenting
> with zcache should find them useful.
> 
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> 
> [v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
> [v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
> [v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
> Cc: Nitin Gupta <ngupta@vflare.org>
> Cc: linux-mm@kvack.org
> Cc: kvm@vger.kernel.org
> 
> ===
> 
> Diffstat:
>  drivers/staging/zcache/tmem.c            |  100 +++-
>  drivers/staging/zcache/tmem.h            |   23 
>  drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
>  3 files changed, 520 insertions(+), 115 deletions(-)
> 
> 
<cut>
> @@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
>  /* forward reference */
>  static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
>  
> -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
> -				 uint32_t index, struct page *page)
> +static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
> +				struct tmem_pool *pool, struct tmem_oid *oid,
> +				 uint32_t index)
>  {
>  	void *pampd = NULL, *cdata;
>  	size_t clen;
>  	int ret;
> -	bool ephemeral = is_ephemeral(pool);
>  	unsigned long count;
> +	struct page *page = virt_to_page(data);

With zcache_put_page() modified to pass page_address(page) instead of the 
actual page structure, in combination with the function signature changes 
to tmem_put() and zcache_pampd_create(), zcache_pampd_create() tries to 
(re)derive the page structure from the virtual address.  However, if the 
original page is a high memory page (or any unmapped page), this 
virt_to_page() fails because the page_address() in zcache_put_page()
returned NULL.

With CONFIG_DEBUG_VIRTUAL set, the BUG message is this:
==========
[  101.347711] ------------[ cut here ]------------
[  101.348030] kernel BUG at arch/x86/mm/physaddr.c:51!
[  101.348030] invalid opcode: 0000 [#1] DEBUG_PAGEALLOC
[  101.348030] Modules linked in:
[  101.348030] 
[  101.348030] Pid: 20, comm: kswapd0 Not tainted 3.1.0-rc1+ #229 Bochs Bochs
[  101.348030] EIP: 0060:[<c1058c9a>] EFLAGS: 00010013 CPU: 0
[  101.348030] EIP is at __phys_addr+0x1a/0x50
[  101.348030] EAX: 00000000 EBX: f5e64000 ECX: 00000000 EDX: 00001000
[  101.348030] ESI: f6ffd000 EDI: 00000000 EBP: f6353c10 ESP: f6353c10
[  101.348030]  DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
[  101.348030] Process kswapd0 (pid: 20, ti=f6352000 task=f69b9c20 task.ti=f6352000)
[  101.348030] Stack:
[  101.348030]  f6353c60 c12e4114 00000000 00000001 00000000 c12e5682 00000000 00000046
[  101.348030]  00000000 f5e65668 f5e65658 f5e65654 f580f000 f6353c60 c13904f5 00000000
[  101.348030]  f5e65654 f5e64000 f5eab000 00000000 f6353cb0 c12e5713 00000000 f5e64000
[  101.348030] Call Trace:
[  101.348030]  [<c12e4114>] zcache_pampd_create+0x14/0x6a0
[  101.348030]  [<c12e5682>] ? tmem_put+0x52/0x3f0
[  101.348030]  [<c13904f5>] ? _raw_spin_lock+0x45/0x50
[  101.348030]  [<c12e5713>] tmem_put+0xe3/0x3f0
[  101.348030]  [<c10f5358>] ? page_address+0xb8/0xe0
[  101.348030]  [<c12e498b>] zcache_frontswap_put_page+0x1eb/0x2e0
[  101.348030]  [<c110798b>] __frontswap_put_page+0x6b/0x110
[  101.348030]  [<c1103ccf>] swap_writepage+0x8f/0xf0
[  101.348030]  [<c10ee377>] shmem_writepage+0x1a7/0x1d0
[  101.348030]  [<c10ea907>] shrink_page_list+0x3f7/0x7c0
[  101.348030]  [<c10eafdd>] shrink_inactive_list+0x12d/0x360
[  101.348030]  [<c10eb5f8>] shrink_zone+0x3e8/0x530
[  101.348030]  [<c10ebc92>] kswapd+0x552/0x940
[  101.348030]  [<c1084460>] ? wake_up_bit+0x30/0x30
[  101.348030]  [<c10eb740>] ? shrink_zone+0x530/0x530
[  101.348030]  [<c10840e4>] kthread+0x74/0x80
[  101.348030]  [<c1084070>] ? __init_kthread_worker+0x60/0x60
[  101.348030]  [<c1397736>] kernel_thread_helper+0x6/0xd
[  101.348030] Code: 00 c0 ff 81 eb 00 20 00 00 39 d8 72 cd eb ae 66 90 55 3d ff ff ff bf 89 e5 76 10 80 3d 10 88 53 c1 00 75 09 05 00 00 00 40 5d c3 <0f> 0b 8b 15 b0 6a 9e c1 81 c2 00 00 80 00 39 d0 72 e7 8b 15 a8 
[  101.348030] EIP: [<c1058c9a>] __phys_addr+0x1a/0x50 SS:ESP 0068:f6353c10
==========

This crash is hit every time a high memory page is swapped out.

I have no solution right now other that to revert this patch and 
restore the original signatures.

What was the rationale for the signature changes?

--
Seth

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-08-10 14:21   ` Seth Jennings
  0 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 14:21 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 06/30/2011 02:01 PM, Dan Magenheimer wrote:
> Hi Greg --
> 
> I think this patch is now ready for staging-next and for merging when
> the 3.1 window opens.  Please let me know if you need any logistics
> done differently.
> 
> Thanks,
> Dan
> 
> ===
> 
>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
> Subject: staging: zcache: support multiple clients, prep for KVM and RAMster
> 
> This is version 2 of an update to zcache, incorporating feedback from the list.
> This patch adds support to the in-kernel transcendent memory ("tmem") code
> and the zcache driver for multiple clients, which will be needed for both
> RAMster and KVM support.  It also adds additional tmem callbacks to support
> RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
> also taken the liberty of adding some additional sysfs variables to
> both surface information and allow policy control.  Those experimenting
> with zcache should find them useful.
> 
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> 
> [v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
> [v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
> [v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
> Cc: Nitin Gupta <ngupta@vflare.org>
> Cc: linux-mm@kvack.org
> Cc: kvm@vger.kernel.org
> 
> ===
> 
> Diffstat:
>  drivers/staging/zcache/tmem.c            |  100 +++-
>  drivers/staging/zcache/tmem.h            |   23 
>  drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
>  3 files changed, 520 insertions(+), 115 deletions(-)
> 
> 
<cut>
> @@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
>  /* forward reference */
>  static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
>  
> -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
> -				 uint32_t index, struct page *page)
> +static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
> +				struct tmem_pool *pool, struct tmem_oid *oid,
> +				 uint32_t index)
>  {
>  	void *pampd = NULL, *cdata;
>  	size_t clen;
>  	int ret;
> -	bool ephemeral = is_ephemeral(pool);
>  	unsigned long count;
> +	struct page *page = virt_to_page(data);

With zcache_put_page() modified to pass page_address(page) instead of the 
actual page structure, in combination with the function signature changes 
to tmem_put() and zcache_pampd_create(), zcache_pampd_create() tries to 
(re)derive the page structure from the virtual address.  However, if the 
original page is a high memory page (or any unmapped page), this 
virt_to_page() fails because the page_address() in zcache_put_page()
returned NULL.

With CONFIG_DEBUG_VIRTUAL set, the BUG message is this:
==========
[  101.347711] ------------[ cut here ]------------
[  101.348030] kernel BUG at arch/x86/mm/physaddr.c:51!
[  101.348030] invalid opcode: 0000 [#1] DEBUG_PAGEALLOC
[  101.348030] Modules linked in:
[  101.348030] 
[  101.348030] Pid: 20, comm: kswapd0 Not tainted 3.1.0-rc1+ #229 Bochs Bochs
[  101.348030] EIP: 0060:[<c1058c9a>] EFLAGS: 00010013 CPU: 0
[  101.348030] EIP is at __phys_addr+0x1a/0x50
[  101.348030] EAX: 00000000 EBX: f5e64000 ECX: 00000000 EDX: 00001000
[  101.348030] ESI: f6ffd000 EDI: 00000000 EBP: f6353c10 ESP: f6353c10
[  101.348030]  DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
[  101.348030] Process kswapd0 (pid: 20, ti=f6352000 task=f69b9c20 task.ti=f6352000)
[  101.348030] Stack:
[  101.348030]  f6353c60 c12e4114 00000000 00000001 00000000 c12e5682 00000000 00000046
[  101.348030]  00000000 f5e65668 f5e65658 f5e65654 f580f000 f6353c60 c13904f5 00000000
[  101.348030]  f5e65654 f5e64000 f5eab000 00000000 f6353cb0 c12e5713 00000000 f5e64000
[  101.348030] Call Trace:
[  101.348030]  [<c12e4114>] zcache_pampd_create+0x14/0x6a0
[  101.348030]  [<c12e5682>] ? tmem_put+0x52/0x3f0
[  101.348030]  [<c13904f5>] ? _raw_spin_lock+0x45/0x50
[  101.348030]  [<c12e5713>] tmem_put+0xe3/0x3f0
[  101.348030]  [<c10f5358>] ? page_address+0xb8/0xe0
[  101.348030]  [<c12e498b>] zcache_frontswap_put_page+0x1eb/0x2e0
[  101.348030]  [<c110798b>] __frontswap_put_page+0x6b/0x110
[  101.348030]  [<c1103ccf>] swap_writepage+0x8f/0xf0
[  101.348030]  [<c10ee377>] shmem_writepage+0x1a7/0x1d0
[  101.348030]  [<c10ea907>] shrink_page_list+0x3f7/0x7c0
[  101.348030]  [<c10eafdd>] shrink_inactive_list+0x12d/0x360
[  101.348030]  [<c10eb5f8>] shrink_zone+0x3e8/0x530
[  101.348030]  [<c10ebc92>] kswapd+0x552/0x940
[  101.348030]  [<c1084460>] ? wake_up_bit+0x30/0x30
[  101.348030]  [<c10eb740>] ? shrink_zone+0x530/0x530
[  101.348030]  [<c10840e4>] kthread+0x74/0x80
[  101.348030]  [<c1084070>] ? __init_kthread_worker+0x60/0x60
[  101.348030]  [<c1397736>] kernel_thread_helper+0x6/0xd
[  101.348030] Code: 00 c0 ff 81 eb 00 20 00 00 39 d8 72 cd eb ae 66 90 55 3d ff ff ff bf 89 e5 76 10 80 3d 10 88 53 c1 00 75 09 05 00 00 00 40 5d c3 <0f> 0b 8b 15 b0 6a 9e c1 81 c2 00 00 80 00 39 d0 72 e7 8b 15 a8 
[  101.348030] EIP: [<c1058c9a>] __phys_addr+0x1a/0x50 SS:ESP 0068:f6353c10
==========

This crash is hit every time a high memory page is swapped out.

I have no solution right now other that to revert this patch and 
restore the original signatures.

What was the rationale for the signature changes?

--
Seth

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-08-10 14:21   ` Seth Jennings
@ 2011-08-10 14:44     ` Seth Jennings
  -1 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 14:44 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 08/10/2011 09:21 AM, Seth Jennings wrote:
> On 06/30/2011 02:01 PM, Dan Magenheimer wrote:
>> Hi Greg --
>>
>> I think this patch is now ready for staging-next and for merging when
>> the 3.1 window opens.  Please let me know if you need any logistics
>> done differently.
>>
>> Thanks,
>> Dan
>>
>> ===
>>
>>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Subject: staging: zcache: support multiple clients, prep for KVM and RAMster
>>
>> This is version 2 of an update to zcache, incorporating feedback from the list.
>> This patch adds support to the in-kernel transcendent memory ("tmem") code
>> and the zcache driver for multiple clients, which will be needed for both
>> RAMster and KVM support.  It also adds additional tmem callbacks to support
>> RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
>> also taken the liberty of adding some additional sysfs variables to
>> both surface information and allow policy control.  Those experimenting
>> with zcache should find them useful.
>>
>> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
>>
>> [v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
>> [v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
>> [v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
>> Cc: Nitin Gupta <ngupta@vflare.org>
>> Cc: linux-mm@kvack.org
>> Cc: kvm@vger.kernel.org
>>
>> ===
>>
>> Diffstat:
>>  drivers/staging/zcache/tmem.c            |  100 +++-
>>  drivers/staging/zcache/tmem.h            |   23 
>>  drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
>>  3 files changed, 520 insertions(+), 115 deletions(-)
>>
>>
> <cut>
>> @@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
>>  /* forward reference */
>>  static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
>>  
>> -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
>> -				 uint32_t index, struct page *page)
>> +static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
>> +				struct tmem_pool *pool, struct tmem_oid *oid,
>> +				 uint32_t index)
>>  {
>>  	void *pampd = NULL, *cdata;
>>  	size_t clen;
>>  	int ret;
>> -	bool ephemeral = is_ephemeral(pool);
>>  	unsigned long count;
>> +	struct page *page = virt_to_page(data);
> 
> With zcache_put_page() modified to pass page_address(page) instead of the 
> actual page structure, in combination with the function signature changes 
> to tmem_put() and zcache_pampd_create(), zcache_pampd_create() tries to 
> (re)derive the page structure from the virtual address.  However, if the 
> original page is a high memory page (or any unmapped page), this 
> virt_to_page() fails because the page_address() in zcache_put_page()
> returned NULL.
> 
> With CONFIG_DEBUG_VIRTUAL set, the BUG message is this:
> ==========
> [  101.347711] ------------[ cut here ]------------
> [  101.348030] kernel BUG at arch/x86/mm/physaddr.c:51!
> [  101.348030] invalid opcode: 0000 [#1] DEBUG_PAGEALLOC
> [  101.348030] Modules linked in:
> [  101.348030] 
> [  101.348030] Pid: 20, comm: kswapd0 Not tainted 3.1.0-rc1+ #229 Bochs Bochs
> [  101.348030] EIP: 0060:[<c1058c9a>] EFLAGS: 00010013 CPU: 0
> [  101.348030] EIP is at __phys_addr+0x1a/0x50
> [  101.348030] EAX: 00000000 EBX: f5e64000 ECX: 00000000 EDX: 00001000
> [  101.348030] ESI: f6ffd000 EDI: 00000000 EBP: f6353c10 ESP: f6353c10
> [  101.348030]  DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
> [  101.348030] Process kswapd0 (pid: 20, ti=f6352000 task=f69b9c20 task.ti=f6352000)
> [  101.348030] Stack:
> [  101.348030]  f6353c60 c12e4114 00000000 00000001 00000000 c12e5682 00000000 00000046
> [  101.348030]  00000000 f5e65668 f5e65658 f5e65654 f580f000 f6353c60 c13904f5 00000000
> [  101.348030]  f5e65654 f5e64000 f5eab000 00000000 f6353cb0 c12e5713 00000000 f5e64000
> [  101.348030] Call Trace:
> [  101.348030]  [<c12e4114>] zcache_pampd_create+0x14/0x6a0
> [  101.348030]  [<c12e5682>] ? tmem_put+0x52/0x3f0
> [  101.348030]  [<c13904f5>] ? _raw_spin_lock+0x45/0x50
> [  101.348030]  [<c12e5713>] tmem_put+0xe3/0x3f0
> [  101.348030]  [<c10f5358>] ? page_address+0xb8/0xe0
> [  101.348030]  [<c12e498b>] zcache_frontswap_put_page+0x1eb/0x2e0
> [  101.348030]  [<c110798b>] __frontswap_put_page+0x6b/0x110
> [  101.348030]  [<c1103ccf>] swap_writepage+0x8f/0xf0
> [  101.348030]  [<c10ee377>] shmem_writepage+0x1a7/0x1d0
> [  101.348030]  [<c10ea907>] shrink_page_list+0x3f7/0x7c0
> [  101.348030]  [<c10eafdd>] shrink_inactive_list+0x12d/0x360
> [  101.348030]  [<c10eb5f8>] shrink_zone+0x3e8/0x530
> [  101.348030]  [<c10ebc92>] kswapd+0x552/0x940
> [  101.348030]  [<c1084460>] ? wake_up_bit+0x30/0x30
> [  101.348030]  [<c10eb740>] ? shrink_zone+0x530/0x530
> [  101.348030]  [<c10840e4>] kthread+0x74/0x80
> [  101.348030]  [<c1084070>] ? __init_kthread_worker+0x60/0x60
> [  101.348030]  [<c1397736>] kernel_thread_helper+0x6/0xd
> [  101.348030] Code: 00 c0 ff 81 eb 00 20 00 00 39 d8 72 cd eb ae 66 90 55 3d ff ff ff bf 89 e5 76 10 80 3d 10 88 53 c1 00 75 09 05 00 00 00 40 5d c3 <0f> 0b 8b 15 b0 6a 9e c1 81 c2 00 00 80 00 39 d0 72 e7 8b 15 a8 
> [  101.348030] EIP: [<c1058c9a>] __phys_addr+0x1a/0x50 SS:ESP 0068:f6353c10
> ==========
> 
> This crash is hit every time a high memory page is swapped out.
> 
> I have no solution right now other that to revert this patch and 
> restore the original signatures.
> 

Sorry for the noise, but I noticed right after I sent this that
the tmem layer doesn't DO anything with the data parameter. So
a possible solution is to just pass the page pointer instead of
the virtual address.  After all, pointers are pointers.

--- a/drivers/staging/zcache/zcache.c
+++ b/drivers/staging/zcache/zcache.c
@@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size, 
        size_t clen;
        int ret;
        unsigned long count;
-       struct page *page = virt_to_page(data);
+       struct page *page = (struct page *)(data);
        struct zcache_client *cli = pool->client;
        uint16_t client_id = get_client_id_from_client(cli);
        unsigned long zv_mean_zsize;
@@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
        int ret = 0;
 
        BUG_ON(is_ephemeral(pool));
-       zv_decompress(virt_to_page(data), pampd);
+       zv_decompress((struct page *)(data), pampd);
        return ret;
 }
 
@@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
                goto out;
        if (!zcache_freeze && zcache_do_preload(pool) == 0) {
                /* preload does preempt_disable on success */
-               ret = tmem_put(pool, oidp, index, page_address(page),
+               ret = tmem_put(pool, oidp, index, (char *)(page),
                                PAGE_SIZE, 0, is_ephemeral(pool));
                if (ret < 0) {
                        if (is_ephemeral(pool))
@@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
        pool = zcache_get_pool_by_id(cli_id, pool_id);
        if (likely(pool != NULL)) {
                if (atomic_read(&pool->obj_count) > 0)
-                       ret = tmem_get(pool, oidp, index, page_address(page),
+                       ret = tmem_get(pool, oidp, index, (char *)(page),
                                        &size, 0, is_ephemeral(pool));
                zcache_put_pool(pool);
        }

I tested this and it works.

Dan, does this mess anything else up?

> What was the rationale for the signature changes?
> 
> --
> Seth
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-08-10 14:44     ` Seth Jennings
  0 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 14:44 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 08/10/2011 09:21 AM, Seth Jennings wrote:
> On 06/30/2011 02:01 PM, Dan Magenheimer wrote:
>> Hi Greg --
>>
>> I think this patch is now ready for staging-next and for merging when
>> the 3.1 window opens.  Please let me know if you need any logistics
>> done differently.
>>
>> Thanks,
>> Dan
>>
>> ===
>>
>>> From: Dan Magenheimer <dan.magenheimer@oracle.com>
>> Subject: staging: zcache: support multiple clients, prep for KVM and RAMster
>>
>> This is version 2 of an update to zcache, incorporating feedback from the list.
>> This patch adds support to the in-kernel transcendent memory ("tmem") code
>> and the zcache driver for multiple clients, which will be needed for both
>> RAMster and KVM support.  It also adds additional tmem callbacks to support
>> RAMster and corresponding no-op stubs in the zcache driver.  In v2, I've
>> also taken the liberty of adding some additional sysfs variables to
>> both surface information and allow policy control.  Those experimenting
>> with zcache should find them useful.
>>
>> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
>>
>> [v2: konrad.wilk@oracle.com: fix bools, add check for NULL, fix a comment]
>> [v2: sjenning@linux.vnet.ibm.com: add info/tunables for poor compression]
>> [v2: marcusklemm@googlemail.com: add tunable for max persistent pages]
>> Cc: Nitin Gupta <ngupta@vflare.org>
>> Cc: linux-mm@kvack.org
>> Cc: kvm@vger.kernel.org
>>
>> ===
>>
>> Diffstat:
>>  drivers/staging/zcache/tmem.c            |  100 +++-
>>  drivers/staging/zcache/tmem.h            |   23 
>>  drivers/staging/zcache/zcache.c          |  512 +++++++++++++++++----
>>  3 files changed, 520 insertions(+), 115 deletions(-)
>>
>>
> <cut>
>> @@ -901,48 +1144,59 @@ static unsigned long zcache_curr_pers_pa
>>  /* forward reference */
>>  static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
>>  
>> -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
>> -				 uint32_t index, struct page *page)
>> +static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
>> +				struct tmem_pool *pool, struct tmem_oid *oid,
>> +				 uint32_t index)
>>  {
>>  	void *pampd = NULL, *cdata;
>>  	size_t clen;
>>  	int ret;
>> -	bool ephemeral = is_ephemeral(pool);
>>  	unsigned long count;
>> +	struct page *page = virt_to_page(data);
> 
> With zcache_put_page() modified to pass page_address(page) instead of the 
> actual page structure, in combination with the function signature changes 
> to tmem_put() and zcache_pampd_create(), zcache_pampd_create() tries to 
> (re)derive the page structure from the virtual address.  However, if the 
> original page is a high memory page (or any unmapped page), this 
> virt_to_page() fails because the page_address() in zcache_put_page()
> returned NULL.
> 
> With CONFIG_DEBUG_VIRTUAL set, the BUG message is this:
> ==========
> [  101.347711] ------------[ cut here ]------------
> [  101.348030] kernel BUG at arch/x86/mm/physaddr.c:51!
> [  101.348030] invalid opcode: 0000 [#1] DEBUG_PAGEALLOC
> [  101.348030] Modules linked in:
> [  101.348030] 
> [  101.348030] Pid: 20, comm: kswapd0 Not tainted 3.1.0-rc1+ #229 Bochs Bochs
> [  101.348030] EIP: 0060:[<c1058c9a>] EFLAGS: 00010013 CPU: 0
> [  101.348030] EIP is at __phys_addr+0x1a/0x50
> [  101.348030] EAX: 00000000 EBX: f5e64000 ECX: 00000000 EDX: 00001000
> [  101.348030] ESI: f6ffd000 EDI: 00000000 EBP: f6353c10 ESP: f6353c10
> [  101.348030]  DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
> [  101.348030] Process kswapd0 (pid: 20, ti=f6352000 task=f69b9c20 task.ti=f6352000)
> [  101.348030] Stack:
> [  101.348030]  f6353c60 c12e4114 00000000 00000001 00000000 c12e5682 00000000 00000046
> [  101.348030]  00000000 f5e65668 f5e65658 f5e65654 f580f000 f6353c60 c13904f5 00000000
> [  101.348030]  f5e65654 f5e64000 f5eab000 00000000 f6353cb0 c12e5713 00000000 f5e64000
> [  101.348030] Call Trace:
> [  101.348030]  [<c12e4114>] zcache_pampd_create+0x14/0x6a0
> [  101.348030]  [<c12e5682>] ? tmem_put+0x52/0x3f0
> [  101.348030]  [<c13904f5>] ? _raw_spin_lock+0x45/0x50
> [  101.348030]  [<c12e5713>] tmem_put+0xe3/0x3f0
> [  101.348030]  [<c10f5358>] ? page_address+0xb8/0xe0
> [  101.348030]  [<c12e498b>] zcache_frontswap_put_page+0x1eb/0x2e0
> [  101.348030]  [<c110798b>] __frontswap_put_page+0x6b/0x110
> [  101.348030]  [<c1103ccf>] swap_writepage+0x8f/0xf0
> [  101.348030]  [<c10ee377>] shmem_writepage+0x1a7/0x1d0
> [  101.348030]  [<c10ea907>] shrink_page_list+0x3f7/0x7c0
> [  101.348030]  [<c10eafdd>] shrink_inactive_list+0x12d/0x360
> [  101.348030]  [<c10eb5f8>] shrink_zone+0x3e8/0x530
> [  101.348030]  [<c10ebc92>] kswapd+0x552/0x940
> [  101.348030]  [<c1084460>] ? wake_up_bit+0x30/0x30
> [  101.348030]  [<c10eb740>] ? shrink_zone+0x530/0x530
> [  101.348030]  [<c10840e4>] kthread+0x74/0x80
> [  101.348030]  [<c1084070>] ? __init_kthread_worker+0x60/0x60
> [  101.348030]  [<c1397736>] kernel_thread_helper+0x6/0xd
> [  101.348030] Code: 00 c0 ff 81 eb 00 20 00 00 39 d8 72 cd eb ae 66 90 55 3d ff ff ff bf 89 e5 76 10 80 3d 10 88 53 c1 00 75 09 05 00 00 00 40 5d c3 <0f> 0b 8b 15 b0 6a 9e c1 81 c2 00 00 80 00 39 d0 72 e7 8b 15 a8 
> [  101.348030] EIP: [<c1058c9a>] __phys_addr+0x1a/0x50 SS:ESP 0068:f6353c10
> ==========
> 
> This crash is hit every time a high memory page is swapped out.
> 
> I have no solution right now other that to revert this patch and 
> restore the original signatures.
> 

Sorry for the noise, but I noticed right after I sent this that
the tmem layer doesn't DO anything with the data parameter. So
a possible solution is to just pass the page pointer instead of
the virtual address.  After all, pointers are pointers.

--- a/drivers/staging/zcache/zcache.c
+++ b/drivers/staging/zcache/zcache.c
@@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size, 
        size_t clen;
        int ret;
        unsigned long count;
-       struct page *page = virt_to_page(data);
+       struct page *page = (struct page *)(data);
        struct zcache_client *cli = pool->client;
        uint16_t client_id = get_client_id_from_client(cli);
        unsigned long zv_mean_zsize;
@@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
        int ret = 0;
 
        BUG_ON(is_ephemeral(pool));
-       zv_decompress(virt_to_page(data), pampd);
+       zv_decompress((struct page *)(data), pampd);
        return ret;
 }
 
@@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
                goto out;
        if (!zcache_freeze && zcache_do_preload(pool) == 0) {
                /* preload does preempt_disable on success */
-               ret = tmem_put(pool, oidp, index, page_address(page),
+               ret = tmem_put(pool, oidp, index, (char *)(page),
                                PAGE_SIZE, 0, is_ephemeral(pool));
                if (ret < 0) {
                        if (is_ephemeral(pool))
@@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
        pool = zcache_get_pool_by_id(cli_id, pool_id);
        if (likely(pool != NULL)) {
                if (atomic_read(&pool->obj_count) > 0)
-                       ret = tmem_get(pool, oidp, index, page_address(page),
+                       ret = tmem_get(pool, oidp, index, (char *)(page),
                                        &size, 0, is_ephemeral(pool));
                zcache_put_pool(pool);
        }

I tested this and it works.

Dan, does this mess anything else up?

> What was the rationale for the signature changes?
> 
> --
> Seth
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-08-10 14:44     ` Seth Jennings
@ 2011-08-10 15:08       ` Dan Magenheimer
  -1 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-08-10 15:08 UTC (permalink / raw)
  To: Seth Jennings
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
> Subject: Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
> 
> > This crash is hit every time a high memory page is swapped out.
> >
> > I have no solution right now other that to revert this patch and
> > restore the original signatures.

Hi Seth --

Thanks for your testing.  I haven't done much testing on 32-bit.

> Sorry for the noise, but I noticed right after I sent this that
> the tmem layer doesn't DO anything with the data parameter. So
> a possible solution is to just pass the page pointer instead of
> the virtual address.  After all, pointers are pointers.

Yes, this looks like a good patch.

> --- a/drivers/staging/zcache/zcache.c
> +++ b/drivers/staging/zcache/zcache.c
> @@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size,
>         size_t clen;
>         int ret;
>         unsigned long count;
> -       struct page *page = virt_to_page(data);
> +       struct page *page = (struct page *)(data);
>         struct zcache_client *cli = pool->client;
>         uint16_t client_id = get_client_id_from_client(cli);
>         unsigned long zv_mean_zsize;
> @@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
>         int ret = 0;
> 
>         BUG_ON(is_ephemeral(pool));
> -       zv_decompress(virt_to_page(data), pampd);
> +       zv_decompress((struct page *)(data), pampd);
>         return ret;
>  }
> 
> @@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
>                 goto out;
>         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
>                 /* preload does preempt_disable on success */
> -               ret = tmem_put(pool, oidp, index, page_address(page),
> +               ret = tmem_put(pool, oidp, index, (char *)(page),
>                                 PAGE_SIZE, 0, is_ephemeral(pool));
>                 if (ret < 0) {
>                         if (is_ephemeral(pool))
> @@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
>         pool = zcache_get_pool_by_id(cli_id, pool_id);
>         if (likely(pool != NULL)) {
>                 if (atomic_read(&pool->obj_count) > 0)
> -                       ret = tmem_get(pool, oidp, index, page_address(page),
> +                       ret = tmem_get(pool, oidp, index, (char *)(page),
>                                         &size, 0, is_ephemeral(pool));
>                 zcache_put_pool(pool);
>         }
> 
> I tested this and it works.
> 
> Dan, does this mess anything else up?

Acked-by: Dan Magenheimer <dan.magenheimer@oracle.com>

> > What was the rationale for the signature changes?
> > Seth

The change on the tmem side allows tmem to handle pre-compressed pages,
which is useful to RAMster and possibly for KVM.  The new "raw"
parameter identifies that case, but for zcache "raw" is always zero so
your solution looks fine.

Seth, could you submit an "official" patch (i.e. proper subject field,
signed-off-by) and I will ack that and ask GregKH to queue it up for
a 3.1-rc?

Subject something like: staging: zcache: fix highmem crash on 32-bit

Thanks,
Dan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-08-10 15:08       ` Dan Magenheimer
  0 siblings, 0 replies; 25+ messages in thread
From: Dan Magenheimer @ 2011-08-10 15:08 UTC (permalink / raw)
  To: Seth Jennings
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
> Subject: Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
> 
> > This crash is hit every time a high memory page is swapped out.
> >
> > I have no solution right now other that to revert this patch and
> > restore the original signatures.

Hi Seth --

Thanks for your testing.  I haven't done much testing on 32-bit.

> Sorry for the noise, but I noticed right after I sent this that
> the tmem layer doesn't DO anything with the data parameter. So
> a possible solution is to just pass the page pointer instead of
> the virtual address.  After all, pointers are pointers.

Yes, this looks like a good patch.

> --- a/drivers/staging/zcache/zcache.c
> +++ b/drivers/staging/zcache/zcache.c
> @@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size,
>         size_t clen;
>         int ret;
>         unsigned long count;
> -       struct page *page = virt_to_page(data);
> +       struct page *page = (struct page *)(data);
>         struct zcache_client *cli = pool->client;
>         uint16_t client_id = get_client_id_from_client(cli);
>         unsigned long zv_mean_zsize;
> @@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
>         int ret = 0;
> 
>         BUG_ON(is_ephemeral(pool));
> -       zv_decompress(virt_to_page(data), pampd);
> +       zv_decompress((struct page *)(data), pampd);
>         return ret;
>  }
> 
> @@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
>                 goto out;
>         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
>                 /* preload does preempt_disable on success */
> -               ret = tmem_put(pool, oidp, index, page_address(page),
> +               ret = tmem_put(pool, oidp, index, (char *)(page),
>                                 PAGE_SIZE, 0, is_ephemeral(pool));
>                 if (ret < 0) {
>                         if (is_ephemeral(pool))
> @@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
>         pool = zcache_get_pool_by_id(cli_id, pool_id);
>         if (likely(pool != NULL)) {
>                 if (atomic_read(&pool->obj_count) > 0)
> -                       ret = tmem_get(pool, oidp, index, page_address(page),
> +                       ret = tmem_get(pool, oidp, index, (char *)(page),
>                                         &size, 0, is_ephemeral(pool));
>                 zcache_put_pool(pool);
>         }
> 
> I tested this and it works.
> 
> Dan, does this mess anything else up?

Acked-by: Dan Magenheimer <dan.magenheimer@oracle.com>

> > What was the rationale for the signature changes?
> > Seth

The change on the tmem side allows tmem to handle pre-compressed pages,
which is useful to RAMster and possibly for KVM.  The new "raw"
parameter identifies that case, but for zcache "raw" is always zero so
your solution looks fine.

Seth, could you submit an "official" patch (i.e. proper subject field,
signed-off-by) and I will ack that and ask GregKH to queue it up for
a 3.1-rc?

Subject something like: staging: zcache: fix highmem crash on 32-bit

Thanks,
Dan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
  2011-08-10 15:08       ` Dan Magenheimer
@ 2011-08-10 15:40         ` Seth Jennings
  -1 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 15:40 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 08/10/2011 10:08 AM, Dan Magenheimer wrote:
>> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
>> Subject: Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
>>
>>> This crash is hit every time a high memory page is swapped out.
>>>
>>> I have no solution right now other that to revert this patch and
>>> restore the original signatures.
> 
> Hi Seth --
> 
> Thanks for your testing.  I haven't done much testing on 32-bit.
> 
>> Sorry for the noise, but I noticed right after I sent this that
>> the tmem layer doesn't DO anything with the data parameter. So
>> a possible solution is to just pass the page pointer instead of
>> the virtual address.  After all, pointers are pointers.
> 
> Yes, this looks like a good patch.
> 
>> --- a/drivers/staging/zcache/zcache.c
>> +++ b/drivers/staging/zcache/zcache.c
>> @@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size,
>>         size_t clen;
>>         int ret;
>>         unsigned long count;
>> -       struct page *page = virt_to_page(data);
>> +       struct page *page = (struct page *)(data);
>>         struct zcache_client *cli = pool->client;
>>         uint16_t client_id = get_client_id_from_client(cli);
>>         unsigned long zv_mean_zsize;
>> @@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
>>         int ret = 0;
>>
>>         BUG_ON(is_ephemeral(pool));
>> -       zv_decompress(virt_to_page(data), pampd);
>> +       zv_decompress((struct page *)(data), pampd);
>>         return ret;
>>  }
>>
>> @@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
>>                 goto out;
>>         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
>>                 /* preload does preempt_disable on success */
>> -               ret = tmem_put(pool, oidp, index, page_address(page),
>> +               ret = tmem_put(pool, oidp, index, (char *)(page),
>>                                 PAGE_SIZE, 0, is_ephemeral(pool));
>>                 if (ret < 0) {
>>                         if (is_ephemeral(pool))
>> @@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
>>         pool = zcache_get_pool_by_id(cli_id, pool_id);
>>         if (likely(pool != NULL)) {
>>                 if (atomic_read(&pool->obj_count) > 0)
>> -                       ret = tmem_get(pool, oidp, index, page_address(page),
>> +                       ret = tmem_get(pool, oidp, index, (char *)(page),
>>                                         &size, 0, is_ephemeral(pool));
>>                 zcache_put_pool(pool);
>>         }
>>
>> I tested this and it works.
>>
>> Dan, does this mess anything else up?
> 
> Acked-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> 
>>> What was the rationale for the signature changes?
>>> Seth
> 
> The change on the tmem side allows tmem to handle pre-compressed pages,
> which is useful to RAMster and possibly for KVM.  The new "raw"
> parameter identifies that case, but for zcache "raw" is always zero so
> your solution looks fine.
> 
> Seth, could you submit an "official" patch (i.e. proper subject field,
> signed-off-by) and I will ack that and ask GregKH to queue it up for
> a 3.1-rc?

Will do. I'm actually about to send out a set of 3 patches for zcache.
There is a Makefile issue, and a 32-bit link-time issue I have found
as well.  Hopefully, I'll send it out today.

> 
> Subject something like: staging: zcache: fix highmem crash on 32-bit
> 
> Thanks,
> Dan
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
@ 2011-08-10 15:40         ` Seth Jennings
  0 siblings, 0 replies; 25+ messages in thread
From: Seth Jennings @ 2011-08-10 15:40 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Greg Kroah-Hartman, Marcus Klemm, Konrad Wilk, linux-kernel,
	devel, kvm, linux-mm, Brian King, Robert Jennings

On 08/10/2011 10:08 AM, Dan Magenheimer wrote:
>> From: Seth Jennings [mailto:sjenning@linux.vnet.ibm.com]
>> Subject: Re: [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster
>>
>>> This crash is hit every time a high memory page is swapped out.
>>>
>>> I have no solution right now other that to revert this patch and
>>> restore the original signatures.
> 
> Hi Seth --
> 
> Thanks for your testing.  I haven't done much testing on 32-bit.
> 
>> Sorry for the noise, but I noticed right after I sent this that
>> the tmem layer doesn't DO anything with the data parameter. So
>> a possible solution is to just pass the page pointer instead of
>> the virtual address.  After all, pointers are pointers.
> 
> Yes, this looks like a good patch.
> 
>> --- a/drivers/staging/zcache/zcache.c
>> +++ b/drivers/staging/zcache/zcache.c
>> @@ -1153,7 +1153,7 @@ static void *zcache_pampd_create(char *data, size_t size,
>>         size_t clen;
>>         int ret;
>>         unsigned long count;
>> -       struct page *page = virt_to_page(data);
>> +       struct page *page = (struct page *)(data);
>>         struct zcache_client *cli = pool->client;
>>         uint16_t client_id = get_client_id_from_client(cli);
>>         unsigned long zv_mean_zsize;
>> @@ -1220,7 +1220,7 @@ static int zcache_pampd_get_data(char *data, size_t *bufsi
>>         int ret = 0;
>>
>>         BUG_ON(is_ephemeral(pool));
>> -       zv_decompress(virt_to_page(data), pampd);
>> +       zv_decompress((struct page *)(data), pampd);
>>         return ret;
>>  }
>>
>> @@ -1532,7 +1532,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct
>>                 goto out;
>>         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
>>                 /* preload does preempt_disable on success */
>> -               ret = tmem_put(pool, oidp, index, page_address(page),
>> +               ret = tmem_put(pool, oidp, index, (char *)(page),
>>                                 PAGE_SIZE, 0, is_ephemeral(pool));
>>                 if (ret < 0) {
>>                         if (is_ephemeral(pool))
>> @@ -1565,7 +1565,7 @@ static int zcache_get_page(int cli_id, int pool_id, struct
>>         pool = zcache_get_pool_by_id(cli_id, pool_id);
>>         if (likely(pool != NULL)) {
>>                 if (atomic_read(&pool->obj_count) > 0)
>> -                       ret = tmem_get(pool, oidp, index, page_address(page),
>> +                       ret = tmem_get(pool, oidp, index, (char *)(page),
>>                                         &size, 0, is_ephemeral(pool));
>>                 zcache_put_pool(pool);
>>         }
>>
>> I tested this and it works.
>>
>> Dan, does this mess anything else up?
> 
> Acked-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> 
>>> What was the rationale for the signature changes?
>>> Seth
> 
> The change on the tmem side allows tmem to handle pre-compressed pages,
> which is useful to RAMster and possibly for KVM.  The new "raw"
> parameter identifies that case, but for zcache "raw" is always zero so
> your solution looks fine.
> 
> Seth, could you submit an "official" patch (i.e. proper subject field,
> signed-off-by) and I will ack that and ask GregKH to queue it up for
> a 3.1-rc?

Will do. I'm actually about to send out a set of 3 patches for zcache.
There is a Makefile issue, and a 32-bit link-time issue I have found
as well.  Hopefully, I'll send it out today.

> 
> Subject something like: staging: zcache: fix highmem crash on 32-bit
> 
> Thanks,
> Dan
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2011-08-10 15:42 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-30 19:01 [PATCH v2] staging: zcache: support multiple clients, prep for KVM and RAMster Dan Magenheimer
2011-06-30 19:01 ` Dan Magenheimer
2011-06-30 19:23 ` Greg KH
2011-06-30 19:23   ` Greg KH
2011-06-30 22:40 ` Dan Carpenter
2011-06-30 22:40   ` Dan Carpenter
2011-06-30 22:40   ` Dan Carpenter
2011-06-30 23:28   ` Dan Magenheimer
2011-06-30 23:28     ` Dan Magenheimer
2011-07-01  8:38     ` Dan Carpenter
2011-07-01  8:38       ` Dan Carpenter
2011-07-01 14:31       ` Dan Magenheimer
2011-07-01 14:31         ` Dan Magenheimer
2011-07-01 16:58         ` Dan Carpenter
2011-07-01 16:58           ` Dan Carpenter
2011-07-06  3:30           ` Greg KH
2011-07-06  3:30             ` Greg KH
2011-08-10 14:21 ` Seth Jennings
2011-08-10 14:21   ` Seth Jennings
2011-08-10 14:44   ` Seth Jennings
2011-08-10 14:44     ` Seth Jennings
2011-08-10 15:08     ` Dan Magenheimer
2011-08-10 15:08       ` Dan Magenheimer
2011-08-10 15:40       ` Seth Jennings
2011-08-10 15:40         ` Seth Jennings
     [not found] <1d15f28a-56df-4cf4-9dd9-1032f211c0d0@default20110630224019.GC2544@shale.localdomain>

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.