From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-fsdevel-owner@vger.kernel.org>
Received: from mx141.netapp.com ([216.240.21.12]:19857 "EHLO mx141.netapp.com"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S1751982AbeCMRhA (ORCPT <rfc822;linux-fsdevel@vger.kernel.org>);
        Tue, 13 Mar 2018 13:37:00 -0400
Subject: [RFC 4/7] zuf: zuf-rootfs && zuf-core
To: Boaz Harrosh <boazh@netapp.com>,
        linux-fsdevel <linux-fsdevel@vger.kernel.org>
References: <d5db55b6-e488-986a-81b1-a3514e8eba81@netapp.com>
 <b5a62d92-eafa-6f11-2370-14812a8a9286@netapp.com>
CC: Ric Wheeler <rwheeler@redhat.com>,
        Miklos Szeredi <mszeredi@redhat.com>,
        Steve French <smfrench@gmail.com>,
        Steven Whitehouse <swhiteho@redhat.com>,
        Jefff moyer <jmoyer@redhat.com>, Sage Weil <sweil@redhat.com>,
        Jan Kara <jack@suse.cz>, Amir Goldstein <amir73il@gmail.com>,
        Andy Rudof <andy.rudoff@intel.com>,
        Anna Schumaker <Anna.Schumaker@netapp.com>,
        Amit Golander <Amit.Golander@netapp.com>,
        Sagi Manole <sagim@netapp.com>,
        Shachar Sharon <Shachar.Sharon@netapp.com>
From: Boaz Harrosh <boazh@netapp.com>
Message-ID: <d7a1ee6a-21e1-dc91-6642-d10a4292701a@netapp.com>
Date: Tue, 13 Mar 2018 19:36:22 +0200
MIME-Version: 1.0
In-Reply-To: <b5a62d92-eafa-6f11-2370-14812a8a9286@netapp.com>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
Sender: linux-fsdevel-owner@vger.kernel.org
List-ID: <linux-fsdevel.vger.kernel.org>


zuf-core established the communication channels with
the zus UM Server.

zuf-root is a psuedo FS that the zus communicates through,
registers new file-systems. receives new mount requests.

In this patch we have the bring up of that special FS, and
the core communication mechanics. Which is the Novelty
of this code submission.

The zuf-rootfs (-t zuf) is usually by default mounted on
/sys/fs/zuf. If an admin wants to run more server applications
(Note that each server application supports many types of FSs)
He/she can mount a second instance of -t zuf and point the new
Server to it.

(Otherwise a second instance attempting to communicate with a
 busy zuf will fail)

TODO: How to trigger a first mount on module_load. Currently
admin needs to manually "mount -t zuf none /sys/fs/zuf"

Signed-off-by: Boaz Harrosh <boazh@netapp.com>
---
 fs/zuf/Makefile   |   4 +
 fs/zuf/_extern.h  |  47 +++++
 fs/zuf/_pr.h      |  53 ++++++
 fs/zuf/relay.h    |  86 +++++++++
 fs/zuf/super.c    |  21 +++
 fs/zuf/zuf-core.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/zuf/zuf-root.c | 330 ++++++++++++++++++++++++++++++++++
 fs/zuf/zuf.h      |  90 ++++++++++
 fs/zuf/zus_api.h  | 108 ++++++++++++
 9 files changed, 1256 insertions(+)
 create mode 100644 fs/zuf/_extern.h
 create mode 100644 fs/zuf/_pr.h
 create mode 100644 fs/zuf/relay.h
 create mode 100644 fs/zuf/super.c
 create mode 100644 fs/zuf/zuf-core.c
 create mode 100644 fs/zuf/zuf-root.c
 create mode 100644 fs/zuf/zuf.h

diff --git a/fs/zuf/Makefile b/fs/zuf/Makefile
index 7e4e51f..d00940c 100644
--- a/fs/zuf/Makefile
+++ b/fs/zuf/Makefile
@@ -10,5 +10,9 @@
 
 obj-$(CONFIG_ZUF) += zuf.o
 
+# ZUF core
+zuf-y += zuf-core.o zuf-root.o
+
 # Main FS
+zuf-y += super.o
 zuf-y += module.o
diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h
new file mode 100644
index 0000000..e490043
--- /dev/null
+++ b/fs/zuf/_extern.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#ifndef __ZUF_EXTERN_H__
+#define __ZUF_EXTERN_H__
+/*
+ * DO NOT INCLUDE this file directly, it is included by zuf.h
+ * It is here because zuf.h got to big
+ */
+
+/*
+ * extern functions declarations
+ */
+
+/* super.c */
+struct dentry *zuf_mount(struct file_system_type *fs_type, int flags,
+			 const char *dev_name, void *data);
+
+/* zuf-core.c */
+int zufs_zts_init(struct zuf_root_info *zri); /* Some private types in core */
+void zufs_zts_fini(struct zuf_root_info *zri);
+
+long zufs_ioc(struct file *filp, unsigned int cmd, ulong arg);
+int zufs_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info *zus_zfi,
+			struct zufs_ioc_mount *zim);
+int zufs_dispatch_umount(struct zuf_root_info *zri,
+			 struct zus_sb_info *zus_sbi);
+
+int zufs_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr,
+		  struct page **pages, uint nump);
+
+int zuf_zt_mmap(struct file *file, struct vm_area_struct *vma);
+
+void zufs_zt_release(struct file *filp);
+void zufs_mounter_release(struct file *filp);
+
+/* zuf-root.c */
+int zuf_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs);
+
+#endif	/*ndef __ZUF_EXTERN_H__*/
diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h
new file mode 100644
index 0000000..39c4622
--- /dev/null
+++ b/fs/zuf/_pr.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#ifndef __ZUF_PR_H__
+#define __ZUF_PR_H__
+
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/*
+ * Debug code
+ */
+#define zuf_err(s, args ...)		pr_err("[%s:%d] " s, __func__, \
+							__LINE__, ## args)
+#define zuf_err_cnd(silent, s, args ...) \
+	do {if (!silent) \
+		pr_err("[%s:%d] " s, __func__, __LINE__, ## args); \
+	} while (0)
+#define zuf_warn(s, args ...)		pr_warn("[%s:%d] " s, __func__, \
+							__LINE__, ## args)
+#define zuf_warn_cnd(silent, s, args ...) \
+	do {if (!silent) \
+		pr_warn("[%s:%d] " s, __func__, __LINE__, ## args); \
+	} while (0)
+#define zuf_info(s, args ...)          pr_info("~info~ " s, ## args)
+
+#define zuf_chan_debug(c, s, args...)	pr_debug(c " [%s:%d] " s, __func__, \
+							__LINE__, ## args)
+
+/* ~~~ channel prints ~~~ */
+#define zuf_dbg_err(s, args ...)	zuf_chan_debug("error", s, ##args)
+#define zuf_dbg_vfs(s, args ...)	zuf_chan_debug("vfs  ", s, ##args)
+#define zuf_dbg_rw(s, args ...)		zuf_chan_debug("rw   ", s, ##args)
+#define zuf_dbg_t1(s, args ...)		zuf_chan_debug("t1   ", s, ##args)
+#define zuf_dbg_verbose(s, args ...)	zuf_chan_debug("d-oto", s, ##args)
+#define zuf_dbg_xattr(s, args ...)	zuf_chan_debug("xattr", s, ##args)
+#define zuf_dbg_acl(s, args ...)	zuf_chan_debug("acl  ", s, ##args)
+#define zuf_dbg_t2(s, args ...)		zuf_chan_debug("t2dbg", s, ##args)
+#define zuf_dbg_t2_rw(s, args ...)	zuf_chan_debug("t2grw", s, ##args)
+#define zuf_dbg_core(s, args ...)	zuf_chan_debug("core ", s, ##args)
+#define zuf_dbg_mmap(s, args ...)	zuf_chan_debug("mmap ", s, ##args)
+#define zuf_dbg_zus(s, args ...)	zuf_chan_debug("zusdg", s, ##args)
+
+#endif /* define __ZUF_PR_H__ */
diff --git a/fs/zuf/relay.h b/fs/zuf/relay.h
new file mode 100644
index 0000000..490a193
--- /dev/null
+++ b/fs/zuf/relay.h
@@ -0,0 +1,86 @@
+/*
+ * Relay scheduler-object Header file.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#ifndef __RELAY_H__
+#define __RELAY_H__
+
+/* ~~~~ Relay ~~~~ */
+struct relay {
+	wait_queue_head_t fss_wq;
+	bool fss_wakeup;
+	bool fss_waiting;
+
+	wait_queue_head_t app_wq;
+	bool app_wakeup;
+	bool app_waiting;
+};
+
+static inline void relay_init(struct relay *relay)
+{
+	init_waitqueue_head(&relay->fss_wq);
+	init_waitqueue_head(&relay->app_wq);
+}
+
+static inline void relay_fss_waiting_grab(struct relay *relay)
+{
+	relay->fss_waiting = true;
+}
+
+static inline bool relay_is_app_waiting(struct relay *relay)
+{
+	return relay->app_waiting;
+}
+
+static inline void relay_app_wakeup(struct relay *relay)
+{
+	relay->app_waiting = false;
+
+	relay->app_wakeup = true;
+	wake_up(&relay->app_wq);
+}
+
+static inline int relay_fss_wait(struct relay *relay)
+{
+	int err;
+
+	relay->fss_wakeup = false;
+	err =  wait_event_interruptible(relay->fss_wq, relay->fss_wakeup);
+
+	relay->fss_waiting = false;
+	return err;
+}
+
+static inline bool relay_is_fss_waiting(struct relay *relay)
+{
+	return relay->fss_waiting;
+}
+
+static inline void relay_fss_wakeup(struct relay *relay)
+{
+	relay->fss_wakeup = true;
+	wake_up(&relay->fss_wq);
+}
+
+static inline int relay_fss_wakeup_app_wait(struct relay *relay,
+					    spinlock_t *spinlock)
+{
+	relay->app_waiting = true;
+
+	relay_fss_wakeup(relay);
+
+	relay->app_wakeup = false;
+	if (spinlock)
+		spin_unlock(spinlock);
+
+	return wait_event_interruptible(relay->app_wq, relay->app_wakeup);
+}
+
+#endif /* ifndef __RELAY_H__ */
diff --git a/fs/zuf/super.c b/fs/zuf/super.c
new file mode 100644
index 0000000..6e176a5
--- /dev/null
+++ b/fs/zuf/super.c
@@ -0,0 +1,21 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#include "zuf.h"
+
+struct dentry *zuf_mount(struct file_system_type *fs_type, int flags,
+			 const char *dev_name, void *data)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c
new file mode 100644
index 0000000..12a23f1
--- /dev/null
+++ b/fs/zuf/zuf-core.c
@@ -0,0 +1,517 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Ioctl operations.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+
+#include "zuf.h"
+
+static struct page *g_drain_p = NULL;
+
+struct zufs_thread {
+	struct zuf_special_file hdr;
+	struct relay relay;
+	struct file *file;
+	struct vm_area_struct *vma;
+	int no;
+
+	/* Next operation*/
+	struct zufs_ioc_hdr *next_opt;
+	struct page **pages;
+	uint nump;
+} ____cacheline_aligned;
+
+static int _zt_from_f(struct file *filp, int cpu, struct zufs_thread **ztp)
+{
+	struct zuf_root_info *zri = ZRI(filp->f_inode->i_sb);
+
+	if ((cpu < 0) || (zri->_max_zts <= cpu))  {
+		zuf_err("fatal\n");
+		return -ERANGE;
+	}
+
+	*ztp = &zri->_all_zt[cpu];
+	return 0;
+}
+
+int zufs_zts_init(struct zuf_root_info *zri)
+{
+	zri->_max_zts = num_online_cpus();
+
+	zri->_all_zt = kcalloc(zri->_max_zts, sizeof(struct zufs_thread),
+			       GFP_KERNEL);
+	if (unlikely(!zri->_all_zt))
+		return -ENOMEM;
+
+	g_drain_p = alloc_page(GFP_KERNEL);
+	if (!g_drain_p) {
+		zuf_err("!!! failed to alloc g_drain_p\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void zufs_zts_fini(struct zuf_root_info *zri)
+{
+	if (g_drain_p) {
+		__free_page(g_drain_p);
+		g_drain_p = NULL;
+	}
+	kfree(zri->_all_zt);
+	zri->_all_zt = NULL;
+}
+
+static int _zu_register_fs(struct file *file, void *parg)
+{
+	struct zufs_ioc_register_fs rfs;
+	int err;
+
+	err = copy_from_user(&rfs, parg, sizeof(rfs));
+	if (unlikely(err)) {
+		zuf_err("=>%d\n", err);
+		return err;
+	}
+
+	err = zuf_register_fs(file->f_inode->i_sb, &rfs);
+	if (err)
+		zuf_err("=>%d\n", err);
+	err = put_user(err, (int *)parg);
+	return err;
+}
+
+/* ~~~~ mounting ~~~~*/
+int zufs_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info *zus_zfi,
+			struct zufs_ioc_mount *zim)
+{
+	zim->zus_zfi = zus_zfi;
+	zim->num_cpu = zri->_max_zts;
+
+	if (unlikely(!zri->mount.file)) {
+		zuf_err("Server not up\n");
+		zim->hdr.err = -EIO;
+		return zim->hdr.err;
+	}
+
+	for (;;) {
+		bool fss_waiting;
+		/* It is OK to wait if user storms mounts */
+		spin_lock(&zri->mount.lock);
+		fss_waiting = relay_is_fss_waiting(&zri->mount.relay);
+		if (fss_waiting)
+			break;
+
+		spin_unlock(&zri->mount.lock);
+		if (unlikely(!zri->mount.file)) {
+			zuf_err("Server died\n");
+			zim->hdr.err = -EIO;
+			break;
+		}
+		zuf_dbg_verbose("waiting\n");
+		msleep(100);
+	}
+
+	zri->mount.zim = zim;
+	relay_fss_wakeup_app_wait(&zri->mount.relay, &zri->mount.lock);
+
+	return zim->hdr.err;
+}
+
+int zufs_dispatch_umount(struct zuf_root_info *zri, struct zus_sb_info *zus_sbi)
+{
+	struct zufs_ioc_mount zim = {
+		.is_umounting = true,
+		.zus_sbi = zus_sbi,
+	};
+
+	return zufs_dispatch_mount(zri, NULL, &zim);
+}
+
+static int _zu_mount(struct file *file, void *parg)
+{
+	struct super_block *sb = file->f_inode->i_sb;
+	struct zuf_root_info *zri = ZRI(sb);
+	bool waiting_for_reply;
+	struct zufs_ioc_mount *zim;
+	int err;
+
+	spin_lock(&zri->mount.lock);
+
+	if (unlikely(!file->private_data)) {
+		/* First time register this file as the mount-thread owner */
+		zri->mount.zsf.type = zlfs_e_mout_thread;
+		zri->mount.file = file;
+		file->private_data = &zri->mount;
+	} else if (unlikely(file->private_data != &zri->mount)) {
+		zuf_err("Say what?? %p != %p\n",
+			file->private_data, &zri->mount);
+		return -EIO;
+	}
+
+	relay_fss_waiting_grab(&zri->mount.relay);
+	zim = zri->mount.zim;
+	zri->mount.zim = NULL;
+	waiting_for_reply = zim && relay_is_app_waiting(&zri->mount.relay);
+
+	spin_unlock(&zri->mount.lock);
+
+	if (waiting_for_reply) {
+		zim->hdr.err = copy_from_user(zim, parg, sizeof(*zim));
+		relay_app_wakeup(&zri->mount.relay);
+		if (unlikely(zim->hdr.err)) {
+			zuf_err("=>%d\n", zim->hdr.err);
+			return zim->hdr.err;
+		}
+	}
+
+	/* This gets to sleep until a mount comes */
+	err = relay_fss_wait(&zri->mount.relay);
+	if (unlikely(err || !zri->mount.zim)) {
+		struct zufs_ioc_hdr *hdr = parg;
+
+		/* Released by _zu_break INTER or crash */
+		zuf_warn("_zu_break? %p => %d\n", zri->mount.zim, err);
+		put_user(ZUS_OP_BREAK, &hdr->operation);
+		put_user(EIO, &hdr->err);
+		return err;
+	}
+
+	err = copy_to_user(parg, zri->mount.zim, sizeof(*zri->mount.zim));
+	if (unlikely(err))
+		zuf_err("=>%d\n", err);
+	return err;
+}
+
+void zufs_mounter_release(struct file *file)
+{
+	struct zuf_root_info *zri = ZRI(file->f_inode->i_sb);
+
+	zuf_warn("closed fu=%d au=%d fw=%d aw=%d\n",
+		  zri->mount.relay.fss_wakeup, zri->mount.relay.app_wakeup,
+		  zri->mount.relay.fss_waiting, zri->mount.relay.app_waiting);
+
+	spin_lock(&zri->mount.lock);
+	zri->mount.file = NULL;
+	if (relay_is_app_waiting(&zri->mount.relay)) {
+		zuf_err("server emergency exit while IO\n");
+
+		if (zri->mount.zim)
+			zri->mount.zim->hdr.err = -EIO;
+		spin_unlock(&zri->mount.lock);
+
+		relay_app_wakeup(&zri->mount.relay);
+		msleep(1000); /* crap */
+	} else {
+		if (zri->mount.zim)
+			zri->mount.zim->hdr.err = 0;
+		spin_unlock(&zri->mount.lock);
+	}
+}
+
+static int _map_pages(struct zufs_thread *zt, struct page **pages, uint nump,
+		      bool zap)
+{
+	int p, err;
+	pgprot_t prot;
+
+	if (!(zt->vma && pages && nump))
+		return 0;
+
+	prot = pgprot_modify(prot, PAGE_SHARED);
+	for (p = 0; p < nump; ++p) {
+		ulong zt_addr = zt->vma->vm_start + p * PAGE_SIZE;
+		ulong pfn = page_to_pfn(zap ? g_drain_p : pages[p]);
+
+		err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
+		if (unlikely(err)) {
+			zuf_err("zuf: remap_pfn_range => %d p=0x%x start=0x%lx\n",
+				 err, p, zt->vma->vm_start);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void _unmap_pages(struct zufs_thread *zt, struct page **pages, uint nump)
+{
+	if (!(zt->vma && pages && nump))
+		return;
+
+	zt->pages = NULL;
+	zt->nump = 0;
+
+	/* Punch in a drain page for this CPU */
+	_map_pages(zt, pages, nump, true);
+}
+
+static int _zu_init(struct file *file, void *parg)
+{
+	struct zufs_thread *zt;
+	int cpu = smp_processor_id();
+	struct zufs_ioc_init zi_init;
+	int err;
+
+	err = copy_from_user(&zi_init, parg, sizeof(zi_init));
+	if (unlikely(err)) {
+		zuf_err("=>%d\n", err);
+		return err;
+	}
+
+	zuf_warn("[%d] aff=0x%lx\n", cpu, zi_init.affinity);
+
+	zi_init.hdr.err = _zt_from_f(file, cpu, &zt);
+	if (unlikely(zi_init.hdr.err)) {
+		zuf_err("=>%d\n", err);
+		goto out;
+	}
+
+	if (zt->file) {
+		zuf_err("[%d] thread already set\n", cpu);
+		memset(zt, 0, sizeof(*zt));
+	}
+
+	relay_init(&zt->relay);
+	zt->hdr.type = zlfs_e_zt;
+	zt->file = file;
+	zt->no = cpu;
+
+	file->private_data = &zt->hdr;
+out:
+	err = copy_to_user(parg, &zi_init, sizeof(zi_init));
+	if (err)
+		zuf_err("=>%d\n", err);
+	return err;
+}
+
+struct zufs_thread *_zt_from_f_private(struct file *file)
+{
+	struct zuf_special_file *zsf = file->private_data;
+
+	WARN_ON(zsf->type != zlfs_e_zt);
+	return container_of(zsf, struct zufs_thread, hdr);
+}
+
+/* Caller checks that file->private_data != NULL */
+void zufs_zt_release(struct file *file)
+{
+	struct zufs_thread *zt = _zt_from_f_private(file);
+
+	if (unlikely(zt->file != file))
+		zuf_err("What happened zt->file(%p) != file(%p)\n",
+			zt->file, file);
+
+	zuf_warn("[%d] closed fu=%d au=%d fw=%d aw=%d\n",
+		  zt->no, zt->relay.fss_wakeup, zt->relay.app_wakeup,
+		  zt->relay.fss_waiting, zt->relay.app_waiting);
+
+	if (relay_is_app_waiting(&zt->relay)) {
+		zuf_err("server emergency exit while IO\n");
+
+		/* NOTE: Do not call _unmap_pages the vma is gone */
+
+		zt->next_opt->err = -EIO;
+		zt->file = NULL;
+
+		relay_app_wakeup(&zt->relay);
+		msleep(1000); /* crap */
+	}
+
+	memset(zt, 0, sizeof(*zt));
+}
+
+static int _zu_wait(struct file *file, void *parg)
+{
+	struct zufs_thread *zt;
+	int cpu = smp_processor_id();
+	int err;
+
+	err = _zt_from_f(file, cpu, &zt);
+	if (unlikely(err))
+		goto err;
+
+	if (!zt->file || file != zt->file) {
+		zuf_err("fatal\n");
+		err = -E2BIG;
+		goto err;
+	}
+
+	relay_fss_waiting_grab(&zt->relay);
+
+	if (relay_is_app_waiting(&zt->relay)) {
+		_unmap_pages(zt, zt->pages, zt->nump);
+
+		get_user(zt->next_opt->err, (int *)parg);
+		if (zt->next_opt->out_len) {
+			void *rply = (void *)zt->next_opt +
+							zt->next_opt->out_start;
+			void *from = parg + zt->next_opt->out_start;
+
+			err = copy_from_user(rply, from, zt->next_opt->out_len);
+		}
+		zt->next_opt = NULL;
+
+		relay_app_wakeup(&zt->relay);
+	}
+
+	err  = relay_fss_wait(&zt->relay);
+
+	if (zt->next_opt &&  zt->next_opt->operation < ZUS_OP_BREAK) {
+		/* call map here at the zuf thread so we need no locks */
+		_map_pages(zt, zt->pages, zt->nump, false);
+		err = copy_to_user(parg, zt->next_opt, zt->next_opt->in_len);
+	} else {
+		struct zufs_ioc_hdr *hdr = parg;
+
+		/* This Means we were released by _zu_break */
+		zuf_warn("_zu_break? %p => %d\n", zt->next_opt, err);
+		put_user(ZUS_OP_BREAK, &hdr->operation);
+		put_user(err, &hdr->err);
+	}
+
+	return err;
+
+err:
+	put_user(err, (int *)parg);
+	return err;
+}
+
+int zufs_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr,
+		  struct page **pages, uint nump)
+{
+	int cpu = smp_processor_id();
+	struct zufs_thread *zt;
+
+	if ((cpu < 0) || (zri->_max_zts <= cpu))
+		return -ERANGE;
+	zt = &zri->_all_zt[cpu];
+
+	if (unlikely(!zt->file))
+		return -EIO;
+
+	while (!relay_is_fss_waiting(&zt->relay)) {
+		mb();
+		if (unlikely(!zt->file))
+			return -EIO;
+		zuf_dbg_err("[%d] can this be\n", cpu);
+		/* FIXME: Do something much smarter */
+		msleep(10);
+		mb();
+	}
+
+	zt->next_opt = hdr;
+	zt->pages = pages;
+	zt->nump = nump;
+
+	relay_fss_wakeup_app_wait(&zt->relay, NULL);
+
+	return zt->file ? hdr->err : -EIO;
+}
+
+static int _zu_break(struct file *filp, void *parg)
+{
+	struct zuf_root_info *zri = ZRI(filp->f_inode->i_sb);
+	int i;
+
+	zuf_dbg_core("enter\n");
+	mb(); /* TODO how to schedule on all CPU's */
+
+	for (i = 0; i < zri->_max_zts; ++i) {
+		struct zufs_thread *zt = &zri->_all_zt[i];
+
+		if (unlikely(!(zt && zt->file)))
+			continue;
+		relay_fss_wakeup(&zt->relay);
+	}
+
+	if (zri->mount.file)
+		relay_fss_wakeup(&zri->mount.relay);
+
+	zuf_dbg_core("exit\n");
+	return 0;
+}
+
+long zufs_ioc(struct file *file, unsigned int cmd, ulong arg)
+{
+	void __user *parg = (void __user *)arg;
+
+	switch (cmd) {
+	case ZU_IOC_REGISTER_FS:
+		return _zu_register_fs(file, parg);
+	case ZU_IOC_MOUNT:
+		return _zu_mount(file, parg);
+	case ZU_IOC_INIT_THREAD:
+		return _zu_init(file, parg);
+	case ZU_IOC_WAIT_OPT:
+		return _zu_wait(file, parg);
+	case ZU_IOC_BREAK_ALL:
+		return _zu_break(file, parg);
+	default:
+		zuf_err("%d %ld\n", cmd, ZU_IOC_WAIT_OPT);
+		return -ENOTTY;
+	}
+}
+
+static int zuf_file_fault(struct vm_fault *vmf)
+{
+	zuf_err("should not fault\n");
+	return VM_FAULT_SIGBUS;
+}
+
+static void zuf_mmap_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct zufs_thread *zt = _zt_from_f_private(file);
+
+	zuf_dbg_vfs("[%ld] start=0x%lx end=0x%lx flags=0x%lx page_prot=0x%lx\n",
+		     file->f_mapping->host->i_ino, vma->vm_start, vma->vm_end,
+		     vma->vm_flags, pgprot_val(vma->vm_page_prot));
+	zt->vma = vma;
+}
+
+static void zuf_mmap_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct zufs_thread *zt = _zt_from_f_private(file);
+
+	zuf_dbg_vfs("[%ld] start=0x%lx end=0x%lx flags=0x%lx page_prot=0x%lx\n",
+		     file->f_mapping->host->i_ino, vma->vm_start, vma->vm_end,
+		     vma->vm_flags, pgprot_val(vma->vm_page_prot));
+
+	zt->vma = NULL;
+}
+
+static const struct vm_operations_struct zuf_vm_ops = {
+	.fault		= zuf_file_fault,
+	.open           = zuf_mmap_open,
+	.close		= zuf_mmap_close,
+};
+
+int zuf_zt_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct zufs_thread *zt = _zt_from_f_private(file);
+
+	/* Tell Kernel We will only access on a single core */
+	vma->vm_flags |= VM_LOCAL_CPU;
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_ops = &zuf_vm_ops;
+
+	zt->vma = vma;
+
+	zuf_dbg_core("[%ld] start=0x%lx end=0x%lx flags=0x%lx page_prot=0x%lx\n",
+		     file->f_mapping->host->i_ino, vma->vm_start, vma->vm_end,
+		     vma->vm_flags, pgprot_val(vma->vm_page_prot));
+
+	return 0;
+}
diff --git a/fs/zuf/zuf-root.c b/fs/zuf/zuf-root.c
new file mode 100644
index 0000000..8102d3a
--- /dev/null
+++ b/fs/zuf/zuf-root.c
@@ -0,0 +1,330 @@
+/*
+ * ZUF Root filesystem.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * ZUF core is mounted on a small specialized FS that
+ * provides the communication with the mount thread, zuf multy-channel
+ * communication [ZTs], and the pmem devices.
+ * Subsequently all FS super_blocks are children of this root, and point
+ * to it. All using the same zuf communication multy-channel.
+ *
+ * [
+ * TODO:
+ *	Multiple servers can run on Multiple mounted roots. Each registering
+ *	their own FSTYPEs. Admin should make sure that the FSTYPEs do not
+ *	overlap
+ * ]
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <asm-generic/mman.h>
+
+#include "zuf.h"
+
+/* ~~~~ Register/Unregister FS-types ~~~~ */
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * NOTE: When CONFIG_LOCKDEP is on the register_filesystem complains when
+ * the fstype object is from a kmalloc. Because of some lockdep_keys not
+ * being const_obj something.
+ *
+ * So in this case we have maximum of 16 fstypes system wide
+ * (Total for all mounted zuf_root(s)). This way we can have them
+ * in const_obj memory below at g_fs_array
+ */
+
+enum { MAX_LOCKDEP_FSs = 16 };
+static uint g_fs_next;
+static struct zuf_fs_type g_fs_array[MAX_LOCKDEP_FSs];
+
+static struct zuf_fs_type *_fs_type_alloc(void)
+{
+	if (MAX_LOCKDEP_FSs <= g_fs_next)
+		return NULL;
+
+	return &g_fs_array[g_fs_next++];
+}
+
+static void _fs_type_free(struct zuf_fs_type *zft)
+{
+	if (zft == &g_fs_array[0])
+		g_fs_next = 0;
+}
+
+#else /* !CONFIG_LOCKDEP*/
+static struct zuf_fs_type *_fs_type_alloc(void)
+{
+	return kcalloc(1, sizeof(struct zuf_fs_type), GFP_KERNEL);
+}
+
+static void _fs_type_free(zuf_fs_type *zft)
+{
+	kfree(zft);
+}
+#endif /*CONFIG_LOCKDEP*/
+
+int zuf_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs)
+{
+	struct zuf_fs_type *zft = _fs_type_alloc();
+
+	if (unlikely(!zft))
+		return -ENOMEM;
+
+	/* Original vfs file type */
+	zft->vfs_fst.owner	= THIS_MODULE;
+	zft->vfs_fst.name	= kstrdup(rfs->rfi.fsname, GFP_KERNEL);
+	zft->vfs_fst.mount	= zuf_mount,
+	zft->vfs_fst.kill_sb	= kill_block_super,
+
+	/* ZUS info about this FS */
+	zft->rfi		= rfs->rfi;
+	zft->zus_zfi		= rfs->zus_zfi;
+	INIT_LIST_HEAD(&zft->list);
+	/* Back pointer to our communication channels */
+	zft->zri		= ZRI(sb);
+
+	zuf_add_fs_type(zft->zri, zft);
+	zuf_info("register_filesystem [%s]\n", zft->vfs_fst.name);
+	return register_filesystem(&zft->vfs_fst);
+}
+
+void _unregister_fs(struct zuf_root_info *zri)
+{
+	struct zuf_fs_type *zft, *n;
+
+	list_for_each_entry_safe_reverse(zft, n, &zri->fst_list, list) {
+		unregister_filesystem(&zft->vfs_fst);
+		list_del_init(&zft->list);
+		_fs_type_free(zft);
+	}
+}
+
+int zufr_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct zuf_special_file *zsf = file->private_data;
+
+	switch (zsf->type) {
+	case zlfs_e_zt:
+		return zuf_zt_mmap(file, vma);
+	default:
+		zuf_err("type=%d\n", zsf->type);
+		return -ENOTTY;
+	}
+}
+
+static int zufr_release(struct inode *inode, struct file *file)
+{
+	struct zuf_special_file *zsf = file->private_data;
+
+	if (!zsf)
+		return 0;
+
+	switch (zsf->type) {
+	case zlfs_e_zt:
+		zufs_zt_release(file);
+		return 0;
+	case zlfs_e_mout_thread: {
+		struct zuf_root_info *zri = ZRI(inode->i_sb);
+
+		zufs_mounter_release(file);
+		_unregister_fs(zri);
+		return 0;
+	}
+	case zlfs_e_pmem:
+		/* NOTHING to clean for pmem file yet */
+		/* zufs_pmem_release(file);*/
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static int zufr_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	drop_nlink(inode);
+	return 0;
+}
+
+static const struct inode_operations zufr_inode_operations;
+static const struct file_operations zufr_file_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= dcache_readdir,
+	.fsync		= noop_fsync,
+	.unlocked_ioctl = zufs_ioc,
+};
+static const struct file_operations zufr_file_reg_operations = {
+	.fsync		= noop_fsync,
+	.unlocked_ioctl = zufs_ioc,
+	.mmap		= zufr_mmap,
+	.release	= zufr_release,
+};
+
+static int zufr_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct zuf_root_info *zri = ZRI(dir->i_sb);
+	struct inode *inode;
+	int err;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return -ENOMEM;
+
+	inode->i_ino = ++zri->next_ino; /* none atomic only one mount thread */
+	inode->i_blocks = inode->i_size = 0;
+	inode->i_ctime = inode->i_mtime = current_kernel_time();
+	inode->i_atime = inode->i_ctime;
+	inode_init_owner(inode, dir, mode);
+
+	inode->i_op = &zufr_inode_operations;
+	inode->i_fop = &zufr_file_reg_operations;
+
+	err = insert_inode_locked(inode);
+	if (unlikely(err)) {
+		zuf_err("[%ld] insert_inode_locked => %d\n", inode->i_ino, err);
+		goto fail;
+	}
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+fail:
+	clear_nlink(inode);
+	make_bad_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static void zufr_put_super(struct super_block *sb)
+{
+	struct zuf_root_info *zri = ZRI(sb);
+
+	zufs_zts_fini(zri);
+	_unregister_fs(zri);
+
+	zuf_info("zuf_root umount\n");
+}
+
+static void zufr_evict_inode(struct inode *inode)
+{
+	clear_inode(inode);
+}
+
+static const struct inode_operations zufr_inode_operations = {
+	.lookup		= simple_lookup,
+
+	.tmpfile	= zufr_tmpfile,
+	.unlink		= zufr_unlink,
+};
+static const struct super_operations zufr_super_operations = {
+	.statfs		= simple_statfs,
+
+	.evict_inode	= zufr_evict_inode,
+	.put_super	= zufr_put_super,
+};
+
+#define ZUFR_SUPER_MAGIC 0x1717
+
+static int zufr_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr zufr_files[] = {{""}};
+	struct zuf_root_info *zri;
+	struct inode *root_i;
+	int err;
+
+	zri = kzalloc(sizeof(*zri), GFP_KERNEL);
+	if (!zri) {
+		zuf_err_cnd(silent,
+			    "Not enough memory to allocate zuf_root_info\n");
+		return -ENOMEM;
+	}
+
+	err = simple_fill_super(sb, ZUFR_SUPER_MAGIC, zufr_files);
+	if (unlikely(err))
+		return err;
+
+	sb->s_op = &zufr_super_operations;
+	sb->s_fs_info = zri;
+	zri->sb = sb;
+
+	root_i = sb->s_root->d_inode;
+	root_i->i_fop = &zufr_file_dir_operations;
+	root_i->i_op = &zufr_inode_operations;
+
+	spin_lock_init(&zri->mount.lock);
+	relay_init(&zri->mount.relay);
+	INIT_LIST_HEAD(&zri->fst_list);
+	INIT_LIST_HEAD(&zri->pmem_list);
+
+	err = zufs_zts_init(zri);
+	if (unlikely(err))
+		return err; /* put will be called we have a root */
+
+	return 0;
+}
+
+static struct dentry *zufr_mount(struct file_system_type *fs_type,
+				  int flags, const char *dev_name,
+				  void *data)
+{
+	struct dentry *ret = mount_single(fs_type, flags, data, zufr_fill_super);
+
+	zuf_info("zuf_root mount => %p\n", ret);
+	return ret;
+}
+
+static struct file_system_type zufr_type = {
+	.owner =	THIS_MODULE,
+	.name =		"zuf",
+	.mount =	zufr_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+/* Create an /sys/fs/zuf/ directory. to mount on */
+static struct kset *zufr_kset;
+
+int __init zuf_root_init(void)
+{
+	int err;
+
+	zufr_kset = kset_create_and_add("zuf", NULL, fs_kobj);
+	if (!zufr_kset) {
+		err = -ENOMEM;
+		goto un_inodecache;
+	}
+
+	err = register_filesystem(&zufr_type);
+	if (unlikely(err))
+		goto un_kset;
+
+	return 0;
+
+un_kset:
+	kset_unregister(zufr_kset);
+un_inodecache:
+	return err;
+}
+
+void __exit zuf_root_exit(void)
+{
+	unregister_filesystem(&zufr_type);
+	kset_unregister(zufr_kset);
+}
+
+module_init(zuf_root_init)
+module_exit(zuf_root_exit)
diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h
new file mode 100644
index 0000000..15516d0
--- /dev/null
+++ b/fs/zuf/zuf.h
@@ -0,0 +1,90 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the ZUF filesystem.
+ *
+ * Copyright (c) 2018 NetApp Inc. All rights reserved.
+ *
+ * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details.
+ *
+ * Authors:
+ *	Boaz Harrosh <boazh@netapp.com>
+ *	Sagi Manole <sagim@netapp.com>"
+ */
+
+#ifndef __ZUF_H
+#define __ZUF_H
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/page_ref.h>
+
+#include "zus_api.h"
+
+#include "relay.h"
+#include "_pr.h"
+
+enum zlfs_e_special_file {
+	zlfs_e_zt = 1,
+	zlfs_e_mout_thread,
+	zlfs_e_pmem,
+};
+
+struct zuf_special_file {
+	enum zlfs_e_special_file type;
+};
+
+/* This is the zuf-root.c mini filesystem */
+struct zuf_root_info {
+	struct __mount_thread_info {
+		struct zuf_special_file zsf;
+		spinlock_t lock;
+		struct relay relay;
+		struct zufs_ioc_mount *zim;
+		struct file *file;
+	} mount;
+
+	ulong next_ino;
+
+	uint _max_zts;
+	struct zufs_thread *_all_zt;
+
+	struct super_block *sb;
+	struct list_head fst_list;
+
+	uint next_pmem_id;
+	struct list_head pmem_list;
+};
+
+static inline struct zuf_root_info *ZRI(struct super_block *sb)
+{
+	struct zuf_root_info *zri = sb->s_fs_info;
+
+	WARN_ON(zri->sb != sb);
+	return zri;
+}
+
+struct zuf_fs_type {
+	struct file_system_type vfs_fst;
+	struct zus_fs_info	*zus_zfi;
+	struct register_fs_info rfi;
+	struct zuf_root_info *zri;
+
+	struct list_head list;
+};
+
+static inline void zuf_add_fs_type(struct zuf_root_info *zri,
+				   struct zuf_fs_type *zft)
+{
+	/* Unlocked for now only one mount-thread with zus */
+	list_add(&zft->list, &zri->fst_list);
+}
+
+/* Keep this include last thing in file */
+#include "_extern.h"
+
+#endif /* __ZUF_H */
diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h
index d6ccc85..19ce326 100644
--- a/fs/zuf/zus_api.h
+++ b/fs/zuf/zus_api.h
@@ -66,4 +66,112 @@
 
 #endif /*  ndef __KERNEL__ */
 
+/**
+ * zufs dual port memory
+ * This is a special type of offset to either memory or persistent-memory,
+ * that is designed to be used in the interface mechanism between userspace
+ * and kernel, and can be accessed by both. Note that user must use the
+ * appropriate accessors to translate to a pointer.
+ */
+typedef __u64	zu_dpp_t;
+
+/* ~~~~~ ZUFS API ioctl commands ~~~~~ */
+enum {
+	ZUS_API_MAP_MAX_PAGES	= 1024,
+	ZUS_API_MAP_MAX_SIZE	= ZUS_API_MAP_MAX_PAGES * PAGE_SIZE,
+};
+
+struct zufs_ioc_hdr {
+	__u32 err;	/* IN/OUT must be first */
+	__u16 in_start;	/* Not used always 0 */
+	__u16 in_len;	/* How much to be copied *to* user mode */
+	__u16 out_start;/* start of output parameters */
+	__u16 out_len;	/* How much to be copied *from* user mode */
+	__u32 operation;/* One of e_zufs_operation */
+	__u32 offset;	/* Start of user buffer in ZT mmap */
+	__u32 len;	/* Len of user buffer in ZT mmap */
+};
+
+/* Register FS */
+/* A cookie from user-mode given in register_fs_info */
+struct zus_fs_info;
+struct zufs_ioc_register_fs {
+	struct zufs_ioc_hdr hdr;
+	struct zus_fs_info *zus_zfi;
+	struct register_fs_info {
+		/* IN */
+		char fsname[16];	/* Only 4 chars and a NUL please      */
+		__u32 FS_magic;         /* This is the FS's version && magic  */
+		__u32 FS_ver_major;	/* on disk, not the zuf-to-zus version*/
+		__u32 FS_ver_minor;	/* (See also struct zufs_dev_table)   */
+
+		__u8 acl_on;
+		__u8 notused[3];
+		__u64 dt_offset;
+
+		__u32 s_time_gran;
+		__u32 def_mode;
+		__u64 s_maxbytes;
+
+	} rfi;
+};
+#define ZU_IOC_REGISTER_FS	_IOWR('S', 10, struct zufs_ioc_register_fs)
+
+/* A cookie from user-mode returned by mount */
+struct zus_sb_info;
+
+/* zus cookie per inode */
+struct zus_inode_info;
+
+/* mount / umount */
+struct  zufs_ioc_mount {
+	struct zufs_ioc_hdr hdr;
+	/* IN */
+	struct zus_fs_info *zus_zfi;
+	uint num_cpu;
+	uint pmem_kern_id;
+	__u8 is_umounting;
+
+	/* OUT */
+	struct zus_sb_info *zus_sbi;
+	/* mount is also iget of root */
+	struct zus_inode_info *zus_ii;
+	zu_dpp_t _zi;
+
+	/* More mount info */
+	__u32 s_blocksize_bits;
+};
+#define ZU_IOC_MOUNT	_IOWR('S', 12, struct zufs_ioc_mount)
+
+/* ZT init */
+struct zufs_ioc_init {
+	struct zufs_ioc_hdr hdr;
+	ulong affinity;	/* IN */
+};
+#define ZU_IOC_INIT_THREAD	_IOWR('S', 20, struct zufs_ioc_init)
+
+/* break_all (Server telling kernel to clean) */
+struct zufs_ioc_break_all {
+	struct zufs_ioc_hdr hdr;
+};
+#define ZU_IOC_BREAK_ALL	_IOWR('S', 22, struct zufs_ioc_break_all)
+
+enum { ZUFS_MAX_COMMAND_BUFF = (PAGE_SIZE - sizeof(struct zufs_ioc_hdr)) };
+struct zufs_ioc_wait_operation {
+	struct zufs_ioc_hdr hdr;
+	char opt_buff[ZUFS_MAX_COMMAND_BUFF];
+};
+#define ZU_IOC_WAIT_OPT		_IOWR('S', 21, struct zufs_ioc_wait_operation)
+
+/* ~~~ all the permutations of zufs_ioc_wait_operation ~~~ */
+/* These are the possible operations sent from Kernel to the Server in the
+ * return of the ZU_IOC_WAIT_OPT.
+ */
+enum e_zufs_operation {
+	ZUS_OP_NULL = 0,
+
+	ZUS_OP_BREAK,		/* Kernel telling Server to exit */
+	ZUS_OP_MAX_OPT,
+};
+
 #endif /* _LINUX_ZUFS_API_H */
-- 
2.5.5