All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] generic channel for multi-process communication
@ 2017-11-30 18:44 Jianfeng Tan
  2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
                   ` (9 more replies)
  0 siblings, 10 replies; 88+ messages in thread
From: Jianfeng Tan @ 2017-11-30 18:44 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for those messages which need a response immediately.
Patch 3: Rework vfio to use this generic communication channel.

Jianfeng Tan (3):
  eal: add channel for multi-process communication
  eal: add synchronous multi-process communication
  vfio: use the generic multi-process channel

 lib/librte_eal/common/eal_common_proc.c        | 546 +++++++++++++++++++++++++
 lib/librte_eal/common/eal_filesystem.h         |  18 +
 lib/librte_eal/common/eal_private.h            |  10 +
 lib/librte_eal/common/include/rte_eal.h        |  71 ++++
 lib/librte_eal/linuxapp/eal/eal.c              |  23 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 139 ++-----
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 +++----------------
 lib/librte_eal/rte_eal_version.map             |  22 +
 9 files changed, 785 insertions(+), 475 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH 1/3] eal: add channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
@ 2017-11-30 18:44 ` Jianfeng Tan
  2017-12-11 11:04   ` Burakov, Anatoly
  2017-12-11 16:43   ` Ananyev, Konstantin
  2017-11-30 18:44 ` [PATCH 2/3] eal: add synchronous " Jianfeng Tan
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2017-11-30 18:44 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'll be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, as an unix
socket connection, for above requirements. Primary will listen on
the unix socket; secondary will connect this socket to talk.

Three new APIs are added:

  1. rte_eal_mp_action_register is used to register an action,
     indexed by a string; if the calling component wants to
     response the messages from the corresponding component in
     its primary process or secondary processes.
  2. rte_eal_mp_action_unregister is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg is used to send a message.

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 497 ++++++++++++++++++++++++++++++++
 lib/librte_eal/common/eal_filesystem.h  |  18 ++
 lib/librte_eal/common/eal_private.h     |  10 +
 lib/librte_eal/common/include/rte_eal.h |  68 +++++
 lib/librte_eal/linuxapp/eal/eal.c       |   9 +
 lib/librte_eal/rte_eal_version.map      |  22 ++
 6 files changed, 624 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 60526ca..5d0a095 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -33,8 +33,21 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
 #include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
@@ -59,3 +72,487 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN	64
+	char action_name[MAX_ACTION_NAME_LEN];
+	rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	int len = strlen(name);
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, len) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+	struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+	if (entry == NULL)
+		return -ENOMEM;
+
+	if (find_action_entry_by_name(action_name) != NULL)
+		return -EEXIST;
+
+	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+	entry->action = action;
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry = find_action_entry_by_name(name);
+
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	free(entry);
+}
+
+/* The maximum amount of fd for one recvmsg/sendmsg */
+#define SCM_MAX_FD		253
+#define MAX_SECONDARY_PROCS	8
+#define MAX_MESSAGE_LENGTH	1024
+
+struct mp_fds {
+	int efd;
+
+	union {
+		/* fds for primary process */
+		struct {
+			int listen;
+			/* fds used to send msg to secondary process(es) */
+			int secondaries[MAX_SECONDARY_PROCS];
+		};
+
+		/* fds for secondary process */
+		struct {
+			/* fds used to send msg to the primary process */
+			int primary;
+		};
+	};
+};
+
+static struct mp_fds mp_fds;
+
+struct msg_hdr {
+	char action_name[MAX_ACTION_NAME_LEN];
+	int fds_num;
+	int len_params;
+	char params[0];
+} __rte_packed;
+
+static int
+add_sec_proc(int fd)
+{
+	int i;
+
+	for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+		if (mp_fds.secondaries[i] == -1)
+			break;
+
+	if (i >= MAX_SECONDARY_PROCS)
+		return -1;
+
+	mp_fds.secondaries[i] = fd;
+
+	return i;
+}
+
+static void
+del_sec_proc(int fd)
+{
+	int i;
+
+	for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+		if (mp_fds.secondaries[i] == fd) {
+			mp_fds.secondaries[i] = -1;
+			break;
+		}
+	}
+}
+
+static int
+read_msg(int sockfd, char *buf, int buflen, int *fds, int fds_num)
+{
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fds_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	struct msg_hdr *hdr = (struct msg_hdr *)buf;
+	int ret, total;
+
+	/* read msg_hdr */
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = hdr;
+	iov.iov_len  = sizeof(*hdr);
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(sockfd, &msgh, 0);
+	if (ret != sizeof(struct msg_hdr)) {
+		RTE_LOG(ERR, EAL, "recvmsg failed\n");
+		return ret;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+	total = ret;
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	/* read params */
+	if (hdr->len_params) {
+		if (hdr->len_params > buflen - (int)sizeof(*hdr))
+			rte_exit(EXIT_FAILURE, "params too long\n");
+
+		ret = read(sockfd, &hdr->params, hdr->len_params);
+		if (ret != hdr->len_params)
+			rte_exit(EXIT_FAILURE, "failed to recv params\n");
+
+		total += ret;
+	}
+
+	RTE_LOG(INFO, EAL, "read msg: %s, %d\n", hdr->action_name,
+		(int)sizeof(*hdr) + hdr->len_params);
+	return total;
+}
+
+static int
+process_msg(int fd)
+{
+	int len;
+	int params_len;
+	char buf[MAX_MESSAGE_LENGTH];
+	int fds[SCM_MAX_FD];
+	struct msg_hdr *hdr;
+	struct action_entry *entry;
+
+	len = read_msg(fd, buf, MAX_MESSAGE_LENGTH, fds, SCM_MAX_FD);
+	if (len <= 0) {
+		RTE_LOG(ERR, EAL, "failed to read message: %s\n",
+			strerror(errno));
+		return -1;
+	}
+
+	hdr = (struct msg_hdr *) buf;
+
+	entry = find_action_entry_by_name(hdr->action_name);
+	if (entry == NULL) {
+		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+			hdr->action_name);
+		return -1;
+	}
+
+	params_len = len - sizeof(struct msg_hdr);
+	
+	return entry->action(hdr->params, params_len, fds, hdr->fds_num);
+}
+
+static int
+add_secondary(void)
+{
+	int fd;
+	struct epoll_event ev;
+
+	while (1) {
+		fd = accept(mp_fds.listen, NULL, NULL);
+		if (fd < 0 && errno == EAGAIN)
+			break;
+		else if (fd < 0) {
+			RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
+				strerror(errno));
+			return -1;
+		}
+
+		ev.events = EPOLLIN | EPOLLRDHUP;
+		ev.data.fd = fd;
+		if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+			RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
+				strerror(errno));
+			break;
+		}
+		if (add_sec_proc(fd) < 0) {
+			RTE_LOG(ERR, EAL, "too many secondary processes\n");
+			close(fd);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void *
+mp_handler(void *arg __rte_unused)
+{
+	int fd;
+	int i, n;
+	struct epoll_event ev;
+	struct epoll_event *events;
+	int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
+	if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
+			strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	events = calloc(20, sizeof ev);
+
+	while (1) {
+		n = epoll_wait(mp_fds.efd, events, 20, -1);
+		for (i = 0; i < n; i++) {
+			if (is_primary && events[i].data.fd == mp_fds.listen) {
+				if (events[i].events != EPOLLIN) {
+					RTE_LOG(ERR, EAL, "what happens?\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (add_secondary() < 0)
+					break;
+
+				continue;
+			}
+
+			fd = events[i].data.fd;
+
+			if ((events[i].events & EPOLLIN)) {
+				if (process_msg(fd) < 0) {
+					RTE_LOG(ERR, EAL,
+						"failed to process msg\n");
+					if (!is_primary)
+						exit(EXIT_FAILURE);
+				}
+				continue;
+			}
+
+			/* EPOLLERR, EPOLLHUP, etc */
+			if (is_primary) {
+				RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
+				epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
+				del_sec_proc(fd);
+				close(fd);
+			} else {
+				RTE_LOG(ERR, EAL, "primary exits, so do I\n");
+				/* Exit secondary when primary exits? */
+				exit(EXIT_FAILURE);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+	int i, fd, ret;
+	const char *path;
+	struct sockaddr_un un;
+	pthread_t tid;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	mp_fds.efd = epoll_create1(0);
+	if (mp_fds.efd < 0) {
+		RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
+		return -1;
+	}
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	path = eal_mp_unix_path();
+	strncpy(un.sun_path, path, sizeof(un.sun_path));
+	un.sun_path[sizeof(un.sun_path) - 1] = '\0';
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+			mp_fds.secondaries[i] = -1;
+
+		if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+			RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
+			close(fd);
+			return -1;
+		}
+
+		/* The file still exists since last run */
+		unlink(path);
+
+		ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
+				path, strerror(errno));
+			close(fd);
+			return -1;
+		}
+		RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
+
+		ret = listen(fd, 1024);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "failed to listen: %s\n",
+				strerror(errno));
+			close(fd);
+			return -1;
+		}
+		mp_fds.listen = fd;
+	} else {
+		ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "failed to connect primary\n");
+			return -1;
+		}
+		mp_fds.primary = fd;
+	}
+
+	ret = pthread_create(&tid, NULL, mp_handler, NULL);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
+			strerror(errno));
+		close(fd);
+		close(mp_fds.efd);
+		return -1;
+	}
+
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+		 "rte_mp_handle");
+	ret = rte_thread_setname(tid, thread_name);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "failed to set thead name\n");
+		close(fd);
+		close(mp_fds.efd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+send_msg(int fd, struct msghdr *p_msgh)
+{
+	int ret;
+
+	do {
+		ret = sendmsg(fd, p_msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0)
+		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+	return ret;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+		   const void *params,
+		   int len_params,
+		   int fds[],
+		   int fds_num)
+{
+	int i;
+	int ret = 0;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = fds_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+	struct msg_hdr *msg;
+	int len_msg;
+
+	if (fds_num > SCM_MAX_FD) {
+		RTE_LOG(ERR, EAL,
+			"Cannot send more than %d FDs\n", SCM_MAX_FD);
+		return -E2BIG;
+	}
+
+	len_msg = sizeof(struct msg_hdr) + len_params;
+	if (len_msg > MAX_MESSAGE_LENGTH) {
+		RTE_LOG(ERR, EAL, "Message is too long\n");
+		return -ENOMEM;
+	}
+
+	RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
+
+	msg = malloc(len_msg);
+	if (!msg) {
+		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+		return -ENOMEM;
+	}
+	memset(msg, 0, len_msg);
+	strcpy(msg->action_name, action_name);
+	msg->fds_num = fds_num;
+	msg->len_params = len_params;
+	memcpy(msg->params, params, len_params);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = (uint8_t *)msg;
+	iov.iov_len = len_msg;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+			if (mp_fds.secondaries[i] == -1)
+				continue;
+
+			ret = send_msg(mp_fds.secondaries[i], &msgh);
+			if (ret < 0)
+				break;
+		}
+	} else {
+		ret = send_msg(mp_fds.primary, &msgh);
+	}
+
+	free(msg);
+
+	return ret;
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 8acbd99..3d9514f 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -67,6 +67,24 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 462226f..60944f2 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -224,4 +224,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8e4e71c..8776bcf 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -215,6 +215,74 @@ int rte_eal_init(int argc, char **argv);
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
 /**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+			    int fds[], int fds_num);
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the primary process or the secondary processes.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * @param action_name
+ *   The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ *   The params argument contains the customized message.
+ *
+ * @param len_params
+ *   The len_params argument is the length of the customized message.
+ *
+ * @param fds
+ *   The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_num
+ *   The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ *  - (>=0) on success.
+ *  - (<0) on failure.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+		   int len_params, int fds[], int fds_num);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..a84eab4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)
 
 	eal_check_mem_on_local_socket();
 
+	if (rte_eal_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		rte_errno = EFAULT;
+		return -1;
+	}
+
+	if (eal_plugins_init() < 0)
+		rte_eal_init_alert("Cannot init plugins\n");
+
 	eal_thread_init_master(rte_config.master_lcore);
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..6762397 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,26 @@ EXPERIMENTAL {
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
 
+} DPDK_17.08;
+
+DPDK_17.11 {
+	global:
+
+	rte_bus_get_iommu_class;
+	rte_eal_iova_mode;
+	rte_eal_mbuf_default_mempool_ops;
+	rte_lcore_has_role;
+	rte_memcpy_ptr;
+	rte_pci_get_iommu_class;
+	rte_pci_match;
+
+} DPDK_17.08;
+
+DPDK_18.02 {
+	global:
+
+	rte_eal_mp_action_register;
+	rte_eal_mp_action_unregister;
+	rte_eal_mp_sendmsg;
+
 } DPDK_17.11;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH 2/3] eal: add synchronous multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
  2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
@ 2017-11-30 18:44 ` Jianfeng Tan
  2017-12-11 11:39   ` Burakov, Anatoly
  2017-11-30 18:44 ` [PATCH 3/3] vfio: use the generic multi-process channel Jianfeng Tan
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2017-11-30 18:44 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication, that
is to say we need an immediate response after we send a message
to the other side.

We will stop the mp_handler thread, and after sending message,
the send thread will wait there for reponse and process the
respond.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 53 +++++++++++++++++++++++++++++++--
 lib/librte_eal/common/include/rte_eal.h |  5 +++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 5d0a095..65ebaf2 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -30,6 +30,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define _GNU_SOURCE
+
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
@@ -41,6 +43,8 @@
 #include <sys/un.h>
 #include <errno.h>
 #include <pthread.h>
+#include <sys/eventfd.h>
+#include <signal.h>
 
 #include <rte_log.h>
 #include <rte_eal.h>
@@ -134,6 +138,7 @@ rte_eal_mp_action_unregister(const char *name)
 
 struct mp_fds {
 	int efd;
+	int evfd; /* eventfd used for pausing mp_handler thread */
 
 	union {
 		/* fds for primary process */
@@ -331,6 +336,13 @@ mp_handler(void *arg __rte_unused)
 		exit(EXIT_FAILURE);
 	}
 
+	ev.data.fd = mp_fds.evfd;
+	if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "epoll_ctl failed: %s\n",
+			strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
 	events = calloc(20, sizeof ev);
 
 	while (1) {
@@ -348,6 +360,14 @@ mp_handler(void *arg __rte_unused)
 				continue;
 			}
 
+			if (events[i].data.fd == mp_fds.evfd) {
+				RTE_LOG(INFO, EAL, "mp_handler thread will pause\n");
+				pause();
+				RTE_LOG(INFO, EAL, "mp_handler thread stops pausing\n");
+
+				continue;
+			}
+
 			fd = events[i].data.fd;
 
 			if ((events[i].events & EPOLLIN)) {
@@ -377,13 +397,14 @@ mp_handler(void *arg __rte_unused)
 	return NULL;
 }
 
+static pthread_t tid;
+
 int
 rte_eal_mp_channel_init(void)
 {
 	int i, fd, ret;
 	const char *path;
 	struct sockaddr_un un;
-	pthread_t tid;
 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
 
 	mp_fds.efd = epoll_create1(0);
@@ -462,6 +483,8 @@ rte_eal_mp_channel_init(void)
 		return -1;
 	}
 
+	mp_fds.evfd = eventfd(0, 0);
+
 	return 0;
 }
 
@@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
 		   const void *params,
 		   int len_params,
 		   int fds[],
-		   int fds_num)
+		   int fds_num,
+		   int need_ack)
 {
 	int i;
 	int ret = 0;
@@ -511,6 +535,11 @@ rte_eal_mp_sendmsg(const char *action_name,
 
 	RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
 
+	if (need_ack) {
+		// stop mp_handler thread.
+		eventfd_write(mp_fds.evfd, (eventfd_t)1);
+	}
+
 	msg = malloc(len_msg);
 	if (!msg) {
 		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
@@ -547,12 +576,32 @@ rte_eal_mp_sendmsg(const char *action_name,
 			ret = send_msg(mp_fds.secondaries[i], &msgh);
 			if (ret < 0)
 				break;
+
+			if (need_ack) {
+				/* We will hang there until the other side
+				 * responses and what if other side is sending
+				 * msg at the same time?
+				 */
+				process_msg(mp_fds.secondaries[i]);
+			}
 		}
 	} else {
 		ret = send_msg(mp_fds.primary, &msgh);
+
+		if (ret > 0 && need_ack) {
+			// We will hang there until the other side responses
+			ret = process_msg(mp_fds.primary);
+		}
 	}
 
 	free(msg);
 
+	if (need_ack) {
+		// start mp_handler thread.
+		union sigval value;
+
+		pthread_sigqueue(tid, 0, value);
+	}
+
 	return ret;
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8776bcf..9875cae 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -274,13 +274,16 @@ void rte_eal_mp_action_unregister(const char *name);
  * @param fds_num
  *   The fds_num argument is number of fds to be sent with sendmsg.
  *
+ * @param need_ack
+ *   The fds_num argument is number of fds to be sent with sendmsg.
+ *
  * @return
  *  - (>=0) on success.
  *  - (<0) on failure.
  */
 int
 rte_eal_mp_sendmsg(const char *action_name, const void *params,
-		   int len_params, int fds[], int fds_num);
+		   int len_params, int fds[], int fds_num, int need_ack);
 
 /**
  * Usage function typedef used by the application usage function.
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH 3/3] vfio: use the generic multi-process channel
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
  2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
  2017-11-30 18:44 ` [PATCH 2/3] eal: add synchronous " Jianfeng Tan
@ 2017-11-30 18:44 ` Jianfeng Tan
  2017-12-11 12:01   ` Burakov, Anatoly
  2017-12-11  9:59 ` [PATCH 0/3] generic channel for multi-process communication Burakov, Anatoly
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2017-11-30 18:44 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio has its own channel for the secondary process to
get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 139 +++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 ++++---------------------
 4 files changed, 109 insertions(+), 475 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index a84eab4..93824bf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
 		return -1;
 	vfio_enabled = rte_vfio_is_enabled("vfio");
 
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
+	if (vfio_enabled && vfio_mp_sync_setup() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 58f0123..dbea350 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -68,9 +68,11 @@ int
 vfio_get_group_fd(int iommu_group_no)
 {
 	int i;
+	int ret;
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct vfio_mp_param p;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -129,51 +131,21 @@ vfio_get_group_fd(int iommu_group_no)
 		vfio_cfg.vfio_active_groups++;
 		return vfio_group_fd;
 	}
-	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
-	 */
-	else {
-		int socket_fd, ret;
+	/* For secondary process, request group fd from the primary */
 
-		socket_fd = vfio_mp_sync_connect_to_primary();
+	p.req = SOCKET_REQ_GROUP;
+	p.group_no = iommu_group_no;
 
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
+	ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "  cannot request group fd!\n");
+		cur_grp->group_no = -1;
+	} else {
+		cur_grp->group_no = iommu_group_no;
+		vfio_cfg.vfio_active_groups++;
 	}
-	return -1;
+
+	return ret;
 }
 
 
@@ -229,11 +201,12 @@ int
 clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct vfio_mp_param p;
+
+	i = get_vfio_group_idx(vfio_group_fd);
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
-		i = get_vfio_group_idx(vfio_group_fd);
 		if (i < 0)
 			return -1;
 		vfio_cfg.vfio_groups[i].group_no = -1;
@@ -243,44 +216,20 @@ clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
+	p.req = SOCKET_CLR_GROUP;
+	p.group_no = vfio_cfg.vfio_groups[i].group_no;
 
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
+	if (rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1) < 0) {
+		RTE_LOG(ERR, EAL, "request primary to clear group fd, failed!\n");
 		return -1;
 	}
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	vfio_cfg.vfio_groups[i].group_no = -1;
+	vfio_cfg.vfio_groups[i].fd = -1;
+	vfio_cfg.vfio_groups[i].devices = 0;
+	vfio_cfg.vfio_active_groups--;
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
-	return -1;
+	return 0;
 }
 
 int
@@ -590,6 +539,7 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct vfio_mp_param p;
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -620,34 +570,17 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		close(socket_fd);
-		return vfio_container_fd;
 	}
 
-	return -1;
+	/* For secondary process, request container fd from primary process */
+
+	p.req = SOCKET_REQ_CONTAINER;
+
+	ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+	if (ret < 0)
+		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+
+	return ret;
 }
 
 int
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index ba7892b..7907c22 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -117,15 +117,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS 64
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -190,6 +181,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index b53ed7e..dfba58f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -31,31 +31,11 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
-
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -66,360 +46,94 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
+static int
+vfio_mp_primary(const void *params, int len,
+		int fd[] __rte_unused, int fds_num __rte_unused)
 {
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
+	int fds[1];
+	const struct vfio_mp_param *p = params;
+	struct vfio_mp_param r;
 
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (len != sizeof(*p)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (p->req) {
+	case SOCKET_REQ_GROUP:
+		r.req = SOCKET_REQ_GROUP;
+		r.group_no = p->group_no;
+		fds[0] = vfio_get_group_fd(p->group_no);
+		if (fds[0] < 0) {
+			r.result = SOCKET_ERR;
+			rte_eal_mp_sendmsg("vfio", &r, sizeof(r), NULL, 0, 0);
+		} else if (fds[0] == 0) {
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r.result = SOCKET_NO_FD;
+			rte_eal_mp_sendmsg("vfio", &r, sizeof(r), NULL, 0, 0);
+		} else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r.result = SOCKET_OK;
+			rte_eal_mp_sendmsg("vfio", &r, sizeof(r), fds, 1, 0);
 		}
-		close(conn_sock);
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r.req = SOCKET_REQ_CONTAINER;
+		fds[0] = vfio_get_container_fd();
+		rte_eal_mp_sendmsg("vfio", &r, sizeof(r), fds, 1, 0);
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+		return -1;
 	}
+
+	return 0;
 }
 
 static int
-vfio_mp_sync_socket_setup(void)
+vfio_mp_secondary(const void *params, int len, int fds[],
+		  int fds_num __rte_unused)
 {
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
+	const struct vfio_mp_param *p = params;
 
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
+	if (len != sizeof(*p)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
+	switch (p->req) {
+	case SOCKET_REQ_GROUP:
+		switch (p->result) {
+		case SOCKET_NO_FD:
+			return 0;
+		case SOCKET_OK:
+			if (fds_num == 1 && fds[0] > 0)
+				return fds[0];
+			/* fall-through on error */
+		default:
+			RTE_LOG(ERR, EAL, "  cannot get group fd!\n");
+			return -1;
+		}
+	case SOCKET_REQ_CONTAINER:
+		if (fds_num == 1 && fds[0] > 0)
+			return fds[0];
 		return -1;
+	default:
+		RTE_LOG(ERR, EAL, "Invalid req!\n");
 	}
-
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
-
-	return 0;
+	return -1;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	rte_eal_mp_t action;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		action = vfio_mp_primary;
+	else
+		action = vfio_mp_secondary;
 
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
-
-	return 0;
+	return rte_eal_mp_action_register("vfio", action);
 }
 
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH 0/3] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (2 preceding siblings ...)
  2017-11-30 18:44 ` [PATCH 3/3] vfio: use the generic multi-process channel Jianfeng Tan
@ 2017-12-11  9:59 ` Burakov, Anatoly
  2017-12-12  7:34   ` Tan, Jianfeng
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2017-12-11  9:59 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
> This patchset adds a generic channel for multi-process (primary/secondary)
> communication.
> 
> Patch 1: addess the purpose and howto;
> Patch 2: add a syncrhonous way for those messages which need a response immediately.
> Patch 3: Rework vfio to use this generic communication channel.
> 

Hi Jianfeng,

Just a general comment: I am assuming this has the limitation of 
"everything happens through primary process's involvement". This will 
work for VFIO, as secondary always needs to ask the primary before doing 
anything, but it doesn't address other issues that could have been 
addressed with IPC.

For example, if a primary process would've hotplugged a device, it can't 
notify all secondary processes about this; rather, it has to wait until 
secondary processes ask for this info. Neither can it do anything if 
secondary requests a primary to do something, and notify other secondary 
processes about it (i.e. if secondary wants to hotplug a device, but 
there are other secondaries also running). It would be great to have a 
standard way of doing things like this in future revisions of our IPC.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 1/3] eal: add channel for multi-process communication
  2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
@ 2017-12-11 11:04   ` Burakov, Anatoly
  2017-12-11 16:43   ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2017-12-11 11:04 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>    1. Config-file based channel, in which, the primary process writes
>       info into a pre-defined config file, and the secondary process
>       reads info out.
>    2. vfio submodule has its own channel based on unix socket for the
>       secondary process to get container fd and group fd from the
>       primary process.
>    3. pdump submodule also has its own channel based on unix socket for
>       packet dump.
> 
> It'll be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>    a. Secondary wants to send info to primary, for example, secondary
>       would like to send request (about some specific vdev to primary).
>    b. Sending info at any time, instead of just initialization time.
>    c. Share FDs with the other side, for vdev like vhost, related FDs
>       (memory region, kick) should be shared.
>    d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, as an unix
> socket connection, for above requirements. Primary will listen on
> the unix socket; secondary will connect this socket to talk.
> 
> Three new APIs are added:
> 
>    1. rte_eal_mp_action_register is used to register an action,
>       indexed by a string; if the calling component wants to
>       response the messages from the corresponding component in
>       its primary process or secondary processes.
>    2. rte_eal_mp_action_unregister is used to unregister the action
>       if the calling component does not want to response the messages.
>    3. rte_eal_mp_sendmsg is used to send a message.
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---

<...snip...>

> +
> +int
> +rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
> +{
> +	struct action_entry *entry = malloc(sizeof(struct action_entry));
> +
> +	if (entry == NULL)
> +		return -ENOMEM;
> +
> +	if (find_action_entry_by_name(action_name) != NULL)
> +		return -EEXIST;

This should probably do a free(entry).

> +
> +	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
> +	entry->action = action;
> +	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
> +	return 0;
> +}
> +

<...snip...>

> +
> +static int
> +add_secondary(void)
> +{
> +	int fd;
> +	struct epoll_event ev;
> +
> +	while (1) {
> +		fd = accept(mp_fds.listen, NULL, NULL);
> +		if (fd < 0 && errno == EAGAIN)
> +			break;
> +		else if (fd < 0) {
> +			RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
> +				strerror(errno));
> +			return -1;
> +		}
> +
> +		ev.events = EPOLLIN | EPOLLRDHUP;
> +		ev.data.fd = fd;
> +		if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
> +				strerror(errno));
> +			break;
> +		}
> +		if (add_sec_proc(fd) < 0) {
> +			RTE_LOG(ERR, EAL, "too many secondary processes\n");
> +			close(fd);
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void *
> +mp_handler(void *arg __rte_unused)
> +{
> +	int fd;
> +	int i, n;
> +	struct epoll_event ev;
> +	struct epoll_event *events;
> +	int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
> +
> +	ev.events = EPOLLIN | EPOLLRDHUP;
> +	ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
> +	if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
> +			strerror(errno));
> +		exit(EXIT_FAILURE);

rte_exit?

> +	}
> +
> +	events = calloc(20, sizeof ev);
> +
> +	while (1) {
> +		n = epoll_wait(mp_fds.efd, events, 20, -1);
> +		for (i = 0; i < n; i++) {
> +			if (is_primary && events[i].data.fd == mp_fds.listen) {
> +				if (events[i].events != EPOLLIN) {
> +					RTE_LOG(ERR, EAL, "what happens?\n");

More descriptive error message would be nice :)

> +					exit(EXIT_FAILURE);

rte_exit?

> +				}
> +
> +				if (add_secondary() < 0)
> +					break;

Doing epoll_ctl in multiple different places hurts readability IMO. 
Might be a good idea to refactor add_secondary and mp_handler in a way 
that keeps all epoll handling in one place.

> +
> +				continue;
> +			}
> +
> +			fd = events[i].data.fd;
> +
> +			if ((events[i].events & EPOLLIN)) {
> +				if (process_msg(fd) < 0) {
> +					RTE_LOG(ERR, EAL,
> +						"failed to process msg\n");
> +					if (!is_primary)
> +						exit(EXIT_FAILURE);

rte_exit()?

> +				}
> +				continue;
> +			}
> +
> +			/* EPOLLERR, EPOLLHUP, etc */
> +			if (is_primary) {
> +				RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
> +				epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
> +				del_sec_proc(fd);
> +				close(fd);
> +			} else {
> +				RTE_LOG(ERR, EAL, "primary exits, so do I\n");
> +				/* Exit secondary when primary exits? */
> +				exit(EXIT_FAILURE);

This is changing previous behavior. I don't think exiting secondary when 
primary exits is something we want to do, so i would just print an 
error, but not exit the process.

> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +int
> +rte_eal_mp_channel_init(void)
> +{
> +	int i, fd, ret;
> +	const char *path;
> +	struct sockaddr_un un;
> +	pthread_t tid;
> +	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> +
> +	mp_fds.efd = epoll_create1(0);
> +	if (mp_fds.efd < 0) {
> +		RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
> +		return -1;
> +	}
> +
> +	fd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (fd < 0) {
> +		RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
> +		return -1;
> +	}
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	path = eal_mp_unix_path();
> +	strncpy(un.sun_path, path, sizeof(un.sun_path));
> +	un.sun_path[sizeof(un.sun_path) - 1] = '\0';
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +			mp_fds.secondaries[i] = -1;
> +
> +		if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
> +			RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
> +			close(fd);
> +			return -1;
> +		}
> +
> +		/* The file still exists since last run */
> +		unlink(path);
> +
> +		ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
> +				path, strerror(errno));
> +			close(fd);
> +			return -1;
> +		}
> +		RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
> +
> +		ret = listen(fd, 1024);
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to listen: %s\n",
> +				strerror(errno));
> +			close(fd);
> +			return -1;
> +		}
> +		mp_fds.listen = fd;
> +	} else {
> +		ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to connect primary\n");
> +			return -1;

Do we want to prevent secondary from launching if it can't connect to 
primary? Some use cases might rely on previous behavior. Maybe instead 
add some checks in handling functions to ensure that we have a valid 
connection to the primary before doing anything?

> +		}
> +		mp_fds.primary = fd;
> +	}
> +
> +	ret = pthread_create(&tid, NULL, mp_handler, NULL);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
> +			strerror(errno));
> +		close(fd);
> +		close(mp_fds.efd);
> +		return -1;
> +	}

<...snip...>

> +	if (fds_num > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL,
> +			"Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		return -E2BIG;
> +	}
> +
> +	len_msg = sizeof(struct msg_hdr) + len_params;
> +	if (len_msg > MAX_MESSAGE_LENGTH) {
> +		RTE_LOG(ERR, EAL, "Message is too long\n");
> +		return -ENOMEM;

Nitpicking, but is this really -ENOMEM? Shouldn't this be -EINVAL or 
-E2BIG? Also, this is external API - maybe return -1 and set rte_errno?

> +	}
> +
> +	RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);

Do we want this as INFO, not DEBUG?

> +
> +	msg = malloc(len_msg);
> +	if (!msg) {
> +		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> +		return -ENOMEM;
> +	}

<...snip...>

>   
>   /**
> + * Action function typedef used by other components.
> + *
> + * As we create unix socket channel for primary/secondary communication, use
> + * this function typedef to register action for coming messages.
> + */
> +typedef int (*rte_eal_mp_t)(const void *params, int len,
> +			    int fds[], int fds_num);

Nitpicking, but probably needs newlines before comments, here and after 
next function definition.

> +/**
> + * Register an action function for primary/secondary communication.
> + *
> + * Call this function to register an action, if the calling component wants
> + * to response the messages from the corresponding component in its primary
> + * process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + * @param action
> + *   The action argument is the function pointer to the action function.
> + *
> + * @return
> + *  - 0 on success.
> + *  - (<0) on failure.
> + */

<...snip...>

> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 229eec9..a84eab4 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)
>   
>   	eal_check_mem_on_local_socket();
>   
> +	if (rte_eal_mp_channel_init() < 0) {
> +		rte_eal_init_alert("failed to init mp channel\n");
> +		rte_errno = EFAULT;
> +		return -1;
> +	}

As noted above, maybe only fail if it's primary process?

> +
> +	if (eal_plugins_init() < 0)
> +		rte_eal_init_alert("Cannot init plugins\n");

This is probably a leftover of some other patch?

> +
>   	eal_thread_init_master(rte_config.master_lcore);
>   
>   	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index f4f46c1..6762397 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -235,4 +235,26 @@ EXPERIMENTAL {
>   	rte_service_set_stats_enable;
>   	rte_service_start_with_defaults;
>   
> +} DPDK_17.08;
> +
> +DPDK_17.11 {
> +	global:
> +
> +	rte_bus_get_iommu_class;
> +	rte_eal_iova_mode;
> +	rte_eal_mbuf_default_mempool_ops;
> +	rte_lcore_has_role;
> +	rte_memcpy_ptr;
> +	rte_pci_get_iommu_class;
> +	rte_pci_match;
> +
> +} DPDK_17.08;
> +

Same here, this looks like leftovers of rebase.

> +DPDK_18.02 {
> +	global:
> +
> +	rte_eal_mp_action_register;
> +	rte_eal_mp_action_unregister;
> +	rte_eal_mp_sendmsg;
> +
>   } DPDK_17.11;
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 2/3] eal: add synchronous multi-process communication
  2017-11-30 18:44 ` [PATCH 2/3] eal: add synchronous " Jianfeng Tan
@ 2017-12-11 11:39   ` Burakov, Anatoly
  2017-12-11 16:49     ` Ananyev, Konstantin
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2017-12-11 11:39 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
> We need the synchronous way for multi-process communication, that
> is to say we need an immediate response after we send a message
> to the other side.
> 
> We will stop the mp_handler thread, and after sending message,
> the send thread will wait there for reponse and process the
> respond.
> 
> Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>   lib/librte_eal/common/eal_common_proc.c | 53 +++++++++++++++++++++++++++++++--
>   lib/librte_eal/common/include/rte_eal.h |  5 +++-
>   2 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 5d0a095..65ebaf2 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -30,6 +30,8 @@
>    *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>    */
>   
> +#define _GNU_SOURCE
> +

shouldn't this be in Makefile flags?

>   #include <stdio.h>
>   #include <fcntl.h>
>   #include <stdlib.h>
> @@ -41,6 +43,8 @@
>   #include <sys/un.h>
>   #include <errno.h>
>   #include <pthread.h>
> +#include <sys/eventfd.h>
> +#include <signal.h>
>   
>   #include <rte_log.h>
>   #include <rte_eal.h>
> @@ -134,6 +138,7 @@ rte_eal_mp_action_unregister(const char *name)
>   
>   struct mp_fds {
>   	int efd;
> +	int evfd; /* eventfd used for pausing mp_handler thread */
>   
>   	union {
>   		/* fds for primary process */
> @@ -331,6 +336,13 @@ mp_handler(void *arg __rte_unused)
>   		exit(EXIT_FAILURE);
>   	}
>   
> +	ev.data.fd = mp_fds.evfd;
> +	if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
> +		RTE_LOG(ERR, EAL, "epoll_ctl failed: %s\n",
> +			strerror(errno));
> +		exit(EXIT_FAILURE);

here and in other places - rte_exit?

> +	}
> +
>   	events = calloc(20, sizeof ev);
>   
>   	while (1) {
> @@ -348,6 +360,14 @@ mp_handler(void *arg __rte_unused)
>   				continue;
>   			}
>   
> +			if (events[i].data.fd == mp_fds.evfd) {
> +				RTE_LOG(INFO, EAL, "mp_handler thread will pause\n");
> +				pause();
> +				RTE_LOG(INFO, EAL, "mp_handler thread stops pausing\n");
> +
> +				continue;
> +			}
> +
>   			fd = events[i].data.fd;
>   
>   			if ((events[i].events & EPOLLIN)) {
> @@ -377,13 +397,14 @@ mp_handler(void *arg __rte_unused)
>   	return NULL;
>   }
>   
> +static pthread_t tid;
> +
>   int
>   rte_eal_mp_channel_init(void)
>   {
>   	int i, fd, ret;
>   	const char *path;
>   	struct sockaddr_un un;
> -	pthread_t tid;
>   	char thread_name[RTE_MAX_THREAD_NAME_LEN];
>   
>   	mp_fds.efd = epoll_create1(0);
> @@ -462,6 +483,8 @@ rte_eal_mp_channel_init(void)
>   		return -1;
>   	}
>   
> +	mp_fds.evfd = eventfd(0, 0);
> +
>   	return 0;
>   }
>   
> @@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
>   		   const void *params,
>   		   int len_params,
>   		   int fds[],
> -		   int fds_num)
> +		   int fds_num,
> +		   int need_ack)

I think "need_ack" is a misnomer because what we really want is not 
"ack" but a response.

More importantly, i think for clarity's sake, this should be a separate 
function - something like rte_eal_mp_sendreq() or maybe a better name 
(reqdata? communicate?). Also, i don't think reusing send parameters is 
a good idea - a user is expecting a response, so a user allocates data 
for a response separately from requests, and passes it explicitly.

>   {
>   	int i;
>   	int ret = 0;
> @@ -511,6 +535,11 @@ rte_eal_mp_sendmsg(const char *action_name,
>   
>   	RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
>   
> +	if (need_ack) {
> +		// stop mp_handler thread.

Do we accept C++-style comments?

> +		eventfd_write(mp_fds.evfd, (eventfd_t)1);
> +	}
> +
>   	msg = malloc(len_msg);
>   	if (!msg) {
>   		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> @@ -547,12 +576,32 @@ rte_eal_mp_sendmsg(const char *action_name,
>   			ret = send_msg(mp_fds.secondaries[i], &msgh);
>   			if (ret < 0)
>   				break;
> +
> +			if (need_ack) {
> +				/* We will hang there until the other side
> +				 * responses and what if other side is sending
> +				 * msg at the same time?
> +				 */
> +				process_msg(mp_fds.secondaries[i]);
> +			}
>   		}
>   	} else {
>   		ret = send_msg(mp_fds.primary, &msgh);
> +
> +		if (ret > 0 && need_ack) {
> +			// We will hang there until the other side responses
> +			ret = process_msg(mp_fds.primary);
> +		}
>   	}
>   
>   	free(msg);
>   
> +	if (need_ack) {
> +		// start mp_handler thread.
> +		union sigval value;

it's not used, but still, maybe zero-initialize it?

> +
> +		pthread_sigqueue(tid, 0, value);
> +	}
> +
>   	return ret;
>   }
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index 8776bcf..9875cae 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -274,13 +274,16 @@ void rte_eal_mp_action_unregister(const char *name);
>    * @param fds_num
>    *   The fds_num argument is number of fds to be sent with sendmsg.
>    *
> + * @param need_ack
> + *   The fds_num argument is number of fds to be sent with sendmsg.
> + *
>    * @return
>    *  - (>=0) on success.
>    *  - (<0) on failure.
>    */
>   int
>   rte_eal_mp_sendmsg(const char *action_name, const void *params,
> -		   int len_params, int fds[], int fds_num);
> +		   int len_params, int fds[], int fds_num, int need_ack);
>   
>   /**
>    * Usage function typedef used by the application usage function.
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 3/3] vfio: use the generic multi-process channel
  2017-11-30 18:44 ` [PATCH 3/3] vfio: use the generic multi-process channel Jianfeng Tan
@ 2017-12-11 12:01   ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2017-12-11 12:01 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
> Previously, vfio has its own channel for the secondary process to
> get container fd and group fd from the primary process.
> 
> This patch changes to use the generic mp channel.
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio.c         | 139 +++------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 ++++---------------------
>   4 files changed, 109 insertions(+), 475 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index a84eab4..93824bf 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c

<...snip...>

> -		default:
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> +	ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot request group fd!\n");
> +		cur_grp->group_no = -1;
> +	} else {
> +		cur_grp->group_no = iommu_group_no;
> +		vfio_cfg.vfio_active_groups++;
>   	}

Either i'm missing something here, or i don't see where we actually 
store the group fd (e.g. the "cur_gtp->fd = vfio_group_fd" part from the 
previous code).

Also, this is why i mentioned "receive parameters" in comments to 
previous patch - looking at this code, it is quite unclear that the 
return from rte_eal_mp_sendmsg is either error or, well, "something", 
defined as "whatever mp_action returns". It would be much clearer if we 
were explicitly getting some data in response.

> -	return -1;
> +
> +	return ret;
>   }
>   
>   

<...snip...>

> +	/* For secondary process, request container fd from primary process */
> +
> +	p.req = SOCKET_REQ_CONTAINER;
> +
> +	ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
> +	if (ret < 0)
> +		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");

Again here, looks counter-intuitive to get container fd in return - it 
would've been much clearer to have a separate response parameter.

> +
> +	return ret;
>   }
>   

<...snip...>


>   
>   static int
> -vfio_mp_sync_socket_setup(void)
> +vfio_mp_secondary(const void *params, int len, int fds[],
> +		  int fds_num __rte_unused)

fds_num isn't unused here.

>   {


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 1/3] eal: add channel for multi-process communication
  2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
  2017-12-11 11:04   ` Burakov, Anatoly
@ 2017-12-11 16:43   ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2017-12-11 16:43 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas

Hi Jianfeng,

> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Thursday, November 30, 2017 6:44 PM
> To: dev@dpdk.org
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; thomas@monjalon.net; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: [PATCH 1/3] eal: add channel for multi-process communication
> 
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>   1. Config-file based channel, in which, the primary process writes
>      info into a pre-defined config file, and the secondary process
>      reads info out.
>   2. vfio submodule has its own channel based on unix socket for the
>      secondary process to get container fd and group fd from the
>      primary process.
>   3. pdump submodule also has its own channel based on unix socket for
>      packet dump.
> 
> It'll be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>   a. Secondary wants to send info to primary, for example, secondary
>      would like to send request (about some specific vdev to primary).
>   b. Sending info at any time, instead of just initialization time.
>   c. Share FDs with the other side, for vdev like vhost, related FDs
>      (memory region, kick) should be shared.
>   d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, as an unix
> socket connection, for above requirements. Primary will listen on
> the unix socket; secondary will connect this socket to talk.

Kind of generic question - why do you need a connection-oriented socket here?
Why just connection-less socket wouldn't be enough?
In that case you don't need to do listen/accept, again you don't need epoll() loop.
Instead with connection-less socket you can just use blocking recvmsg() to
inside mp_handler().

> 
> Three new APIs are added:
> 
>   1. rte_eal_mp_action_register is used to register an action,
>      indexed by a string; if the calling component wants to
>      response the messages from the corresponding component in
>      its primary process or secondary processes.
>   2. rte_eal_mp_action_unregister is used to unregister the action
>      if the calling component does not want to response the messages.

I think you need some sort of synchronization between action_register/unregister()
and action_process() - mutex_lock or so.
Another thing - as I understand you do use string as message*action identification?
I think you need to limit max length of it.
Konstantin

>   3. rte_eal_mp_sendmsg is used to send a message.
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_proc.c | 497 ++++++++++++++++++++++++++++++++
>  lib/librte_eal/common/eal_filesystem.h  |  18 ++
>  lib/librte_eal/common/eal_private.h     |  10 +
>  lib/librte_eal/common/include/rte_eal.h |  68 +++++
>  lib/librte_eal/linuxapp/eal/eal.c       |   9 +
>  lib/librte_eal/rte_eal_version.map      |  22 ++
>  6 files changed, 624 insertions(+)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 60526ca..5d0a095 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -33,8 +33,21 @@
>  #include <stdio.h>
>  #include <fcntl.h>
>  #include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/epoll.h>
> +#include <limits.h>
> +#include <unistd.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +#include <pthread.h>
> +
> +#include <rte_log.h>
>  #include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_common.h>
> 
> +#include "eal_private.h"
>  #include "eal_filesystem.h"
>  #include "eal_internal_cfg.h"
> 
> @@ -59,3 +72,487 @@ rte_eal_primary_proc_alive(const char *config_file_path)
> 
>  	return !!ret;
>  }
> +
> +struct action_entry {
> +	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
> +
> +#define MAX_ACTION_NAME_LEN	64
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	rte_eal_mp_t action;
> +};
> +
> +/** Double linked list of actions. */
> +TAILQ_HEAD(action_entry_list, action_entry);
> +
> +static struct action_entry_list action_entry_list =
> +	TAILQ_HEAD_INITIALIZER(action_entry_list);
> +
> +static struct action_entry *
> +find_action_entry_by_name(const char *name)
> +{
> +	int len = strlen(name);
> +	struct action_entry *entry;
> +
> +	TAILQ_FOREACH(entry, &action_entry_list, next) {
> +		if (strncmp(entry->action_name, name, len) == 0)
> +			break;
> +	}
> +
> +	return entry;
> +}
> +
> +int
> +rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
> +{
> +	struct action_entry *entry = malloc(sizeof(struct action_entry));
> +
> +	if (entry == NULL)
> +		return -ENOMEM;
> +
> +	if (find_action_entry_by_name(action_name) != NULL)
> +		return -EEXIST;
> +
> +	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
> +	entry->action = action;
> +	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
> +	return 0;
> +}
> +
> +void
> +rte_eal_mp_action_unregister(const char *name)
> +{
> +	struct action_entry *entry = find_action_entry_by_name(name);
> +
> +	TAILQ_REMOVE(&action_entry_list, entry, next);
> +	free(entry);
> +}
> +
> +/* The maximum amount of fd for one recvmsg/sendmsg */
> +#define SCM_MAX_FD		253
> +#define MAX_SECONDARY_PROCS	8
> +#define MAX_MESSAGE_LENGTH	1024
> +
> +struct mp_fds {
> +	int efd;
> +
> +	union {
> +		/* fds for primary process */
> +		struct {
> +			int listen;
> +			/* fds used to send msg to secondary process(es) */
> +			int secondaries[MAX_SECONDARY_PROCS];
> +		};
> +
> +		/* fds for secondary process */
> +		struct {
> +			/* fds used to send msg to the primary process */
> +			int primary;
> +		};
> +	};
> +};
> +
> +static struct mp_fds mp_fds;
> +
> +struct msg_hdr {
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	int fds_num;
> +	int len_params;
> +	char params[0];
> +} __rte_packed;
> +
> +static int
> +add_sec_proc(int fd)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +		if (mp_fds.secondaries[i] == -1)
> +			break;
> +
> +	if (i >= MAX_SECONDARY_PROCS)
> +		return -1;
> +
> +	mp_fds.secondaries[i] = fd;
> +
> +	return i;
> +}
> +
> +static void
> +del_sec_proc(int fd)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +		if (mp_fds.secondaries[i] == fd) {
> +			mp_fds.secondaries[i] = -1;
> +			break;
> +		}
> +	}
> +}
> +
> +static int
> +read_msg(int sockfd, char *buf, int buflen, int *fds, int fds_num)
> +{
> +	struct iovec iov;
> +	struct msghdr msgh;
> +	size_t fdsize = fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	struct msg_hdr *hdr = (struct msg_hdr *)buf;
> +	int ret, total;
> +
> +	/* read msg_hdr */
> +	memset(&msgh, 0, sizeof(msgh));
> +	iov.iov_base = hdr;
> +	iov.iov_len  = sizeof(*hdr);
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(sockfd, &msgh, 0);
> +	if (ret != sizeof(struct msg_hdr)) {
> +		RTE_LOG(ERR, EAL, "recvmsg failed\n");
> +		return ret;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, EAL, "truncted msg\n");
> +		return -1;
> +	}
> +	total = ret;
> +
> +	/* read auxiliary FDs if any */
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	/* read params */
> +	if (hdr->len_params) {
> +		if (hdr->len_params > buflen - (int)sizeof(*hdr))
> +			rte_exit(EXIT_FAILURE, "params too long\n");
> +
> +		ret = read(sockfd, &hdr->params, hdr->len_params);
> +		if (ret != hdr->len_params)
> +			rte_exit(EXIT_FAILURE, "failed to recv params\n");
> +
> +		total += ret;
> +	}
> +
> +	RTE_LOG(INFO, EAL, "read msg: %s, %d\n", hdr->action_name,
> +		(int)sizeof(*hdr) + hdr->len_params);
> +	return total;
> +}
> +
> +static int
> +process_msg(int fd)
> +{
> +	int len;
> +	int params_len;
> +	char buf[MAX_MESSAGE_LENGTH];
> +	int fds[SCM_MAX_FD];
> +	struct msg_hdr *hdr;
> +	struct action_entry *entry;
> +
> +	len = read_msg(fd, buf, MAX_MESSAGE_LENGTH, fds, SCM_MAX_FD);
> +	if (len <= 0) {
> +		RTE_LOG(ERR, EAL, "failed to read message: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +
> +	hdr = (struct msg_hdr *) buf;
> +
> +	entry = find_action_entry_by_name(hdr->action_name);
> +	if (entry == NULL) {
> +		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
> +			hdr->action_name);
> +		return -1;
> +	}
> +
> +	params_len = len - sizeof(struct msg_hdr);
> +
> +	return entry->action(hdr->params, params_len, fds, hdr->fds_num);
> +}
> +
> +static int
> +add_secondary(void)
> +{
> +	int fd;
> +	struct epoll_event ev;
> +
> +	while (1) {
> +		fd = accept(mp_fds.listen, NULL, NULL);
> +		if (fd < 0 && errno == EAGAIN)
> +			break;
> +		else if (fd < 0) {
> +			RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
> +				strerror(errno));
> +			return -1;
> +		}
> +
> +		ev.events = EPOLLIN | EPOLLRDHUP;
> +		ev.data.fd = fd;
> +		if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
> +				strerror(errno));
> +			break;
> +		}
> +		if (add_sec_proc(fd) < 0) {
> +			RTE_LOG(ERR, EAL, "too many secondary processes\n");
> +			close(fd);
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void *
> +mp_handler(void *arg __rte_unused)
> +{
> +	int fd;
> +	int i, n;
> +	struct epoll_event ev;
> +	struct epoll_event *events;
> +	int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
> +
> +	ev.events = EPOLLIN | EPOLLRDHUP;
> +	ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
> +	if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
> +			strerror(errno));
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	events = calloc(20, sizeof ev);
> +
> +	while (1) {
> +		n = epoll_wait(mp_fds.efd, events, 20, -1);
> +		for (i = 0; i < n; i++) {
> +			if (is_primary && events[i].data.fd == mp_fds.listen) {
> +				if (events[i].events != EPOLLIN) {
> +					RTE_LOG(ERR, EAL, "what happens?\n");
> +					exit(EXIT_FAILURE);
> +				}
> +
> +				if (add_secondary() < 0)
> +					break;
> +
> +				continue;
> +			}
> +
> +			fd = events[i].data.fd;
> +
> +			if ((events[i].events & EPOLLIN)) {
> +				if (process_msg(fd) < 0) {
> +					RTE_LOG(ERR, EAL,
> +						"failed to process msg\n");
> +					if (!is_primary)
> +						exit(EXIT_FAILURE);
> +				}
> +				continue;
> +			}
> +
> +			/* EPOLLERR, EPOLLHUP, etc */
> +			if (is_primary) {
> +				RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
> +				epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
> +				del_sec_proc(fd);
> +				close(fd);
> +			} else {
> +				RTE_LOG(ERR, EAL, "primary exits, so do I\n");
> +				/* Exit secondary when primary exits? */
> +				exit(EXIT_FAILURE);
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +int
> +rte_eal_mp_channel_init(void)
> +{
> +	int i, fd, ret;
> +	const char *path;
> +	struct sockaddr_un un;
> +	pthread_t tid;
> +	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> +
> +	mp_fds.efd = epoll_create1(0);
> +	if (mp_fds.efd < 0) {
> +		RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
> +		return -1;
> +	}
> +
> +	fd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (fd < 0) {
> +		RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
> +		return -1;
> +	}
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	path = eal_mp_unix_path();
> +	strncpy(un.sun_path, path, sizeof(un.sun_path));
> +	un.sun_path[sizeof(un.sun_path) - 1] = '\0';
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +			mp_fds.secondaries[i] = -1;
> +
> +		if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
> +			RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
> +			close(fd);
> +			return -1;
> +		}
> +
> +		/* The file still exists since last run */
> +		unlink(path);
> +
> +		ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
> +				path, strerror(errno));
> +			close(fd);
> +			return -1;
> +		}
> +		RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
> +
> +		ret = listen(fd, 1024);
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to listen: %s\n",
> +				strerror(errno));
> +			close(fd);
> +			return -1;
> +		}
> +		mp_fds.listen = fd;
> +	} else {
> +		ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
> +		if (ret < 0) {
> +			RTE_LOG(ERR, EAL, "failed to connect primary\n");
> +			return -1;
> +		}
> +		mp_fds.primary = fd;
> +	}
> +
> +	ret = pthread_create(&tid, NULL, mp_handler, NULL);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
> +			strerror(errno));
> +		close(fd);
> +		close(mp_fds.efd);
> +		return -1;
> +	}
> +
> +	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
> +		 "rte_mp_handle");
> +	ret = rte_thread_setname(tid, thread_name);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "failed to set thead name\n");
> +		close(fd);
> +		close(mp_fds.efd);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +send_msg(int fd, struct msghdr *p_msgh)
> +{
> +	int ret;
> +
> +	do {
> +		ret = sendmsg(fd, p_msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0)
> +		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
> +
> +	return ret;
> +}
> +
> +int
> +rte_eal_mp_sendmsg(const char *action_name,
> +		   const void *params,
> +		   int len_params,
> +		   int fds[],
> +		   int fds_num)
> +{
> +	int i;
> +	int ret = 0;
> +	struct msghdr msgh;
> +	struct iovec iov;
> +	size_t fd_size = fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +	struct cmsghdr *cmsg;
> +	struct msg_hdr *msg;
> +	int len_msg;
> +
> +	if (fds_num > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL,
> +			"Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		return -E2BIG;
> +	}
> +
> +	len_msg = sizeof(struct msg_hdr) + len_params;
> +	if (len_msg > MAX_MESSAGE_LENGTH) {
> +		RTE_LOG(ERR, EAL, "Message is too long\n");
> +		return -ENOMEM;
> +	}
> +
> +	RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
> +
> +	msg = malloc(len_msg);
> +	if (!msg) {
> +		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> +		return -ENOMEM;
> +	}
> +	memset(msg, 0, len_msg);
> +	strcpy(msg->action_name, action_name);
> +	msg->fds_num = fds_num;
> +	msg->len_params = len_params;
> +	memcpy(msg->params, params, len_params);
> +
> +	memset(&msgh, 0, sizeof(msgh));
> +	memset(control, 0, sizeof(control));
> +
> +	iov.iov_base = (uint8_t *)msg;
> +	iov.iov_len = len_msg;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	cmsg = CMSG_FIRSTHDR(&msgh);
> +	cmsg->cmsg_len = CMSG_LEN(fd_size);
> +	cmsg->cmsg_level = SOL_SOCKET;
> +	cmsg->cmsg_type = SCM_RIGHTS;
> +	memcpy(CMSG_DATA(cmsg), fds, fd_size);
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +			if (mp_fds.secondaries[i] == -1)
> +				continue;
> +
> +			ret = send_msg(mp_fds.secondaries[i], &msgh);
> +			if (ret < 0)
> +				break;
> +		}
> +	} else {
> +		ret = send_msg(mp_fds.primary, &msgh);
> +	}
> +
> +	free(msg);
> +
> +	return ret;
> +}
> diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
> index 8acbd99..3d9514f 100644
> --- a/lib/librte_eal/common/eal_filesystem.h
> +++ b/lib/librte_eal/common/eal_filesystem.h
> @@ -67,6 +67,24 @@ eal_runtime_config_path(void)
>  	return buffer;
>  }
> 
> +/** Path of primary/secondary communication unix socket file. */
> +#define MP_UNIX_PATH_FMT "%s/.%s_unix"
> +static inline const char *
> +eal_mp_unix_path(void)
> +{
> +	static char buffer[PATH_MAX]; /* static so auto-zeroed */
> +	const char *directory = default_config_dir;
> +	const char *home_dir = getenv("HOME");
> +
> +	if (getuid() != 0 && home_dir != NULL)
> +		directory = home_dir;
> +	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
> +		 directory, internal_config.hugefile_prefix);
> +
> +	return buffer;
> +
> +}
> +
>  /** Path of hugepage info file. */
>  #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
> 
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index 462226f..60944f2 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -224,4 +224,14 @@ int rte_eal_hugepage_attach(void);
>   */
>  struct rte_bus *rte_bus_find_by_device_name(const char *str);
> 
> +/**
> + * Create the unix channel for primary/secondary communication.
> + *
> + * @return
> + *   0 on success;
> + *   (<0) on failure.
> + */
> +
> +int rte_eal_mp_channel_init(void);
> +
>  #endif /* _EAL_PRIVATE_H_ */
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index 8e4e71c..8776bcf 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -215,6 +215,74 @@ int rte_eal_init(int argc, char **argv);
>  int rte_eal_primary_proc_alive(const char *config_file_path);
> 
>  /**
> + * Action function typedef used by other components.
> + *
> + * As we create unix socket channel for primary/secondary communication, use
> + * this function typedef to register action for coming messages.
> + */
> +typedef int (*rte_eal_mp_t)(const void *params, int len,
> +			    int fds[], int fds_num);
> +/**
> + * Register an action function for primary/secondary communication.
> + *
> + * Call this function to register an action, if the calling component wants
> + * to response the messages from the corresponding component in its primary
> + * process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + * @param action
> + *   The action argument is the function pointer to the action function.
> + *
> + * @return
> + *  - 0 on success.
> + *  - (<0) on failure.
> + */
> +int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
> +/**
> + * Unregister an action function for primary/secondary communication.
> + *
> + * Call this function to unregister an action  if the calling component does
> + * not want to response the messages from the corresponding component in its
> + * primary process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + */
> +void rte_eal_mp_action_unregister(const char *name);
> +
> +/**
> + * Send a message to the primary process or the secondary processes.
> + *
> + * This function will send a message which will be responsed by the action
> + * identified by action_name of the process on the other side.
> + *
> + * @param action_name
> + *   The action_name argument is used to identify which action will be used.
> + *
> + * @param params
> + *   The params argument contains the customized message.
> + *
> + * @param len_params
> + *   The len_params argument is the length of the customized message.
> + *
> + * @param fds
> + *   The fds argument is an array of fds sent with sendmsg.
> + *
> + * @param fds_num
> + *   The fds_num argument is number of fds to be sent with sendmsg.
> + *
> + * @return
> + *  - (>=0) on success.
> + *  - (<0) on failure.
> + */
> +int
> +rte_eal_mp_sendmsg(const char *action_name, const void *params,
> +		   int len_params, int fds[], int fds_num);
> +
> +/**
>   * Usage function typedef used by the application usage function.
>   *
>   * Use this function typedef to define and call rte_set_application_usage_hook()
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 229eec9..a84eab4 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)
> 
>  	eal_check_mem_on_local_socket();
> 
> +	if (rte_eal_mp_channel_init() < 0) {
> +		rte_eal_init_alert("failed to init mp channel\n");
> +		rte_errno = EFAULT;
> +		return -1;
> +	}
> +
> +	if (eal_plugins_init() < 0)
> +		rte_eal_init_alert("Cannot init plugins\n");
> +
>  	eal_thread_init_master(rte_config.master_lcore);
> 
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index f4f46c1..6762397 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -235,4 +235,26 @@ EXPERIMENTAL {
>  	rte_service_set_stats_enable;
>  	rte_service_start_with_defaults;
> 
> +} DPDK_17.08;
> +
> +DPDK_17.11 {
> +	global:
> +
> +	rte_bus_get_iommu_class;
> +	rte_eal_iova_mode;
> +	rte_eal_mbuf_default_mempool_ops;
> +	rte_lcore_has_role;
> +	rte_memcpy_ptr;
> +	rte_pci_get_iommu_class;
> +	rte_pci_match;
> +
> +} DPDK_17.08;
> +
> +DPDK_18.02 {
> +	global:
> +
> +	rte_eal_mp_action_register;
> +	rte_eal_mp_action_unregister;
> +	rte_eal_mp_sendmsg;
> +
>  } DPDK_17.11;
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 2/3] eal: add synchronous multi-process communication
  2017-12-11 11:39   ` Burakov, Anatoly
@ 2017-12-11 16:49     ` Ananyev, Konstantin
  0 siblings, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2017-12-11 16:49 UTC (permalink / raw)
  To: Burakov, Anatoly, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas


> >
> > @@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
> >   		   const void *params,
> >   		   int len_params,
> >   		   int fds[],
> > -		   int fds_num)
> > +		   int fds_num,
> > +		   int need_ack)
> 
> I think "need_ack" is a misnomer because what we really want is not
> "ack" but a response.
> 
> More importantly, i think for clarity's sake, this should be a separate
> function - something like rte_eal_mp_sendreq() or maybe a better name
> (reqdata? communicate?). 

+1 for a separate function.
Also I don't think it should disturb/block mp_handler() - there could be messages
for other actions (from other endpoints).
I think only rte_eal_mp_sendreq() should be blocked till ack/response is received.
And probably it needs max timeout to block for.
Konstantin


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 0/3] generic channel for multi-process communication
  2017-12-11  9:59 ` [PATCH 0/3] generic channel for multi-process communication Burakov, Anatoly
@ 2017-12-12  7:34   ` Tan, Jianfeng
  2017-12-12 16:18     ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2017-12-12  7:34 UTC (permalink / raw)
  To: Burakov, Anatoly, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

Hi Anatoly,

> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Monday, December 11, 2017 6:00 PM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
> Subject: Re: [PATCH 0/3] generic channel for multi-process communication
> 
> On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
> > This patchset adds a generic channel for multi-process (primary/secondary)
> > communication.
> >
> > Patch 1: addess the purpose and howto;
> > Patch 2: add a syncrhonous way for those messages which need a response
> immediately.
> > Patch 3: Rework vfio to use this generic communication channel.
> >
> 
> Hi Jianfeng,
> 
> Just a general comment: I am assuming this has the limitation of
> "everything happens through primary process's involvement". This will
> work for VFIO, as secondary always needs to ask the primary before doing
> anything, but it doesn't address other issues that could have been
> addressed with IPC.
> 
> For example, if a primary process would've hotplugged a device, it can't
> notify all secondary processes about this; rather, it has to wait until
> secondary processes ask for this info.

No need to wait the secondary to pull such info.

It can work like this:
(1) Register a hotplug callback for each process at initialization;
(2) Whenever a process wants to hotplug a device, it will broadcast the info, by broadcast, I mean:
  - if plugin happens at the primary, the primary will tell all of the secondary processes;
  - if plugin happens at one secondary, it will firstly tell the primary, and the primary will broadcast it to all the secondary process.

> Neither can it do anything if
> secondary requests a primary to do something, and notify other secondary
> processes about it (i.e. if secondary wants to hotplug a device, but
> there are other secondaries also running). It would be great to have a
> standard way of doing things like this in future revisions of our IPC.

Please review above thing; If you are OK with that, I'll include this in the next version.

Thanks,
Jianfeng

> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH 0/3] generic channel for multi-process communication
  2017-12-12  7:34   ` Tan, Jianfeng
@ 2017-12-12 16:18     ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2017-12-12 16:18 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

On 12-Dec-17 7:34 AM, Tan, Jianfeng wrote:
> Hi Anatoly,
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Monday, December 11, 2017 6:00 PM
>> To: Tan, Jianfeng; dev@dpdk.org
>> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
>> Subject: Re: [PATCH 0/3] generic channel for multi-process communication
>>
>> On 30-Nov-17 6:44 PM, Jianfeng Tan wrote:
>>> This patchset adds a generic channel for multi-process (primary/secondary)
>>> communication.
>>>
>>> Patch 1: addess the purpose and howto;
>>> Patch 2: add a syncrhonous way for those messages which need a response
>> immediately.
>>> Patch 3: Rework vfio to use this generic communication channel.
>>>
>>
>> Hi Jianfeng,
>>
>> Just a general comment: I am assuming this has the limitation of
>> "everything happens through primary process's involvement". This will
>> work for VFIO, as secondary always needs to ask the primary before doing
>> anything, but it doesn't address other issues that could have been
>> addressed with IPC.
>>
>> For example, if a primary process would've hotplugged a device, it can't
>> notify all secondary processes about this; rather, it has to wait until
>> secondary processes ask for this info.
> 
> No need to wait the secondary to pull such info.
> 
> It can work like this:
> (1) Register a hotplug callback for each process at initialization;
> (2) Whenever a process wants to hotplug a device, it will broadcast the info, by broadcast, I mean:
>    - if plugin happens at the primary, the primary will tell all of the secondary processes;
>    - if plugin happens at one secondary, it will firstly tell the primary, and the primary will broadcast it to all the secondary process.
> 
>> Neither can it do anything if
>> secondary requests a primary to do something, and notify other secondary
>> processes about it (i.e. if secondary wants to hotplug a device, but
>> there are other secondaries also running). It would be great to have a
>> standard way of doing things like this in future revisions of our IPC.
> 
> Please review above thing; If you are OK with that, I'll include this in the next version.
> 
> Thanks,
> Jianfeng
> 
>>
>> --
>> Thanks,
>> Anatoly

Yes, that would work, my bad. However i don't think we necessarily need 
it right now. This can go in a separate patch. I was rather looking at 
other, future potential use cases, hotplug was just an example.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v2 0/4] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (3 preceding siblings ...)
  2017-12-11  9:59 ` [PATCH 0/3] generic channel for multi-process communication Burakov, Anatoly
@ 2018-01-11  4:07 ` Jianfeng Tan
  2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
                     ` (3 more replies)
  2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (4 subsequent siblings)
  9 siblings, 4 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-11  4:07 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: secondary process add/del;
Patch 3: add a syncrhonous way for the requests which need a immediate response.
Patch 4: Rework vfio to use this generic communication channel.

Jianfeng Tan (4):
  eal: add channel for multi-process communication
  eal: add and del secondary processes in the primary
  eal: add synchronous multi-process communication
  vfio: use the generic multi-process channel

 lib/librte_eal/common/eal_common_proc.c        | 594 +++++++++++++++++++++++++
 lib/librte_eal/common/eal_filesystem.h         |  17 +
 lib/librte_eal/common/eal_private.h            |  10 +
 lib/librte_eal/common/include/rte_eal.h        | 138 ++++++
 lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 133 ++----
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 +++--------------
 lib/librte_eal/rte_eal_version.map             |  11 +
 9 files changed, 872 insertions(+), 477 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v2 1/4] eal: add channel for multi-process communication
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
@ 2018-01-11  4:07   ` Jianfeng Tan
  2018-01-13 12:57     ` Burakov, Anatoly
  2018-01-15 19:52     ` Ananyev, Konstantin
  2018-01-11  4:07   ` [PATCH v2 2/4] eal: add and del secondary processes in the primary Jianfeng Tan
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-11  4:07 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are 1:n primary:secondary processes, the
     primary process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
 lib/librte_eal/common/eal_filesystem.h  |  17 ++
 lib/librte_eal/common/eal_private.h     |  10 +
 lib/librte_eal/common/include/rte_eal.h |  69 ++++++
 lib/librte_eal/linuxapp/eal/eal.c       |   8 +
 lib/librte_eal/rte_eal_version.map      |   9 +
 6 files changed, 501 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -5,11 +5,55 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+#define MAX_SECONDARY_PROCS	8
+#define MAX_ACTION_NAME_LEN	64
+#define MAX_UNIX_PATH_LEN	104
+#define MAX_MSG_LENGTH		1024
+#define SCM_MAX_FD		253 /* The max amount of fds */
+
+static int mp_fd = -1;
+static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN	64
+	char action_name[MAX_ACTION_NAME_LEN];
+	rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+struct mp_msghdr {
+	char action_name[MAX_ACTION_NAME_LEN];
+	int fds_num;
+	int len_params;
+	char params[0];
+} __rte_packed;
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	int len = strlen(name);
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, len) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+	struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+
+	if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return -1;
+	}
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(action_name) != NULL) {
+		free(entry);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+	entry->action = action;
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	free(entry);
+	pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+	int ret;
+	struct iovec iov;
+	struct msghdr msgh;
+	size_t fdsize = fds_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = buf;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(fd, &msgh, 0);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+process_msg(struct mp_msghdr *hdr, int len, int fds[])
+{
+	int ret;
+	int params_len;
+	struct action_entry *entry;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(hdr->action_name);
+	if (entry == NULL) {
+		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+			hdr->action_name);
+		pthread_mutex_unlock(&mp_mutex_action);
+		return -1;
+	}
+
+	params_len = len - sizeof(struct mp_msghdr);
+	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return ret;
+
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	int len;
+	int fds[SCM_MAX_FD];
+	char buf[MAX_MSG_LENGTH];
+
+	while (1) {
+		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+		if (len > 0)
+			process_msg((struct mp_msghdr *)buf, len, fds);
+	}
+
+	return NULL;
+}
+
+static inline const char *
+get_unix_path(int is_server)
+{
+	static char unix_path[MAX_UNIX_PATH_LEN];
+	const char *prefix = eal_mp_unix_path();
+	const char *suffix = (is_server) ? "" : "_c";
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
+	else
+		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
+			 prefix, suffix, getpid());
+	return unix_path;
+}
+
+static int
+open_unix_fd(int is_server)
+{
+	int fd;
+	struct sockaddr_un un;
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
+		 get_unix_path(is_server));
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
+	return fd;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+	pthread_t tid;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	mp_fd = open_unix_fd(1);
+	if (mp_fd < 0)
+		return -1;
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
+			strerror(errno));
+		goto error;
+	}
+
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+	if (rte_thread_setname(tid, thread_name) < 0) {
+		RTE_LOG(ERR, EAL, "failed to set thead name\n");
+		goto error;
+	}
+
+	return 0;
+error:
+	close(mp_fd);
+	mp_fd = -1;
+	return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+{
+	int len_msg;
+	struct mp_msghdr *msg;
+
+	len_msg = sizeof(struct mp_msghdr) + len_params;
+	if (len_msg > MAX_MSG_LENGTH) {
+		RTE_LOG(ERR, EAL, "Message is too long\n");
+		rte_errno = -EINVAL;
+		return NULL;
+	}
+
+	msg = malloc(len_msg);
+	if (!msg) {
+		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+		rte_errno = -ENOMEM;
+		return NULL;
+	}
+	memset(msg, 0, len_msg);
+	strcpy(msg->action_name, act_name);
+	msg->fds_num = fds_num;
+	msg->len_params = len_params;
+	memcpy(msg->params, p, len_params);
+	return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+	int ret;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = msg->fds_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = (uint8_t *)msg;
+	iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+	do {
+		ret = sendmsg(fd, &msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
+				dst_path);
+		else if (!rte_eal_primary_proc_alive(NULL))
+			RTE_LOG(ERR, EAL, "primary process exited\n");
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+mp_send(const char *action_name,
+	const void *params,
+	int len_params,
+	int fds[],
+	int fds_num)
+{
+	int i;
+	int n = 0;
+	int sockfd;
+	struct mp_msghdr *msg;
+
+	if (fds_num > SCM_MAX_FD) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+		rte_errno = -E2BIG;
+		return 0;
+	}
+
+	msg = format_msg(action_name, params, len_params, fds_num);
+	if (msg == NULL)
+		return 0;
+
+	if ((sockfd = open_unix_fd(0)) < 0) {
+		free(msg);
+		return 0;
+	}
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		/* broadcast to all secondaries */
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+			if (mp_sec_sockets[i] == NULL)
+				continue;
+
+			n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
+		}
+	} else
+		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
+
+	free(msg);
+	close(sockfd);
+	return n;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+		   const void *params,
+		   int len_params,
+		   int fds[],
+		   int fds_num)
+{
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
+	return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 02fa109..9884c0b 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
 /**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+			    int fds[], int fds_num);
+
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * @param action_name
+ *   The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ *   The params argument contains the customized message.
+ *
+ * @param len_params
+ *   The len_params argument is the length of the customized message.
+ *
+ * @param fds
+ *   The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_num
+ *   The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ *  - Returns the number of messages being sent successfully.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+		   int len_params, int fds[], int fds_num);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..f231724 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)
 
 	eal_check_mem_on_local_socket();
 
+	if (rte_eal_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	eal_thread_init_master(rte_config.master_lcore);
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..5dacde5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,13 @@ EXPERIMENTAL {
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
 
+} DPDK_17.08;
+
+DPDK_18.02 {
+	global:
+
+	rte_eal_mp_action_register;
+	rte_eal_mp_action_unregister;
+	rte_eal_mp_sendmsg;
+
 } DPDK_17.11;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v2 2/4] eal: add and del secondary processes in the primary
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
  2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
@ 2018-01-11  4:07   ` Jianfeng Tan
  2018-01-13 13:11     ` Burakov, Anatoly
  2018-01-15 21:45     ` Ananyev, Konstantin
  2018-01-11  4:07   ` [PATCH v2 3/4] eal: add synchronous multi-process communication Jianfeng Tan
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
  3 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-11  4:07 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

By the multi-process channel, we add an mp action named "proc".

As a secondary process starts, it sends a "proc add" message to
the primary.

As the primary finds a failure in sending message to a specific
secondary process, that secondary process is treated as exited;
and we remove it from the secondary array by sending a "proc del"
message to the primary itself.

Test:
  1. Start the primary and the secondary process
    $ (testpmd) -c 0x3 -n 4 -- -i
    $ (helloworld) -c 0xc -n 4 --proc-type=auto --

  2. Check the log of testpmd:
    ...
    EAL: bind to /var/run/.rte_unix
    ...
    EAL: add secondary: /var/run/.testpmd_unix_(xxx)
    ...

  3. Check the log of helloworld:
    ...
    EAL: bind to /var/run/.testpmd_unix_xxx
    EAL: bind to /var/run/.testpmd_unix_c_xxx
    ...

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index d700e9e..70519cc 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -54,6 +54,13 @@ struct mp_msghdr {
 	char params[0];
 } __rte_packed;
 
+struct proc_request {
+#define MP_PROC_ADD	0
+#define MP_PROC_DEL	1
+	int type;
+	char path[MAX_UNIX_PATH_LEN];
+};
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
 	return NULL;
 }
 
+static int
+add_sec_proc(const char *path)
+{
+	int i;
+
+	for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+		if (mp_sec_sockets[i] == NULL)
+			break;
+	if (i < MAX_SECONDARY_PROCS)
+		mp_sec_sockets[i] = strdup(path);
+
+	return i < MAX_SECONDARY_PROCS;
+}
+
+static int
+del_sec_proc(const char *path)
+{
+	int i;
+
+	for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+		if (!strcmp(mp_sec_sockets[i], path)) {
+			free(mp_sec_sockets[i]);
+			mp_sec_sockets[i] = NULL;
+			break;
+		}
+	}
+
+	return i < MAX_SECONDARY_PROCS;
+}
+
+static int
+mp_primary_proc(const void *params,
+		int len __rte_unused,
+		int fds[] __rte_unused,
+		int fds_num __rte_unused)
+{
+	const struct proc_request *r = (const struct proc_request *)params;
+
+	switch (r->type) {
+	case MP_PROC_ADD:
+		RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
+		return add_sec_proc(r->path);
+	case MP_PROC_DEL:
+		RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
+		return del_sec_proc(r->path);
+	default:
+		RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
+	}
+
+	return -1;
+}
+
 static inline const char *
 get_unix_path(int is_server)
 {
@@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
 	if (mp_fd < 0)
 		return -1;
 
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
+			RTE_LOG(ERR, EAL, "failed to register handler\n");
+			goto error;
+		}
+	} else {
+		struct proc_request r;
+
+		r.type = MP_PROC_ADD;
+		snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));
+		if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
+			RTE_LOG(ERR, EAL, "failed to add into primary\n");
+			goto error;
+		}
+	}
+
 	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
 		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
 			strerror(errno));
@@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
 
-		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			struct proc_request r;
+
 			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
 				dst_path);
-		else if (!rte_eal_primary_proc_alive(NULL))
+			r.type = MP_PROC_DEL;
+			snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+			if (rte_eal_mp_sendmsg("proc", &r,
+						sizeof(r), NULL, 0) < 0)
+				RTE_LOG(ERR, EAL,
+					"failed to del secondary %s\n",
+					dst_path);
+		} else if (!rte_eal_primary_proc_alive(NULL))
 			RTE_LOG(ERR, EAL, "primary process exited\n");
 
 		return 0;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
  2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
  2018-01-11  4:07   ` [PATCH v2 2/4] eal: add and del secondary processes in the primary Jianfeng Tan
@ 2018-01-11  4:07   ` Jianfeng Tan
  2018-01-13 13:41     ` Burakov, Anatoly
  2018-01-16  0:00     ` Ananyev, Konstantin
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
  3 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-11  4:07 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  73 +++++++++++++++-
 lib/librte_eal/rte_eal_version.map      |   2 +
 3 files changed, 206 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 70519cc..f194a52 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -32,6 +32,7 @@
 static int mp_fd = -1;
 static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
 static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
 
 struct action_entry {
 	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
@@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
 
 struct mp_msghdr {
 	char action_name[MAX_ACTION_NAME_LEN];
+#define MP_MSG	0 /* Share message with peers, will not block */
+#define MP_REQ	1 /* Request for information, Will block for a reply */
+#define MP_REP	2 /* Reply to previously-received request */
+	int type;
 	int fds_num;
 	int len_params;
 	char params[0];
@@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+read_msg(int fd, char *buf, int buflen,
+	 int *fds, int fds_num, struct sockaddr_un *s)
 {
 	int ret;
 	struct iovec iov;
@@ -151,6 +157,8 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
 	iov.iov_base = buf;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -181,7 +189,7 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
 }
 
 static int
-process_msg(struct mp_msghdr *hdr, int len, int fds[])
+process_msg(struct mp_msghdr *hdr, int len, int fds[], struct sockaddr_un *s)
 {
 	int ret;
 	int params_len;
@@ -199,10 +207,10 @@ process_msg(struct mp_msghdr *hdr, int len, int fds[])
 	}
 
 	params_len = len - sizeof(struct mp_msghdr);
-	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+	ret = entry->action(hdr->params, params_len,
+			    fds, hdr->fds_num, s->sun_path);
 	pthread_mutex_unlock(&mp_mutex_action);
 	return ret;
-
 }
 
 static void *
@@ -211,11 +219,12 @@ mp_handle(void *arg __rte_unused)
 	int len;
 	int fds[SCM_MAX_FD];
 	char buf[MAX_MSG_LENGTH];
+	struct sockaddr_un sa;
 
 	while (1) {
-		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD, &sa);
 		if (len > 0)
-			process_msg((struct mp_msghdr *)buf, len, fds);
+			process_msg((struct mp_msghdr *)buf, len, fds, &sa);
 	}
 
 	return NULL;
@@ -255,7 +264,8 @@ static int
 mp_primary_proc(const void *params,
 		int len __rte_unused,
 		int fds[] __rte_unused,
-		int fds_num __rte_unused)
+		int fds_num __rte_unused,
+		const void *peer __rte_unused)
 {
 	const struct proc_request *r = (const struct proc_request *)params;
 
@@ -362,7 +372,8 @@ rte_eal_mp_channel_init(void)
 }
 
 static inline struct mp_msghdr *
-format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+format_msg(const char *act_name, const void *p,
+	   int len_params, int fds_num, int type)
 {
 	int len_msg;
 	struct mp_msghdr *msg;
@@ -384,6 +395,7 @@ format_msg(const char *act_name, const void *p, int len_params, int fds_num)
 	strcpy(msg->action_name, act_name);
 	msg->fds_num = fds_num;
 	msg->len_params = len_params;
+	msg->type = type;
 	memcpy(msg->params, p, len_params);
 	return msg;
 }
@@ -455,7 +467,9 @@ mp_send(const char *action_name,
 	const void *params,
 	int len_params,
 	int fds[],
-	int fds_num)
+	int fds_num,
+	int type,
+	const void *peer)
 {
 	int i;
 	int n = 0;
@@ -468,7 +482,7 @@ mp_send(const char *action_name,
 		return 0;
 	}
 
-	msg = format_msg(action_name, params, len_params, fds_num);
+	msg = format_msg(action_name, params, len_params, fds_num, type);
 	if (msg == NULL)
 		return 0;
 
@@ -477,6 +491,11 @@ mp_send(const char *action_name,
 		return 0;
 	}
 
+	if (peer) {
+		n += send_msg(sockfd, peer, msg, fds);
+		goto ret;
+	}
+
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		/* broadcast to all secondaries */
 		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
@@ -488,6 +507,7 @@ mp_send(const char *action_name,
 	} else
 		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
 
+ret:
 	free(msg);
 	close(sockfd);
 	return n;
@@ -501,5 +521,107 @@ rte_eal_mp_sendmsg(const char *action_name,
 		   int fds_num)
 {
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
-	return mp_send(action_name, params, len_params, fds, fds_num);
+	return mp_send(action_name, params, len_params,
+			fds, fds_num, MP_MSG, NULL);
+}
+
+int
+rte_eal_mp_request(const char *action_name,
+		   void *params,
+		   int len_p,
+		   int fds[],
+		   int fds_in,
+		   int fds_out)
+{
+	int i, j;
+	int sockfd;
+	int nprocs;
+	int ret = 0;
+	struct mp_msghdr *req;
+	struct timeval tv;
+	char buf[MAX_MSG_LENGTH];
+	struct mp_msghdr *hdr;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+		rte_errno = -E2BIG;
+		return 0;
+	}
+
+	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+	if (req == NULL)
+		return 0;
+
+	if ((sockfd = open_unix_fd(0)) < 0) {
+		free(req);
+		return 0;
+	}
+
+	tv.tv_sec = 5;  /* 5 Secs Timeout */
+	tv.tv_usec = 0;
+	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+			(const void *)&tv, sizeof(struct timeval)) < 0)
+		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
+
+	/* Only allow one req at a time */
+	pthread_mutex_lock(&mp_mutex_request);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		nprocs = 0;
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+			if (!mp_sec_sockets[i]) {
+				j = i;
+				nprocs++;
+			}
+
+		if (nprocs > 1) {
+			RTE_LOG(ERR, EAL,
+				"multi secondary processes not supported\n");
+			goto free_and_ret;
+		}
+
+		ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
+	} else
+		ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+	if (ret == 0) {
+		RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+		ret = -1;
+		goto free_and_ret;
+	}
+
+	ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
+	if (ret > 0) {
+		hdr = (struct mp_msghdr *)buf;
+		if (hdr->len_params == len_p)
+			memcpy(params, hdr->params, len_p);
+		else {
+			RTE_LOG(ERR, EAL, "invalid reply\n");
+			ret = 0;
+		}
+	}
+
+free_and_ret:
+	free(req);
+	close(sockfd);
+	pthread_mutex_unlock(&mp_mutex_request);
+	return ret;
+}
+
+int
+rte_eal_mp_reply(const char *action_name,
+		 const void *params,
+		 int len_p,
+		 int fds[],
+		 int fds_in,
+		 const void *peer)
+{
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", action_name);
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		return 0;
+	}
+	return mp_send(action_name, params, len_p, fds, fds_in, MP_REP, peer);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 9884c0b..2690a77 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -192,7 +192,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
  * this function typedef to register action for coming messages.
  */
 typedef int (*rte_eal_mp_t)(const void *params, int len,
-			    int fds[], int fds_num);
+			    int fds[], int fds_num, const void *peer);
 
 /**
  * Register an action function for primary/secondary communication.
@@ -245,7 +245,7 @@ void rte_eal_mp_action_unregister(const char *name);
  *   The fds argument is an array of fds sent with sendmsg.
  *
  * @param fds_num
- *   The fds_num argument is number of fds to be sent with sendmsg.
+ *   The number of fds to be sent with sendmsg.
  *
  * @return
  *  - Returns the number of messages being sent successfully.
@@ -255,6 +255,75 @@ rte_eal_mp_sendmsg(const char *action_name, const void *params,
 		   int len_params, int fds[], int fds_num);
 
 /**
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process. Note:
+ * this does not work for the primary process sending requests to its
+ * multiple (>1) secondary processes.
+ *
+ * @param action_name
+ *   The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ *   The params argument contains the customized message; as the reply is
+ *   received, the replied params will be copied to this pointer. 
+ *
+ * @param len_p
+ *   The length of the customized message.
+ *
+ * @param fds
+ *   The fds argument is an array of fds sent with sendmsg; as the reply
+ *   is received, the replied fds will be copied into this array.
+ *
+ * @param fds_in
+ *   The number of fds to be sent.
+ *
+ * @param fds_out
+ *   The number of fds to be received.
+ *
+ * @return
+ *  - (1) on success;
+ *  - (0) on sending request successfully but no valid reply received.
+ *  - (<0) on failing to sending request.
+ */
+int
+rte_eal_mp_request(const char *action_name, void *params,
+		   int len_p, int fds[], int fds_in, int fds_out);
+
+/**
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param action_name
+ *   The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ *   The params argument contains the customized message.
+ *
+ * @param len_p
+ *   The length of the customized message.
+ *
+ * @param fds
+ *   The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_in
+ *   The number of fds to be sent with sendmsg.
+ *
+ * @param peer
+ *   The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ *  - (1) on success;
+ *  - (0) on failure.
+ */
+int
+rte_eal_mp_reply(const char *action_name, const void *params,
+		 int len_p, int fds[], int fds_in, const void *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 5dacde5..068ac0b 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -243,5 +243,7 @@ DPDK_18.02 {
 	rte_eal_mp_action_register;
 	rte_eal_mp_action_unregister;
 	rte_eal_mp_sendmsg;
+	rte_eal_mp_request;
+	rte_eal_mp_reply;
 
 } DPDK_17.11;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v2 4/4] vfio: use the generic multi-process channel
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
                     ` (2 preceding siblings ...)
  2018-01-11  4:07   ` [PATCH v2 3/4] eal: add synchronous multi-process communication Jianfeng Tan
@ 2018-01-11  4:07   ` Jianfeng Tan
  2018-01-13 14:03     ` Burakov, Anatoly
                       ` (3 more replies)
  3 siblings, 4 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-11  4:07 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 --num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 133 ++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 ++++---------------------
 4 files changed, 94 insertions(+), 477 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index f231724..d4b45a2 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
 		return -1;
 	vfio_enabled = rte_vfio_is_enabled("vfio");
 
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
+	if (vfio_enabled && vfio_mp_sync_setup() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 3036f60..2ff40f7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -39,9 +39,11 @@ int
 vfio_get_group_fd(int iommu_group_no)
 {
 	int i;
+	int ret;
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct vfio_mp_param p;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,49 +103,21 @@ vfio_get_group_fd(int iommu_group_no)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
+	p.req = SOCKET_REQ_GROUP;
+	p.group_no = iommu_group_no;
 
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
+	vfio_group_fd = -1;
+	ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_group_fd, 0, 1);
+	if (ret > 0 && p.result == SOCKET_OK) {
+		cur_grp->group_no = iommu_group_no;
+		cur_grp->fd = vfio_group_fd;
+		vfio_cfg.vfio_active_groups++;
+		return vfio_group_fd;
 	}
+
+	RTE_LOG(ERR, EAL, "  cannot request group fd\n");
 	return -1;
 }
 
@@ -200,7 +174,8 @@ int
 clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	int ret;
+	struct vfio_mp_param p;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -214,43 +189,14 @@ clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p.req = SOCKET_CLR_GROUP;
+	p.group_no = vfio_group_fd;
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
+	ret = rte_eal_mp_request("vfio", &p, sizeof(p), NULL, 0, 0);
+	if (ret > 0 && p.result == SOCKET_OK)
 		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
+
+	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
 	return -1;
 }
 
@@ -561,6 +507,7 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct vfio_mp_param p;
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +538,19 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p.req = SOCKET_REQ_CONTAINER;
 
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		close(socket_fd);
+	vfio_container_fd = -1;
+	ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_container_fd, 0, 1);
+	if (ret > 0 && p.result == SOCKET_OK)
 		return vfio_container_fd;
-	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index b34d5d0..a14b168 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS 64
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -161,6 +152,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 9b474dc..ea1a6a7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -2,31 +2,13 @@
  * Copyright(c) 2010-2014 Intel Corporation
  */
 
-#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
+#include <unistd.h>
 
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,360 +19,75 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const void *params,
+		int len,
+		int fds[] __rte_unused,
+		int fds_num __rte_unused,
+		const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd;
 	int ret;
+	int num = 0;
+	const struct vfio_mp_param *p = params;
+	struct vfio_mp_param r;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (len != sizeof(*p)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (p->req) {
+	case SOCKET_REQ_GROUP:
+		r.req = SOCKET_REQ_GROUP;
+		r.group_no = p->group_no;
+		fd = vfio_get_group_fd(p->group_no);
+		if (fd < 0)
+			r.result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r.result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r.result = SOCKET_OK;
+			num = 1;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
+		break;
+	case SOCKET_CLR_GROUP:
+		r.req = SOCKET_CLR_GROUP;
+		r.group_no = p->group_no;
+		if (clear_group(p->group_no) < 0)
+			r.result = SOCKET_NO_FD;
+		else
+			r.result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r.req = SOCKET_REQ_CONTAINER;
+		fd = vfio_get_container_fd();
+		if (fd < 0)
+			r.result = SOCKET_ERR;
+		else {
+			r.result = SOCKET_OK;
+			num = 1;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
-
-	return 0;
+	ret = rte_eal_mp_reply("vfio", &r, sizeof(r), &fd, num, peer);
+	if (p->req == SOCKET_REQ_CONTAINER && num == 1)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_eal_mp_action_register("vfio", vfio_mp_primary);
 
 	return 0;
 }
-
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 1/4] eal: add channel for multi-process communication
  2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
@ 2018-01-13 12:57     ` Burakov, Anatoly
  2018-01-15 19:52     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-13 12:57 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 40fa982..d700e9e 100644

<snip>

> +
>   int
>   rte_eal_primary_proc_alive(const char *config_file_path)
>   {
> @@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
>   
>   	return !!ret;
>   }
> +
> +static struct action_entry *
> +find_action_entry_by_name(const char *name)
> +{
> +	int len = strlen(name);

Why do strlen() here? You already have MAX_ACTION_NAME_LEN, strncmp will 
take care of the rest, no?

> +	struct action_entry *entry;
> +
> +	TAILQ_FOREACH(entry, &action_entry_list, next) {
> +		if (strncmp(entry->action_name, name, len) == 0)
> +			break;
> +	}
> +
> +	return entry;
> +}
> +
> +int
> +rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
> +{
> +	struct action_entry *entry = malloc(sizeof(struct action_entry));
> +
> +	if (entry == NULL) {
> +		rte_errno = -ENOMEM;
> +		return -1;
> +	}
> +
> +	if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
> +		rte_errno = -E2BIG;
> +		return -1;
> +	}

strnlen perhaps? strnlen(action_name) == MAX_ACTION_NAME_LEN will be an 
error condition, and unlike strlen you won't have to scan the entire 
memory if your string was corrupted.

> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	if (find_action_entry_by_name(action_name) != NULL) {
> +		free(entry);
> +		rte_errno = -EEXIST;
> +		return -1;
> +	}
> +	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
> +	entry->action = action;
> +	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return 0;
> +}
> +
> +void
> +rte_eal_mp_action_unregister(const char *name)
> +{

name == NULL? You do a find_action_entry_by_name with it, which calls 
strlen, which IIRC segfaults on NULL pointer. Also, maybe add an strlen 
(or better yet, strnlen) check here like in action_register, so that 
find_action_entry_by_name doesn't need to care about string lengths and 
can work off MAX_ACTION_NAME_LEN instead.

> +	struct action_entry *entry;
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	entry = find_action_entry_by_name(name);

entry == NULL?

> +	TAILQ_REMOVE(&action_entry_list, entry, next);
> +	free(entry);
> +	pthread_mutex_unlock(&mp_mutex_action);
> +}
> +
> +static int
> +read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
> +{
> +	int ret;
> +	struct iovec iov;
> +	struct msghdr msgh;

<snip>

> +		return -1;
> +	}
> +
> +	params_len = len - sizeof(struct mp_msghdr);
> +	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return ret;
> +

unnecessary newline.

> +}
> +
> +static void *
> +mp_handle(void *arg __rte_unused)
> +{
> +	int len;
> +	int fds[SCM_MAX_FD];
> +	char buf[MAX_MSG_LENGTH];
> +
> +	while (1) {

<snip>

> +		goto error;
> +	}
> +
> +	return 0;
> +error:
> +	close(mp_fd);
> +	mp_fd = -1;
> +	return -1;
> +}
> +
> +static inline struct mp_msghdr *
> +format_msg(const char *act_name, const void *p, int len_params, int fds_num)

The name is slightly misleading, as this function actually *creates* a 
message, not just formats it. create_msg? alloc_msg?

> +{
> +	int len_msg;
> +	struct mp_msghdr *msg;
> +
> +	len_msg = sizeof(struct mp_msghdr) + len_params;
> +	if (len_msg > MAX_MSG_LENGTH) {
> +		RTE_LOG(ERR, EAL, "Message is too long\n");
> +		rte_errno = -EINVAL;
> +		return NULL;
> +	}
> +
> +	msg = malloc(len_msg);
> +	if (!msg) {
> +		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> +		rte_errno = -ENOMEM;
> +		return NULL;
> +	}
> +	memset(msg, 0, len_msg);
> +	strcpy(msg->action_name, act_name);

strncpy?

> +	msg->fds_num = fds_num;
> +	msg->len_params = len_params;
> +	memcpy(msg->params, p, len_params);
> +	return msg;
> +}
> +
> +static int
> +send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
> +{
> +	int ret;
> +	struct msghdr msgh;
> +	struct iovec iov;
> +	size_t fd_size = msg->fds_num * sizeof(int);

<snip>

> +	return mp_send(action_name, params, len_params, fds, fds_num);
> +}
> diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
> index e8959eb..e95399b 100644
> --- a/lib/librte_eal/common/eal_filesystem.h
> +++ b/lib/librte_eal/common/eal_filesystem.h
> @@ -38,6 +38,23 @@ eal_runtime_config_path(void)
>   	return buffer;
>   }
>   
> +/** Path of primary/secondary communication unix socket file. */
> +#define MP_UNIX_PATH_FMT "%s/.%s_unix"
> +static inline const char *
> +eal_mp_unix_path(void)

perhaps eal_mp_socket_path would've been more descriptive? API doesn't 
need to care what kind of socket it is.

> +{
> +	static char buffer[PATH_MAX]; /* static so auto-zeroed */
> +	const char *directory = default_config_dir;
> +	const char *home_dir = getenv("HOME");
> +
> +	if (getuid() != 0 && home_dir != NULL)
> +		directory = home_dir;

<snip>


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 2/4] eal: add and del secondary processes in the primary
  2018-01-11  4:07   ` [PATCH v2 2/4] eal: add and del secondary processes in the primary Jianfeng Tan
@ 2018-01-13 13:11     ` Burakov, Anatoly
  2018-01-15 21:45     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-13 13:11 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:
> By the multi-process channel, we add an mp action named "proc".
> 
> As a secondary process starts, it sends a "proc add" message to
> the primary.
> 
> As the primary finds a failure in sending message to a specific
> secondary process, that secondary process is treated as exited;
> and we remove it from the secondary array by sending a "proc del"
> message to the primary itself.
> 
> Test:
>    1. Start the primary and the secondary process
>      $ (testpmd) -c 0x3 -n 4 -- -i
>      $ (helloworld) -c 0xc -n 4 --proc-type=auto --
> 
>    2. Check the log of testpmd:
>      ...
>      EAL: bind to /var/run/.rte_unix
>      ...
>      EAL: add secondary: /var/run/.testpmd_unix_(xxx)
>      ...
> 
>    3. Check the log of helloworld:
>      ...
>      EAL: bind to /var/run/.testpmd_unix_xxx
>      EAL: bind to /var/run/.testpmd_unix_c_xxx
>      ...

it says "unix" all over the place, but that's an internal implementation 
detail. "mp_socket" or similar should do, no?

> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>   lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
>   1 file changed, 86 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index d700e9e..70519cc 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -54,6 +54,13 @@ struct mp_msghdr {
>   	char params[0];
>   } __rte_packed;
>   
> +struct proc_request {
> +#define MP_PROC_ADD	0
> +#define MP_PROC_DEL	1
> +	int type;
> +	char path[MAX_UNIX_PATH_LEN];
> +};
> +
>   int
>   rte_eal_primary_proc_alive(const char *config_file_path)
>   {
> @@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
>   	return NULL;
>   }
>   
> +static int
> +add_sec_proc(const char *path)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +		if (mp_sec_sockets[i] == NULL)
> +			break;
> +	if (i < MAX_SECONDARY_PROCS)
> +		mp_sec_sockets[i] = strdup(path);
> +
> +	return i < MAX_SECONDARY_PROCS;
> +}

While it's equivalent, the intent behind this isn't clear, it's 
needlessly complicating the more common idiom of

for (i = 0; i < MAX; i++) {}
if (i == MAX)
    return error;
do_something;
return success;

> +
> +static int
> +del_sec_proc(const char *path)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +		if (!strcmp(mp_sec_sockets[i], path)) {
> +			free(mp_sec_sockets[i]);
> +			mp_sec_sockets[i] = NULL;
> +			break;
> +		}
> +	}
> +
> +	return i < MAX_SECONDARY_PROCS;
> +}

Same as above - maybe rewrite it as a more commonly used idiom. Also, 
you probably want to use strncmp(), and check for NULL pointers, IIRC 
strncmp(NULL) is undefined behavior.

> +
> +static int
> +mp_primary_proc(const void *params,
> +		int len __rte_unused,
> +		int fds[] __rte_unused,
> +		int fds_num __rte_unused)
> +{
> +	const struct proc_request *r = (const struct proc_request *)params;
> +
> +	switch (r->type) {
> +	case MP_PROC_ADD:
> +		RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
> +		return add_sec_proc(r->path);
> +	case MP_PROC_DEL:
> +		RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
> +		return del_sec_proc(r->path);
> +	default:
> +		RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
> +	}
> +
> +	return -1;
> +}
> +
>   static inline const char *
>   get_unix_path(int is_server)
>   {
> @@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
>   	if (mp_fd < 0)
>   		return -1;
>   
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to register handler\n");
> +			goto error;
> +		}
> +	} else {
> +		struct proc_request r;
> +
> +		r.type = MP_PROC_ADD;
> +		snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));

Nitpicking, but maybe just send PID instead of the whole path? 
Primary/secondary share their prefix and most of their socket path 
anyway, so the real difference is the PID. This would also eliminate the 
need for using strings in many places.

> +		if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to add into primary\n");
> +			goto error;
> +		}
> +	}
> +
>   	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
>   		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
>   			strerror(errno));
> @@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
>   	if (ret < 0) {
>   		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
>   
> -		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			struct proc_request r;
> +
>   			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
>   				dst_path);
> -		else if (!rte_eal_primary_proc_alive(NULL))
> +			r.type = MP_PROC_DEL;
> +			snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
> +			if (rte_eal_mp_sendmsg("proc", &r,
> +						sizeof(r), NULL, 0) < 0)
> +				RTE_LOG(ERR, EAL,
> +					"failed to del secondary %s\n",
> +					dst_path);
> +		} else if (!rte_eal_primary_proc_alive(NULL))
>   			RTE_LOG(ERR, EAL, "primary process exited\n");
>   
>   		return 0;
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-11  4:07   ` [PATCH v2 3/4] eal: add synchronous multi-process communication Jianfeng Tan
@ 2018-01-13 13:41     ` Burakov, Anatoly
  2018-01-16  0:00     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-13 13:41 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:
> ---
>   lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
>   lib/librte_eal/common/include/rte_eal.h |  73 +++++++++++++++-
>   lib/librte_eal/rte_eal_version.map      |   2 +
>   3 files changed, 206 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 70519cc..f194a52 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -32,6 +32,7 @@
>   static int mp_fd = -1;
>   static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
>   static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
> +static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
>   
>   struct action_entry {
>   	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
> @@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
>   
>   struct mp_msghdr {
>   	char action_name[MAX_ACTION_NAME_LEN];
> +#define MP_MSG	0 /* Share message with peers, will not block */
> +#define MP_REQ	1 /* Request for information, Will block for a reply */
> +#define MP_REP	2 /* Reply to previously-received request */

nitpicking, but... response instead of reply?

> +	int type;
>   	int fds_num;
>   	int len_params;
>   	char params[0];
> @@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
>   }
>   
>   static int
> -read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
> +read_msg(int fd, char *buf, int buflen,
> +	 int *fds, int fds_num, struct sockaddr_un *s)

<snip>

> +	return mp_send(action_name, params, len_params,
> +			fds, fds_num, MP_MSG, NULL);
> +}
> +
> +int
> +rte_eal_mp_request(const char *action_name,
> +		   void *params,
> +		   int len_p,
> +		   int fds[],
> +		   int fds_in,
> +		   int fds_out)

name == NULL? name too long?

> +{
> +	int i, j;
> +	int sockfd;
> +	int nprocs;
> +	int ret = 0;
> +	struct mp_msghdr *req;
> +	struct timeval tv;
> +	char buf[MAX_MSG_LENGTH];
> +	struct mp_msghdr *hdr;
> +
> +	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
> +
> +	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		rte_errno = -E2BIG;

(this also applies to previous patches) you set rte_errno to -EINVAL in 
format_msg when message with parameters is too big - should that be 
setting -E2BIG as well? Also, maybe not set rte_errno in multiple 
places, and put all parameter checking (or at least errno setting) in 
rte_eal_mp_* functions?

> +		return 0;
> +	}
> +
> +	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
> +	if (req == NULL)
> +		return 0;
> +
> +	if ((sockfd = open_unix_fd(0)) < 0) {
> +		free(req);
> +		return 0;
> +	}
> +
> +	tv.tv_sec = 5;  /* 5 Secs Timeout */
> +	tv.tv_usec = 0;
> +	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
> +			(const void *)&tv, sizeof(struct timeval)) < 0)
> +		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
> +
> +	/* Only allow one req at a time */
> +	pthread_mutex_lock(&mp_mutex_request);
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		nprocs = 0;
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)

What follows is a bit confusing, some comments explaining what happens 
and maybe more informative variable names would've been helpful.

> +			if (!mp_sec_sockets[i]) {
> +				j = i;
> +				nprocs++;
> +			}
> +
> +		if (nprocs > 1) {
> +			RTE_LOG(ERR, EAL,
> +				"multi secondary processes not supported\n");
> +			goto free_and_ret;
> +		}
> +

<snip>


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 4/4] vfio: use the generic multi-process channel
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
@ 2018-01-13 14:03     ` Burakov, Anatoly
  2018-03-04 14:57     ` [PATCH v5] vfio: change to use " Jianfeng Tan
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-13 14:03 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:

<snip>

> -			}
> -			/* fall-through on error */
> -		default:
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> +	vfio_group_fd = -1;
> +	ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_group_fd, 0, 1);
> +	if (ret > 0 && p.result == SOCKET_OK) {

Thanks, this looks much more clear than the previous revision! In an 
ideal world we would've been able to have separate response and reply 
(as it's perfectly possible to imagine a situation where the request 
would be small but the response would be huge), but for now this works 
as well. Maybe put this API down under EXPERIMENTAL tag? (btw wasn't 
this official policy now?)

> +		cur_grp->group_no = iommu_group_no;
> +		cur_grp->fd = vfio_group_fd;
> +		vfio_cfg.vfio_active_groups++;
> +		return vfio_group_fd;
>   	}
> +
> +	RTE_LOG(ERR, EAL, "  cannot request group fd\n");
>   	return -1;

check for SOCKET_NO_FD? Previously, that branch returned 0, now it will 
return -1.

>   }
>   
> @@ -200,7 +174,8 @@ int
>   clear_group(int vfio_group_fd)
>   {
>   	int i;
> -	int socket_fd, ret;
> +	int ret;
> +	struct vfio_mp_param p;
>   
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>   
> @@ -214,43 +189,14 @@ clear_group(int vfio_group_fd)
>   		return 0;
>   	}
>   
> -	/* This is just for SECONDARY processes */
> -	socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -		return -1;
> -	}
> -
> -	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -		close(socket_fd);
> -		return -1;
> -	}
> -
> -	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> -		close(socket_fd);
> -		return -1;
> -	}
> +	p.req = SOCKET_CLR_GROUP;
> +	p.group_no = vfio_group_fd;
>   
> -	ret = vfio_mp_sync_receive_request(socket_fd);
> -	switch (ret) {
> -	case SOCKET_NO_FD:
> -		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> -		close(socket_fd);
> -		break;
> -	case SOCKET_OK:
> -		close(socket_fd);
> +	ret = rte_eal_mp_request("vfio", &p, sizeof(p), NULL, 0, 0);
> +	if (ret > 0 && p.result == SOCKET_OK)
>   		return 0;
> -	case SOCKET_ERR:
> -		RTE_LOG(ERR, EAL, "  Socket error\n");
> -		close(socket_fd);
> -		break;
> -	default:
> -		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> -		close(socket_fd);
> -	}
> +
> +	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");

The error message lumps together two cases - bad VFIO group fd, and a 
socket error.

>   	return -1;
>   }
>   
> @@ -561,6 +507,7 @@ int
>   vfio_get_container_fd(void)
>   {
>   	int ret, vfio_container_fd;
> +	struct vfio_mp_param p;
>   
>   	/* if we're in a primary process, try to open the container */
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> @@ -591,33 +538,19 @@ vfio_get_container_fd(void)
>   		}

<snip>


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 1/4] eal: add channel for multi-process communication
  2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
  2018-01-13 12:57     ` Burakov, Anatoly
@ 2018-01-15 19:52     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-15 19:52 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas

Hi Jianfeng,

> 
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>   1. Config-file based channel, in which, the primary process writes
>      info into a pre-defined config file, and the secondary process
>      reads the info out.
>   2. vfio submodule has its own channel based on unix socket for the
>      secondary process to get container fd and group fd from the
>      primary process.
>   3. pdump submodule also has its own channel based on unix socket for
>      packet dump.
> 
> It'd be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>   a. Secondary wants to send info to primary, for example, secondary
>      would like to send request (about some specific vdev to primary).
>   b. Sending info at any time, instead of just initialization time.
>   c. Share FDs with the other side, for vdev like vhost, related FDs
>      (memory region, kick) should be shared.
>   d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, based on datagram
> unix socket, for above requirements. Each process will block on a unix
> socket waiting for messages from the peers.
> 
> Three new APIs are added:
> 
>   1. rte_eal_mp_action_register() is used to register an action,
>      indexed by a string, when a component at receiver side would like
>      to response the messages from the peer processe.
>   2. rte_eal_mp_action_unregister() is used to unregister the action
>      if the calling component does not want to response the messages.
>   3. rte_eal_mp_sendmsg() is used to send a message, and returns
>      immediately. If there are 1:n primary:secondary processes, the
>      primary process will send n messages.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
>  lib/librte_eal/common/eal_filesystem.h  |  17 ++
>  lib/librte_eal/common/eal_private.h     |  10 +
>  lib/librte_eal/common/include/rte_eal.h |  69 ++++++
>  lib/librte_eal/linuxapp/eal/eal.c       |   8 +
>  lib/librte_eal/rte_eal_version.map      |   9 +
>  6 files changed, 501 insertions(+)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 40fa982..d700e9e 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -5,11 +5,55 @@
>  #include <stdio.h>
>  #include <fcntl.h>
>  #include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <limits.h>
> +#include <unistd.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +#include <pthread.h>
> +
> +#include <rte_log.h>
>  #include <rte_eal.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
> +#include <rte_common.h>
> 
> +#include "eal_private.h"
>  #include "eal_filesystem.h"
>  #include "eal_internal_cfg.h"
> 
> +#define MAX_SECONDARY_PROCS	8
> +#define MAX_ACTION_NAME_LEN	64
> +#define MAX_UNIX_PATH_LEN	104

Why do you need this?
Why not just PATH_MAX?

> +#define MAX_MSG_LENGTH		1024
> +#define SCM_MAX_FD		253 /* The max amount of fds */
> +
> +static int mp_fd = -1;
> +static char *mp_sec_sockets[MAX_SECONDARY_PROCS];

Who will init it and why it could be only 8?

> +static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
> +
> +struct action_entry {
> +	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
> +
> +#define MAX_ACTION_NAME_LEN	64
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	rte_eal_mp_t action;
> +};
> +
> +/** Double linked list of actions. */
> +TAILQ_HEAD(action_entry_list, action_entry);
> +
> +static struct action_entry_list action_entry_list =
> +	TAILQ_HEAD_INITIALIZER(action_entry_list);
> +
> +struct mp_msghdr {
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	int fds_num;
> +	int len_params;
> +	char params[0];
> +} __rte_packed;
> +
>  int
>  rte_eal_primary_proc_alive(const char *config_file_path)
>  {
> @@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
> 
>  	return !!ret;
>  }
> +
> +static struct action_entry *
> +find_action_entry_by_name(const char *name)
> +{
> +	int len = strlen(name);
> +	struct action_entry *entry;
> +
> +	TAILQ_FOREACH(entry, &action_entry_list, next) {
> +		if (strncmp(entry->action_name, name, len) == 0)

I think it has be just strcmp() here.


> +			break;
> +	}
> +
> +	return entry;
> +}
> +
> +int
> +rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
> +{
> +	struct action_entry *entry = malloc(sizeof(struct action_entry));
> +
> +	if (entry == NULL) {
> +		rte_errno = -ENOMEM;
> +		return -1;
> +	}
> +
> +	if (strlen(action_name) > MAX_ACTION_NAME_LEN) {

No space for '\0' left.
either >= MAX_ACTION_NAME_LEN, or make entry.name[MAX_ACTION_NAME_LEN + 1];
Even better just 
 - allocate new action_entry.
if (snprintf(action->name, "%s", action_name) >= sizeof(action->name)) {
    free(action);
    return -E2BIG;  
} 


> +		rte_errno = -E2BIG;
> +		return -1;
> +	}
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	if (find_action_entry_by_name(action_name) != NULL) {
> +		free(entry);

Forgot to do mutex_unlock().

> +		rte_errno = -EEXIST;
> +		return -1;
> +	}
> +	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
> +	entry->action = action;
> +	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return 0;
> +}
> +
> +void
> +rte_eal_mp_action_unregister(const char *name)
> +{
> +	struct action_entry *entry;
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	entry = find_action_entry_by_name(name);
> +	TAILQ_REMOVE(&action_entry_list, entry, next);
> +	free(entry);

Better to do free() after releasing the mutex.

> +	pthread_mutex_unlock(&mp_mutex_action);
> +}
> +
> +static int
> +read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
> +{
> +	int ret;
> +	struct iovec iov;
> +	struct msghdr msgh;
> +	size_t fdsize = fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +
> +	memset(&msgh, 0, sizeof(msgh));
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(fd, &msgh, 0);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
> +		return -1;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, EAL, "truncted msg\n");
> +		return -1;
> +	}
> +
> +	/* read auxiliary FDs if any */
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +process_msg(struct mp_msghdr *hdr, int len, int fds[])
> +{
> +	int ret;
> +	int params_len;
> +	struct action_entry *entry;
> +
> +	RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	entry = find_action_entry_by_name(hdr->action_name);
> +	if (entry == NULL) {
> +		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
> +			hdr->action_name);
> +		pthread_mutex_unlock(&mp_mutex_action);
> +		return -1;

If no action is specified for that message - who will free it?
If action() exisits is it a responsibility of action() to free msg?

> +	}
> +
> +	params_len = len - sizeof(struct mp_msghdr);
> +	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);

Do you really need to do action() with lock held?

> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return ret;
> +
> +}
> +
> +static void *

Why just not 'void' here?

> +mp_handle(void *arg __rte_unused)
> +{
> +	int len;
> +	int fds[SCM_MAX_FD];
> +	char buf[MAX_MSG_LENGTH];
> +
> +	while (1) {
> +		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
> +		if (len > 0)
> +			process_msg((struct mp_msghdr *)buf, len, fds);
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline const char *
> +get_unix_path(int is_server)
> +{
> +	static char unix_path[MAX_UNIX_PATH_LEN];

PATH_MAX?

Why just not make that function to accept char path[PATH_MAX] as a parameter?

> +	const char *prefix = eal_mp_unix_path();
> +	const char *suffix = (is_server) ? "" : "_c";
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
> +	else
> +		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
> +			 prefix, suffix, getpid());
> +	return unix_path;
> +}
> +
> +static int
> +open_unix_fd(int is_server)
> +{
> +	int fd;
> +	struct sockaddr_un un;
> +
> +	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
> +	if (fd < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
> +		return -1;
> +	}
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
> +		 get_unix_path(is_server));
> +	unlink(un.sun_path); /* May still exist since last run */
> +	if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
> +			un.sun_path, strerror(errno));
> +		close(fd);
> +		return -1;
> +	}
> +
> +	RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
> +	return fd;
> +}
> +
> +int
> +rte_eal_mp_channel_init(void)
> +{
> +	pthread_t tid;
> +	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> +
> +	mp_fd = open_unix_fd(1);
> +	if (mp_fd < 0)
> +		return -1;
> +
> +	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
> +			strerror(errno));
> +		goto error;
> +	}
> +
> +	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
> +	if (rte_thread_setname(tid, thread_name) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to set thead name\n");

Forgot to terminate thread?

> +		goto error;

As a nit - can we reorder code a bit to avoid 'goto's?

> +	}
> +
> +	return 0;
> +error:
> +	close(mp_fd);
> +	mp_fd = -1;
> +	return -1;
> +}
> +
> +static inline struct mp_msghdr *
> +format_msg(const char *act_name, const void *p, int len_params, int fds_num)
> +{
> +	int len_msg;
> +	struct mp_msghdr *msg;
> +
> +	len_msg = sizeof(struct mp_msghdr) + len_params;
> +	if (len_msg > MAX_MSG_LENGTH) {
> +		RTE_LOG(ERR, EAL, "Message is too long\n");
> +		rte_errno = -EINVAL;
> +		return NULL;
> +	}
> +
> +	msg = malloc(len_msg);
> +	if (!msg) {
> +		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> +		rte_errno = -ENOMEM;
> +		return NULL;
> +	}
> +	memset(msg, 0, len_msg);
> +	strcpy(msg->action_name, act_name);
> +	msg->fds_num = fds_num;
> +	msg->len_params = len_params;
> +	memcpy(msg->params, p, len_params);
> +	return msg;
> +}
> +
> +static int
> +send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
> +{
> +	int ret;
> +	struct msghdr msgh;
> +	struct iovec iov;
> +	size_t fd_size = msg->fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +	struct cmsghdr *cmsg;
> +	struct sockaddr_un dst;
> +
> +	memset(&dst, 0, sizeof(dst));
> +	dst.sun_family = AF_UNIX;
> +	snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
> +
> +	memset(&msgh, 0, sizeof(msgh));
> +	memset(control, 0, sizeof(control));
> +
> +	iov.iov_base = (uint8_t *)msg;
> +	iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
> +
> +	msgh.msg_name = &dst;
> +	msgh.msg_namelen = sizeof(dst);
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	cmsg = CMSG_FIRSTHDR(&msgh);
> +	cmsg->cmsg_len = CMSG_LEN(fd_size);
> +	cmsg->cmsg_level = SOL_SOCKET;
> +	cmsg->cmsg_type = SCM_RIGHTS;
> +	memcpy(CMSG_DATA(cmsg), fds, fd_size);
> +
> +	do {
> +		ret = sendmsg(fd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
> +
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
> +				dst_path);
> +		else if (!rte_eal_primary_proc_alive(NULL))
> +			RTE_LOG(ERR, EAL, "primary process exited\n");

So secondary to secondary are not allowed?

> +
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
> +static int
> +mp_send(const char *action_name,
> +	const void *params,
> +	int len_params,
> +	int fds[],
> +	int fds_num)
> +{
> +	int i;
> +	int n = 0;
> +	int sockfd;
> +	struct mp_msghdr *msg;
> +
> +	if (fds_num > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		rte_errno = -E2BIG;
> +		return 0;
> +	}
> +
> +	msg = format_msg(action_name, params, len_params, fds_num);
> +	if (msg == NULL)
> +		return 0;
> +
> +	if ((sockfd = open_unix_fd(0)) < 0) {
> +		free(msg);
> +		return 0;
> +	}
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		/* broadcast to all secondaries */
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +			if (mp_sec_sockets[i] == NULL)
> +				continue;
> +
> +			n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
> +		}
> +	} else
> +		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
> +
> +	free(msg);
> +	close(sockfd);
> +	return n;
> +}
> +
> +int
> +rte_eal_mp_sendmsg(const char *action_name,
> +		   const void *params,
> +		   int len_params,
> +		   int fds[],
> +		   int fds_num)
> +{
> +	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
> +	return mp_send(action_name, params, len_params, fds, fds_num);
> +}
> diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
> index e8959eb..e95399b 100644
> --- a/lib/librte_eal/common/eal_filesystem.h
> +++ b/lib/librte_eal/common/eal_filesystem.h
> @@ -38,6 +38,23 @@ eal_runtime_config_path(void)
>  	return buffer;
>  }
> 
> +/** Path of primary/secondary communication unix socket file. */
> +#define MP_UNIX_PATH_FMT "%s/.%s_unix"
> +static inline const char *
> +eal_mp_unix_path(void)
> +{
> +	static char buffer[PATH_MAX]; /* static so auto-zeroed */
> +	const char *directory = default_config_dir;
> +	const char *home_dir = getenv("HOME");
> +
> +	if (getuid() != 0 && home_dir != NULL)
> +		directory = home_dir;
> +	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
> +		 directory, internal_config.hugefile_prefix);
> +
> +	return buffer;
> +}
> +
>  /** Path of hugepage info file. */
>  #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
> 
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index c46dd8f..e36e3b5 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
>   */
>  struct rte_bus *rte_bus_find_by_device_name(const char *str);
> 
> +/**
> + * Create the unix channel for primary/secondary communication.
> + *
> + * @return
> + *   0 on success;
> + *   (<0) on failure.
> + */
> +
> +int rte_eal_mp_channel_init(void);
> +
>  #endif /* _EAL_PRIVATE_H_ */
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index 02fa109..9884c0b 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
>  int rte_eal_primary_proc_alive(const char *config_file_path);
> 
>  /**
> + * Action function typedef used by other components.
> + *
> + * As we create unix socket channel for primary/secondary communication, use
> + * this function typedef to register action for coming messages.
> + */
> +typedef int (*rte_eal_mp_t)(const void *params, int len,
> +			    int fds[], int fds_num);
> +
> +/**
> + * Register an action function for primary/secondary communication.
> + *
> + * Call this function to register an action, if the calling component wants
> + * to response the messages from the corresponding component in its primary
> + * process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + * @param action
> + *   The action argument is the function pointer to the action function.
> + *
> + * @return
> + *  - 0 on success.
> + *  - (<0) on failure.
> + */
> +int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
> +
> +/**
> + * Unregister an action function for primary/secondary communication.
> + *
> + * Call this function to unregister an action  if the calling component does
> + * not want to response the messages from the corresponding component in its
> + * primary process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + */
> +void rte_eal_mp_action_unregister(const char *name);
> +
> +/**
> + * Send a message to the peer process.
> + *
> + * This function will send a message which will be responsed by the action
> + * identified by action_name of the process on the other side.
> + *
> + * @param action_name
> + *   The action_name argument is used to identify which action will be used.
> + *
> + * @param params
> + *   The params argument contains the customized message.
> + *
> + * @param len_params
> + *   The len_params argument is the length of the customized message.
> + *
> + * @param fds
> + *   The fds argument is an array of fds sent with sendmsg.
> + *
> + * @param fds_num
> + *   The fds_num argument is number of fds to be sent with sendmsg.
> + *
> + * @return
> + *  - Returns the number of messages being sent successfully.
> + */
> +int
> +rte_eal_mp_sendmsg(const char *action_name, const void *params,
> +		   int len_params, int fds[], int fds_num);
> +
> +/**
>   * Usage function typedef used by the application usage function.
>   *
>   * Use this function typedef to define and call rte_set_application_usage_hook()
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 229eec9..f231724 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)
> 
>  	eal_check_mem_on_local_socket();
> 
> +	if (rte_eal_mp_channel_init() < 0) {
> +		rte_eal_init_alert("failed to init mp channel\n");
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			rte_errno = EFAULT;
> +			return -1;
> +		}
> +	}
> +
>  	eal_thread_init_master(rte_config.master_lcore);
> 
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index f4f46c1..5dacde5 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -235,4 +235,13 @@ EXPERIMENTAL {
>  	rte_service_set_stats_enable;
>  	rte_service_start_with_defaults;
> 
> +} DPDK_17.08;
> +
> +DPDK_18.02 {
> +	global:
> +
> +	rte_eal_mp_action_register;
> +	rte_eal_mp_action_unregister;
> +	rte_eal_mp_sendmsg;
> +
>  } DPDK_17.11;
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 2/4] eal: add and del secondary processes in the primary
  2018-01-11  4:07   ` [PATCH v2 2/4] eal: add and del secondary processes in the primary Jianfeng Tan
  2018-01-13 13:11     ` Burakov, Anatoly
@ 2018-01-15 21:45     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-15 21:45 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas



> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Thursday, January 11, 2018 4:08 AM
> To: dev@dpdk.org
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; thomas@monjalon.net; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: [PATCH v2 2/4] eal: add and del secondary processes in the primary
> 
> By the multi-process channel, we add an mp action named "proc".
> 
> As a secondary process starts, it sends a "proc add" message to
> the primary.
> 
> As the primary finds a failure in sending message to a specific
> secondary process, that secondary process is treated as exited;
> and we remove it from the secondary array by sending a "proc del"
> message to the primary itself.
> 
> Test:
>   1. Start the primary and the secondary process
>     $ (testpmd) -c 0x3 -n 4 -- -i
>     $ (helloworld) -c 0xc -n 4 --proc-type=auto --
> 
>   2. Check the log of testpmd:
>     ...
>     EAL: bind to /var/run/.rte_unix
>     ...
>     EAL: add secondary: /var/run/.testpmd_unix_(xxx)
>     ...
> 
>   3. Check the log of helloworld:
>     ...
>     EAL: bind to /var/run/.testpmd_unix_xxx
>     EAL: bind to /var/run/.testpmd_unix_c_xxx
>     ...
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
>  1 file changed, 86 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index d700e9e..70519cc 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -54,6 +54,13 @@ struct mp_msghdr {
>  	char params[0];
>  } __rte_packed;
> 
> +struct proc_request {
> +#define MP_PROC_ADD	0
> +#define MP_PROC_DEL	1
> +	int type;
> +	char path[MAX_UNIX_PATH_LEN];
> +};
> +
>  int
>  rte_eal_primary_proc_alive(const char *config_file_path)
>  {
> @@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
>  	return NULL;
>  }
> 
> +static int
> +add_sec_proc(const char *path)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +		if (mp_sec_sockets[i] == NULL)
> +			break;
> +	if (i < MAX_SECONDARY_PROCS)
> +		mp_sec_sockets[i] = strdup(path);
> +
> +	return i < MAX_SECONDARY_PROCS;
> +}
> +
> +static int
> +del_sec_proc(const char *path)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +		if (!strcmp(mp_sec_sockets[i], path)) {
> +			free(mp_sec_sockets[i]);
> +			mp_sec_sockets[i] = NULL;
> +			break;
> +		}
> +	}
> +
> +	return i < MAX_SECONDARY_PROCS;
> +}


I am not sure we really need all these add/del messages and mp_sec_sockets[]...
For broadcast - why we can't just scan contents of our home dir for all open client sockets
and send a message to each such socket found.
Konstantin

> +
> +static int
> +mp_primary_proc(const void *params,
> +		int len __rte_unused,
> +		int fds[] __rte_unused,
> +		int fds_num __rte_unused)
> +{
> +	const struct proc_request *r = (const struct proc_request *)params;
> +
> +	switch (r->type) {
> +	case MP_PROC_ADD:
> +		RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
> +		return add_sec_proc(r->path);
> +	case MP_PROC_DEL:
> +		RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
> +		return del_sec_proc(r->path);
> +	default:
> +		RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
> +	}
> +
> +	return -1;
> +}
> +
>  static inline const char *
>  get_unix_path(int is_server)
>  {
> @@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
>  	if (mp_fd < 0)
>  		return -1;
> 
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to register handler\n");
> +			goto error;
> +		}
> +	} else {
> +		struct proc_request r;
> +
> +		r.type = MP_PROC_ADD;
> +		snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));
> +		if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
> +			RTE_LOG(ERR, EAL, "failed to add into primary\n");
> +			goto error;
> +		}
> +	}
> +
>  	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
>  		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
>  			strerror(errno));
> @@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
>  	if (ret < 0) {
>  		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
> 
> -		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			struct proc_request r;
> +
>  			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
>  				dst_path);
> -		else if (!rte_eal_primary_proc_alive(NULL))
> +			r.type = MP_PROC_DEL;
> +			snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
> +			if (rte_eal_mp_sendmsg("proc", &r,
> +						sizeof(r), NULL, 0) < 0)
> +				RTE_LOG(ERR, EAL,
> +					"failed to del secondary %s\n",
> +					dst_path);
> +		} else if (!rte_eal_primary_proc_alive(NULL))
>  			RTE_LOG(ERR, EAL, "primary process exited\n");
> 
>  		return 0;
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-11  4:07   ` [PATCH v2 3/4] eal: add synchronous multi-process communication Jianfeng Tan
  2018-01-13 13:41     ` Burakov, Anatoly
@ 2018-01-16  0:00     ` Ananyev, Konstantin
  2018-01-16  8:10       ` Tan, Jianfeng
  1 sibling, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-16  0:00 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas



> 
> We need the synchronous way for multi-process communication,
> i.e., blockingly waiting for reply message when we send a request
> to the peer process.
> 
> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> such use case. By invoking rte_eal_mp_request(), a request message
> is sent out, and then it waits there for a reply message. The
> timeout is hard-coded 5 Sec. And the replied message will be copied
> in the parameters of this API so that the caller can decide how
> to translate those information (including params and fds). Note
> if a primary process owns multiple secondary processes, this API
> will fail.
> 
> The API rte_eal_mp_reply() is always called by an mp action handler.
> Here we add another parameter for rte_eal_mp_t so that the action
> handler knows which peer address to reply.
> 
> We use mutex in rte_eal_mp_request() to guarantee that only one
> request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
Instead you can do something like that:
1) Introduce new struct, list for it and mutex 
 struct sync_request {
      int reply_received;
      char dst[PATH_MAX];
      char reply[...];
      LIST_ENTRY(sync_request) next;
};

static struct  
    LIST_HEAD(list, sync_request);
    pthread_mutex_t lock;
   pthead_cond_t cond;
} sync_requests;

2) then at request() call:
  Grab sync_requests.lock
  Check do we already have a pending request for that destination,
  If yes - the release the lock and returns with error.
  - allocate and init new sync_request struct, set reply_received=0
  - do send_msg()
  -then in a cycle:
  pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
  - at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.

3) at mp_handler() if REPLY received - grab sync_request.lock, 
    search through sync_requests.list for dst[] ,
   if found, then set it's reply_received=1, copy the received message into reply
   and call pthread_cond_braodcast((&sync_requests.cond);
  
> 
> Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
>  lib/librte_eal/common/include/rte_eal.h |  73 +++++++++++++++-
>  lib/librte_eal/rte_eal_version.map      |   2 +
>  3 files changed, 206 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 70519cc..f194a52 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -32,6 +32,7 @@
>  static int mp_fd = -1;
>  static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
>  static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
> +static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
> 
>  struct action_entry {
>  	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
> @@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
> 
>  struct mp_msghdr {
>  	char action_name[MAX_ACTION_NAME_LEN];
> +#define MP_MSG	0 /* Share message with peers, will not block */
> +#define MP_REQ	1 /* Request for information, Will block for a reply */
> +#define MP_REP	2 /* Reply to previously-received request */

As a nit - please use enum {} instead for the above macros.


> +	int type;
>  	int fds_num;
>  	int len_params;
>  	char params[0];
> @@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
>  }
> 
>  static int
> -read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
> +read_msg(int fd, char *buf, int buflen,
> +	 int *fds, int fds_num, struct sockaddr_un *s)
>  {
>  	int ret;
>  	struct iovec iov;
> @@ -151,6 +157,8 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
>  	iov.iov_base = buf;
>  	iov.iov_len  = buflen;
> 
> +	msgh.msg_name = s;
> +	msgh.msg_namelen = sizeof(*s);
>  	msgh.msg_iov = &iov;
>  	msgh.msg_iovlen = 1;
>  	msgh.msg_control = control;
> @@ -181,7 +189,7 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
>  }
> 
>  static int
> -process_msg(struct mp_msghdr *hdr, int len, int fds[])
> +process_msg(struct mp_msghdr *hdr, int len, int fds[], struct sockaddr_un *s)
>  {
>  	int ret;
>  	int params_len;
> @@ -199,10 +207,10 @@ process_msg(struct mp_msghdr *hdr, int len, int fds[])
>  	}
> 
>  	params_len = len - sizeof(struct mp_msghdr);
> -	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
> +	ret = entry->action(hdr->params, params_len,
> +			    fds, hdr->fds_num, s->sun_path);
>  	pthread_mutex_unlock(&mp_mutex_action);
>  	return ret;
> -
>  }
> 
>  static void *
> @@ -211,11 +219,12 @@ mp_handle(void *arg __rte_unused)
>  	int len;
>  	int fds[SCM_MAX_FD];
>  	char buf[MAX_MSG_LENGTH];
> +	struct sockaddr_un sa;
> 
>  	while (1) {
> -		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
> +		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD, &sa);
>  		if (len > 0)
> -			process_msg((struct mp_msghdr *)buf, len, fds);
> +			process_msg((struct mp_msghdr *)buf, len, fds, &sa);
>  	}
> 
>  	return NULL;
> @@ -255,7 +264,8 @@ static int
>  mp_primary_proc(const void *params,
>  		int len __rte_unused,
>  		int fds[] __rte_unused,
> -		int fds_num __rte_unused)
> +		int fds_num __rte_unused,
> +		const void *peer __rte_unused)
>  {
>  	const struct proc_request *r = (const struct proc_request *)params;
> 
> @@ -362,7 +372,8 @@ rte_eal_mp_channel_init(void)
>  }
> 
>  static inline struct mp_msghdr *
> -format_msg(const char *act_name, const void *p, int len_params, int fds_num)
> +format_msg(const char *act_name, const void *p,
> +	   int len_params, int fds_num, int type)
>  {
>  	int len_msg;
>  	struct mp_msghdr *msg;
> @@ -384,6 +395,7 @@ format_msg(const char *act_name, const void *p, int len_params, int fds_num)
>  	strcpy(msg->action_name, act_name);
>  	msg->fds_num = fds_num;
>  	msg->len_params = len_params;
> +	msg->type = type;
>  	memcpy(msg->params, p, len_params);
>  	return msg;
>  }
> @@ -455,7 +467,9 @@ mp_send(const char *action_name,
>  	const void *params,
>  	int len_params,
>  	int fds[],
> -	int fds_num)
> +	int fds_num,
> +	int type,
> +	const void *peer)
>  {
>  	int i;
>  	int n = 0;
> @@ -468,7 +482,7 @@ mp_send(const char *action_name,
>  		return 0;
>  	}
> 
> -	msg = format_msg(action_name, params, len_params, fds_num);
> +	msg = format_msg(action_name, params, len_params, fds_num, type);
>  	if (msg == NULL)
>  		return 0;
> 
> @@ -477,6 +491,11 @@ mp_send(const char *action_name,
>  		return 0;
>  	}
> 
> +	if (peer) {
> +		n += send_msg(sockfd, peer, msg, fds);
> +		goto ret;
> +	}
> +
>  	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>  		/* broadcast to all secondaries */
>  		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> @@ -488,6 +507,7 @@ mp_send(const char *action_name,
>  	} else
>  		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
> 
> +ret:
>  	free(msg);
>  	close(sockfd);
>  	return n;
> @@ -501,5 +521,107 @@ rte_eal_mp_sendmsg(const char *action_name,
>  		   int fds_num)
>  {
>  	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
> -	return mp_send(action_name, params, len_params, fds, fds_num);
> +	return mp_send(action_name, params, len_params,
> +			fds, fds_num, MP_MSG, NULL);
> +}
> +
> +int
> +rte_eal_mp_request(const char *action_name,
> +		   void *params,
> +		   int len_p,
> +		   int fds[],
> +		   int fds_in,
> +		   int fds_out)
> +{
> +	int i, j;
> +	int sockfd;
> +	int nprocs;
> +	int ret = 0;
> +	struct mp_msghdr *req;
> +	struct timeval tv;
> +	char buf[MAX_MSG_LENGTH];
> +	struct mp_msghdr *hdr;
> +
> +	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
> +
> +	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		rte_errno = -E2BIG;
> +		return 0;
> +	}
> +
> +	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
> +	if (req == NULL)
> +		return 0;
> +
> +	if ((sockfd = open_unix_fd(0)) < 0) {
> +		free(req);
> +		return 0;
> +	}
> +
> +	tv.tv_sec = 5;  /* 5 Secs Timeout */
> +	tv.tv_usec = 0;
> +	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
> +			(const void *)&tv, sizeof(struct timeval)) < 0)
> +		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
> +
> +	/* Only allow one req at a time */
> +	pthread_mutex_lock(&mp_mutex_request);
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		nprocs = 0;
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +			if (!mp_sec_sockets[i]) {
> +				j = i;
> +				nprocs++;
> +			}
> +
> +		if (nprocs > 1) {
> +			RTE_LOG(ERR, EAL,
> +				"multi secondary processes not supported\n");
> +			goto free_and_ret;
> +		}
> +
> +		ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
> +	} else
> +		ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
> +
> +	if (ret == 0) {
> +		RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
> +		ret = -1;
> +		goto free_and_ret;
> +	}
> +
> +	ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
> +	if (ret > 0) {
> +		hdr = (struct mp_msghdr *)buf;
> +		if (hdr->len_params == len_p)
> +			memcpy(params, hdr->params, len_p);
> +		else {
> +			RTE_LOG(ERR, EAL, "invalid reply\n");
> +			ret = 0;
> +		}
> +	}
> +
> +free_and_ret:
> +	free(req);
> +	close(sockfd);
> +	pthread_mutex_unlock(&mp_mutex_request);
> +	return ret;
> +}
> +
> +int
> +rte_eal_mp_reply(const char *action_name,
> +		 const void *params,
> +		 int len_p,
> +		 int fds[],
> +		 int fds_in,
> +		 const void *peer)
> +{
> +	RTE_LOG(DEBUG, EAL, "reply: %s\n", action_name);
> +	if (peer == NULL) {
> +		RTE_LOG(ERR, EAL, "peer is not specified\n");
> +		return 0;
> +	}
> +	return mp_send(action_name, params, len_p, fds, fds_in, MP_REP, peer);
>  }
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index 9884c0b..2690a77 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -192,7 +192,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
>   * this function typedef to register action for coming messages.
>   */
>  typedef int (*rte_eal_mp_t)(const void *params, int len,
> -			    int fds[], int fds_num);
> +			    int fds[], int fds_num, const void *peer);
> 
>  /**
>   * Register an action function for primary/secondary communication.
> @@ -245,7 +245,7 @@ void rte_eal_mp_action_unregister(const char *name);
>   *   The fds argument is an array of fds sent with sendmsg.
>   *
>   * @param fds_num
> - *   The fds_num argument is number of fds to be sent with sendmsg.
> + *   The number of fds to be sent with sendmsg.
>   *
>   * @return
>   *  - Returns the number of messages being sent successfully.
> @@ -255,6 +255,75 @@ rte_eal_mp_sendmsg(const char *action_name, const void *params,
>  		   int len_params, int fds[], int fds_num);
> 
>  /**
> + * Send a request to the peer process and expect a reply.
> + *
> + * This function sends a request message to the peer process, and will
> + * block until receiving reply message from the peer process. Note:
> + * this does not work for the primary process sending requests to its
> + * multiple (>1) secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument is used to identify which action will be used.
> + *
> + * @param params
> + *   The params argument contains the customized message; as the reply is
> + *   received, the replied params will be copied to this pointer.
> + *
> + * @param len_p
> + *   The length of the customized message.
> + *
> + * @param fds
> + *   The fds argument is an array of fds sent with sendmsg; as the reply
> + *   is received, the replied fds will be copied into this array.
> + *
> + * @param fds_in
> + *   The number of fds to be sent.
> + *
> + * @param fds_out
> + *   The number of fds to be received.
> + *
> + * @return
> + *  - (1) on success;
> + *  - (0) on sending request successfully but no valid reply received.
> + *  - (<0) on failing to sending request.
> + */
> +int
> +rte_eal_mp_request(const char *action_name, void *params,
> +		   int len_p, int fds[], int fds_in, int fds_out);
> +
> +/**
> + * Send a reply to the peer process.
> + *
> + * This function will send a reply message in response to a request message
> + * received previously.
> + *
> + * @param action_name
> + *   The action_name argument is used to identify which action will be used.
> + *
> + * @param params
> + *   The params argument contains the customized message.
> + *
> + * @param len_p
> + *   The length of the customized message.
> + *
> + * @param fds
> + *   The fds argument is an array of fds sent with sendmsg.
> + *
> + * @param fds_in
> + *   The number of fds to be sent with sendmsg.
> + *
> + * @param peer
> + *   The fds_num argument is number of fds to be sent with sendmsg.
> + *
> + * @return
> + *  - (1) on success;
> + *  - (0) on failure.
> + */
> +int
> +rte_eal_mp_reply(const char *action_name, const void *params,
> +		 int len_p, int fds[], int fds_in, const void *peer);
> +
> +/**
>   * Usage function typedef used by the application usage function.
>   *
>   * Use this function typedef to define and call rte_set_application_usage_hook()
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index 5dacde5..068ac0b 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -243,5 +243,7 @@ DPDK_18.02 {
>  	rte_eal_mp_action_register;
>  	rte_eal_mp_action_unregister;
>  	rte_eal_mp_sendmsg;
> +	rte_eal_mp_request;
> +	rte_eal_mp_reply;
> 
>  } DPDK_17.11;
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-16  0:00     ` Ananyev, Konstantin
@ 2018-01-16  8:10       ` Tan, Jianfeng
  2018-01-16 11:12         ` Ananyev, Konstantin
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-16  8:10 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas

Thank you, Konstantin and Anatoly firstly. Other comments are well 
received and I'll send out a new version.


On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
>
>> We need the synchronous way for multi-process communication,
>> i.e., blockingly waiting for reply message when we send a request
>> to the peer process.
>>
>> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
>> such use case. By invoking rte_eal_mp_request(), a request message
>> is sent out, and then it waits there for a reply message. The
>> timeout is hard-coded 5 Sec. And the replied message will be copied
>> in the parameters of this API so that the caller can decide how
>> to translate those information (including params and fds). Note
>> if a primary process owns multiple secondary processes, this API
>> will fail.
>>
>> The API rte_eal_mp_reply() is always called by an mp action handler.
>> Here we add another parameter for rte_eal_mp_t so that the action
>> handler knows which peer address to reply.
>>
>> We use mutex in rte_eal_mp_request() to guarantee that only one
>> request is on the fly for one pair of processes.
> You don't need to do things in such strange and restrictive way.
> Instead you can do something like that:
> 1) Introduce new struct, list for it and mutex
>   struct sync_request {
>        int reply_received;
>        char dst[PATH_MAX];
>        char reply[...];
>        LIST_ENTRY(sync_request) next;
> };
>
> static struct
>      LIST_HEAD(list, sync_request);
>      pthread_mutex_t lock;
>     pthead_cond_t cond;
> } sync_requests;
>
> 2) then at request() call:
>    Grab sync_requests.lock
>    Check do we already have a pending request for that destination,
>    If yes - the release the lock and returns with error.
>    - allocate and init new sync_request struct, set reply_received=0
>    - do send_msg()
>    -then in a cycle:
>    pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
>    - at return from it check if sync_request.reply_received == 1, if not
> check if timeout expired and either return a failure or go to the start of the cycle.
>
> 3) at mp_handler() if REPLY received - grab sync_request.lock,
>      search through sync_requests.list for dst[] ,
>     if found, then set it's reply_received=1, copy the received message into reply
>     and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to 
multiple receivers at the same time. And it makes things more 
complicated. Do we really need this?

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-16  8:10       ` Tan, Jianfeng
@ 2018-01-16 11:12         ` Ananyev, Konstantin
  2018-01-16 16:47           ` Tan, Jianfeng
  0 siblings, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-16 11:12 UTC (permalink / raw)
  To: Tan, Jianfeng, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas

Hi Jianfeng,

> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Tuesday, January 16, 2018 8:11 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
> 
> Thank you, Konstantin and Anatoly firstly. Other comments are well
> received and I'll send out a new version.
> 
> 
> On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
> >
> >> We need the synchronous way for multi-process communication,
> >> i.e., blockingly waiting for reply message when we send a request
> >> to the peer process.
> >>
> >> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> >> such use case. By invoking rte_eal_mp_request(), a request message
> >> is sent out, and then it waits there for a reply message. The
> >> timeout is hard-coded 5 Sec. And the replied message will be copied
> >> in the parameters of this API so that the caller can decide how
> >> to translate those information (including params and fds). Note
> >> if a primary process owns multiple secondary processes, this API
> >> will fail.
> >>
> >> The API rte_eal_mp_reply() is always called by an mp action handler.
> >> Here we add another parameter for rte_eal_mp_t so that the action
> >> handler knows which peer address to reply.
> >>
> >> We use mutex in rte_eal_mp_request() to guarantee that only one
> >> request is on the fly for one pair of processes.
> > You don't need to do things in such strange and restrictive way.
> > Instead you can do something like that:
> > 1) Introduce new struct, list for it and mutex
> >   struct sync_request {
> >        int reply_received;
> >        char dst[PATH_MAX];
> >        char reply[...];
> >        LIST_ENTRY(sync_request) next;
> > };
> >
> > static struct
> >      LIST_HEAD(list, sync_request);
> >      pthread_mutex_t lock;
> >     pthead_cond_t cond;
> > } sync_requests;
> >
> > 2) then at request() call:
> >    Grab sync_requests.lock
> >    Check do we already have a pending request for that destination,
> >    If yes - the release the lock and returns with error.
> >    - allocate and init new sync_request struct, set reply_received=0
> >    - do send_msg()
> >    -then in a cycle:
> >    pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
> >    - at return from it check if sync_request.reply_received == 1, if not
> > check if timeout expired and either return a failure or go to the start of the cycle.
> >
> > 3) at mp_handler() if REPLY received - grab sync_request.lock,
> >      search through sync_requests.list for dst[] ,
> >     if found, then set it's reply_received=1, copy the received message into reply
> >     and call pthread_cond_braodcast((&sync_requests.cond);
> 
> The only benefit I can see is that now the sender can request to
> multiple receivers at the same time. And it makes things more
> complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.
Plus as you said - other threads can keep sending messages.
Konstantin 

> 
> Thanks,
> Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-16 11:12         ` Ananyev, Konstantin
@ 2018-01-16 16:47           ` Tan, Jianfeng
  2018-01-17 10:50             ` Ananyev, Konstantin
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-16 16:47 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas



On 1/16/2018 7:12 PM, Ananyev, Konstantin wrote:
> Hi Jianfeng,
>
>> -----Original Message-----
>> From: Tan, Jianfeng
>> Sent: Tuesday, January 16, 2018 8:11 AM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>> Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
>>
>> Thank you, Konstantin and Anatoly firstly. Other comments are well
>> received and I'll send out a new version.
>>
>>
>> On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
>>>> We need the synchronous way for multi-process communication,
>>>> i.e., blockingly waiting for reply message when we send a request
>>>> to the peer process.
>>>>
>>>> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
>>>> such use case. By invoking rte_eal_mp_request(), a request message
>>>> is sent out, and then it waits there for a reply message. The
>>>> timeout is hard-coded 5 Sec. And the replied message will be copied
>>>> in the parameters of this API so that the caller can decide how
>>>> to translate those information (including params and fds). Note
>>>> if a primary process owns multiple secondary processes, this API
>>>> will fail.
>>>>
>>>> The API rte_eal_mp_reply() is always called by an mp action handler.
>>>> Here we add another parameter for rte_eal_mp_t so that the action
>>>> handler knows which peer address to reply.
>>>>
>>>> We use mutex in rte_eal_mp_request() to guarantee that only one
>>>> request is on the fly for one pair of processes.
>>> You don't need to do things in such strange and restrictive way.
>>> Instead you can do something like that:
>>> 1) Introduce new struct, list for it and mutex
>>>    struct sync_request {
>>>         int reply_received;
>>>         char dst[PATH_MAX];
>>>         char reply[...];
>>>         LIST_ENTRY(sync_request) next;
>>> };
>>>
>>> static struct
>>>       LIST_HEAD(list, sync_request);
>>>       pthread_mutex_t lock;
>>>      pthead_cond_t cond;
>>> } sync_requests;
>>>
>>> 2) then at request() call:
>>>     Grab sync_requests.lock
>>>     Check do we already have a pending request for that destination,
>>>     If yes - the release the lock and returns with error.
>>>     - allocate and init new sync_request struct, set reply_received=0
>>>     - do send_msg()
>>>     -then in a cycle:
>>>     pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
>>>     - at return from it check if sync_request.reply_received == 1, if not
>>> check if timeout expired and either return a failure or go to the start of the cycle.
>>>
>>> 3) at mp_handler() if REPLY received - grab sync_request.lock,
>>>       search through sync_requests.list for dst[] ,
>>>      if found, then set it's reply_received=1, copy the received message into reply
>>>      and call pthread_cond_braodcast((&sync_requests.cond);
>> The only benefit I can see is that now the sender can request to
>> multiple receivers at the same time. And it makes things more
>> complicated. Do we really need this?
> The benefit is that one thread is blocked waiting for response,
> your mp_handler can still receive and handle other messages.

This can already be done in the original implementation. mp_handler 
listens for msg, request from the other peer(s), and replies the 
requests, which is not affected.

> Plus as you said - other threads can keep sending messages.

For this one, in the original implementation, other threads can still 
send msg, but not request. I suppose the request is not in a fast path, 
why we care to make it fast?

Thanks,
Jianfeng

> Konstantin
>
>> Thanks,
>> Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-16 16:47           ` Tan, Jianfeng
@ 2018-01-17 10:50             ` Ananyev, Konstantin
  2018-01-17 13:09               ` Tan, Jianfeng
  0 siblings, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-17 10:50 UTC (permalink / raw)
  To: Tan, Jianfeng, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas



> > Hi Jianfeng,
> >
> >> -----Original Message-----
> >> From: Tan, Jianfeng
> >> Sent: Tuesday, January 16, 2018 8:11 AM
> >> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>
> >> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> >> Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
> >>
> >> Thank you, Konstantin and Anatoly firstly. Other comments are well
> >> received and I'll send out a new version.
> >>
> >>
> >> On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
> >>>> We need the synchronous way for multi-process communication,
> >>>> i.e., blockingly waiting for reply message when we send a request
> >>>> to the peer process.
> >>>>
> >>>> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> >>>> such use case. By invoking rte_eal_mp_request(), a request message
> >>>> is sent out, and then it waits there for a reply message. The
> >>>> timeout is hard-coded 5 Sec. And the replied message will be copied
> >>>> in the parameters of this API so that the caller can decide how
> >>>> to translate those information (including params and fds). Note
> >>>> if a primary process owns multiple secondary processes, this API
> >>>> will fail.
> >>>>
> >>>> The API rte_eal_mp_reply() is always called by an mp action handler.
> >>>> Here we add another parameter for rte_eal_mp_t so that the action
> >>>> handler knows which peer address to reply.
> >>>>
> >>>> We use mutex in rte_eal_mp_request() to guarantee that only one
> >>>> request is on the fly for one pair of processes.
> >>> You don't need to do things in such strange and restrictive way.
> >>> Instead you can do something like that:
> >>> 1) Introduce new struct, list for it and mutex
> >>>    struct sync_request {
> >>>         int reply_received;
> >>>         char dst[PATH_MAX];
> >>>         char reply[...];
> >>>         LIST_ENTRY(sync_request) next;
> >>> };
> >>>
> >>> static struct
> >>>       LIST_HEAD(list, sync_request);
> >>>       pthread_mutex_t lock;
> >>>      pthead_cond_t cond;
> >>> } sync_requests;
> >>>
> >>> 2) then at request() call:
> >>>     Grab sync_requests.lock
> >>>     Check do we already have a pending request for that destination,
> >>>     If yes - the release the lock and returns with error.
> >>>     - allocate and init new sync_request struct, set reply_received=0
> >>>     - do send_msg()
> >>>     -then in a cycle:
> >>>     pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
> >>>     - at return from it check if sync_request.reply_received == 1, if not
> >>> check if timeout expired and either return a failure or go to the start of the cycle.
> >>>
> >>> 3) at mp_handler() if REPLY received - grab sync_request.lock,
> >>>       search through sync_requests.list for dst[] ,
> >>>      if found, then set it's reply_received=1, copy the received message into reply
> >>>      and call pthread_cond_braodcast((&sync_requests.cond);
> >> The only benefit I can see is that now the sender can request to
> >> multiple receivers at the same time. And it makes things more
> >> complicated. Do we really need this?
> > The benefit is that one thread is blocked waiting for response,
> > your mp_handler can still receive and handle other messages.
> 
> This can already be done in the original implementation. mp_handler
> listens for msg, request from the other peer(s), and replies the
> requests, which is not affected.
> 
> > Plus as you said - other threads can keep sending messages.
> 
> For this one, in the original implementation, other threads can still
> send msg, but not request. I suppose the request is not in a fast path,
> why we care to make it fast?
> 

+int
+rte_eal_mp_request(const char *action_name,
+		   void *params,
+		   int len_p,
+		   int fds[],
+		   int fds_in,
+		   int fds_out)
+{
+	int i, j;
+	int sockfd;
+	int nprocs;
+	int ret = 0;
+	struct mp_msghdr *req;
+	struct timeval tv;
+	char buf[MAX_MSG_LENGTH];
+	struct mp_msghdr *hdr;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+		rte_errno = -E2BIG;
+		return 0;
+	}
+
+	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+	if (req == NULL)
+		return 0;
+
+	if ((sockfd = open_unix_fd(0)) < 0) {
+		free(req);
+		return 0;
+	}
+
+	tv.tv_sec = 5;  /* 5 Secs Timeout */
+	tv.tv_usec = 0;
+	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+			(const void *)&tv, sizeof(struct timeval)) < 0)
+		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");

I f you set it just for one call, why do you not restore it?
Also I don't think it is a good idea to change it here - 
if you'll make timeout a parameter value - then it could be overwritten
by different threads. 

+
+	/* Only allow one req at a time */
+	pthread_mutex_lock(&mp_mutex_request);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		nprocs = 0;
+		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+			if (!mp_sec_sockets[i]) {
+				j = i;
+				nprocs++;
+			}
+
+		if (nprocs > 1) {
+			RTE_LOG(ERR, EAL,
+				"multi secondary processes not supported\n");
+			goto free_and_ret;
+		}
+
+		ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);

As I remember - sndmsg() is also blocking call, so under some conditions you can stall
there forever.
As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.

+	} else
+		ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+	if (ret == 0) {
+		RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+		ret = -1;
+		goto free_and_ret;
+	}
+
+	ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);

if the message you receive is not a reply you are expecting -
it will be simply dropped - mp_handler() would never process it.

+	if (ret > 0) {
+		hdr = (struct mp_msghdr *)buf;
+		if (hdr->len_params == len_p)
+			memcpy(params, hdr->params, len_p);
+		else {
+			RTE_LOG(ERR, EAL, "invalid reply\n");
+			ret = 0;
+		}
+	}
+
+free_and_ret:
+	free(req);
+	close(sockfd);
+	pthread_mutex_unlock(&mp_mutex_request);
+	return ret;
+}

All of the above makes me think that current implementation is erroneous
and needs to be reworked.
Konstantin

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-17 10:50             ` Ananyev, Konstantin
@ 2018-01-17 13:09               ` Tan, Jianfeng
  2018-01-17 13:15                 ` Tan, Jianfeng
  2018-01-17 17:20                 ` Ananyev, Konstantin
  0 siblings, 2 replies; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-17 13:09 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas



On 1/17/2018 6:50 PM, Ananyev, Konstantin wrote:
>
>>> Hi Jianfeng,
>>>
>>>> -----Original Message-----
>>>> From: Tan, Jianfeng
>>>> Sent: Tuesday, January 16, 2018 8:11 AM
>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>
>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>>>> Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
>>>>
>>>> Thank you, Konstantin and Anatoly firstly. Other comments are well
>>>> received and I'll send out a new version.
>>>>
>>>>
>>>> On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
>>>>>> We need the synchronous way for multi-process communication,
>>>>>> i.e., blockingly waiting for reply message when we send a request
>>>>>> to the peer process.
>>>>>>
>>>>>> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
>>>>>> such use case. By invoking rte_eal_mp_request(), a request message
>>>>>> is sent out, and then it waits there for a reply message. The
>>>>>> timeout is hard-coded 5 Sec. And the replied message will be copied
>>>>>> in the parameters of this API so that the caller can decide how
>>>>>> to translate those information (including params and fds). Note
>>>>>> if a primary process owns multiple secondary processes, this API
>>>>>> will fail.
>>>>>>
>>>>>> The API rte_eal_mp_reply() is always called by an mp action handler.
>>>>>> Here we add another parameter for rte_eal_mp_t so that the action
>>>>>> handler knows which peer address to reply.
>>>>>>
>>>>>> We use mutex in rte_eal_mp_request() to guarantee that only one
>>>>>> request is on the fly for one pair of processes.
>>>>> You don't need to do things in such strange and restrictive way.
>>>>> Instead you can do something like that:
>>>>> 1) Introduce new struct, list for it and mutex
>>>>>     struct sync_request {
>>>>>          int reply_received;
>>>>>          char dst[PATH_MAX];
>>>>>          char reply[...];
>>>>>          LIST_ENTRY(sync_request) next;
>>>>> };
>>>>>
>>>>> static struct
>>>>>        LIST_HEAD(list, sync_request);
>>>>>        pthread_mutex_t lock;
>>>>>       pthead_cond_t cond;
>>>>> } sync_requests;
>>>>>
>>>>> 2) then at request() call:
>>>>>      Grab sync_requests.lock
>>>>>      Check do we already have a pending request for that destination,
>>>>>      If yes - the release the lock and returns with error.
>>>>>      - allocate and init new sync_request struct, set reply_received=0
>>>>>      - do send_msg()
>>>>>      -then in a cycle:
>>>>>      pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
>>>>>      - at return from it check if sync_request.reply_received == 1, if not
>>>>> check if timeout expired and either return a failure or go to the start of the cycle.
>>>>>
>>>>> 3) at mp_handler() if REPLY received - grab sync_request.lock,
>>>>>        search through sync_requests.list for dst[] ,
>>>>>       if found, then set it's reply_received=1, copy the received message into reply
>>>>>       and call pthread_cond_braodcast((&sync_requests.cond);
>>>> The only benefit I can see is that now the sender can request to
>>>> multiple receivers at the same time. And it makes things more
>>>> complicated. Do we really need this?
>>> The benefit is that one thread is blocked waiting for response,
>>> your mp_handler can still receive and handle other messages.
>> This can already be done in the original implementation. mp_handler
>> listens for msg, request from the other peer(s), and replies the
>> requests, which is not affected.
>>
>>> Plus as you said - other threads can keep sending messages.
>> For this one, in the original implementation, other threads can still
>> send msg, but not request. I suppose the request is not in a fast path,
>> why we care to make it fast?
>>
> +int
> +rte_eal_mp_request(const char *action_name,
> +		   void *params,
> +		   int len_p,
> +		   int fds[],
> +		   int fds_in,
> +		   int fds_out)
> +{
> +	int i, j;
> +	int sockfd;
> +	int nprocs;
> +	int ret = 0;
> +	struct mp_msghdr *req;
> +	struct timeval tv;
> +	char buf[MAX_MSG_LENGTH];
> +	struct mp_msghdr *hdr;
> +
> +	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
> +
> +	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		rte_errno = -E2BIG;
> +		return 0;
> +	}
> +
> +	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
> +	if (req == NULL)
> +		return 0;
> +
> +	if ((sockfd = open_unix_fd(0)) < 0) {
> +		free(req);
> +		return 0;
> +	}
> +
> +	tv.tv_sec = 5;  /* 5 Secs Timeout */
> +	tv.tv_usec = 0;
> +	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
> +			(const void *)&tv, sizeof(struct timeval)) < 0)
> +		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
>
> I f you set it just for one call, why do you not restore it?

Yes, original code is buggy, I should have put it into the critical section.

Do you mean we just create once and use for ever? if yes, we could put 
the open and setting into mp_init().

> Also I don't think it is a good idea to change it here -
> if you'll make timeout a parameter value - then it could be overwritten
> by different threads.

For simplicity, I'm not inclined to put the timeout as an parameter 
exposing to caller. So if you agree, I'll put it into the mp_init() with 
open.

>
> +
> +	/* Only allow one req at a time */
> +	pthread_mutex_lock(&mp_mutex_request);
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		nprocs = 0;
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> +			if (!mp_sec_sockets[i]) {
> +				j = i;
> +				nprocs++;
> +			}
> +
> +		if (nprocs > 1) {
> +			RTE_LOG(ERR, EAL,
> +				"multi secondary processes not supported\n");
> +			goto free_and_ret;
> +		}
> +
> +		ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
>
> As I remember - sndmsg() is also blocking call, so under some conditions you can stall
> there forever.

 From linux's unix_diagram_sendmsg(), we see:
     timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

I assume it will not block for datagram unix socket in Linux. But I'm 
not sure what it behaves in freebsd.

Anyway, better to add an explicit setsockopt() to make it not blocking.

> As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.
>
> +	} else
> +		ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
> +
> +	if (ret == 0) {
> +		RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
> +		ret = -1;
> +		goto free_and_ret;
> +	}
> +
> +	ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
>
> if the message you receive is not a reply you are expecting -
> it will be simply dropped - mp_handler() would never process it.

We cannot detect if it's the right reply absolutely correctly, but just 
check the action_name, which means, it still possibly gets a wrong reply 
if an action_name contains multiple requests.

Is just comparing the action_name acceptable?

>
> +	if (ret > 0) {
> +		hdr = (struct mp_msghdr *)buf;
> +		if (hdr->len_params == len_p)
> +			memcpy(params, hdr->params, len_p);
> +		else {
> +			RTE_LOG(ERR, EAL, "invalid reply\n");
> +			ret = 0;
> +		}
> +	}
> +
> +free_and_ret:
> +	free(req);
> +	close(sockfd);
> +	pthread_mutex_unlock(&mp_mutex_request);
> +	return ret;
> +}
>
> All of the above makes me think that current implementation is erroneous
> and needs to be reworked.

Thank you for your review. I'll work on a new version.

Thanks,
Jianfeng

> Konstantin
>
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-17 13:09               ` Tan, Jianfeng
@ 2018-01-17 13:15                 ` Tan, Jianfeng
  2018-01-17 17:20                 ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-17 13:15 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas



On 1/17/2018 9:09 PM, Tan, Jianfeng wrote:
>
>
> On 1/17/2018 6:50 PM, Ananyev, Konstantin wrote:
>
[...]
>> +int
>> +rte_eal_mp_request(const char *action_name,
>> +           void *params,
>> +           int len_p,
>> +           int fds[],
>> +           int fds_in,
>> +           int fds_out)
>> +{
>> +    int i, j;
>> +    int sockfd;
>> +    int nprocs;
>> +    int ret = 0;
>> +    struct mp_msghdr *req;
>> +    struct timeval tv;
>> +    char buf[MAX_MSG_LENGTH];
>> +    struct mp_msghdr *hdr;
>> +
>> +    RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
>> +
>> +    if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
>> +        RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", 
>> SCM_MAX_FD);
>> +        rte_errno = -E2BIG;
>> +        return 0;
>> +    }
>> +
>> +    req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
>> +    if (req == NULL)
>> +        return 0;
>> +
>> +    if ((sockfd = open_unix_fd(0)) < 0) {
>> +        free(req);
>> +        return 0;
>> +    }
>> +
>> +    tv.tv_sec = 5;  /* 5 Secs Timeout */
>> +    tv.tv_usec = 0;
>> +    if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
>> +            (const void *)&tv, sizeof(struct timeval)) < 0)
>> +        RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
>>
>> I f you set it just for one call, why do you not restore it?
>
> Yes, original code is buggy, I should have put it into the critical 
> section.
>
> Do you mean we just create once and use for ever? if yes, we could put 
> the open and setting into mp_init().

A second thought, we shall not put the setting into mp_init(). It'll be 
set to non-blocking as of sending msg, but blocking as of receiving msg.

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
  2018-01-17 13:09               ` Tan, Jianfeng
  2018-01-17 13:15                 ` Tan, Jianfeng
@ 2018-01-17 17:20                 ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-17 17:20 UTC (permalink / raw)
  To: Tan, Jianfeng, dev, Burakov, Anatoly; +Cc: Richardson, Bruce, thomas


> 
> 
> On 1/17/2018 6:50 PM, Ananyev, Konstantin wrote:
> >
> >>> Hi Jianfeng,
> >>>
> >>>> -----Original Message-----
> >>>> From: Tan, Jianfeng
> >>>> Sent: Tuesday, January 16, 2018 8:11 AM
> >>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Burakov, Anatoly <anatoly.burakov@intel.com>
> >>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> >>>> Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
> >>>>
> >>>> Thank you, Konstantin and Anatoly firstly. Other comments are well
> >>>> received and I'll send out a new version.
> >>>>
> >>>>
> >>>> On 1/16/2018 8:00 AM, Ananyev, Konstantin wrote:
> >>>>>> We need the synchronous way for multi-process communication,
> >>>>>> i.e., blockingly waiting for reply message when we send a request
> >>>>>> to the peer process.
> >>>>>>
> >>>>>> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> >>>>>> such use case. By invoking rte_eal_mp_request(), a request message
> >>>>>> is sent out, and then it waits there for a reply message. The
> >>>>>> timeout is hard-coded 5 Sec. And the replied message will be copied
> >>>>>> in the parameters of this API so that the caller can decide how
> >>>>>> to translate those information (including params and fds). Note
> >>>>>> if a primary process owns multiple secondary processes, this API
> >>>>>> will fail.
> >>>>>>
> >>>>>> The API rte_eal_mp_reply() is always called by an mp action handler.
> >>>>>> Here we add another parameter for rte_eal_mp_t so that the action
> >>>>>> handler knows which peer address to reply.
> >>>>>>
> >>>>>> We use mutex in rte_eal_mp_request() to guarantee that only one
> >>>>>> request is on the fly for one pair of processes.
> >>>>> You don't need to do things in such strange and restrictive way.
> >>>>> Instead you can do something like that:
> >>>>> 1) Introduce new struct, list for it and mutex
> >>>>>     struct sync_request {
> >>>>>          int reply_received;
> >>>>>          char dst[PATH_MAX];
> >>>>>          char reply[...];
> >>>>>          LIST_ENTRY(sync_request) next;
> >>>>> };
> >>>>>
> >>>>> static struct
> >>>>>        LIST_HEAD(list, sync_request);
> >>>>>        pthread_mutex_t lock;
> >>>>>       pthead_cond_t cond;
> >>>>> } sync_requests;
> >>>>>
> >>>>> 2) then at request() call:
> >>>>>      Grab sync_requests.lock
> >>>>>      Check do we already have a pending request for that destination,
> >>>>>      If yes - the release the lock and returns with error.
> >>>>>      - allocate and init new sync_request struct, set reply_received=0
> >>>>>      - do send_msg()
> >>>>>      -then in a cycle:
> >>>>>      pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
> >>>>>      - at return from it check if sync_request.reply_received == 1, if not
> >>>>> check if timeout expired and either return a failure or go to the start of the cycle.
> >>>>>
> >>>>> 3) at mp_handler() if REPLY received - grab sync_request.lock,
> >>>>>        search through sync_requests.list for dst[] ,
> >>>>>       if found, then set it's reply_received=1, copy the received message into reply
> >>>>>       and call pthread_cond_braodcast((&sync_requests.cond);
> >>>> The only benefit I can see is that now the sender can request to
> >>>> multiple receivers at the same time. And it makes things more
> >>>> complicated. Do we really need this?
> >>> The benefit is that one thread is blocked waiting for response,
> >>> your mp_handler can still receive and handle other messages.
> >> This can already be done in the original implementation. mp_handler
> >> listens for msg, request from the other peer(s), and replies the
> >> requests, which is not affected.
> >>
> >>> Plus as you said - other threads can keep sending messages.
> >> For this one, in the original implementation, other threads can still
> >> send msg, but not request. I suppose the request is not in a fast path,
> >> why we care to make it fast?
> >>
> > +int
> > +rte_eal_mp_request(const char *action_name,
> > +		   void *params,
> > +		   int len_p,
> > +		   int fds[],
> > +		   int fds_in,
> > +		   int fds_out)
> > +{
> > +	int i, j;
> > +	int sockfd;
> > +	int nprocs;
> > +	int ret = 0;
> > +	struct mp_msghdr *req;
> > +	struct timeval tv;
> > +	char buf[MAX_MSG_LENGTH];
> > +	struct mp_msghdr *hdr;
> > +
> > +	RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
> > +
> > +	if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
> > +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> > +		rte_errno = -E2BIG;
> > +		return 0;
> > +	}
> > +
> > +	req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
> > +	if (req == NULL)
> > +		return 0;
> > +
> > +	if ((sockfd = open_unix_fd(0)) < 0) {
> > +		free(req);
> > +		return 0;
> > +	}
> > +
> > +	tv.tv_sec = 5;  /* 5 Secs Timeout */
> > +	tv.tv_usec = 0;
> > +	if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
> > +			(const void *)&tv, sizeof(struct timeval)) < 0)
> > +		RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
> >
> > I f you set it just for one call, why do you not restore it?
> 
> Yes, original code is buggy, I should have put it into the critical section.
> 
> Do you mean we just create once and use for ever? if yes, we could put
> the open and setting into mp_init().
> 
> > Also I don't think it is a good idea to change it here -
> > if you'll make timeout a parameter value - then it could be overwritten
> > by different threads.
> 
> For simplicity, I'm not inclined to put the timeout as an parameter
> exposing to caller. So if you agree, I'll put it into the mp_init() with
> open.

My preference would be to have timeout value on a per call basis.
For one request user would like to wait no more than 5sec,
for another one user would probably be ok to wait forever.

> 
> >
> > +
> > +	/* Only allow one req at a time */
> > +	pthread_mutex_lock(&mp_mutex_request);
> > +
> > +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> > +		nprocs = 0;
> > +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
> > +			if (!mp_sec_sockets[i]) {
> > +				j = i;
> > +				nprocs++;
> > +			}
> > +
> > +		if (nprocs > 1) {
> > +			RTE_LOG(ERR, EAL,
> > +				"multi secondary processes not supported\n");
> > +			goto free_and_ret;
> > +		}
> > +
> > +		ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
> >
> > As I remember - sndmsg() is also blocking call, so under some conditions you can stall
> > there forever.
> 
>  From linux's unix_diagram_sendmsg(), we see:
>      timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

Ok, but it would have effect only if (msg->msg_flags & MSG_DONTWAIT) != 0.
And for that, as I remember you need your socket in non-blocking mode, no?

> 
> I assume it will not block for datagram unix socket in Linux. But I'm
> not sure what it behaves in freebsd.
> 
> Anyway, better to add an explicit setsockopt() to make it not blocking.

You can't do that - at the same moment another thread might call your sendmsg()
and it might expect it to be blocking call.

> 
> > As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.
> >
> > +	} else
> > +		ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
> > +
> > +	if (ret == 0) {
> > +		RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
> > +		ret = -1;
> > +		goto free_and_ret;
> > +	}
> > +
> > +	ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
> >
> > if the message you receive is not a reply you are expecting -
> > it will be simply dropped - mp_handler() would never process it.
> 
> We cannot detect if it's the right reply absolutely correctly, but just
> check the action_name, which means, it still possibly gets a wrong reply
> if an action_name contains multiple requests.
> 
> Is just comparing the action_name acceptable?

As I can see the main issue here is that you can call recvmsg() from 2 different
points and they are not syncronised:
1. your mp_handler() doesn't aware about reply you are waiting and not 
have any handler associated with it.
So if mp_handler() will receive a reply it will just drop it.
2. your reply() is not aware about any other messages and associated actions -
so again it can't handle them properly (and probably shouldn't).

The simplest (and most common) way - always call recvmsg from one place -
mp_handler() and have a special action for reply msg.
As I wrote before that action will be just find the appropriate buffer provided
by reply() - copy message into it and signal thread waiting in reply() that
it can proceed.
  
Konstantin

> 
> >
> > +	if (ret > 0) {
> > +		hdr = (struct mp_msghdr *)buf;
> > +		if (hdr->len_params == len_p)
> > +			memcpy(params, hdr->params, len_p);
> > +		else {
> > +			RTE_LOG(ERR, EAL, "invalid reply\n");
> > +			ret = 0;
> > +		}
> > +	}
> > +
> > +free_and_ret:
> > +	free(req);
> > +	close(sockfd);
> > +	pthread_mutex_unlock(&mp_mutex_request);
> > +	return ret;
> > +}
> >
> > All of the above makes me think that current implementation is erroneous
> > and needs to be reworked.
> 
> Thank you for your review. I'll work on a new version.
> 
> Thanks,
> Jianfeng
> 
> > Konstantin
> >
> >

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v3 0/3] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (4 preceding siblings ...)
  2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
@ 2018-01-25  4:16 ` Jianfeng Tan
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
                     ` (2 more replies)
  2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
                   ` (3 subsequent siblings)
  9 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25  4:16 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v2->v3:
  - Add pre-check for each APIs.
  - Remove the limitation of 8 secondary processes by: discard original
    register/unregister mechanism of secondary process, instead, primary
    discoveries secondary processes by looking up the folder for regex match.
  - Previous implementation use two sockets for msg and request, this version
    just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.
Patch 3: Rework vfio to use this generic communication channel.


Jianfeng Tan (3):
  eal: add channel for multi-process communication
  eal: add synchronous multi-process communication
  vfio: use the generic multi-process channel

 doc/guides/rel_notes/release_18_02.rst         |  15 +
 lib/librte_eal/common/eal_common_proc.c        | 593 ++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h         |  17 +
 lib/librte_eal/common/eal_private.h            |  10 +
 lib/librte_eal/common/include/rte_eal.h        | 131 ++++++
 lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 172 +++----
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 +++--------------
 lib/librte_eal/rte_eal_version.map             |   6 +
 10 files changed, 915 insertions(+), 474 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v3 1/3] eal: add channel for multi-process communication
  2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
@ 2018-01-25  4:16   ` Jianfeng Tan
  2018-01-25 10:41     ` Thomas Monjalon
                       ` (2 more replies)
  2018-01-25  4:16   ` [PATCH v3 2/3] eal: add synchronous " Jianfeng Tan
  2018-01-25  4:16   ` [PATCH v3 3/3] vfio: use the generic multi-process channel Jianfeng Tan
  2 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25  4:16 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are n secondary processes, the primary
     process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_proc.c | 390 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  17 ++
 lib/librte_eal/common/eal_private.h     |  10 +
 lib/librte_eal/common/include/rte_eal.h |  75 ++++++
 lib/librte_eal/linuxapp/eal/eal.c       |   8 +
 lib/librte_eal/rte_eal_version.map      |   3 +
 6 files changed, 502 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..baeb7d1 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -2,14 +2,48 @@
  * Copyright(c) 2016 Intel Corporation
  */
 
-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <fnmatch.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_log.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+static int mp_fd = -1;
+static char mp_filter[PATH_MAX];   /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;
+	char action_name[RTE_MP_MAX_NAME_LEN];
+	rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +65,357 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+static bool
+validate_action_name(const char *name)
+{
+	if (name == NULL) {
+		RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+		RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return false;
+	}
+	return true;
+}
+
+int
+rte_eal_mp_action_register(const char *name, rte_eal_mp_t action)
+{
+	struct action_entry *entry;
+
+	if(!validate_action_name(name))
+		return -1;
+
+	entry = malloc(sizeof(struct action_entry));
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	strcpy(entry->action_name, name);
+	entry->action = action;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(name) != NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		rte_errno = -EEXIST;
+		free(entry);
+		return -1;
+	}
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	if(!validate_action_name(name))
+		return;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	if (entry == NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		return;
+	}
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+	int msglen;
+	struct iovec iov;
+	struct msghdr msgh;
+	char control[CMSG_SPACE(sizeof(msg->fds))];
+	struct cmsghdr *cmsg;
+	int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = msg;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(mp_fd, &msgh, 0);
+	if (msglen < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+	struct action_entry *entry;
+	rte_eal_mp_t action = NULL;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(msg->name);
+	if (entry != NULL)
+		action = entry->action;
+	pthread_mutex_unlock(&mp_mutex_action);
+
+	if (!action)
+		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+	else if (action(msg) < 0)
+		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	struct rte_mp_msg msg;
+
+	while (1) {
+		if (read_msg(&msg) == 0)
+			process_msg(&msg);
+	}
+
+	return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+	struct sockaddr_un un;
+	const char *prefix = eal_mp_socket_path();
+
+	mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (mp_fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+	else
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d",
+			 prefix, getpid());
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(mp_fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+	return mp_fd;
+}
+
+static void
+unlink_sockets(void)
+{
+	int dir_fd;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		return;
+	}
+	dir_fd = dirfd(mp_dir);
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) == 0)
+			unlinkat(dir_fd, ent->d_name, 0);
+	}
+
+	closedir(mp_dir);
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	char *path;
+	pthread_t tid;
+
+	snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+		 internal_config.hugefile_prefix);
+
+	path = strdup(eal_mp_socket_path());
+	snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+	free(path);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		unlink_sockets();
+
+	if (open_socket_fd() < 0)
+		return -1;
+
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) == 0) {
+		/* try best to set thread name */
+		rte_thread_setname(tid, thread_name);
+		return 0;
+	}
+
+	RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", strerror(errno));
+	close(mp_fd);
+	mp_fd = -1;
+	return -1;
+}
+
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	int fd_size = msg->num_fds * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = msg;
+	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+	do {
+		snd = sendmsg(mp_fd, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	if (snd > 0)
+		return 1;
+
+	RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+		dst_path, strerror(errno));
+	return 0;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+	int n = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		/* broadcast to all secondary processes */
+		mp_dir = opendir(mp_dir_path);
+		if (!mp_dir) {
+			RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+				mp_dir_path);
+			return 0;
+		}
+		while ((ent = readdir(mp_dir))) {
+			if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+				continue;
+
+			n += send_msg(ent->d_name, msg);
+		}
+		closedir(mp_dir);
+	} else
+		n += send_msg(eal_mp_socket_path(), msg);
+
+	return n;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+	if (msg == NULL) {
+		RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+
+	if (!validate_action_name(msg->name))
+		return false;
+
+	if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+		RTE_LOG(ERR, EAL, "Message data is too long\n");
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+			RTE_MP_MAX_FD_NUM);
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	return true;
+}
+
+int
+rte_eal_mp_sendmsg(struct rte_mp_msg *msg)
+{
+	if (!check_input(msg))
+		return -1;
+
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+	return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..3b2929d 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..9a1aac2 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
  */
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
+#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
+struct rte_mp_msg {
+	char name[RTE_MP_MAX_NAME_LEN];
+	int len_param;
+	int num_fds;
+	uint8_t param[RTE_MP_MAX_PARAM_LEN];
+	int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create  socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *name, rte_eal_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @return
+ *  - (<0) on invalid parameters;
+ *  - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..ad44ab5 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	if (rte_eal_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 #ifdef VFIO_PRESENT
 	if (rte_eal_vfio_setup() < 0) {
 		rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..adeadfb 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
 	rte_eal_devargs_remove;
 	rte_eal_hotplug_add;
 	rte_eal_hotplug_remove;
+	rte_eal_mp_action_register;
+	rte_eal_mp_action_unregister;
+	rte_eal_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
@ 2018-01-25  4:16   ` Jianfeng Tan
  2018-01-25 12:00     ` Burakov, Anatoly
  2018-01-25 12:22     ` Ananyev, Konstantin
  2018-01-25  4:16   ` [PATCH v3 3/3] vfio: use the generic multi-process channel Jianfeng Tan
  2 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25  4:16 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

       sender-process                receiver-process
   ----------------------            ----------------

    thread-n
     |_rte_eal_mp_request() ----------> mp-thread
        |_timedwait()                    |_process_msg()
                                           |_action()
                                               |_rte_eal_mp_reply()
	        mp_thread  <---------------------|
                  |_process_msg()
                     |_signal(send_thread)
    thread-m <----------|
     |_collect-reply

 * A secondary process is only allowed to talk to the primary process.
 * If there are multiple secondary processes for the primary proces,
   it will send request to peer1, collect response from peer1; then
   send request to peer2, collect reponse from peer2, and so on.
 * When thread-n is sending request, thread-m of that process can send
   request at the same time.
 * For pair <action_name, peer>, we guarantee that only one such request
   is on the fly.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |  15 ++
 lib/librte_eal/common/eal_common_proc.c | 237 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  58 +++++++-
 lib/librte_eal/rte_eal_version.map      |   3 +
 4 files changed, 295 insertions(+), 18 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..f6ed666 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,21 @@ New Features
   renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
   to PMD agnostic ``eventdev_pipeline``.
 
+* **Added new multi-process communication channel**
+
+  Added a generic channel in EAL for multi-process (primary/secondary) synchronous
+  and asynchronous communication. Each component who wants to reponse a message
+  shall register the action; and each process has a thread to receive the message
+  and invokes the registered action. The list of new APIs:
+
+  * ``rte_eal_mp_register``
+  * ``rte_eal_mp_unregister``
+  * ``rte_eal_mp_sendmsg``
+  * ``rte_eal_mp_request``
+  * ``rte_eal_mp_reply``
+
+  Note as we changed to use the new channel for communication, applications cannot
+  talk with old version through the old (private) communication channel.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index baeb7d1..69df943 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -44,6 +45,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
 static struct action_entry_list action_entry_list =
 	TAILQ_HEAD_INITIALIZER(action_entry_list);
 
+enum mp_type {
+	MP_MSG, /* Share message with peers, will not block */
+	MP_REQ, /* Request for information, Will block for a reply */
+	MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+	int type;
+	struct rte_mp_msg msg;
+};
+
+struct sync_request {
+	int reply_received;
+	char dst[PATH_MAX];
+	struct rte_mp_msg *request;
+	struct rte_mp_msg *reply;
+	pthread_cond_t cond;
+	TAILQ_ENTRY(sync_request) next;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+	struct sync_request_list requests;
+	pthread_mutex_t lock;
+} sync_requests = {
+	.requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+	.lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+	struct sync_request *r;
+
+	TAILQ_FOREACH(r, &sync_requests.requests, next) {
+		if (!strcmp(r->dst, dst) &&
+		    !strcmp(r->request->name, act_name))
+			break;
+	}
+
+	return r;
+}
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -147,19 +192,21 @@ rte_eal_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
 	int msglen;
 	struct iovec iov;
 	struct msghdr msgh;
-	char control[CMSG_SPACE(sizeof(msg->fds))];
+	char control[CMSG_SPACE(sizeof(m->msg.fds))];
 	struct cmsghdr *cmsg;
-	int buflen = sizeof(*msg) - sizeof(msg->fds);
+	int buflen = sizeof(*m) - sizeof(m->msg.fds);
 
 	memset(&msgh, 0, sizeof(msgh));
-	iov.iov_base = msg;
+	iov.iov_base = m;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -181,7 +228,7 @@ read_msg(struct rte_mp_msg *msg)
 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
 			(cmsg->cmsg_type == SCM_RIGHTS)) {
-			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
 			break;
 		}
 	}
@@ -190,12 +237,28 @@ read_msg(struct rte_mp_msg *msg)
 }
 
 static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
+	struct sync_request *sync_req;
 	struct action_entry *entry;
+	struct rte_mp_msg *msg = &m->msg;
 	rte_eal_mp_t action = NULL;
 
 	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+	if (m->type == MP_REP) {
+		pthread_mutex_lock(&sync_requests.lock);
+		sync_req = find_sync_request(s->sun_path, msg->name);
+		if (sync_req) {
+			memcpy(sync_req->reply, msg, sizeof(*msg));
+			sync_req->reply_received = 1;
+			pthread_cond_signal(&sync_req->cond);
+		} else
+			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+		pthread_mutex_unlock(&sync_requests.lock);
+		return;
+	}
+
 	pthread_mutex_lock(&mp_mutex_action);
 	entry = find_action_entry_by_name(msg->name);
 	if (entry != NULL)
@@ -204,18 +267,19 @@ process_msg(struct rte_mp_msg *msg)
 
 	if (!action)
 		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
-	else if (action(msg) < 0)
+	else if (action(msg, s->sun_path) < 0)
 		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
 }
 
 static void *
 mp_handle(void *arg __rte_unused)
 {
-	struct rte_mp_msg msg;
+	struct mp_msg_internal msg;
+	struct sockaddr_un sa;
 
 	while (1) {
-		if (read_msg(&msg) == 0)
-			process_msg(&msg);
+		if (read_msg(&msg, &sa) == 0)
+			process_msg(&msg, &sa);
 	}
 
 	return NULL;
@@ -309,16 +373,20 @@ rte_eal_mp_channel_init(void)
 }
 
 static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
 {
 	int snd;
 	struct iovec iov;
 	struct msghdr msgh;
 	struct cmsghdr *cmsg;
 	struct sockaddr_un dst;
+	struct mp_msg_internal m;
 	int fd_size = msg->num_fds * sizeof(int);
 	char control[CMSG_SPACE(fd_size)];
 
+	m.type = type;
+	memcpy(&m.msg, msg, sizeof(*msg));
+
 	memset(&dst, 0, sizeof(dst));
 	dst.sun_family = AF_UNIX;
 	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -326,8 +394,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 	memset(&msgh, 0, sizeof(msgh));
 	memset(control, 0, sizeof(control));
 
-	iov.iov_base = msg;
-	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+	iov.iov_base = &m;
+	iov.iov_len = sizeof(m) - sizeof(msg->fds);
 
 	msgh.msg_name = &dst;
 	msgh.msg_namelen = sizeof(dst);
@@ -355,12 +423,16 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 }
 
 static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 {
 	int n = 0;
 	DIR *mp_dir;
 	struct dirent *ent;
 
+
+	if (peer)
+		return send_msg(peer, msg, type);
+
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
 		/* broadcast to all secondary processes */
 		mp_dir = opendir(mp_dir_path);
@@ -373,11 +445,11 @@ mp_send(struct rte_mp_msg *msg)
 			if (fnmatch(mp_filter, ent->d_name, 0) != 0)
 				continue;
 
-			n += send_msg(ent->d_name, msg);
+			n += send_msg(ent->d_name, msg, type);
 		}
 		closedir(mp_dir);
 	} else
-		n += send_msg(eal_mp_socket_path(), msg);
+		n += send_msg(eal_mp_socket_path(), msg, type);
 
 	return n;
 }
@@ -417,5 +489,136 @@ rte_eal_mp_sendmsg(struct rte_mp_msg *msg)
 		return -1;
 
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
-	return mp_send(msg);
+	return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+	       struct rte_mp_reply *reply, const struct timespec *ts)
+{
+	struct timeval now;
+	struct rte_mp_msg msg, *tmp;
+	struct sync_request sync_req, *exist;
+
+	sync_req.reply_received = 0;
+	strcpy(sync_req.dst, dst);
+	sync_req.request = req;
+	sync_req.reply = &msg;
+	pthread_cond_init(&sync_req.cond, NULL);
+
+	pthread_mutex_lock(&sync_requests.lock);
+	exist = find_sync_request(dst, req->name);
+	if (!exist)
+		TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+	if (exist) {
+		RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+		return 0;
+	}
+
+	if (send_msg(dst, req, MP_REQ) != 1) {
+		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+			dst, req->name);
+		return 0;
+	}
+
+	pthread_mutex_lock(&sync_requests.lock);
+	do {
+		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+		/* Check spurious wakeups */
+		if (sync_req.reply_received == 1)
+			break;
+		/* Check if time is out */
+		if (gettimeofday(&now, NULL) < 0)
+			break;
+		if (now.tv_sec < ts->tv_sec)
+			break;
+		else if (now.tv_sec == ts->tv_sec &&
+			 now.tv_usec * 1000 < ts->tv_nsec)
+			break;
+	} while (1);
+	/* We got the lock now */
+	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+
+	if (sync_req.reply_received == 0) {
+		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+			dst, req->name);
+		return 1;
+	}
+
+	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
+	if (!tmp) {
+		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+			dst, req->name);
+		return 1;
+	}
+	memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
+	reply->msgs = tmp;
+	reply->nb_msgs++;
+	return 1;
+}
+
+int
+rte_eal_mp_request(struct rte_mp_msg *req,
+		   struct rte_mp_reply *reply,
+		   const struct timespec *ts)
+{
+	DIR *mp_dir;
+	struct dirent *ent;
+	int nb_snds = 0;
+	struct timeval now;
+	struct timespec end;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+	if (check_input(req) == false)
+		return -1;
+	if (gettimeofday(&now, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "Faile to get current time\n");
+		return -1;
+	}
+	end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+	end.tv_sec = now.tv_sec + ts->tv_sec +
+			(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+	reply->nb_msgs = 0;
+	reply->msgs = NULL;
+
+	/* for secondary process, send request to the primary process only */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+	/* for primary process, broadcast request, and collect reply 1 by 1 */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		return -1;
+	}
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		nb_snds += mp_request_one(ent->d_name, req, reply, &end);
+	}
+	closedir(mp_dir);
+
+	return nb_snds;
+}
+
+int
+rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+	if (check_input(msg) == false)
+		return -1;
+
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		return -1;
+	}
+
+	return mp_send(msg, peer, MP_REP);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 9a1aac2..8e234e0 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@
 
 #include <stdint.h>
 #include <sched.h>
+#include <time.h>
 
 #include <rte_config.h>
 #include <rte_per_lcore.h>
@@ -197,13 +198,18 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+struct rte_mp_reply {
+	int nb_msgs;
+	struct rte_mp_msg *msgs;
+};
+
 /**
  * Action function typedef used by other components.
  *
  * As we create  socket channel for primary/secondary communication, use
  * this function typedef to register action for coming messages.
  */
-typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg, const void *peer);
 
 /**
  * @warning
@@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
 int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ *   The req argument contains the customized request message.
+ *
+ * @param reply
+ *   The reply argument will be for storing all the replied messages;
+ *   the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ *   The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ *  - (<0) on invalid parameters;
+ *  - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_request(struct rte_mp_msg *req,
+			struct rte_mp_reply *reply, const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @param peer
+ *   The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ *  - (1) on success;
+ *  - (0) on failure;
+ *  - (<0) on invalid parameters.
+ */
+int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..3015bc6 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
 	rte_eal_mp_action_register;
 	rte_eal_mp_action_unregister;
 	rte_eal_mp_sendmsg;
+	rte_eal_mp_request;
+	rte_eal_mp_reply;
+	rte_eal_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v3 3/3] vfio: use the generic multi-process channel
  2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
  2018-01-25  4:16   ` [PATCH v3 2/3] eal: add synchronous " Jianfeng Tan
@ 2018-01-25  4:16   ` Jianfeng Tan
  2018-01-25 10:47     ` Thomas Monjalon
  2 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25  4:16 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 --num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 172 +++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
 4 files changed, 136 insertions(+), 473 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ad44ab5..66a79a1 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
 		return -1;
 	vfio_enabled = rte_vfio_is_enabled("vfio");
 
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
+	if (vfio_enabled && vfio_mp_sync_setup() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..c2f8486 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -39,9 +39,14 @@ int
 vfio_get_group_fd(int iommu_group_no)
 {
 	int i;
+	int ret;
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +106,31 @@ vfio_get_group_fd(int iommu_group_no)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_no = iommu_group_no;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	ret = rte_eal_mp_request(&req, &reply, &ts);
+	if (ret > 0 && reply.nb_msgs > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK && rep->num_fds == 1) {
+			cur_grp->group_no = iommu_group_no;
+			vfio_group_fd = rep->fds[0];
+			cur_grp->fd = vfio_group_fd;
+			vfio_cfg.vfio_active_groups++;
 		}
+		free(reply.msgs);
 	}
-	return -1;
+
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
 }
 
 
@@ -200,7 +186,11 @@ int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	int ret;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -214,43 +204,24 @@ rte_vfio_clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_no = vfio_group_fd;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+	ret = rte_eal_mp_request(&req, &reply, &ts);
+	if (ret > 0 && reply.nb_msgs > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK) {
+			free(reply.msgs);
+			return 0;
+		}
+		free(reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
+	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
 	return -1;
 }
 
@@ -561,6 +532,11 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +567,29 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	ret = rte_eal_mp_request(&req, &reply, &ts);
+	if (ret > 0 && reply.nb_msgs > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK && rep->num_fds == 1) {
+			free(reply.msgs);
+			return rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..8c2f409 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -2,31 +2,14 @@
  * Copyright(c) 2010-2014 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,360 +20,81 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd;
+	int num;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
+	memset(&reply, 0, sizeof(reply));
 
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_no = m->group_no;
+		fd = vfio_get_group_fd(m->group_no);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			num = 1;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_no = m->group_no;
+		if (rte_vfio_clear_group(m->group_no) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			num = 1;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
+	if (num == 1) {
+		reply.num_fds = 1;
+		reply.fds[0] = fd;
 	}
+	strcpy(reply.name, "vfio");
+	reply.len_param = sizeof(*r);
 
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
-
-	return 0;
+	ret = rte_eal_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_eal_mp_action_register("vfio", vfio_mp_primary);
 
 	return 0;
 }
-
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 1/3] eal: add channel for multi-process communication
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
@ 2018-01-25 10:41     ` Thomas Monjalon
  2018-01-25 11:27     ` Burakov, Anatoly
  2018-01-25 12:21     ` Ananyev, Konstantin
  2 siblings, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-25 10:41 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, anatoly.burakov, bruce.richardson, konstantin.ananyev

25/01/2018 05:16, Jianfeng Tan:
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> +	rte_eal_mp_action_register;
> +	rte_eal_mp_action_unregister;
> +	rte_eal_mp_sendmsg;

Just a naming comment:
I think you can drop "eal" in function names.
"rte_mp_" is a good prefix for multi-process management.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 3/3] vfio: use the generic multi-process channel
  2018-01-25  4:16   ` [PATCH v3 3/3] vfio: use the generic multi-process channel Jianfeng Tan
@ 2018-01-25 10:47     ` Thomas Monjalon
  2018-01-25 10:52       ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-25 10:47 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, anatoly.burakov, bruce.richardson, konstantin.ananyev

25/01/2018 05:16, Jianfeng Tan:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
> 
> This patch changes to use the generic mp channel.

There was a private request to get it in 18.02-rc2.

I have 3 concerns:
1/ It is late
2/ It is not yet reviewed by Anatoly and Konstantin
3/ We try to not rework the existing code in RC2,
because it would totally invalidate the validation work
done for RC1.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 3/3] vfio: use the generic multi-process channel
  2018-01-25 10:47     ` Thomas Monjalon
@ 2018-01-25 10:52       ` Burakov, Anatoly
  2018-01-25 10:57         ` Thomas Monjalon
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 10:52 UTC (permalink / raw)
  To: Thomas Monjalon, Jianfeng Tan; +Cc: dev, bruce.richardson, konstantin.ananyev

On 25-Jan-18 10:47 AM, Thomas Monjalon wrote:
> 25/01/2018 05:16, Jianfeng Tan:
>> Previously, vfio uses its own private channel for the secondary
>> process to get container fd and group fd from the primary process.
>>
>> This patch changes to use the generic mp channel.
> 
> There was a private request to get it in 18.02-rc2.
> 
> I have 3 concerns:
> 1/ It is late
> 2/ It is not yet reviewed by Anatoly and Konstantin
> 3/ We try to not rework the existing code in RC2,
> because it would totally invalidate the validation work
> done for RC1.
> 
> 

Hi Thomas,

We can postpone the VFIO patch until 18.05, and integrate only the first 
two patches. First two patches do not change anything in DPDK, so 
validation impact should be non-existent.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 3/3] vfio: use the generic multi-process channel
  2018-01-25 10:52       ` Burakov, Anatoly
@ 2018-01-25 10:57         ` Thomas Monjalon
  2018-01-25 12:15           ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-25 10:57 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: Jianfeng Tan, dev, bruce.richardson, konstantin.ananyev

25/01/2018 11:52, Burakov, Anatoly:
> On 25-Jan-18 10:47 AM, Thomas Monjalon wrote:
> > 25/01/2018 05:16, Jianfeng Tan:
> >> Previously, vfio uses its own private channel for the secondary
> >> process to get container fd and group fd from the primary process.
> >>
> >> This patch changes to use the generic mp channel.
> > 
> > There was a private request to get it in 18.02-rc2.
> > 
> > I have 3 concerns:
> > 1/ It is late
> > 2/ It is not yet reviewed by Anatoly and Konstantin
> > 3/ We try to not rework the existing code in RC2,
> > because it would totally invalidate the validation work
> > done for RC1.
> > 
> > 
> 
> Hi Thomas,
> 
> We can postpone the VFIO patch until 18.05, and integrate only the first 
> two patches. First two patches do not change anything in DPDK, so 
> validation impact should be non-existent.

Yes, possible if it is well reviewed and all comments addressed.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 1/3] eal: add channel for multi-process communication
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
  2018-01-25 10:41     ` Thomas Monjalon
@ 2018-01-25 11:27     ` Burakov, Anatoly
  2018-01-25 11:34       ` Thomas Monjalon
  2018-01-25 12:21     ` Ananyev, Konstantin
  2 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 11:27 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

Overall on this patch:

Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>

There are a few nitpicks below in comments.

Also, as a general note, i would prefer for sendmsg API's to return 0 on 
success and -1 on failure, as number of sent messages is not only 
meaningless to the user (since there's no way to tell if the value 
returned is the value we expected), but also makes the API unintuitive 
and prone to usage errors when using common "if (sendmsg()) {// error}" 
idiom. However, i'm fine with leaving it as is, if everyone else is. 
It's an experimental API, so we can change it later if need be.

On 25-Jan-18 4:16 AM, Jianfeng Tan wrote:
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>    1. Config-file based channel, in which, the primary process writes
>       info into a pre-defined config file, and the secondary process
>       reads the info out.
>    2. vfio submodule has its own channel based on unix socket for the
>       secondary process to get container fd and group fd from the
>       primary process.
>    3. pdump submodule also has its own channel based on unix socket for
>       packet dump.
> 
> It'd be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>    a. Secondary wants to send info to primary, for example, secondary
>       would like to send request (about some specific vdev to primary).
>    b. Sending info at any time, instead of just initialization time.
>    c. Share FDs with the other side, for vdev like vhost, related FDs
>       (memory region, kick) should be shared.
>    d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, based on datagram
> unix socket, for above requirements. Each process will block on a unix
> socket waiting for messages from the peers.
> 
> Three new APIs are added:
> 
>    1. rte_eal_mp_action_register() is used to register an action,
>       indexed by a string, when a component at receiver side would like
>       to response the messages from the peer processe.
>    2. rte_eal_mp_action_unregister() is used to unregister the action
>       if the calling component does not want to response the messages.
>    3. rte_eal_mp_sendmsg() is used to send a message, and returns
>       immediately. If there are n secondary processes, the primary
>       process will send n messages.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>   lib/librte_eal/common/eal_common_proc.c | 390 +++++++++++++++++++++++++++++++-
>   lib/librte_eal/common/eal_filesystem.h  |  17 ++
>   lib/librte_eal/common/eal_private.h     |  10 +
>   lib/librte_eal/common/include/rte_eal.h |  75 ++++++
>   lib/librte_eal/linuxapp/eal/eal.c       |   8 +
>   lib/librte_eal/rte_eal_version.map      |   3 +
>   6 files changed, 502 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 40fa982..baeb7d1 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -2,14 +2,48 @@
>    * Copyright(c) 2016 Intel Corporation

Nitpicking - making substantial changes to this files should probably 
update copyright year (2016-2018?).

>    */
>   
> -#include <stdio.h>
> +#include <dirent.h>
> +#include <errno.h>
>   #include <fcntl.h>
> +#include <fnmatch.h>
> +#include <libgen.h>
> +#include <limits.h>
> +#include <pthread.h>
> +#include <stdio.h>
>   #include <stdlib.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <unistd.h>
> +

<snip>

> +int
> +rte_eal_mp_channel_init(void)
> +{
> +	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> +	char *path;
> +	pthread_t tid;
> +
> +	snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
> +		 internal_config.hugefile_prefix);
> +
> +	path = strdup(eal_mp_socket_path());
> +	snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
> +	free(path);
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		unlink_sockets();
> +
> +	if (open_socket_fd() < 0)
> +		return -1;
> +
> +	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
> +
> +	if (pthread_create(&tid, NULL, mp_handle, NULL) == 0) {
> +		/* try best to set thread name */
> +		rte_thread_setname(tid, thread_name);
> +		return 0;
> +	}
> +
> +	RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", strerror(errno));
> +	close(mp_fd);
> +	mp_fd = -1;
> +	return -1;

Nitpicking: looks weird, usually early exit is for failures, not 
success. Maybe move the error part under (pthread_create() != 0).

> +}
> +
> +static int
> +send_msg(const char *dst_path, struct rte_mp_msg *msg)
> +{
> +	int snd;
> +	struct iovec iov;
> +	struct msghdr msgh;
> +	struct cmsghdr *cmsg;
> +	struct sockaddr_un dst;
> +	int fd_size = msg->num_fds * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +
> +	memset(&dst, 0, sizeof(dst));

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 1/3] eal: add channel for multi-process communication
  2018-01-25 11:27     ` Burakov, Anatoly
@ 2018-01-25 11:34       ` Thomas Monjalon
  0 siblings, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-25 11:34 UTC (permalink / raw)
  To: Burakov, Anatoly; +Cc: Jianfeng Tan, dev, bruce.richardson, konstantin.ananyev

25/01/2018 12:27, Burakov, Anatoly:
> Also, as a general note, i would prefer for sendmsg API's to return 0 on 
> success and -1 on failure, as number of sent messages is not only 
> meaningless to the user (since there's no way to tell if the value 
> returned is the value we expected), but also makes the API unintuitive 
> and prone to usage errors when using common "if (sendmsg()) {// error}" 
> idiom. However, i'm fine with leaving it as is, if everyone else is. 

I have not reviewed it, but I feel you are right.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25  4:16   ` [PATCH v3 2/3] eal: add synchronous " Jianfeng Tan
@ 2018-01-25 12:00     ` Burakov, Anatoly
  2018-01-25 12:19       ` Burakov, Anatoly
  2018-01-25 12:19       ` Ananyev, Konstantin
  2018-01-25 12:22     ` Ananyev, Konstantin
  1 sibling, 2 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 12:00 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On the overall patch,

Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>

For request(), returning number of replies received actually makes 
sense, because now we get use the value to read our replies, if we were 
a primary process sending messages to secondary processes.

Few comments below.

On 25-Jan-18 4:16 AM, Jianfeng Tan wrote:
> We need the synchronous way for multi-process communication,
> i.e., blockingly waiting for reply message when we send a request
> to the peer process.
> 
> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> such use case. By invoking rte_eal_mp_request(), a request message
> is sent out, and then it waits there for a reply message. The caller
> can specify the timeout. And the response messages will be collected
> and returned so that the caller can decide how to translate them.
> 
> The API rte_eal_mp_reply() is always called by an mp action handler.
> Here we add another parameter for rte_eal_mp_t so that the action
> handler knows which peer address to reply.
> 
>         sender-process                receiver-process
>     ----------------------            ----------------
> 
>      thread-n
>       |_rte_eal_mp_request() ----------> mp-thread
>          |_timedwait()                    |_process_msg()
>                                             |_action()
>                                                 |_rte_eal_mp_reply()
> 	        mp_thread  <---------------------|
>                    |_process_msg()
>                       |_signal(send_thread)
>      thread-m <----------|
>       |_collect-reply
> 
>   * A secondary process is only allowed to talk to the primary process.
>   * If there are multiple secondary processes for the primary proces,
>     it will send request to peer1, collect response from peer1; then
>     send request to peer2, collect reponse from peer2, and so on.
>   * When thread-n is sending request, thread-m of that process can send
>     request at the same time.
>   * For pair <action_name, peer>, we guarantee that only one such request
>     is on the fly.
> 
> Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>   doc/guides/rel_notes/release_18_02.rst  |  15 ++
>   lib/librte_eal/common/eal_common_proc.c | 237 +++++++++++++++++++++++++++++---
>   lib/librte_eal/common/include/rte_eal.h |  58 +++++++-
>   lib/librte_eal/rte_eal_version.map      |   3 +
>   4 files changed, 295 insertions(+), 18 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
> index 00b3224..f6ed666 100644
> --- a/doc/guides/rel_notes/release_18_02.rst
> +++ b/doc/guides/rel_notes/release_18_02.rst
> @@ -151,6 +151,21 @@ New Features
>     renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
>     to PMD agnostic ``eventdev_pipeline``.
>   
> +* **Added new multi-process communication channel**
> +
> +  Added a generic channel in EAL for multi-process (primary/secondary) synchronous
> +  and asynchronous communication. Each component who wants to reponse a message
> +  shall register the action; and each process has a thread to receive the message
> +  and invokes the registered action. The list of new APIs:
> +
> +  * ``rte_eal_mp_register``
> +  * ``rte_eal_mp_unregister``
> +  * ``rte_eal_mp_sendmsg``
> +  * ``rte_eal_mp_request``
> +  * ``rte_eal_mp_reply``
> +
> +  Note as we changed to use the new channel for communication, applications cannot
> +  talk with old version through the old (private) communication channel.

Some of this should've probably been added into previous patch.

>   
>   API Changes
>   -----------
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index baeb7d1..69df943 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -12,6 +12,7 @@
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <string.h>
> +#include <sys/time.h>
>   #include <sys/types.h>
>   #include <sys/socket.h>
>   #include <sys/un.h>
> @@ -44,6 +45,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
>   static struct action_entry_list action_entry_list =
>   	TAILQ_HEAD_INITIALIZER(action_entry_list);
>   

<snip>

> +		return 0;
> +	}
> +
> +	if (send_msg(dst, req, MP_REQ) != 1) {
> +		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
> +			dst, req->name);
> +		return 0;
> +	}
> +
> +	pthread_mutex_lock(&sync_requests.lock);
> +	do {
> +		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
> +		/* Check spurious wakeups */
> +		if (sync_req.reply_received == 1)
> +			break;
> +		/* Check if time is out */
> +		if (gettimeofday(&now, NULL) < 0)
> +			break;
> +		if (now.tv_sec < ts->tv_sec)
> +			break;
> +		else if (now.tv_sec == ts->tv_sec &&
> +			 now.tv_usec * 1000 < ts->tv_nsec)
> +			break;
> +	} while (1);
> +	/* We got the lock now */
> +	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
> +	pthread_mutex_unlock(&sync_requests.lock);
> +
> +	if (sync_req.reply_received == 0) {
> +		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
> +			dst, req->name);
> +		return 1;

Why are we returning 1 here? There was no reply, so no reply structure 
was allocated. This looks like a potential buffer overflow on trying to 
read replies if one of them wasn't delivered.

> +	}
> +
> +	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
> +	if (!tmp) {
> +		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
> +			dst, req->name);
> +		return 1;
> +	}

Same here - we couldn't allocate a reply, so it won't get to the user. 
Why return 1 here?

> +	memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
> +	reply->msgs = tmp;
> +	reply->nb_msgs++;
> +	return 1;
> +}
> +
> +int
> +rte_eal_mp_request(struct rte_mp_msg *req,
> +		   struct rte_mp_reply *reply,
> +		   const struct timespec *ts)
> +{
> +	DIR *mp_dir;
> +	struct dirent *ent;
> +	int nb_snds = 0;
> +	struct timeval now;
> +	struct timespec end;
> +

<snip>

>   /**
>    * @warning
> @@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
>   int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
>   
>   /**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Send a request to the peer process and expect a reply.
> + *
> + * This function sends a request message to the peer process, and will
> + * block until receiving reply message from the peer process.
> + *
> + * @note The caller is responsible to free reply->replies.
> + *
> + * @param req
> + *   The req argument contains the customized request message.
> + *
> + * @param reply
> + *   The reply argument will be for storing all the replied messages;
> + *   the caller is responsible for free reply->replies.
> + *
> + * @param ts
> + *   The ts argument specifies how long we can wait for the peer(s) to reply.
> + *
> + * @return
> + *  - (<0) on invalid parameters;
> + *  - (>=0) as the number of messages being sent successfully.
> + */
> +int rte_eal_mp_request(struct rte_mp_msg *req,
> +			struct rte_mp_reply *reply, const struct timespec *ts);

See above: it would be much more useful to return number of replies 
received, rather than number of messages sent, as that's the number we 
are most interested in. Otherwise, if we e.g. sent 5 messages but 
received 1 reply, you're essentially not telling the user how far can he 
index the reply pointer.

> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Send a reply to the peer process.
> + *
> + * This function will send a reply message in response to a request message
> + * received previously.
> + *
> + * @param msg
> + *   The msg argument contains the customized message.
> + *
> + * @param peer
> + *   The peer argument is the pointer to the peer socket path.
> + *
> + * @return
> + *  - (1) on success;
> + *  - (0) on failure;
> + *  - (<0) on invalid parameters.
> + */
> +int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);

I don't think there's much point in making distinction between invalid 
parameters and failure.

> +
> +/**
>    * Usage function typedef used by the application usage function.
>    *
>    * Use this function typedef to define and call rte_set_application_usage_hook()
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index adeadfb..3015bc6 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -220,6 +220,9 @@ EXPERIMENTAL {
>   	rte_eal_mp_action_register;
>   	rte_eal_mp_action_unregister;
>   	rte_eal_mp_sendmsg;
> +	rte_eal_mp_request;
> +	rte_eal_mp_reply;
> +	rte_eal_mp_sendmsg;

You're adding rte_eal_mp_sendmsg twice.

>   	rte_service_attr_get;
>   	rte_service_attr_reset_all;
>   	rte_service_component_register;
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 3/3] vfio: use the generic multi-process channel
  2018-01-25 10:57         ` Thomas Monjalon
@ 2018-01-25 12:15           ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 12:15 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Jianfeng Tan, dev, bruce.richardson, konstantin.ananyev

On 25-Jan-18 10:57 AM, Thomas Monjalon wrote:
> 25/01/2018 11:52, Burakov, Anatoly:
>> On 25-Jan-18 10:47 AM, Thomas Monjalon wrote:
>>> 25/01/2018 05:16, Jianfeng Tan:
>>>> Previously, vfio uses its own private channel for the secondary
>>>> process to get container fd and group fd from the primary process.
>>>>
>>>> This patch changes to use the generic mp channel.
>>>
>>> There was a private request to get it in 18.02-rc2.
>>>
>>> I have 3 concerns:
>>> 1/ It is late
>>> 2/ It is not yet reviewed by Anatoly and Konstantin
>>> 3/ We try to not rework the existing code in RC2,
>>> because it would totally invalidate the validation work
>>> done for RC1.
>>>
>>>
>>
>> Hi Thomas,
>>
>> We can postpone the VFIO patch until 18.05, and integrate only the first
>> two patches. First two patches do not change anything in DPDK, so
>> validation impact should be non-existent.
> 
> Yes, possible if it is well reviewed and all comments addressed.
> 

OK then. Jianfeng, let's drop the VFIO patch for now, and postpone it 
for 18.05?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 12:00     ` Burakov, Anatoly
@ 2018-01-25 12:19       ` Burakov, Anatoly
  2018-01-25 12:19       ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 12:19 UTC (permalink / raw)
  To: dev

> 
> See above: it would be much more useful to return number of replies 
> received, rather than number of messages sent, as that's the number we 
> are most interested in. Otherwise, if we e.g. sent 5 messages but 
> received 1 reply, you're essentially not telling the user how far can he 
> index the reply pointer.

Apologies, just noticed that rte_mp_reply has nb_messages in it. So if 
we are getting number of replies along with reply, this API should too 
switch to 0/-1 on success/failure respectively, as the number of sent 
messages also becomes meaningless to the user.



-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 12:00     ` Burakov, Anatoly
  2018-01-25 12:19       ` Burakov, Anatoly
@ 2018-01-25 12:19       ` Ananyev, Konstantin
  2018-01-25 12:25         ` Burakov, Anatoly
  1 sibling, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-25 12:19 UTC (permalink / raw)
  To: Burakov, Anatoly, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas



> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Thursday, January 25, 2018 12:00 PM
> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
> 
> On the overall patch,
> 
> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> 
> For request(), returning number of replies received actually makes
> sense, because now we get use the value to read our replies, if we were
> a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

> > +		return 0;
> > +	}
> > +
> > +	if (send_msg(dst, req, MP_REQ) != 1) {
> > +		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
> > +			dst, req->name);
> > +		return 0;
> > +	}
> > +
> > +	pthread_mutex_lock(&sync_requests.lock);
> > +	do {
> > +		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
> > +		/* Check spurious wakeups */
> > +		if (sync_req.reply_received == 1)
> > +			break;
> > +		/* Check if time is out */
> > +		if (gettimeofday(&now, NULL) < 0)
> > +			break;
> > +		if (now.tv_sec < ts->tv_sec)
> > +			break;
> > +		else if (now.tv_sec == ts->tv_sec &&
> > +			 now.tv_usec * 1000 < ts->tv_nsec)
> > +			break;
> > +	} while (1);
> > +	/* We got the lock now */
> > +	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
> > +	pthread_mutex_unlock(&sync_requests.lock);
> > +
> > +	if (sync_req.reply_received == 0) {
> > +		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
> > +			dst, req->name);
> > +		return 1;
> 
> Why are we returning 1 here? There was no reply, so no reply structure
> was allocated. This looks like a potential buffer overflow on trying to
> read replies if one of them wasn't delivered.


As I understand - because we receive a number of sended requests.
Number of received replies  will be available in reply->nb_msgs.
Same below.
Konstantin

> 
> > +	}
> > +
> > +	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
> > +	if (!tmp) {
> > +		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
> > +			dst, req->name);
> > +		return 1;
> > +	}
> 
> Same here - we couldn't allocate a reply, so it won't get to the user.
> Why return 1 here?
> 
> > +	memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
> > +	reply->msgs = tmp;
> > +	reply->nb_msgs++;
> > +	return 1;
> > +}
> > +
> > +int
> > +rte_eal_mp_request(struct rte_mp_msg *req,
> > +		   struct rte_mp_reply *reply,
> > +		   const struct timespec *ts)
> > +{
> > +	DIR *mp_dir;
> > +	struct dirent *ent;
> > +	int nb_snds = 0;
> > +	struct timeval now;
> > +	struct timespec end;
> > +
> 
> <snip>
> 
> >   /**
> >    * @warning
> > @@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
> >   int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
> >
> >   /**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Send a request to the peer process and expect a reply.
> > + *
> > + * This function sends a request message to the peer process, and will
> > + * block until receiving reply message from the peer process.
> > + *
> > + * @note The caller is responsible to free reply->replies.
> > + *
> > + * @param req
> > + *   The req argument contains the customized request message.
> > + *
> > + * @param reply
> > + *   The reply argument will be for storing all the replied messages;
> > + *   the caller is responsible for free reply->replies.
> > + *
> > + * @param ts
> > + *   The ts argument specifies how long we can wait for the peer(s) to reply.
> > + *
> > + * @return
> > + *  - (<0) on invalid parameters;
> > + *  - (>=0) as the number of messages being sent successfully.
> > + */
> > +int rte_eal_mp_request(struct rte_mp_msg *req,
> > +			struct rte_mp_reply *reply, const struct timespec *ts);
> 
> See above: it would be much more useful to return number of replies
> received, rather than number of messages sent, as that's the number we
> are most interested in. Otherwise, if we e.g. sent 5 messages but
> received 1 reply, you're essentially not telling the user how far can he
> index the reply pointer.
> 
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Send a reply to the peer process.
> > + *
> > + * This function will send a reply message in response to a request message
> > + * received previously.
> > + *
> > + * @param msg
> > + *   The msg argument contains the customized message.
> > + *
> > + * @param peer
> > + *   The peer argument is the pointer to the peer socket path.
> > + *
> > + * @return
> > + *  - (1) on success;
> > + *  - (0) on failure;
> > + *  - (<0) on invalid parameters.
> > + */
> > +int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);
> 
> I don't think there's much point in making distinction between invalid
> parameters and failure.
> 
> > +
> > +/**
> >    * Usage function typedef used by the application usage function.
> >    *
> >    * Use this function typedef to define and call rte_set_application_usage_hook()
> > diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> > index adeadfb..3015bc6 100644
> > --- a/lib/librte_eal/rte_eal_version.map
> > +++ b/lib/librte_eal/rte_eal_version.map
> > @@ -220,6 +220,9 @@ EXPERIMENTAL {
> >   	rte_eal_mp_action_register;
> >   	rte_eal_mp_action_unregister;
> >   	rte_eal_mp_sendmsg;
> > +	rte_eal_mp_request;
> > +	rte_eal_mp_reply;
> > +	rte_eal_mp_sendmsg;
> 
> You're adding rte_eal_mp_sendmsg twice.
> 
> >   	rte_service_attr_get;
> >   	rte_service_attr_reset_all;
> >   	rte_service_component_register;
> >
> 
> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 1/3] eal: add channel for multi-process communication
  2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
  2018-01-25 10:41     ` Thomas Monjalon
  2018-01-25 11:27     ` Burakov, Anatoly
@ 2018-01-25 12:21     ` Ananyev, Konstantin
  2 siblings, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-25 12:21 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas


> 
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>   1. Config-file based channel, in which, the primary process writes
>      info into a pre-defined config file, and the secondary process
>      reads the info out.
>   2. vfio submodule has its own channel based on unix socket for the
>      secondary process to get container fd and group fd from the
>      primary process.
>   3. pdump submodule also has its own channel based on unix socket for
>      packet dump.
> 
> It'd be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>   a. Secondary wants to send info to primary, for example, secondary
>      would like to send request (about some specific vdev to primary).
>   b. Sending info at any time, instead of just initialization time.
>   c. Share FDs with the other side, for vdev like vhost, related FDs
>      (memory region, kick) should be shared.
>   d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, based on datagram
> unix socket, for above requirements. Each process will block on a unix
> socket waiting for messages from the peers.
> 
> Three new APIs are added:
> 
>   1. rte_eal_mp_action_register() is used to register an action,
>      indexed by a string, when a component at receiver side would like
>      to response the messages from the peer processe.
>   2. rte_eal_mp_action_unregister() is used to unregister the action
>      if the calling component does not want to response the messages.
>   3. rte_eal_mp_sendmsg() is used to send a message, and returns
>      immediately. If there are n secondary processes, the primary
>      process will send n messages.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25  4:16   ` [PATCH v3 2/3] eal: add synchronous " Jianfeng Tan
  2018-01-25 12:00     ` Burakov, Anatoly
@ 2018-01-25 12:22     ` Ananyev, Konstantin
  1 sibling, 0 replies; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-25 12:22 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Burakov, Anatoly, Richardson, Bruce, thomas


> We need the synchronous way for multi-process communication,
> i.e., blockingly waiting for reply message when we send a request
> to the peer process.
> 
> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> such use case. By invoking rte_eal_mp_request(), a request message
> is sent out, and then it waits there for a reply message. The caller
> can specify the timeout. And the response messages will be collected
> and returned so that the caller can decide how to translate them.
> 
> The API rte_eal_mp_reply() is always called by an mp action handler.
> Here we add another parameter for rte_eal_mp_t so that the action
> handler knows which peer address to reply.
> 
>        sender-process                receiver-process
>    ----------------------            ----------------
> 
>     thread-n
>      |_rte_eal_mp_request() ----------> mp-thread
>         |_timedwait()                    |_process_msg()
>                                            |_action()
>                                                |_rte_eal_mp_reply()
> 	        mp_thread  <---------------------|
>                   |_process_msg()
>                      |_signal(send_thread)
>     thread-m <----------|
>      |_collect-reply
> 
>  * A secondary process is only allowed to talk to the primary process.
>  * If there are multiple secondary processes for the primary proces,
>    it will send request to peer1, collect response from peer1; then
>    send request to peer2, collect reponse from peer2, and so on.
>  * When thread-n is sending request, thread-m of that process can send
>    request at the same time.
>  * For pair <action_name, peer>, we guarantee that only one such request
>    is on the fly.
> 
> Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 12:19       ` Ananyev, Konstantin
@ 2018-01-25 12:25         ` Burakov, Anatoly
  2018-01-25 13:00           ` Ananyev, Konstantin
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 12:25 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas

On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
> 
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Thursday, January 25, 2018 12:00 PM
>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
>>
>> On the overall patch,
>>
>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>
>> For request(), returning number of replies received actually makes
>> sense, because now we get use the value to read our replies, if we were
>> a primary process sending messages to secondary processes.
> 
> Yes, I also think it is good to return number of sends.
> Then caller can compare number of sended requests with number of
> received replies and decide should it be considered a failure or no.
> 

Well, OK, that might make sense. However, i think it would've be of more 
value to make the API consistent (0/-1 on success/failure) and put 
number of sent messages into the reply, like number of received. I.e. 
something like

struct reply {
    int nb_sent;
    int nb_received;
};

We do it for the latter already, so why not the former?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 12:25         ` Burakov, Anatoly
@ 2018-01-25 13:00           ` Ananyev, Konstantin
  2018-01-25 13:05             ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-25 13:00 UTC (permalink / raw)
  To: Burakov, Anatoly, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas



> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Thursday, January 25, 2018 12:26 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
> 
> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
> >
> >
> >> -----Original Message-----
> >> From: Burakov, Anatoly
> >> Sent: Thursday, January 25, 2018 12:00 PM
> >> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
> >> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
> >> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
> >>
> >> On the overall patch,
> >>
> >> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >>
> >> For request(), returning number of replies received actually makes
> >> sense, because now we get use the value to read our replies, if we were
> >> a primary process sending messages to secondary processes.
> >
> > Yes, I also think it is good to return number of sends.
> > Then caller can compare number of sended requests with number of
> > received replies and decide should it be considered a failure or no.
> >
> 
> Well, OK, that might make sense. However, i think it would've be of more
> value to make the API consistent (0/-1 on success/failure) and put
> number of sent messages into the reply, like number of received. I.e.
> something like
> 
> struct reply {
>     int nb_sent;
>     int nb_received;
> };
> 
> We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 13:00           ` Ananyev, Konstantin
@ 2018-01-25 13:05             ` Burakov, Anatoly
  2018-01-25 13:10               ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 13:05 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas

On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
> 
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Thursday, January 25, 2018 12:26 PM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
>>
>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Burakov, Anatoly
>>>> Sent: Thursday, January 25, 2018 12:00 PM
>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
>>>>
>>>> On the overall patch,
>>>>
>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>
>>>> For request(), returning number of replies received actually makes
>>>> sense, because now we get use the value to read our replies, if we were
>>>> a primary process sending messages to secondary processes.
>>>
>>> Yes, I also think it is good to return number of sends.
>>> Then caller can compare number of sended requests with number of
>>> received replies and decide should it be considered a failure or no.
>>>
>>
>> Well, OK, that might make sense. However, i think it would've be of more
>> value to make the API consistent (0/-1 on success/failure) and put
>> number of sent messages into the reply, like number of received. I.e.
>> something like
>>
>> struct reply {
>>      int nb_sent;
>>      int nb_received;
>> };
>>
>> We do it for the latter already, so why not the former?
> 
> The question is what treat as success/failure?
> Let say we sent 2 requests (of 3 possible), got back 1 response...
> Should we consider it as success or failure?
> 

I think "failure" is "something went wrong", not "secondary processes 
didn't respond". For example, invalid parameters, or our socket suddenly 
being closed, or some other error that prevents us from sending requests 
to secondaries.

As far as i can tell from the code, there's no way to know if the 
secondary process is running other than by attempting to connect to it, 
and get a response. So, failed connection should not be a failure 
condition, because we can't know if we *can* connect to the process 
until we do. Process may have ended, but socket files will still be 
around, and there's nothing we can do about that. So i wouldn't consider 
inability to send a message a failure condition.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 13:05             ` Burakov, Anatoly
@ 2018-01-25 13:10               ` Burakov, Anatoly
  2018-01-25 15:03                 ` Ananyev, Konstantin
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 13:10 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas

On 25-Jan-18 1:05 PM, Burakov, Anatoly wrote:
> On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
>>
>>
>>> -----Original Message-----
>>> From: Burakov, Anatoly
>>> Sent: Thursday, January 25, 2018 12:26 PM
>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng 
>>> <jianfeng.tan@intel.com>; dev@dpdk.org
>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process 
>>> communication
>>>
>>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Burakov, Anatoly
>>>>> Sent: Thursday, January 25, 2018 12:00 PM
>>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, 
>>>>> Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process 
>>>>> communication
>>>>>
>>>>> On the overall patch,
>>>>>
>>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>>
>>>>> For request(), returning number of replies received actually makes
>>>>> sense, because now we get use the value to read our replies, if we 
>>>>> were
>>>>> a primary process sending messages to secondary processes.
>>>>
>>>> Yes, I also think it is good to return number of sends.
>>>> Then caller can compare number of sended requests with number of
>>>> received replies and decide should it be considered a failure or no.
>>>>
>>>
>>> Well, OK, that might make sense. However, i think it would've be of more
>>> value to make the API consistent (0/-1 on success/failure) and put
>>> number of sent messages into the reply, like number of received. I.e.
>>> something like
>>>
>>> struct reply {
>>>      int nb_sent;
>>>      int nb_received;
>>> };
>>>
>>> We do it for the latter already, so why not the former?
>>
>> The question is what treat as success/failure?
>> Let say we sent 2 requests (of 3 possible), got back 1 response...
>> Should we consider it as success or failure?
>>
> 
> I think "failure" is "something went wrong", not "secondary processes 
> didn't respond". For example, invalid parameters, or our socket suddenly 
> being closed, or some other error that prevents us from sending requests 
> to secondaries.
> 
> As far as i can tell from the code, there's no way to know if the 
> secondary process is running other than by attempting to connect to it, 
> and get a response. So, failed connection should not be a failure 
> condition, because we can't know if we *can* connect to the process 
> until we do. Process may have ended, but socket files will still be 
> around, and there's nothing we can do about that. So i wouldn't consider 
> inability to send a message a failure condition.
> 

Just to clarify - i'm suggesting leaving this decision up to the user. 
If a user expects there to be "n" processes running, but only "m" 
responses were received, he could treat it as error. Another user might 
simply send periodical updates/polls to secondaries, for whatever reason 
(say, stats display), and won't really care if one of them just died, so 
there's no error for that user.

However, all of this has nothing to do with API. If we're able to send 
messages - it's not a failure. If we can't - it is. That's the part API 
should be concerned about, and that's what the return value should 
indicate, IMO.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 13:10               ` Burakov, Anatoly
@ 2018-01-25 15:03                 ` Ananyev, Konstantin
  2018-01-25 16:22                   ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Ananyev, Konstantin @ 2018-01-25 15:03 UTC (permalink / raw)
  To: Burakov, Anatoly, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas



> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Thursday, January 25, 2018 1:10 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous multi-process communication
> 
> On 25-Jan-18 1:05 PM, Burakov, Anatoly wrote:
> > On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Burakov, Anatoly
> >>> Sent: Thursday, January 25, 2018 12:26 PM
> >>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
> >>> <jianfeng.tan@intel.com>; dev@dpdk.org
> >>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
> >>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
> >>> communication
> >>>
> >>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Burakov, Anatoly
> >>>>> Sent: Thursday, January 25, 2018 12:00 PM
> >>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
> >>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev,
> >>>>> Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
> >>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
> >>>>> communication
> >>>>>
> >>>>> On the overall patch,
> >>>>>
> >>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >>>>>
> >>>>> For request(), returning number of replies received actually makes
> >>>>> sense, because now we get use the value to read our replies, if we
> >>>>> were
> >>>>> a primary process sending messages to secondary processes.
> >>>>
> >>>> Yes, I also think it is good to return number of sends.
> >>>> Then caller can compare number of sended requests with number of
> >>>> received replies and decide should it be considered a failure or no.
> >>>>
> >>>
> >>> Well, OK, that might make sense. However, i think it would've be of more
> >>> value to make the API consistent (0/-1 on success/failure) and put
> >>> number of sent messages into the reply, like number of received. I.e.
> >>> something like
> >>>
> >>> struct reply {
> >>>      int nb_sent;
> >>>      int nb_received;
> >>> };
> >>>
> >>> We do it for the latter already, so why not the former?
> >>
> >> The question is what treat as success/failure?
> >> Let say we sent 2 requests (of 3 possible), got back 1 response...
> >> Should we consider it as success or failure?
> >>
> >
> > I think "failure" is "something went wrong", not "secondary processes
> > didn't respond". For example, invalid parameters, or our socket suddenly
> > being closed, or some other error that prevents us from sending requests
> > to secondaries.
> >
> > As far as i can tell from the code, there's no way to know if the
> > secondary process is running other than by attempting to connect to it,
> > and get a response. So, failed connection should not be a failure
> > condition, because we can't know if we *can* connect to the process
> > until we do. Process may have ended, but socket files will still be
> > around, and there's nothing we can do about that. So i wouldn't consider
> > inability to send a message a failure condition.
> >
> 
> Just to clarify - i'm suggesting leaving this decision up to the user.
> If a user expects there to be "n" processes running, but only "m"
> responses were received, he could treat it as error. Another user might
> simply send periodical updates/polls to secondaries, for whatever reason
> (say, stats display), and won't really care if one of them just died, so
> there's no error for that user.
> 
> However, all of this has nothing to do with API. If we're able to send
> messages - it's not a failure. If we can't - it is. That's the part API
> should be concerned about, and that's what the return value should
> indicate, IMO.

Ok so to clarify, you are suggesting: 
we have N peers - if send_msg() returns success for all N - return success
(no matter did we get a reply or not)
Otherwise return a failure.
?
Konstantin


> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 15:03                 ` Ananyev, Konstantin
@ 2018-01-25 16:22                   ` Burakov, Anatoly
  2018-01-25 17:10                     ` Tan, Jianfeng
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 16:22 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tan, Jianfeng, dev; +Cc: Richardson, Bruce, thomas

On 25-Jan-18 3:03 PM, Ananyev, Konstantin wrote:
> 
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Thursday, January 25, 2018 1:10 PM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>> Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous multi-process communication
>>
>> On 25-Jan-18 1:05 PM, Burakov, Anatoly wrote:
>>> On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Burakov, Anatoly
>>>>> Sent: Thursday, January 25, 2018 12:26 PM
>>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, Jianfeng
>>>>> <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>> communication
>>>>>
>>>>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Burakov, Anatoly
>>>>>>> Sent: Thursday, January 25, 2018 12:00 PM
>>>>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev,
>>>>>>> Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>>>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>>>> communication
>>>>>>>
>>>>>>> On the overall patch,
>>>>>>>
>>>>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>>>>
>>>>>>> For request(), returning number of replies received actually makes
>>>>>>> sense, because now we get use the value to read our replies, if we
>>>>>>> were
>>>>>>> a primary process sending messages to secondary processes.
>>>>>>
>>>>>> Yes, I also think it is good to return number of sends.
>>>>>> Then caller can compare number of sended requests with number of
>>>>>> received replies and decide should it be considered a failure or no.
>>>>>>
>>>>>
>>>>> Well, OK, that might make sense. However, i think it would've be of more
>>>>> value to make the API consistent (0/-1 on success/failure) and put
>>>>> number of sent messages into the reply, like number of received. I.e.
>>>>> something like
>>>>>
>>>>> struct reply {
>>>>>       int nb_sent;
>>>>>       int nb_received;
>>>>> };
>>>>>
>>>>> We do it for the latter already, so why not the former?
>>>>
>>>> The question is what treat as success/failure?
>>>> Let say we sent 2 requests (of 3 possible), got back 1 response...
>>>> Should we consider it as success or failure?
>>>>
>>>
>>> I think "failure" is "something went wrong", not "secondary processes
>>> didn't respond". For example, invalid parameters, or our socket suddenly
>>> being closed, or some other error that prevents us from sending requests
>>> to secondaries.
>>>
>>> As far as i can tell from the code, there's no way to know if the
>>> secondary process is running other than by attempting to connect to it,
>>> and get a response. So, failed connection should not be a failure
>>> condition, because we can't know if we *can* connect to the process
>>> until we do. Process may have ended, but socket files will still be
>>> around, and there's nothing we can do about that. So i wouldn't consider
>>> inability to send a message a failure condition.
>>>
>>
>> Just to clarify - i'm suggesting leaving this decision up to the user.
>> If a user expects there to be "n" processes running, but only "m"
>> responses were received, he could treat it as error. Another user might
>> simply send periodical updates/polls to secondaries, for whatever reason
>> (say, stats display), and won't really care if one of them just died, so
>> there's no error for that user.
>>
>> However, all of this has nothing to do with API. If we're able to send
>> messages - it's not a failure. If we can't - it is. That's the part API
>> should be concerned about, and that's what the return value should
>> indicate, IMO.
> 
> Ok so to clarify, you are suggesting:
> we have N peers - if send_msg() returns success for all N - return success
> (no matter did we get a reply or not)
> Otherwise return a failure.
> ?
> Konstantin

More along the lines of, return -1 if and only if something went wrong. 
That might be invalid parameters, or that might be an error with our own 
socket, or something else to that effect. In all other cases, return 0 
(that includes cases where we sent N messages but M replies where N != 
M). So, in other words, return 0 if we *could have succeeded* if nothing 
went wrong on the other side, and only return -1 if something went wrong 
on our side.

> 
> 
>>
>> --
>> Thanks,
>> Anatoly


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 16:22                   ` Burakov, Anatoly
@ 2018-01-25 17:10                     ` Tan, Jianfeng
  2018-01-25 18:02                       ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-25 17:10 UTC (permalink / raw)
  To: Burakov, Anatoly, Ananyev, Konstantin, dev; +Cc: Richardson, Bruce, thomas



On 1/26/2018 12:22 AM, Burakov, Anatoly wrote:
> On 25-Jan-18 3:03 PM, Ananyev, Konstantin wrote:
>>
>>
>>> -----Original Message-----
>>> From: Burakov, Anatoly
>>> Sent: Thursday, January 25, 2018 1:10 PM
>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, 
>>> Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>>> Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous 
>>> multi-process communication
>>>
>>> On 25-Jan-18 1:05 PM, Burakov, Anatoly wrote:
>>>> On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Burakov, Anatoly
>>>>>> Sent: Thursday, January 25, 2018 12:26 PM
>>>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, 
>>>>>> Jianfeng
>>>>>> <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; 
>>>>>> thomas@monjalon.net
>>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>>> communication
>>>>>>
>>>>>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Burakov, Anatoly
>>>>>>>> Sent: Thursday, January 25, 2018 12:00 PM
>>>>>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev,
>>>>>>>> Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>>>>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>>>>> communication
>>>>>>>>
>>>>>>>> On the overall patch,
>>>>>>>>
>>>>>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>>>>>
>>>>>>>> For request(), returning number of replies received actually makes
>>>>>>>> sense, because now we get use the value to read our replies, if we
>>>>>>>> were
>>>>>>>> a primary process sending messages to secondary processes.
>>>>>>>
>>>>>>> Yes, I also think it is good to return number of sends.
>>>>>>> Then caller can compare number of sended requests with number of
>>>>>>> received replies and decide should it be considered a failure or 
>>>>>>> no.
>>>>>>>
>>>>>>
>>>>>> Well, OK, that might make sense. However, i think it would've be 
>>>>>> of more
>>>>>> value to make the API consistent (0/-1 on success/failure) and put
>>>>>> number of sent messages into the reply, like number of received. 
>>>>>> I.e.
>>>>>> something like
>>>>>>
>>>>>> struct reply {
>>>>>>       int nb_sent;
>>>>>>       int nb_received;
>>>>>> };
>>>>>>
>>>>>> We do it for the latter already, so why not the former?
>>>>>
>>>>> The question is what treat as success/failure?
>>>>> Let say we sent 2 requests (of 3 possible), got back 1 response...
>>>>> Should we consider it as success or failure?
>>>>>
>>>>
>>>> I think "failure" is "something went wrong", not "secondary processes
>>>> didn't respond". For example, invalid parameters, or our socket 
>>>> suddenly
>>>> being closed, or some other error that prevents us from sending 
>>>> requests
>>>> to secondaries.
>>>>
>>>> As far as i can tell from the code, there's no way to know if the
>>>> secondary process is running other than by attempting to connect to 
>>>> it,
>>>> and get a response. So, failed connection should not be a failure
>>>> condition, because we can't know if we *can* connect to the process
>>>> until we do. Process may have ended, but socket files will still be
>>>> around, and there's nothing we can do about that. So i wouldn't 
>>>> consider
>>>> inability to send a message a failure condition.
>>>>
>>>
>>> Just to clarify - i'm suggesting leaving this decision up to the user.
>>> If a user expects there to be "n" processes running, but only "m"
>>> responses were received, he could treat it as error. Another user might
>>> simply send periodical updates/polls to secondaries, for whatever 
>>> reason
>>> (say, stats display), and won't really care if one of them just 
>>> died, so
>>> there's no error for that user.
>>>
>>> However, all of this has nothing to do with API. If we're able to send
>>> messages - it's not a failure. If we can't - it is. That's the part API
>>> should be concerned about, and that's what the return value should
>>> indicate, IMO.
>>
>> Ok so to clarify, you are suggesting:
>> we have N peers - if send_msg() returns success for all N - return 
>> success
>> (no matter did we get a reply or not)
>> Otherwise return a failure.
>> ?
>> Konstantin
>
> More along the lines of, return -1 if and only if something went 
> wrong. That might be invalid parameters, or that might be an error 
> with our own socket,

To check if the error is caused by our own socket, we check the errno 
after sendmsg?

Like for remote socket errors, we check:
- ECONNRESET
- ECONNREFUSED
- ENOBUFS

Right?

Thanks,
Jianfeng


> or something else to that effect. In all other cases, return 0 (that 
> includes cases where we sent N messages but M replies where N != M). 
> So, in other words, return 0 if we *could have succeeded* if nothing 
> went wrong on the other side, and only return -1 if something went 
> wrong on our side.
>
>>
>>
>>>
>>> -- 
>>> Thanks,
>>> Anatoly
>
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
  2018-01-25 17:10                     ` Tan, Jianfeng
@ 2018-01-25 18:02                       ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-25 18:02 UTC (permalink / raw)
  To: Tan, Jianfeng, Ananyev, Konstantin, dev; +Cc: Richardson, Bruce, thomas

On 25-Jan-18 5:10 PM, Tan, Jianfeng wrote:
> 
> 
> On 1/26/2018 12:22 AM, Burakov, Anatoly wrote:
>> On 25-Jan-18 3:03 PM, Ananyev, Konstantin wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Burakov, Anatoly
>>>> Sent: Thursday, January 25, 2018 1:10 PM
>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, 
>>>> Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; thomas@monjalon.net
>>>> Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous 
>>>> multi-process communication
>>>>
>>>> On 25-Jan-18 1:05 PM, Burakov, Anatoly wrote:
>>>>> On 25-Jan-18 1:00 PM, Ananyev, Konstantin wrote:
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Burakov, Anatoly
>>>>>>> Sent: Thursday, January 25, 2018 12:26 PM
>>>>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Tan, 
>>>>>>> Jianfeng
>>>>>>> <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; 
>>>>>>> thomas@monjalon.net
>>>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>>>> communication
>>>>>>>
>>>>>>> On 25-Jan-18 12:19 PM, Ananyev, Konstantin wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Burakov, Anatoly
>>>>>>>>> Sent: Thursday, January 25, 2018 12:00 PM
>>>>>>>>> To: Tan, Jianfeng <jianfeng.tan@intel.com>; dev@dpdk.org
>>>>>>>>> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Ananyev,
>>>>>>>>> Konstantin <konstantin.ananyev@intel.com>; thomas@monjalon.net
>>>>>>>>> Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
>>>>>>>>> communication
>>>>>>>>>
>>>>>>>>> On the overall patch,
>>>>>>>>>
>>>>>>>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>>>>>>
>>>>>>>>> For request(), returning number of replies received actually makes
>>>>>>>>> sense, because now we get use the value to read our replies, if we
>>>>>>>>> were
>>>>>>>>> a primary process sending messages to secondary processes.
>>>>>>>>
>>>>>>>> Yes, I also think it is good to return number of sends.
>>>>>>>> Then caller can compare number of sended requests with number of
>>>>>>>> received replies and decide should it be considered a failure or 
>>>>>>>> no.
>>>>>>>>
>>>>>>>
>>>>>>> Well, OK, that might make sense. However, i think it would've be 
>>>>>>> of more
>>>>>>> value to make the API consistent (0/-1 on success/failure) and put
>>>>>>> number of sent messages into the reply, like number of received. 
>>>>>>> I.e.
>>>>>>> something like
>>>>>>>
>>>>>>> struct reply {
>>>>>>>       int nb_sent;
>>>>>>>       int nb_received;
>>>>>>> };
>>>>>>>
>>>>>>> We do it for the latter already, so why not the former?
>>>>>>
>>>>>> The question is what treat as success/failure?
>>>>>> Let say we sent 2 requests (of 3 possible), got back 1 response...
>>>>>> Should we consider it as success or failure?
>>>>>>
>>>>>
>>>>> I think "failure" is "something went wrong", not "secondary processes
>>>>> didn't respond". For example, invalid parameters, or our socket 
>>>>> suddenly
>>>>> being closed, or some other error that prevents us from sending 
>>>>> requests
>>>>> to secondaries.
>>>>>
>>>>> As far as i can tell from the code, there's no way to know if the
>>>>> secondary process is running other than by attempting to connect to 
>>>>> it,
>>>>> and get a response. So, failed connection should not be a failure
>>>>> condition, because we can't know if we *can* connect to the process
>>>>> until we do. Process may have ended, but socket files will still be
>>>>> around, and there's nothing we can do about that. So i wouldn't 
>>>>> consider
>>>>> inability to send a message a failure condition.
>>>>>
>>>>
>>>> Just to clarify - i'm suggesting leaving this decision up to the user.
>>>> If a user expects there to be "n" processes running, but only "m"
>>>> responses were received, he could treat it as error. Another user might
>>>> simply send periodical updates/polls to secondaries, for whatever 
>>>> reason
>>>> (say, stats display), and won't really care if one of them just 
>>>> died, so
>>>> there's no error for that user.
>>>>
>>>> However, all of this has nothing to do with API. If we're able to send
>>>> messages - it's not a failure. If we can't - it is. That's the part API
>>>> should be concerned about, and that's what the return value should
>>>> indicate, IMO.
>>>
>>> Ok so to clarify, you are suggesting:
>>> we have N peers - if send_msg() returns success for all N - return 
>>> success
>>> (no matter did we get a reply or not)
>>> Otherwise return a failure.
>>> ?
>>> Konstantin
>>
>> More along the lines of, return -1 if and only if something went 
>> wrong. That might be invalid parameters, or that might be an error 
>> with our own socket,
> 
> To check if the error is caused by our own socket, we check the errno 
> after sendmsg?
> 
> Like for remote socket errors, we check:
> - ECONNRESET
> - ECONNREFUSED
> - ENOBUFS
> 
> Right?
> 
> Thanks,
> Jianfeng

Well, that was only an example. If it doesn't make much sense to do so 
in this case, then don't, and only return -1 on invalid parameters. 
AFAIU we're using connectionless sockets so a bunch of these errors 
won't be applicable to us. Maybe -ENOBUFS, but i'm not sure it's worth 
it to check for that.

> 
> 
>> or something else to that effect. In all other cases, return 0 (that 
>> includes cases where we sent N messages but M replies where N != M). 
>> So, in other words, return 0 if we *could have succeeded* if nothing 
>> went wrong on the other side, and only return -1 if something went 
>> wrong on our side.
>>
>>>
>>>
>>>>
>>>> -- 
>>>> Thanks,
>>>> Anatoly
>>
>>
> 
> 

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v4 0/2] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (5 preceding siblings ...)
  2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
@ 2018-01-25 19:14 ` Jianfeng Tan
  2018-01-25 19:14   ` [PATCH v4 1/2] eal: add synchronous " Jianfeng Tan
                     ` (2 more replies)
  2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
                   ` (2 subsequent siblings)
  9 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:14 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v3->v4:
  - Drop the patch 3 on vfio communication (postponed).
  - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
  - Add nb_sent and nb_received in struct rte_mp_reply.
  - Standardize the return val of sendmsg, request, reply: 0 on sucess,
    (-1) on failure.
  - If we found an peer error when we send msg in primary, we try to
    remove the secondary socket; as there is no sync mechanism there
    (cannot do flock like regular file for socket file), we use a more
    complex socket name (with tsc in it).
  - Some other small changes.

v2->v3:
  - Add pre-check for each APIs.
  - Remove the limitation of 8 secondary processes by: discard original
    register/unregister mechanism of secondary process, instead, primary
    discoveries secondary processes by looking up the folder for regex match.
  - Previous implementation use two sockets for msg and request, this version
    just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.

Jianfeng Tan (2):
  eal: add synchronous multi-process communication
  vfio: use the generic multi-process channel

 doc/guides/rel_notes/release_18_02.rst         |   2 +
 lib/librte_eal/common/eal_common_proc.c        | 254 +++++++++++++--
 lib/librte_eal/common/include/rte_eal.h        |  58 +++-
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 169 ++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
 lib/librte_eal/rte_eal_version.map             |   2 +
 8 files changed, 429 insertions(+), 495 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v4 1/2] eal: add synchronous multi-process communication
  2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
@ 2018-01-25 19:14   ` Jianfeng Tan
  2018-01-25 19:14   ` [PATCH v4 2/2] vfio: use the generic multi-process channel Jianfeng Tan
  2018-01-25 19:15   ` [PATCH v4 0/2] generic channel for multi-process communication Tan, Jianfeng
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:14 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

       sender-process                receiver-process
   ----------------------            ----------------

    thread-n
     |_rte_eal_mp_request() ----------> mp-thread
        |_timedwait()                    |_process_msg()
                                           |_action()
                                               |_rte_eal_mp_reply()
	        mp_thread  <---------------------|
                  |_process_msg()
                     |_signal(send_thread)
    thread-m <----------|
     |_collect-reply

 * A secondary process is only allowed to talk to the primary process.
 * If there are multiple secondary processes for the primary process,
   it will send request to peer1, collect response from peer1; then
   send request to peer2, collect response from peer2, and so on.
 * When thread-n is sending request, thread-m of that process can send
   request at the same time.
 * For pair <action_name, peer>, we guarantee that only one such request
   is on the fly.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   2 +
 lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  58 +++++++-
 lib/librte_eal/rte_eal_version.map      |   2 +
 4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features
 
   * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
   * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+  * ``rte_mp_request`` is for sending a request message and will block until
+    it gets a reply message which is sent from the peer by ``rte_mp_reply``.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
 static struct action_entry_list action_entry_list =
 	TAILQ_HEAD_INITIALIZER(action_entry_list);
 
+enum mp_type {
+	MP_MSG, /* Share message with peers, will not block */
+	MP_REQ, /* Request for information, Will block for a reply */
+	MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+	int type;
+	struct rte_mp_msg msg;
+};
+
+struct sync_request {
+	TAILQ_ENTRY(sync_request) next;
+	int reply_received;
+	char dst[PATH_MAX];
+	struct rte_mp_msg *request;
+	struct rte_mp_msg *reply;
+	pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+	struct sync_request_list requests;
+	pthread_mutex_t lock;
+} sync_requests = {
+	.requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+	.lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+	struct sync_request *r;
+
+	TAILQ_FOREACH(r, &sync_requests.requests, next) {
+		if (!strcmp(r->dst, dst) &&
+		    !strcmp(r->request->name, act_name))
+			break;
+	}
+
+	return r;
+}
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
 	int msglen;
 	struct iovec iov;
 	struct msghdr msgh;
-	char control[CMSG_SPACE(sizeof(msg->fds))];
+	char control[CMSG_SPACE(sizeof(m->msg.fds))];
 	struct cmsghdr *cmsg;
-	int buflen = sizeof(*msg) - sizeof(msg->fds);
+	int buflen = sizeof(*m) - sizeof(m->msg.fds);
 
 	memset(&msgh, 0, sizeof(msgh));
-	iov.iov_base = msg;
+	iov.iov_base = m;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
 			(cmsg->cmsg_type == SCM_RIGHTS)) {
-			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
 			break;
 		}
 	}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
 }
 
 static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
+	struct sync_request *sync_req;
 	struct action_entry *entry;
+	struct rte_mp_msg *msg = &m->msg;
 	rte_mp_t action = NULL;
 
 	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+	if (m->type == MP_REP) {
+		pthread_mutex_lock(&sync_requests.lock);
+		sync_req = find_sync_request(s->sun_path, msg->name);
+		if (sync_req) {
+			memcpy(sync_req->reply, msg, sizeof(*msg));
+			sync_req->reply_received = 1;
+			pthread_cond_signal(&sync_req->cond);
+		} else
+			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+		pthread_mutex_unlock(&sync_requests.lock);
+		return;
+	}
+
 	pthread_mutex_lock(&mp_mutex_action);
 	entry = find_action_entry_by_name(msg->name);
 	if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)
 
 	if (!action)
 		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
-	else if (action(msg) < 0)
+	else if (action(msg, s->sun_path) < 0)
 		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
 }
 
 static void *
 mp_handle(void *arg __rte_unused)
 {
-	struct rte_mp_msg msg;
+	struct mp_msg_internal msg;
+	struct sockaddr_un sa;
 
 	while (1) {
-		if (read_msg(&msg) == 0)
-			process_msg(&msg);
+		if (read_msg(&msg, &sa) == 0)
+			process_msg(&msg, &sa);
 	}
 
 	return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
  *
  */
 static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
 {
 	int snd;
 	struct iovec iov;
 	struct msghdr msgh;
 	struct cmsghdr *cmsg;
 	struct sockaddr_un dst;
+	struct mp_msg_internal m;
 	int fd_size = msg->num_fds * sizeof(int);
 	char control[CMSG_SPACE(fd_size)];
 
+	m.type = type;
+	memcpy(&m.msg, msg, sizeof(*msg));
+
 	memset(&dst, 0, sizeof(dst));
 	dst.sun_family = AF_UNIX;
 	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 	memset(&msgh, 0, sizeof(msgh));
 	memset(control, 0, sizeof(control));
 
-	iov.iov_base = msg;
-	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+	iov.iov_base = &m;
+	iov.iov_len = sizeof(m) - sizeof(msg->fds);
 
 	msgh.msg_name = &dst;
 	msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 }
 
 static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 {
 	int ret = 0;
 	DIR *mp_dir;
 	struct dirent *ent;
 
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		if (send_msg(eal_mp_socket_path(), msg) < 0)
+	if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+		peer = eal_mp_socket_path();
+
+	if (peer) {
+		if (send_msg(peer, msg, type) < 0)
 			return -1;
 		else
 			return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
 		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
 			continue;
 
-		if (send_msg(ent->d_name, msg) < 0)
+		if (send_msg(ent->d_name, msg, type) < 0)
 			ret = -1;
 	}
-	closedir(mp_dir);
 
+	closedir(mp_dir);
 	return ret;
 }
 
@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
 		return -1;
 
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
-	return mp_send(msg);
+	return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+	       struct rte_mp_reply *reply, const struct timespec *ts)
+{
+	int ret;
+	struct timeval now;
+	struct rte_mp_msg msg, *tmp;
+	struct sync_request sync_req, *exist;
+
+	sync_req.reply_received = 0;
+	strcpy(sync_req.dst, dst);
+	sync_req.request = req;
+	sync_req.reply = &msg;
+	pthread_cond_init(&sync_req.cond, NULL);
+
+	pthread_mutex_lock(&sync_requests.lock);
+	exist = find_sync_request(dst, req->name);
+	if (!exist)
+		TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+	if (exist) {
+		RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+
+	ret = send_msg(dst, req, MP_REQ);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+			dst, req->name);
+		return -1;
+	} else if (ret == 0)
+		return 0;
+
+	reply->nb_sent++;
+
+	pthread_mutex_lock(&sync_requests.lock);
+	do {
+		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+		/* Check spurious wakeups */
+		if (sync_req.reply_received == 1)
+			break;
+		/* Check if time is out */
+		if (gettimeofday(&now, NULL) < 0)
+			break;
+		if (now.tv_sec < ts->tv_sec)
+			break;
+		else if (now.tv_sec == ts->tv_sec &&
+			 now.tv_usec * 1000 < ts->tv_nsec)
+			break;
+	} while (1);
+	/* We got the lock now */
+	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+
+	if (sync_req.reply_received == 0) {
+		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ETIMEDOUT;
+		return -1;
+	}
+
+	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+	if (!tmp) {
+		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+	reply->msgs = tmp;
+	reply->nb_received++;
+	return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		const struct timespec *ts)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+	struct timeval now;
+	struct timespec end;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+	if (check_input(req) == false)
+		return -1;
+	if (gettimeofday(&now, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "Faile to get current time\n");
+		rte_errno = errno;
+		return -1;
+	}
+
+	end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+	end.tv_sec = now.tv_sec + ts->tv_sec +
+			(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+	reply->nb_sent = 0;
+	reply->nb_received = 0;
+	reply->msgs = NULL;
+
+	/* for secondary process, send request to the primary process only */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+	/* for primary process, broadcast request, and collect reply 1 by 1 */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (mp_request_one(ent->d_name, req, reply, &end))
+			ret = -1;
+	}
+
+	closedir(mp_dir);
+	return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+	if (check_input(msg) == false)
+		return -1;
+
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+
+	return mp_send(msg, peer, MP_REP);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@
 
 #include <stdint.h>
 #include <sched.h>
+#include <time.h>
 
 #include <rte_config.h>
 #include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+struct rte_mp_reply {
+	int nb_sent;
+	int nb_received;
+	struct rte_mp_msg *msgs; /* caller to free */
+};
+
 /**
  * Action function typedef used by other components.
  *
  * As we create  socket channel for primary/secondary communication, use
  * this function typedef to register action for coming messages.
  */
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);
 
 /**
  * @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
 int rte_mp_sendmsg(struct rte_mp_msg *msg);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ *   The req argument contains the customized request message.
+ *
+ * @param reply
+ *   The reply argument will be for storing all the replied messages;
+ *   the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ *   The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		   const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @param peer
+ *   The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..2cb6b07 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
 	rte_eal_mp_action_register;
 	rte_eal_mp_action_unregister;
 	rte_eal_mp_sendmsg;
+	rte_eal_mp_request;
+	rte_eal_mp_reply;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v4 2/2] vfio: use the generic multi-process channel
  2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
  2018-01-25 19:14   ` [PATCH v4 1/2] eal: add synchronous " Jianfeng Tan
@ 2018-01-25 19:14   ` Jianfeng Tan
  2018-01-25 19:15   ` [PATCH v4 0/2] generic channel for multi-process communication Tan, Jianfeng
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:14 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
				--num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 169 ++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
 4 files changed, 133 insertions(+), 475 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 53e29e4..07b2a06 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
 		return -1;
 	vfio_enabled = rte_vfio_is_enabled("vfio");
 
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
+	if (vfio_enabled && vfio_mp_sync_setup() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..2dbb37e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,30 @@ vfio_get_group_fd(int iommu_group_no)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_no = iommu_group_no;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK && rep->num_fds == 1) {
+			cur_grp->group_no = iommu_group_no;
+			vfio_group_fd = rep->fds[0];
+			cur_grp->fd = vfio_group_fd;
+			vfio_cfg.vfio_active_groups++;
 		}
+		free(reply.msgs);
 	}
-	return -1;
+
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
 }
 
 
@@ -200,7 +184,10 @@ int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -214,43 +201,23 @@ rte_vfio_clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_no = vfio_group_fd;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+	if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK) {
+			free(reply.msgs);
+			return 0;
+		}
+		free(reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
+	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
 	return -1;
 }
 
@@ -561,6 +528,11 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg req, *rep;
+	struct rte_mp_reply reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +563,28 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(req.name, "vfio");
+	req.len_param = sizeof(*p);
+	req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+		rep = &reply.msgs[0];
+		p = (struct vfio_mp_param *)rep->param;
+		if (p->result == SOCKET_OK && rep->num_fds == 1) {
+			free(reply.msgs);
+			return rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..126e3c2 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,360 +20,81 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd;
+	int num;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
+	memset(&reply, 0, sizeof(reply));
 
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_no = m->group_no;
+		fd = vfio_get_group_fd(m->group_no);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			num = 1;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_no = m->group_no;
+		if (rte_vfio_clear_group(m->group_no) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			num = 1;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
+	if (num == 1) {
+		reply.num_fds = 1;
+		reply.fds[0] = fd;
 	}
+	strcpy(reply.name, "vfio");
+	reply.len_param = sizeof(*r);
 
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
-
-	return 0;
+	ret = rte_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_mp_action_register("vfio", vfio_mp_primary);
 
 	return 0;
 }
-
 #endif
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v4 0/2] generic channel for multi-process communication
  2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
  2018-01-25 19:14   ` [PATCH v4 1/2] eal: add synchronous " Jianfeng Tan
  2018-01-25 19:14   ` [PATCH v4 2/2] vfio: use the generic multi-process channel Jianfeng Tan
@ 2018-01-25 19:15   ` Tan, Jianfeng
  2 siblings, 0 replies; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-25 19:15 UTC (permalink / raw)
  To: dev; +Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas

Apology, please ignore this version which is not correct. Will send out 
a new version.


On 1/26/2018 3:14 AM, Jianfeng Tan wrote:
> v3->v4:
>    - Drop the patch 3 on vfio communication (postponed).
>    - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
>    - Add nb_sent and nb_received in struct rte_mp_reply.
>    - Standardize the return val of sendmsg, request, reply: 0 on sucess,
>      (-1) on failure.
>    - If we found an peer error when we send msg in primary, we try to
>      remove the secondary socket; as there is no sync mechanism there
>      (cannot do flock like regular file for socket file), we use a more
>      complex socket name (with tsc in it).
>    - Some other small changes.
>
> v2->v3:
>    - Add pre-check for each APIs.
>    - Remove the limitation of 8 secondary processes by: discard original
>      register/unregister mechanism of secondary process, instead, primary
>      discoveries secondary processes by looking up the folder for regex match.
>    - Previous implementation use two sockets for msg and request, this version
>      just uses one socket. And receive all kinds of messages in mp thread.
>
> v1->v2: (Address comments from Anatoly and Konstantin)
>    - Use datagram unix socket to supersede stream unix socket + epoll.
>    - Change the secondary add/del mechanism as now we use connection-less channel.
>    - Add mp_mutex_action to sync action register/unregister/reference.
>    - Limit max length of action name to 64B.
>    - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
>    - Formalize the errno handle.
>    - Some other small issues.
>
> This patchset adds a generic channel for multi-process (primary/secondary)
> communication.
>
> Patch 1: addess the purpose and howto;
> Patch 2: add a syncrhonous way for the requests which need a immediate response.
>
> Jianfeng Tan (2):
>    eal: add synchronous multi-process communication
>    vfio: use the generic multi-process channel
>
>   doc/guides/rel_notes/release_18_02.rst         |   2 +
>   lib/librte_eal/common/eal_common_proc.c        | 254 +++++++++++++--
>   lib/librte_eal/common/include/rte_eal.h        |  58 +++-
>   lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio.c         | 169 ++++------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
>   lib/librte_eal/rte_eal_version.map             |   2 +
>   8 files changed, 429 insertions(+), 495 deletions(-)
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v5 0/2] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (6 preceding siblings ...)
  2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
@ 2018-01-25 19:21 ` Jianfeng Tan
  2018-01-25 19:21   ` [PATCH v5 1/2] eal: add " Jianfeng Tan
                     ` (2 more replies)
  2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
  2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
  9 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:21 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v3->v5:
  - Drop the patch 3 on vfio communication (postponed).
  - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
  - Add nb_sent and nb_received in struct rte_mp_reply.
  - Standardize the return val of sendmsg, request, reply: 0 on sucess,
    (-1) on failure.
  - If we found an peer error when we send msg in primary, we try to
    remove the secondary socket; as there is no sync mechanism there
    (cannot do flock like regular file for socket file), we use a more
    complex socket name (with tsc in it).
  - Some other small changes.

v3->v4:
  - Wrong patches are sent out.

v2->v3:
  - Add pre-check for each APIs.
  - Remove the limitation of 8 secondary processes by: discard original
    register/unregister mechanism of secondary process, instead, primary
    discoveries secondary processes by looking up the folder for regex match.
  - Previous implementation use two sockets for msg and request, this version
    just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.


Jianfeng Tan (2):
  eal: add channel for multi-process communication
  eal: add synchronous multi-process communication

 doc/guides/rel_notes/release_18_02.rst  |  11 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h | 133 ++++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   5 +
 8 files changed, 848 insertions(+), 7 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v5 1/2] eal: add channel for multi-process communication
  2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
@ 2018-01-25 19:21   ` Jianfeng Tan
  2018-01-25 19:21   ` [PATCH v5 2/2] eal: add synchronous " Jianfeng Tan
  2018-01-25 21:23   ` [PATCH v5 0/2] generic channel for " Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:21 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are n secondary processes, the primary
     process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   9 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h |  77 +++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   3 +
 8 files changed, 572 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..be6ac99 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,15 @@ New Features
   renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
   to PMD agnostic ``eventdev_pipeline``.
 
+* **Added new multi-process communication channel**
+
+  Added a generic channel in EAL for multi-process (primary/secondary) communication.
+  Consumers of this channel need to register an action with an action name to response
+  a message received; the actions will be identified by the action name and executed
+  in the context of a new dedicated thread for this channel. The list of new APIs:
+
+  * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+  * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 04cbd81..fcc9828 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -603,6 +603,14 @@ rte_eal_init(int argc, char **argv)
 
 	rte_config_init();
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	if (rte_eal_memory_init() < 0) {
 		rte_eal_init_alert("Cannot init memory\n");
 		rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..aea0829 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
  */
 
-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+static int mp_fd = -1;
+static char mp_filter[PATH_MAX];   /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;
+	char action_name[RTE_MP_MAX_NAME_LEN];
+	rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+	if (name == NULL) {
+		RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+		RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return -1;
+	}
+	return 0;
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return -1;
+
+	entry = malloc(sizeof(struct action_entry));
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	strcpy(entry->action_name, name);
+	entry->action = action;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(name) != NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		rte_errno = -EEXIST;
+		free(entry);
+		return -1;
+	}
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	if (entry == NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		return;
+	}
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+	int msglen;
+	struct iovec iov;
+	struct msghdr msgh;
+	char control[CMSG_SPACE(sizeof(msg->fds))];
+	struct cmsghdr *cmsg;
+	int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = msg;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(mp_fd, &msgh, 0);
+	if (msglen < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+	struct action_entry *entry;
+	rte_mp_t action = NULL;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(msg->name);
+	if (entry != NULL)
+		action = entry->action;
+	pthread_mutex_unlock(&mp_mutex_action);
+
+	if (!action)
+		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+	else if (action(msg) < 0)
+		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	struct rte_mp_msg msg;
+
+	while (1) {
+		if (read_msg(&msg) == 0)
+			process_msg(&msg);
+	}
+
+	return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+	struct sockaddr_un un;
+	const char *prefix = eal_mp_socket_path();
+
+	mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (mp_fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+	else {
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+			 prefix, getpid(), rte_rdtsc());
+	}
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(mp_fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+	return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+	int dir_fd;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		return -1;
+	}
+	dir_fd = dirfd(mp_dir);
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(filter, ent->d_name, 0) == 0)
+			unlinkat(dir_fd, ent->d_name, 0);
+	}
+
+	closedir(mp_dir);
+	return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+	char *filename;
+	char *fullpath = strdup(path);
+
+	if (!fullpath)
+		return;
+	filename = basename(fullpath);
+	unlink_sockets(filename);
+	free(fullpath);
+	RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	char *path;
+	pthread_t tid;
+
+	snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+		 internal_config.hugefile_prefix);
+
+	path = strdup(eal_mp_socket_path());
+	snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+	free(path);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+	    unlink_sockets(mp_filter)) {
+		RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+		return -1;
+	}
+
+	if (open_socket_fd() < 0)
+		return -1;
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+			strerror(errno));
+		close(mp_fd);
+		mp_fd = -1;
+		return -1;
+	}
+
+	/* try best to set thread name */
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+	rte_thread_setname(tid, thread_name);
+	return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	int fd_size = msg->num_fds * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = msg;
+	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+	do {
+		snd = sendmsg(mp_fd, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	if (snd < 0) {
+		rte_errno = errno;
+		/* Check if it caused by peer process exits */
+		if (errno == -ECONNREFUSED) {
+			/* We don't unlink the primary's socket here */
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				unlink_socket_by_path(dst_path);
+			return 0;
+		}
+		if (errno == -ENOBUFS) {
+			RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+				dst_path);
+			return 0;
+		}
+		RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+			dst_path, strerror(errno));
+		return -1;
+	}
+
+	return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		if (send_msg(eal_mp_socket_path(), msg) < 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* broadcast to all secondary processes */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+				mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (send_msg(ent->d_name, msg) < 0)
+			ret = -1;
+	}
+	closedir(mp_dir);
+
+	return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+	if (msg == NULL) {
+		RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+
+	if (validate_action_name(msg->name))
+		return false;
+
+	if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+		RTE_LOG(ERR, EAL, "Message data is too long\n");
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+			RTE_MP_MAX_FD_NUM);
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	return true;
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+	if (!check_input(msg))
+		return -1;
+
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+	return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 /**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..1d42e9c 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _RTE_EAL_H_
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
  */
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
+#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
+struct rte_mp_msg {
+	char name[RTE_MP_MAX_NAME_LEN];
+	int len_param;
+	int num_fds;
+	uint8_t param[RTE_MP_MAX_PARAM_LEN];
+	int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create  socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_sendmsg(struct rte_mp_msg *msg);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..53e29e4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2012-2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 #ifdef VFIO_PRESENT
 	if (rte_eal_vfio_setup() < 0) {
 		rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..adeadfb 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
 	rte_eal_devargs_remove;
 	rte_eal_hotplug_add;
 	rte_eal_hotplug_remove;
+	rte_eal_mp_action_register;
+	rte_eal_mp_action_unregister;
+	rte_eal_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v5 2/2] eal: add synchronous multi-process communication
  2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
  2018-01-25 19:21   ` [PATCH v5 1/2] eal: add " Jianfeng Tan
@ 2018-01-25 19:21   ` Jianfeng Tan
  2018-01-25 21:23   ` [PATCH v5 0/2] generic channel for " Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-25 19:21 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

       sender-process                receiver-process
   ----------------------            ----------------

    thread-n
     |_rte_eal_mp_request() ----------> mp-thread
        |_timedwait()                    |_process_msg()
                                           |_action()
                                               |_rte_eal_mp_reply()
	        mp_thread  <---------------------|
                  |_process_msg()
                     |_signal(send_thread)
    thread-m <----------|
     |_collect-reply

 * A secondary process is only allowed to talk to the primary process.
 * If there are multiple secondary processes for the primary process,
   it will send request to peer1, collect response from peer1; then
   send request to peer2, collect response from peer2, and so on.
 * When thread-n is sending request, thread-m of that process can send
   request at the same time.
 * For pair <action_name, peer>, we guarantee that only one such request
   is on the fly.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   2 +
 lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  58 +++++++-
 lib/librte_eal/rte_eal_version.map      |   2 +
 4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features
 
   * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
   * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+  * ``rte_mp_request`` is for sending a request message and will block until
+    it gets a reply message which is sent from the peer by ``rte_mp_reply``.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
 static struct action_entry_list action_entry_list =
 	TAILQ_HEAD_INITIALIZER(action_entry_list);
 
+enum mp_type {
+	MP_MSG, /* Share message with peers, will not block */
+	MP_REQ, /* Request for information, Will block for a reply */
+	MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+	int type;
+	struct rte_mp_msg msg;
+};
+
+struct sync_request {
+	TAILQ_ENTRY(sync_request) next;
+	int reply_received;
+	char dst[PATH_MAX];
+	struct rte_mp_msg *request;
+	struct rte_mp_msg *reply;
+	pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+	struct sync_request_list requests;
+	pthread_mutex_t lock;
+} sync_requests = {
+	.requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+	.lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+	struct sync_request *r;
+
+	TAILQ_FOREACH(r, &sync_requests.requests, next) {
+		if (!strcmp(r->dst, dst) &&
+		    !strcmp(r->request->name, act_name))
+			break;
+	}
+
+	return r;
+}
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
 	int msglen;
 	struct iovec iov;
 	struct msghdr msgh;
-	char control[CMSG_SPACE(sizeof(msg->fds))];
+	char control[CMSG_SPACE(sizeof(m->msg.fds))];
 	struct cmsghdr *cmsg;
-	int buflen = sizeof(*msg) - sizeof(msg->fds);
+	int buflen = sizeof(*m) - sizeof(m->msg.fds);
 
 	memset(&msgh, 0, sizeof(msgh));
-	iov.iov_base = msg;
+	iov.iov_base = m;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
 			(cmsg->cmsg_type == SCM_RIGHTS)) {
-			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
 			break;
 		}
 	}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
 }
 
 static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
+	struct sync_request *sync_req;
 	struct action_entry *entry;
+	struct rte_mp_msg *msg = &m->msg;
 	rte_mp_t action = NULL;
 
 	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+	if (m->type == MP_REP) {
+		pthread_mutex_lock(&sync_requests.lock);
+		sync_req = find_sync_request(s->sun_path, msg->name);
+		if (sync_req) {
+			memcpy(sync_req->reply, msg, sizeof(*msg));
+			sync_req->reply_received = 1;
+			pthread_cond_signal(&sync_req->cond);
+		} else
+			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+		pthread_mutex_unlock(&sync_requests.lock);
+		return;
+	}
+
 	pthread_mutex_lock(&mp_mutex_action);
 	entry = find_action_entry_by_name(msg->name);
 	if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)
 
 	if (!action)
 		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
-	else if (action(msg) < 0)
+	else if (action(msg, s->sun_path) < 0)
 		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
 }
 
 static void *
 mp_handle(void *arg __rte_unused)
 {
-	struct rte_mp_msg msg;
+	struct mp_msg_internal msg;
+	struct sockaddr_un sa;
 
 	while (1) {
-		if (read_msg(&msg) == 0)
-			process_msg(&msg);
+		if (read_msg(&msg, &sa) == 0)
+			process_msg(&msg, &sa);
 	}
 
 	return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
  *
  */
 static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
 {
 	int snd;
 	struct iovec iov;
 	struct msghdr msgh;
 	struct cmsghdr *cmsg;
 	struct sockaddr_un dst;
+	struct mp_msg_internal m;
 	int fd_size = msg->num_fds * sizeof(int);
 	char control[CMSG_SPACE(fd_size)];
 
+	m.type = type;
+	memcpy(&m.msg, msg, sizeof(*msg));
+
 	memset(&dst, 0, sizeof(dst));
 	dst.sun_family = AF_UNIX;
 	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 	memset(&msgh, 0, sizeof(msgh));
 	memset(control, 0, sizeof(control));
 
-	iov.iov_base = msg;
-	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+	iov.iov_base = &m;
+	iov.iov_len = sizeof(m) - sizeof(msg->fds);
 
 	msgh.msg_name = &dst;
 	msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 }
 
 static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 {
 	int ret = 0;
 	DIR *mp_dir;
 	struct dirent *ent;
 
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		if (send_msg(eal_mp_socket_path(), msg) < 0)
+	if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+		peer = eal_mp_socket_path();
+
+	if (peer) {
+		if (send_msg(peer, msg, type) < 0)
 			return -1;
 		else
 			return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
 		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
 			continue;
 
-		if (send_msg(ent->d_name, msg) < 0)
+		if (send_msg(ent->d_name, msg, type) < 0)
 			ret = -1;
 	}
-	closedir(mp_dir);
 
+	closedir(mp_dir);
 	return ret;
 }
 
@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
 		return -1;
 
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
-	return mp_send(msg);
+	return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+	       struct rte_mp_reply *reply, const struct timespec *ts)
+{
+	int ret;
+	struct timeval now;
+	struct rte_mp_msg msg, *tmp;
+	struct sync_request sync_req, *exist;
+
+	sync_req.reply_received = 0;
+	strcpy(sync_req.dst, dst);
+	sync_req.request = req;
+	sync_req.reply = &msg;
+	pthread_cond_init(&sync_req.cond, NULL);
+
+	pthread_mutex_lock(&sync_requests.lock);
+	exist = find_sync_request(dst, req->name);
+	if (!exist)
+		TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+	if (exist) {
+		RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+
+	ret = send_msg(dst, req, MP_REQ);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+			dst, req->name);
+		return -1;
+	} else if (ret == 0)
+		return 0;
+
+	reply->nb_sent++;
+
+	pthread_mutex_lock(&sync_requests.lock);
+	do {
+		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+		/* Check spurious wakeups */
+		if (sync_req.reply_received == 1)
+			break;
+		/* Check if time is out */
+		if (gettimeofday(&now, NULL) < 0)
+			break;
+		if (now.tv_sec < ts->tv_sec)
+			break;
+		else if (now.tv_sec == ts->tv_sec &&
+			 now.tv_usec * 1000 < ts->tv_nsec)
+			break;
+	} while (1);
+	/* We got the lock now */
+	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+
+	if (sync_req.reply_received == 0) {
+		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ETIMEDOUT;
+		return -1;
+	}
+
+	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+	if (!tmp) {
+		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+	reply->msgs = tmp;
+	reply->nb_received++;
+	return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		const struct timespec *ts)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+	struct timeval now;
+	struct timespec end;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+	if (check_input(req) == false)
+		return -1;
+	if (gettimeofday(&now, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "Faile to get current time\n");
+		rte_errno = errno;
+		return -1;
+	}
+
+	end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+	end.tv_sec = now.tv_sec + ts->tv_sec +
+			(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+	reply->nb_sent = 0;
+	reply->nb_received = 0;
+	reply->msgs = NULL;
+
+	/* for secondary process, send request to the primary process only */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+	/* for primary process, broadcast request, and collect reply 1 by 1 */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (mp_request_one(ent->d_name, req, reply, &end))
+			ret = -1;
+	}
+
+	closedir(mp_dir);
+	return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+	if (check_input(msg) == false)
+		return -1;
+
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+
+	return mp_send(msg, peer, MP_REP);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@
 
 #include <stdint.h>
 #include <sched.h>
+#include <time.h>
 
 #include <rte_config.h>
 #include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+struct rte_mp_reply {
+	int nb_sent;
+	int nb_received;
+	struct rte_mp_msg *msgs; /* caller to free */
+};
+
 /**
  * Action function typedef used by other components.
  *
  * As we create  socket channel for primary/secondary communication, use
  * this function typedef to register action for coming messages.
  */
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);
 
 /**
  * @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
 int rte_mp_sendmsg(struct rte_mp_msg *msg);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ *   The req argument contains the customized request message.
+ *
+ * @param reply
+ *   The reply argument will be for storing all the replied messages;
+ *   the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ *   The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		   const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @param peer
+ *   The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..2cb6b07 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
 	rte_eal_mp_action_register;
 	rte_eal_mp_action_unregister;
 	rte_eal_mp_sendmsg;
+	rte_eal_mp_request;
+	rte_eal_mp_reply;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v5 0/2] generic channel for multi-process communication
  2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
  2018-01-25 19:21   ` [PATCH v5 1/2] eal: add " Jianfeng Tan
  2018-01-25 19:21   ` [PATCH v5 2/2] eal: add synchronous " Jianfeng Tan
@ 2018-01-25 21:23   ` Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-25 21:23 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, anatoly.burakov, bruce.richardson, konstantin.ananyev

25/01/2018 20:21, Jianfeng Tan:
> v3->v5:
>   - Drop the patch 3 on vfio communication (postponed).
>   - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.

You did not update the .map file for this change.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v6 0/2] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (7 preceding siblings ...)
  2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
@ 2018-01-26  3:41 ` Jianfeng Tan
  2018-01-26  3:41   ` [PATCH v6 1/2] eal: add " Jianfeng Tan
                     ` (2 more replies)
  2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
  9 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-26  3:41 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v5->v6:
  - Correct the API name issue in rte_eal_version.map.

v3->v5:
  - Drop the patch 3 on vfio communication (postponed).
  - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
  - Add nb_sent and nb_received in struct rte_mp_reply.
  - Standardize the return val of sendmsg, request, reply: 0 on sucess,
    (-1) on failure.
  - If we found an peer error when we send msg in primary, we try to
    remove the secondary socket; as there is no sync mechanism there
    (cannot do flock like regular file for socket file), we use a more
    complex socket name (with tsc in it).
  - Some other small changes.

v3->v4:
  - Wrong patches are sent out.

v2->v3:
  - Add pre-check for each APIs.
  - Remove the limitation of 8 secondary processes by: discard original
    register/unregister mechanism of secondary process, instead, primary
    discoveries secondary processes by looking up the folder for regex match.
  - Previous implementation use two sockets for msg and request, this version
    just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.



Jianfeng Tan (2):
  eal: add channel for multi-process communication
  eal: add synchronous multi-process communication

 doc/guides/rel_notes/release_18_02.rst  |  11 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h | 133 ++++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   5 +
 8 files changed, 848 insertions(+), 7 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v6 1/2] eal: add channel for multi-process communication
  2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
@ 2018-01-26  3:41   ` Jianfeng Tan
  2018-01-26 10:25     ` Burakov, Anatoly
  2018-01-26  3:41   ` [PATCH v6 2/2] eal: add synchronous " Jianfeng Tan
  2018-01-29 23:52   ` [PATCH v6 0/2] generic channel for " Thomas Monjalon
  2 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-26  3:41 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are n secondary processes, the primary
     process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   9 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h |  77 +++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   3 +
 8 files changed, 572 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..be6ac99 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,15 @@ New Features
   renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
   to PMD agnostic ``eventdev_pipeline``.
 
+* **Added new multi-process communication channel**
+
+  Added a generic channel in EAL for multi-process (primary/secondary) communication.
+  Consumers of this channel need to register an action with an action name to response
+  a message received; the actions will be identified by the action name and executed
+  in the context of a new dedicated thread for this channel. The list of new APIs:
+
+  * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+  * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 04cbd81..fcc9828 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -603,6 +603,14 @@ rte_eal_init(int argc, char **argv)
 
 	rte_config_init();
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	if (rte_eal_memory_init() < 0) {
 		rte_eal_init_alert("Cannot init memory\n");
 		rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..aea0829 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
  */
 
-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+static int mp_fd = -1;
+static char mp_filter[PATH_MAX];   /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;
+	char action_name[RTE_MP_MAX_NAME_LEN];
+	rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+	if (name == NULL) {
+		RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+		RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return -1;
+	}
+	return 0;
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return -1;
+
+	entry = malloc(sizeof(struct action_entry));
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	strcpy(entry->action_name, name);
+	entry->action = action;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(name) != NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		rte_errno = -EEXIST;
+		free(entry);
+		return -1;
+	}
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	if (entry == NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		return;
+	}
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+	int msglen;
+	struct iovec iov;
+	struct msghdr msgh;
+	char control[CMSG_SPACE(sizeof(msg->fds))];
+	struct cmsghdr *cmsg;
+	int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = msg;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(mp_fd, &msgh, 0);
+	if (msglen < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+	struct action_entry *entry;
+	rte_mp_t action = NULL;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(msg->name);
+	if (entry != NULL)
+		action = entry->action;
+	pthread_mutex_unlock(&mp_mutex_action);
+
+	if (!action)
+		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+	else if (action(msg) < 0)
+		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	struct rte_mp_msg msg;
+
+	while (1) {
+		if (read_msg(&msg) == 0)
+			process_msg(&msg);
+	}
+
+	return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+	struct sockaddr_un un;
+	const char *prefix = eal_mp_socket_path();
+
+	mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (mp_fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+	else {
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+			 prefix, getpid(), rte_rdtsc());
+	}
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(mp_fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+	return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+	int dir_fd;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		return -1;
+	}
+	dir_fd = dirfd(mp_dir);
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(filter, ent->d_name, 0) == 0)
+			unlinkat(dir_fd, ent->d_name, 0);
+	}
+
+	closedir(mp_dir);
+	return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+	char *filename;
+	char *fullpath = strdup(path);
+
+	if (!fullpath)
+		return;
+	filename = basename(fullpath);
+	unlink_sockets(filename);
+	free(fullpath);
+	RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	char *path;
+	pthread_t tid;
+
+	snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+		 internal_config.hugefile_prefix);
+
+	path = strdup(eal_mp_socket_path());
+	snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+	free(path);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+	    unlink_sockets(mp_filter)) {
+		RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+		return -1;
+	}
+
+	if (open_socket_fd() < 0)
+		return -1;
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+			strerror(errno));
+		close(mp_fd);
+		mp_fd = -1;
+		return -1;
+	}
+
+	/* try best to set thread name */
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+	rte_thread_setname(tid, thread_name);
+	return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	int fd_size = msg->num_fds * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = msg;
+	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+	do {
+		snd = sendmsg(mp_fd, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	if (snd < 0) {
+		rte_errno = errno;
+		/* Check if it caused by peer process exits */
+		if (errno == -ECONNREFUSED) {
+			/* We don't unlink the primary's socket here */
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				unlink_socket_by_path(dst_path);
+			return 0;
+		}
+		if (errno == -ENOBUFS) {
+			RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+				dst_path);
+			return 0;
+		}
+		RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+			dst_path, strerror(errno));
+		return -1;
+	}
+
+	return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		if (send_msg(eal_mp_socket_path(), msg) < 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* broadcast to all secondary processes */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+				mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (send_msg(ent->d_name, msg) < 0)
+			ret = -1;
+	}
+	closedir(mp_dir);
+
+	return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+	if (msg == NULL) {
+		RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+
+	if (validate_action_name(msg->name))
+		return false;
+
+	if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+		RTE_LOG(ERR, EAL, "Message data is too long\n");
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+			RTE_MP_MAX_FD_NUM);
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	return true;
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+	if (!check_input(msg))
+		return -1;
+
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+	return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 /**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..1d42e9c 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _RTE_EAL_H_
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
  */
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
+#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
+struct rte_mp_msg {
+	char name[RTE_MP_MAX_NAME_LEN];
+	int len_param;
+	int num_fds;
+	uint8_t param[RTE_MP_MAX_PARAM_LEN];
+	int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create  socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_sendmsg(struct rte_mp_msg *msg);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..53e29e4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2012-2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 #ifdef VFIO_PRESENT
 	if (rte_eal_vfio_setup() < 0) {
 		rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..8fd60de 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
 	rte_eal_devargs_remove;
 	rte_eal_hotplug_add;
 	rte_eal_hotplug_remove;
+	rte_mp_action_register;
+	rte_mp_action_unregister;
+	rte_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v6 2/2] eal: add synchronous multi-process communication
  2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
  2018-01-26  3:41   ` [PATCH v6 1/2] eal: add " Jianfeng Tan
@ 2018-01-26  3:41   ` Jianfeng Tan
  2018-01-26 10:31     ` Burakov, Anatoly
  2018-01-29 23:52   ` [PATCH v6 0/2] generic channel for " Thomas Monjalon
  2 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-26  3:41 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

       sender-process                receiver-process
   ----------------------            ----------------

    thread-n
     |_rte_eal_mp_request() ----------> mp-thread
        |_timedwait()                    |_process_msg()
                                           |_action()
                                               |_rte_eal_mp_reply()
	        mp_thread  <---------------------|
                  |_process_msg()
                     |_signal(send_thread)
    thread-m <----------|
     |_collect-reply

 * A secondary process is only allowed to talk to the primary process.
 * If there are multiple secondary processes for the primary process,
   it will send request to peer1, collect response from peer1; then
   send request to peer2, collect response from peer2, and so on.
 * When thread-n is sending request, thread-m of that process can send
   request at the same time.
 * For pair <action_name, peer>, we guarantee that only one such request
   is on the fly.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   2 +
 lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  58 +++++++-
 lib/librte_eal/rte_eal_version.map      |   2 +
 4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features
 
   * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
   * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+  * ``rte_mp_request`` is for sending a request message and will block until
+    it gets a reply message which is sent from the peer by ``rte_mp_reply``.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
 static struct action_entry_list action_entry_list =
 	TAILQ_HEAD_INITIALIZER(action_entry_list);
 
+enum mp_type {
+	MP_MSG, /* Share message with peers, will not block */
+	MP_REQ, /* Request for information, Will block for a reply */
+	MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+	int type;
+	struct rte_mp_msg msg;
+};
+
+struct sync_request {
+	TAILQ_ENTRY(sync_request) next;
+	int reply_received;
+	char dst[PATH_MAX];
+	struct rte_mp_msg *request;
+	struct rte_mp_msg *reply;
+	pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+	struct sync_request_list requests;
+	pthread_mutex_t lock;
+} sync_requests = {
+	.requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+	.lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+	struct sync_request *r;
+
+	TAILQ_FOREACH(r, &sync_requests.requests, next) {
+		if (!strcmp(r->dst, dst) &&
+		    !strcmp(r->request->name, act_name))
+			break;
+	}
+
+	return r;
+}
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
 	int msglen;
 	struct iovec iov;
 	struct msghdr msgh;
-	char control[CMSG_SPACE(sizeof(msg->fds))];
+	char control[CMSG_SPACE(sizeof(m->msg.fds))];
 	struct cmsghdr *cmsg;
-	int buflen = sizeof(*msg) - sizeof(msg->fds);
+	int buflen = sizeof(*m) - sizeof(m->msg.fds);
 
 	memset(&msgh, 0, sizeof(msgh));
-	iov.iov_base = msg;
+	iov.iov_base = m;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
 			(cmsg->cmsg_type == SCM_RIGHTS)) {
-			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
 			break;
 		}
 	}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
 }
 
 static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
+	struct sync_request *sync_req;
 	struct action_entry *entry;
+	struct rte_mp_msg *msg = &m->msg;
 	rte_mp_t action = NULL;
 
 	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+	if (m->type == MP_REP) {
+		pthread_mutex_lock(&sync_requests.lock);
+		sync_req = find_sync_request(s->sun_path, msg->name);
+		if (sync_req) {
+			memcpy(sync_req->reply, msg, sizeof(*msg));
+			sync_req->reply_received = 1;
+			pthread_cond_signal(&sync_req->cond);
+		} else
+			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+		pthread_mutex_unlock(&sync_requests.lock);
+		return;
+	}
+
 	pthread_mutex_lock(&mp_mutex_action);
 	entry = find_action_entry_by_name(msg->name);
 	if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)
 
 	if (!action)
 		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
-	else if (action(msg) < 0)
+	else if (action(msg, s->sun_path) < 0)
 		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
 }
 
 static void *
 mp_handle(void *arg __rte_unused)
 {
-	struct rte_mp_msg msg;
+	struct mp_msg_internal msg;
+	struct sockaddr_un sa;
 
 	while (1) {
-		if (read_msg(&msg) == 0)
-			process_msg(&msg);
+		if (read_msg(&msg, &sa) == 0)
+			process_msg(&msg, &sa);
 	}
 
 	return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
  *
  */
 static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
 {
 	int snd;
 	struct iovec iov;
 	struct msghdr msgh;
 	struct cmsghdr *cmsg;
 	struct sockaddr_un dst;
+	struct mp_msg_internal m;
 	int fd_size = msg->num_fds * sizeof(int);
 	char control[CMSG_SPACE(fd_size)];
 
+	m.type = type;
+	memcpy(&m.msg, msg, sizeof(*msg));
+
 	memset(&dst, 0, sizeof(dst));
 	dst.sun_family = AF_UNIX;
 	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 	memset(&msgh, 0, sizeof(msgh));
 	memset(control, 0, sizeof(control));
 
-	iov.iov_base = msg;
-	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+	iov.iov_base = &m;
+	iov.iov_len = sizeof(m) - sizeof(msg->fds);
 
 	msgh.msg_name = &dst;
 	msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 }
 
 static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 {
 	int ret = 0;
 	DIR *mp_dir;
 	struct dirent *ent;
 
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		if (send_msg(eal_mp_socket_path(), msg) < 0)
+	if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+		peer = eal_mp_socket_path();
+
+	if (peer) {
+		if (send_msg(peer, msg, type) < 0)
 			return -1;
 		else
 			return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
 		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
 			continue;
 
-		if (send_msg(ent->d_name, msg) < 0)
+		if (send_msg(ent->d_name, msg, type) < 0)
 			ret = -1;
 	}
-	closedir(mp_dir);
 
+	closedir(mp_dir);
 	return ret;
 }
 
@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
 		return -1;
 
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
-	return mp_send(msg);
+	return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+	       struct rte_mp_reply *reply, const struct timespec *ts)
+{
+	int ret;
+	struct timeval now;
+	struct rte_mp_msg msg, *tmp;
+	struct sync_request sync_req, *exist;
+
+	sync_req.reply_received = 0;
+	strcpy(sync_req.dst, dst);
+	sync_req.request = req;
+	sync_req.reply = &msg;
+	pthread_cond_init(&sync_req.cond, NULL);
+
+	pthread_mutex_lock(&sync_requests.lock);
+	exist = find_sync_request(dst, req->name);
+	if (!exist)
+		TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+	if (exist) {
+		RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+
+	ret = send_msg(dst, req, MP_REQ);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+			dst, req->name);
+		return -1;
+	} else if (ret == 0)
+		return 0;
+
+	reply->nb_sent++;
+
+	pthread_mutex_lock(&sync_requests.lock);
+	do {
+		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+		/* Check spurious wakeups */
+		if (sync_req.reply_received == 1)
+			break;
+		/* Check if time is out */
+		if (gettimeofday(&now, NULL) < 0)
+			break;
+		if (now.tv_sec < ts->tv_sec)
+			break;
+		else if (now.tv_sec == ts->tv_sec &&
+			 now.tv_usec * 1000 < ts->tv_nsec)
+			break;
+	} while (1);
+	/* We got the lock now */
+	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+
+	if (sync_req.reply_received == 0) {
+		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ETIMEDOUT;
+		return -1;
+	}
+
+	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+	if (!tmp) {
+		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+	reply->msgs = tmp;
+	reply->nb_received++;
+	return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		const struct timespec *ts)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+	struct timeval now;
+	struct timespec end;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+	if (check_input(req) == false)
+		return -1;
+	if (gettimeofday(&now, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "Faile to get current time\n");
+		rte_errno = errno;
+		return -1;
+	}
+
+	end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+	end.tv_sec = now.tv_sec + ts->tv_sec +
+			(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+	reply->nb_sent = 0;
+	reply->nb_received = 0;
+	reply->msgs = NULL;
+
+	/* for secondary process, send request to the primary process only */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+	/* for primary process, broadcast request, and collect reply 1 by 1 */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (mp_request_one(ent->d_name, req, reply, &end))
+			ret = -1;
+	}
+
+	closedir(mp_dir);
+	return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+	if (check_input(msg) == false)
+		return -1;
+
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+
+	return mp_send(msg, peer, MP_REP);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@
 
 #include <stdint.h>
 #include <sched.h>
+#include <time.h>
 
 #include <rte_config.h>
 #include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+struct rte_mp_reply {
+	int nb_sent;
+	int nb_received;
+	struct rte_mp_msg *msgs; /* caller to free */
+};
+
 /**
  * Action function typedef used by other components.
  *
  * As we create  socket channel for primary/secondary communication, use
  * this function typedef to register action for coming messages.
  */
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);
 
 /**
  * @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
 int rte_mp_sendmsg(struct rte_mp_msg *msg);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ *   The req argument contains the customized request message.
+ *
+ * @param reply
+ *   The reply argument will be for storing all the replied messages;
+ *   the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ *   The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		   const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @param peer
+ *   The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 8fd60de..673e5e5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
 	rte_mp_action_register;
 	rte_mp_action_unregister;
 	rte_mp_sendmsg;
+	rte_mp_request;
+	rte_mp_reply;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v6 1/2] eal: add channel for multi-process communication
  2018-01-26  3:41   ` [PATCH v6 1/2] eal: add " Jianfeng Tan
@ 2018-01-26 10:25     ` Burakov, Anatoly
  2018-01-29  6:37       ` Tan, Jianfeng
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-26 10:25 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 26-Jan-18 3:41 AM, Jianfeng Tan wrote:
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>    1. Config-file based channel, in which, the primary process writes
>       info into a pre-defined config file, and the secondary process
>       reads the info out.
>    2. vfio submodule has its own channel based on unix socket for the
>       secondary process to get container fd and group fd from the
>       primary process.
>    3. pdump submodule also has its own channel based on unix socket for
>       packet dump.
> 
> It'd be good to have a generic communication channel for multi-process
> communication to accommodate the requirements including:
>    a. Secondary wants to send info to primary, for example, secondary
>       would like to send request (about some specific vdev to primary).
>    b. Sending info at any time, instead of just initialization time.
>    c. Share FDs with the other side, for vdev like vhost, related FDs
>       (memory region, kick) should be shared.
>    d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, based on datagram
> unix socket, for above requirements. Each process will block on a unix
> socket waiting for messages from the peers.
> 
> Three new APIs are added:
> 
>    1. rte_eal_mp_action_register() is used to register an action,
>       indexed by a string, when a component at receiver side would like
>       to response the messages from the peer processe.
>    2. rte_eal_mp_action_unregister() is used to unregister the action
>       if the calling component does not want to response the messages.
>    3. rte_eal_mp_sendmsg() is used to send a message, and returns
>       immediately. If there are n secondary processes, the primary
>       process will send n messages.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---

<snip>

> +
> +static int
> +mp_send(struct rte_mp_msg *msg)
> +{
> +	int ret = 0;
> +	DIR *mp_dir;
> +	struct dirent *ent;
> +
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +		if (send_msg(eal_mp_socket_path(), msg) < 0)
> +			return -1;
> +		else
> +			return 0;
> +	}
> +
> +	/* broadcast to all secondary processes */
> +	mp_dir = opendir(mp_dir_path);
> +	if (!mp_dir) {
> +		RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
> +				mp_dir_path);
> +		rte_errno = errno;
> +		return -1;
> +	}
> +	while ((ent = readdir(mp_dir))) {
> +		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
> +			continue;
> +
> +		if (send_msg(ent->d_name, msg) < 0)
> +			ret = -1;
> +	}
> +	closedir(mp_dir);
> +
> +	return ret;

Nitpick: you probably don't need ret here, just return 0 as in other places.

> +}
> +
> +static bool
> +check_input(const struct rte_mp_msg *msg)
> +{
> +	if (msg == NULL) {
> +		RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
> +		rte_errno = -EINVAL;
> +		return false;
> +	}
> +
> +	if (validate_action_name(msg->name))
> +		return false;
> +
> +	if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
> +		RTE_LOG(ERR, EAL, "Message data is too long\n");
> +		rte_errno = -E2BIG;
> +		return false;
> +	}
> +
> +	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
> +			RTE_MP_MAX_FD_NUM);
> +		rte_errno = -E2BIG;
> +		return false;

Otherwise, i'm happy with this patch.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6 2/2] eal: add synchronous multi-process communication
  2018-01-26  3:41   ` [PATCH v6 2/2] eal: add synchronous " Jianfeng Tan
@ 2018-01-26 10:31     ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-26 10:31 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 26-Jan-18 3:41 AM, Jianfeng Tan wrote:
> We need the synchronous way for multi-process communication,
> i.e., blockingly waiting for reply message when we send a request
> to the peer process.
> 
> We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
> such use case. By invoking rte_eal_mp_request(), a request message
> is sent out, and then it waits there for a reply message. The caller
> can specify the timeout. And the response messages will be collected
> and returned so that the caller can decide how to translate them.
> 
> The API rte_eal_mp_reply() is always called by an mp action handler.
> Here we add another parameter for rte_eal_mp_t so that the action
> handler knows which peer address to reply.
> 
>         sender-process                receiver-process
>     ----------------------            ----------------
> 
>      thread-n
>       |_rte_eal_mp_request() ----------> mp-thread
>          |_timedwait()                    |_process_msg()
>                                             |_action()
>                                                 |_rte_eal_mp_reply()
> 	        mp_thread  <---------------------|
>                    |_process_msg()
>                       |_signal(send_thread)
>      thread-m <----------|
>       |_collect-reply
> 
>   * A secondary process is only allowed to talk to the primary process.
>   * If there are multiple secondary processes for the primary process,
>     it will send request to peer1, collect response from peer1; then
>     send request to peer2, collect response from peer2, and so on.
>   * When thread-n is sending request, thread-m of that process can send
>     request at the same time.
>   * For pair <action_name, peer>, we guarantee that only one such request
>     is on the fly.
> 
> Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---

No further comments from me :)

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6 1/2] eal: add channel for multi-process communication
  2018-01-26 10:25     ` Burakov, Anatoly
@ 2018-01-29  6:37       ` Tan, Jianfeng
  2018-01-29  9:37         ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2018-01-29  6:37 UTC (permalink / raw)
  To: Burakov, Anatoly, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

Hi Anatoly,

> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Friday, January 26, 2018 6:26 PM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
> Subject: Re: [PATCH v6 1/2] eal: add channel for multi-process
> communication
> 
> On 26-Jan-18 3:41 AM, Jianfeng Tan wrote:
> > Previouly, there are three channels for multi-process
> > (i.e., primary/secondary) communication.
> >    1. Config-file based channel, in which, the primary process writes
> >       info into a pre-defined config file, and the secondary process
> >       reads the info out.
> >    2. vfio submodule has its own channel based on unix socket for the
> >       secondary process to get container fd and group fd from the
> >       primary process.
> >    3. pdump submodule also has its own channel based on unix socket for
> >       packet dump.
> >
> > It'd be good to have a generic communication channel for multi-process
> > communication to accommodate the requirements including:
> >    a. Secondary wants to send info to primary, for example, secondary
> >       would like to send request (about some specific vdev to primary).
> >    b. Sending info at any time, instead of just initialization time.
> >    c. Share FDs with the other side, for vdev like vhost, related FDs
> >       (memory region, kick) should be shared.
> >    d. A send message request needs the other side to response immediately.
> >
> > This patch proposes to create a communication channel, based on
> datagram
> > unix socket, for above requirements. Each process will block on a unix
> > socket waiting for messages from the peers.
> >
> > Three new APIs are added:
> >
> >    1. rte_eal_mp_action_register() is used to register an action,
> >       indexed by a string, when a component at receiver side would like
> >       to response the messages from the peer processe.
> >    2. rte_eal_mp_action_unregister() is used to unregister the action
> >       if the calling component does not want to response the messages.
> >    3. rte_eal_mp_sendmsg() is used to send a message, and returns
> >       immediately. If there are n secondary processes, the primary
> >       process will send n messages.
> >
> > Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
> > Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > ---
> 
> <snip>
> 
> > +
> > +static int
> > +mp_send(struct rte_mp_msg *msg)
> > +{
> > +	int ret = 0;
> > +	DIR *mp_dir;
> > +	struct dirent *ent;
> > +
> > +	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> > +		if (send_msg(eal_mp_socket_path(), msg) < 0)
> > +			return -1;
> > +		else
> > +			return 0;
> > +	}
> > +
> > +	/* broadcast to all secondary processes */
> > +	mp_dir = opendir(mp_dir_path);
> > +	if (!mp_dir) {
> > +		RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
> > +				mp_dir_path);
> > +		rte_errno = errno;
> > +		return -1;
> > +	}
> > +	while ((ent = readdir(mp_dir))) {
> > +		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
> > +			continue;
> > +
> > +		if (send_msg(ent->d_name, msg) < 0)
> > +			ret = -1;

Here ret is assigned to -1.

> > +	}
> > +	closedir(mp_dir);
> > +
> > +	return ret;
> 
> Nitpick: you probably don't need ret here, just return 0 as in other places.

We cannot just return 0 as it could be -1 as above comment shows.
The ret variable was introduced to avoid two "closedir()".

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6 1/2] eal: add channel for multi-process communication
  2018-01-29  6:37       ` Tan, Jianfeng
@ 2018-01-29  9:37         ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-01-29  9:37 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

On 29-Jan-18 6:37 AM, Tan, Jianfeng wrote:
> Hi Anatoly,
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Friday, January 26, 2018 6:26 PM
>> To: Tan, Jianfeng; dev@dpdk.org
>> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
>> Subject: Re: [PATCH v6 1/2] eal: add channel for multi-process
>> communication
>>
>> On 26-Jan-18 3:41 AM, Jianfeng Tan wrote:
>>> Previouly, there are three channels for multi-process
>>> (i.e., primary/secondary) communication.
>>>     1. Config-file based channel, in which, the primary process writes
>>>        info into a pre-defined config file, and the secondary process
>>>        reads the info out.
>>>     2. vfio submodule has its own channel based on unix socket for the
>>>        secondary process to get container fd and group fd from the
>>>        primary process.
>>>     3. pdump submodule also has its own channel based on unix socket for
>>>        packet dump.
>>>
>>> It'd be good to have a generic communication channel for multi-process
>>> communication to accommodate the requirements including:
>>>     a. Secondary wants to send info to primary, for example, secondary
>>>        would like to send request (about some specific vdev to primary).
>>>     b. Sending info at any time, instead of just initialization time.
>>>     c. Share FDs with the other side, for vdev like vhost, related FDs
>>>        (memory region, kick) should be shared.
>>>     d. A send message request needs the other side to response immediately.
>>>
>>> This patch proposes to create a communication channel, based on
>> datagram
>>> unix socket, for above requirements. Each process will block on a unix
>>> socket waiting for messages from the peers.
>>>
>>> Three new APIs are added:
>>>
>>>     1. rte_eal_mp_action_register() is used to register an action,
>>>        indexed by a string, when a component at receiver side would like
>>>        to response the messages from the peer processe.
>>>     2. rte_eal_mp_action_unregister() is used to unregister the action
>>>        if the calling component does not want to response the messages.
>>>     3. rte_eal_mp_sendmsg() is used to send a message, and returns
>>>        immediately. If there are n secondary processes, the primary
>>>        process will send n messages.
>>>
>>> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>> Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>> ---
>>> +	}
>>> +	closedir(mp_dir);
>>> +
>>> +	return ret;
>>
>> Nitpick: you probably don't need ret here, just return 0 as in other places.
> 
> We cannot just return 0 as it could be -1 as above comment shows.
> The ret variable was introduced to avoid two "closedir()".
> 
> Thanks,
> Jianfeng
> 

Yep you're right, apologies.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6 0/2] generic channel for multi-process communication
  2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
  2018-01-26  3:41   ` [PATCH v6 1/2] eal: add " Jianfeng Tan
  2018-01-26  3:41   ` [PATCH v6 2/2] eal: add synchronous " Jianfeng Tan
@ 2018-01-29 23:52   ` Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-29 23:52 UTC (permalink / raw)
  To: Jianfeng Tan
  Cc: dev, anatoly.burakov, bruce.richardson, konstantin.ananyev,
	Neil Horman, ferruh.yigit

26/01/2018 04:41, Jianfeng Tan:
> v5->v6:
>   - Correct the API name issue in rte_eal_version.map.
> 
> v3->v5:
>   - Drop the patch 3 on vfio communication (postponed).
>   - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
>   - Add nb_sent and nb_received in struct rte_mp_reply.
>   - Standardize the return val of sendmsg, request, reply: 0 on sucess,
>     (-1) on failure.
>   - If we found an peer error when we send msg in primary, we try to
>     remove the secondary socket; as there is no sync mechanism there
>     (cannot do flock like regular file for socket file), we use a more
>     complex socket name (with tsc in it).
>   - Some other small changes.

Please, may I ask a last rebase?
The __rte_experimental tag is now required to be added as in this commit:
	http://dpdk.org/commit/77b7b81e32e
Thanks

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v7 0/2] generic channel for multi-process communication
  2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
                   ` (8 preceding siblings ...)
  2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
@ 2018-01-30  6:58 ` Jianfeng Tan
  2018-01-30  6:58   ` [PATCH v7 1/2] eal: add " Jianfeng Tan
                     ` (2 more replies)
  9 siblings, 3 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-30  6:58 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

v6->v7:
  - Add __rte_experimental tag for new APIs.
  - Rebased on master.

v5->v6:
  - Correct the API name issue in rte_eal_version.map.

v3->v5:
  - Drop the patch 3 on vfio communication (postponed).
  - Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
  - Add nb_sent and nb_received in struct rte_mp_reply.
  - Standardize the return val of sendmsg, request, reply: 0 on sucess,
    (-1) on failure.
  - If we found an peer error when we send msg in primary, we try to
    remove the secondary socket; as there is no sync mechanism there
    (cannot do flock like regular file for socket file), we use a more
    complex socket name (with tsc in it).
  - Some other small changes.

v3->v4:
  - Wrong patches are sent out.

v2->v3:
  - Add pre-check for each APIs.
  - Remove the limitation of 8 secondary processes by: discard original
    register/unregister mechanism of secondary process, instead, primary
    discoveries secondary processes by looking up the folder for regex match.
  - Previous implementation use two sockets for msg and request, this version
    just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
  - Use datagram unix socket to supersede stream unix socket + epoll.
  - Change the secondary add/del mechanism as now we use connection-less channel.
  - Add mp_mutex_action to sync action register/unregister/reference.
  - Limit max length of action name to 64B.
  - New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
  - Formalize the errno handle.
  - Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.


Jianfeng Tan (2):
  eal: add channel for multi-process communication
  eal: add synchronous multi-process communication

 doc/guides/rel_notes/release_18_02.rst  |  11 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h | 138 ++++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   5 +
 8 files changed, 853 insertions(+), 7 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v7 1/2] eal: add channel for multi-process communication
  2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
@ 2018-01-30  6:58   ` Jianfeng Tan
  2018-01-30  6:58   ` [PATCH v7 2/2] eal: add synchronous " Jianfeng Tan
  2018-01-30 14:46   ` [PATCH v7 0/2] generic channel for " Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-30  6:58 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
  1. Config-file based channel, in which, the primary process writes
     info into a pre-defined config file, and the secondary process
     reads the info out.
  2. vfio submodule has its own channel based on unix socket for the
     secondary process to get container fd and group fd from the
     primary process.
  3. pdump submodule also has its own channel based on unix socket for
     packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
  a. Secondary wants to send info to primary, for example, secondary
     would like to send request (about some specific vdev to primary).
  b. Sending info at any time, instead of just initialization time.
  c. Share FDs with the other side, for vdev like vhost, related FDs
     (memory region, kick) should be shared.
  d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

  1. rte_eal_mp_action_register() is used to register an action,
     indexed by a string, when a component at receiver side would like
     to response the messages from the peer processe.
  2. rte_eal_mp_action_unregister() is used to unregister the action
     if the calling component does not want to response the messages.
  3. rte_eal_mp_sendmsg() is used to send a message, and returns
     immediately. If there are n secondary processes, the primary
     process will send n messages.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   9 +
 lib/librte_eal/bsdapp/eal/eal.c         |  10 +-
 lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
 lib/librte_eal/common/eal_filesystem.h  |  19 +-
 lib/librte_eal/common/eal_private.h     |  12 +-
 lib/librte_eal/common/include/rte_eal.h |  80 +++++-
 lib/librte_eal/linuxapp/eal/eal.c       |  10 +-
 lib/librte_eal/rte_eal_version.map      |   3 +
 8 files changed, 575 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 8c3968e..0531f59 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,15 @@ New Features
   renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
   to PMD agnostic ``eventdev_pipeline``.
 
+* **Added new multi-process communication channel**
+
+  Added a generic channel in EAL for multi-process (primary/secondary) communication.
+  Consumers of this channel need to register an action with an action name to response
+  a message received; the actions will be identified by the action name and executed
+  in the context of a new dedicated thread for this channel. The list of new APIs:
+
+  * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+  * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 0bac6cf..ba1811a 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -604,6 +604,14 @@ rte_eal_init(int argc, char **argv)
 
 	rte_config_init();
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	if (rte_eal_memory_init() < 0) {
 		rte_eal_init_alert("Cannot init memory\n");
 		rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..f63c9c2 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
  */
 
-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
 
+#include "eal_private.h"
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 
+static int mp_fd = -1;
+static char mp_filter[PATH_MAX];   /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+	TAILQ_ENTRY(action_entry) next;
+	char action_name[RTE_MP_MAX_NAME_LEN];
+	rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+	TAILQ_HEAD_INITIALIZER(action_entry_list);
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)
 
 	return !!ret;
 }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+	struct action_entry *entry;
+
+	TAILQ_FOREACH(entry, &action_entry_list, next) {
+		if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+			break;
+	}
+
+	return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+	if (name == NULL) {
+		RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+		RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+	if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+		rte_errno = -E2BIG;
+		return -1;
+	}
+	return 0;
+}
+
+int __rte_experimental
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return -1;
+
+	entry = malloc(sizeof(struct action_entry));
+	if (entry == NULL) {
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	strcpy(entry->action_name, name);
+	entry->action = action;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	if (find_action_entry_by_name(name) != NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		rte_errno = -EEXIST;
+		free(entry);
+		return -1;
+	}
+	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	return 0;
+}
+
+void __rte_experimental
+rte_mp_action_unregister(const char *name)
+{
+	struct action_entry *entry;
+
+	if (validate_action_name(name))
+		return;
+
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(name);
+	if (entry == NULL) {
+		pthread_mutex_unlock(&mp_mutex_action);
+		return;
+	}
+	TAILQ_REMOVE(&action_entry_list, entry, next);
+	pthread_mutex_unlock(&mp_mutex_action);
+	free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+	int msglen;
+	struct iovec iov;
+	struct msghdr msgh;
+	char control[CMSG_SPACE(sizeof(msg->fds))];
+	struct cmsghdr *cmsg;
+	int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+	memset(&msgh, 0, sizeof(msgh));
+	iov.iov_base = msg;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	msglen = recvmsg(mp_fd, &msgh, 0);
+	if (msglen < 0) {
+		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+		RTE_LOG(ERR, EAL, "truncted msg\n");
+		return -1;
+	}
+
+	/* read auxiliary FDs if any */
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+	struct action_entry *entry;
+	rte_mp_t action = NULL;
+
+	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+	pthread_mutex_lock(&mp_mutex_action);
+	entry = find_action_entry_by_name(msg->name);
+	if (entry != NULL)
+		action = entry->action;
+	pthread_mutex_unlock(&mp_mutex_action);
+
+	if (!action)
+		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+	else if (action(msg) < 0)
+		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+	struct rte_mp_msg msg;
+
+	while (1) {
+		if (read_msg(&msg) == 0)
+			process_msg(&msg);
+	}
+
+	return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+	struct sockaddr_un un;
+	const char *prefix = eal_mp_socket_path();
+
+	mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (mp_fd < 0) {
+		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+		return -1;
+	}
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+	else {
+		snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+			 prefix, getpid(), rte_rdtsc());
+	}
+	unlink(un.sun_path); /* May still exist since last run */
+	if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+			un.sun_path, strerror(errno));
+		close(mp_fd);
+		return -1;
+	}
+
+	RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+	return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+	int dir_fd;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		return -1;
+	}
+	dir_fd = dirfd(mp_dir);
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(filter, ent->d_name, 0) == 0)
+			unlinkat(dir_fd, ent->d_name, 0);
+	}
+
+	closedir(mp_dir);
+	return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+	char *filename;
+	char *fullpath = strdup(path);
+
+	if (!fullpath)
+		return;
+	filename = basename(fullpath);
+	unlink_sockets(filename);
+	free(fullpath);
+	RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	char *path;
+	pthread_t tid;
+
+	snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+		 internal_config.hugefile_prefix);
+
+	path = strdup(eal_mp_socket_path());
+	snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+	free(path);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+	    unlink_sockets(mp_filter)) {
+		RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+		return -1;
+	}
+
+	if (open_socket_fd() < 0)
+		return -1;
+
+	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+			strerror(errno));
+		close(mp_fd);
+		mp_fd = -1;
+		return -1;
+	}
+
+	/* try best to set thread name */
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+	rte_thread_setname(tid, thread_name);
+	return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+	int snd;
+	struct iovec iov;
+	struct msghdr msgh;
+	struct cmsghdr *cmsg;
+	struct sockaddr_un dst;
+	int fd_size = msg->num_fds * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+
+	memset(&dst, 0, sizeof(dst));
+	dst.sun_family = AF_UNIX;
+	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+	memset(&msgh, 0, sizeof(msgh));
+	memset(control, 0, sizeof(control));
+
+	iov.iov_base = msg;
+	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+	msgh.msg_name = &dst;
+	msgh.msg_namelen = sizeof(dst);
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+	do {
+		snd = sendmsg(mp_fd, &msgh, 0);
+	} while (snd < 0 && errno == EINTR);
+
+	if (snd < 0) {
+		rte_errno = errno;
+		/* Check if it caused by peer process exits */
+		if (errno == -ECONNREFUSED) {
+			/* We don't unlink the primary's socket here */
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				unlink_socket_by_path(dst_path);
+			return 0;
+		}
+		if (errno == -ENOBUFS) {
+			RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+				dst_path);
+			return 0;
+		}
+		RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+			dst_path, strerror(errno));
+		return -1;
+	}
+
+	return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		if (send_msg(eal_mp_socket_path(), msg) < 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* broadcast to all secondary processes */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+				mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (send_msg(ent->d_name, msg) < 0)
+			ret = -1;
+	}
+	closedir(mp_dir);
+
+	return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+	if (msg == NULL) {
+		RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+		rte_errno = -EINVAL;
+		return false;
+	}
+
+	if (validate_action_name(msg->name))
+		return false;
+
+	if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+		RTE_LOG(ERR, EAL, "Message data is too long\n");
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+			RTE_MP_MAX_FD_NUM);
+		rte_errno = -E2BIG;
+		return false;
+	}
+
+	return true;
+}
+
+int __rte_experimental
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+	if (!check_input(msg))
+		return -1;
+
+	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+	return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 /**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
 	return buffer;
 }
 
+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+	static char buffer[PATH_MAX]; /* static so auto-zeroed */
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+		 directory, internal_config.hugefile_prefix);
+
+	return buffer;
+}
+
 /** Path of hugepage info file. */
 #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
  */
 struct rte_bus *rte_bus_find_by_device_name(const char *str);
 
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ *   0 on success;
+ *   (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1f37c7a..2d022c0 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #ifndef _RTE_EAL_H_
@@ -203,6 +203,84 @@ int __rte_experimental rte_eal_cleanup(void);
  */
 int rte_eal_primary_proc_alive(const char *config_file_path);
 
+#define RTE_MP_MAX_FD_NUM	8    /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN	64   /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN	256  /* The max length of param */
+struct rte_mp_msg {
+	char name[RTE_MP_MAX_NAME_LEN];
+	int len_param;
+	int num_fds;
+	uint8_t param[RTE_MP_MAX_PARAM_LEN];
+	int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create  socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ *   The action argument is the function pointer to the action function.
+ *
+ * @return
+ *  - 0 on success.
+ *  - (<0) on failure.
+ */
+int __rte_experimental
+rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action  if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ *   The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void __rte_experimental
+rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_sendmsg(struct rte_mp_msg *msg);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 828baac..66f7585 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
  *   Copyright(c) 2012-2014 6WIND S.A.
  *   All rights reserved.
  *
@@ -853,6 +853,14 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 #ifdef VFIO_PRESENT
 	if (rte_eal_vfio_setup() < 0) {
 		rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 93f6c13..24deaef 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
 	rte_eal_devargs_remove;
 	rte_eal_hotplug_add;
 	rte_eal_hotplug_remove;
+	rte_mp_action_register;
+	rte_mp_action_unregister;
+	rte_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [PATCH v7 2/2] eal: add synchronous multi-process communication
  2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
  2018-01-30  6:58   ` [PATCH v7 1/2] eal: add " Jianfeng Tan
@ 2018-01-30  6:58   ` Jianfeng Tan
  2018-01-30 14:46   ` [PATCH v7 0/2] generic channel for " Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-01-30  6:58 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

       sender-process                receiver-process
   ----------------------            ----------------

    thread-n
     |_rte_eal_mp_request() ----------> mp-thread
        |_timedwait()                    |_process_msg()
                                           |_action()
                                               |_rte_eal_mp_reply()
	        mp_thread  <---------------------|
                  |_process_msg()
                     |_signal(send_thread)
    thread-m <----------|
     |_collect-reply

 * A secondary process is only allowed to talk to the primary process.
 * If there are multiple secondary processes for the primary process,
   it will send request to peer1, collect response from peer1; then
   send request to peer2, collect response from peer2, and so on.
 * When thread-n is sending request, thread-m of that process can send
   request at the same time.
 * For pair <action_name, peer>, we guarantee that only one such request
   is on the fly.

Suggested-by: Anatoly Burakov <anatoly.burakov@intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Reviewed-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/guides/rel_notes/release_18_02.rst  |   2 +
 lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
 lib/librte_eal/common/include/rte_eal.h |  60 +++++++-
 lib/librte_eal/rte_eal_version.map      |   2 +
 4 files changed, 298 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 0531f59..bb8559b 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -169,6 +169,8 @@ New Features
 
   * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
   * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+  * ``rte_mp_request`` is for sending a request message and will block until
+    it gets a reply message which is sent from the peer by ``rte_mp_reply``.
 
 API Changes
 -----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index f63c9c2..b974837 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
 static struct action_entry_list action_entry_list =
 	TAILQ_HEAD_INITIALIZER(action_entry_list);
 
+enum mp_type {
+	MP_MSG, /* Share message with peers, will not block */
+	MP_REQ, /* Request for information, Will block for a reply */
+	MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+	int type;
+	struct rte_mp_msg msg;
+};
+
+struct sync_request {
+	TAILQ_ENTRY(sync_request) next;
+	int reply_received;
+	char dst[PATH_MAX];
+	struct rte_mp_msg *request;
+	struct rte_mp_msg *reply;
+	pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+	struct sync_request_list requests;
+	pthread_mutex_t lock;
+} sync_requests = {
+	.requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+	.lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+	struct sync_request *r;
+
+	TAILQ_FOREACH(r, &sync_requests.requests, next) {
+		if (!strcmp(r->dst, dst) &&
+		    !strcmp(r->request->name, act_name))
+			break;
+	}
+
+	return r;
+}
+
 int
 rte_eal_primary_proc_alive(const char *config_file_path)
 {
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
 }
 
 static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
 	int msglen;
 	struct iovec iov;
 	struct msghdr msgh;
-	char control[CMSG_SPACE(sizeof(msg->fds))];
+	char control[CMSG_SPACE(sizeof(m->msg.fds))];
 	struct cmsghdr *cmsg;
-	int buflen = sizeof(*msg) - sizeof(msg->fds);
+	int buflen = sizeof(*m) - sizeof(m->msg.fds);
 
 	memset(&msgh, 0, sizeof(msgh));
-	iov.iov_base = msg;
+	iov.iov_base = m;
 	iov.iov_len  = buflen;
 
+	msgh.msg_name = s;
+	msgh.msg_namelen = sizeof(*s);
 	msgh.msg_iov = &iov;
 	msgh.msg_iovlen = 1;
 	msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
 			(cmsg->cmsg_type == SCM_RIGHTS)) {
-			memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+			memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
 			break;
 		}
 	}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
 }
 
 static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
 {
+	struct sync_request *sync_req;
 	struct action_entry *entry;
+	struct rte_mp_msg *msg = &m->msg;
 	rte_mp_t action = NULL;
 
 	RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+	if (m->type == MP_REP) {
+		pthread_mutex_lock(&sync_requests.lock);
+		sync_req = find_sync_request(s->sun_path, msg->name);
+		if (sync_req) {
+			memcpy(sync_req->reply, msg, sizeof(*msg));
+			sync_req->reply_received = 1;
+			pthread_cond_signal(&sync_req->cond);
+		} else
+			RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+		pthread_mutex_unlock(&sync_requests.lock);
+		return;
+	}
+
 	pthread_mutex_lock(&mp_mutex_action);
 	entry = find_action_entry_by_name(msg->name);
 	if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)
 
 	if (!action)
 		RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
-	else if (action(msg) < 0)
+	else if (action(msg, s->sun_path) < 0)
 		RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
 }
 
 static void *
 mp_handle(void *arg __rte_unused)
 {
-	struct rte_mp_msg msg;
+	struct mp_msg_internal msg;
+	struct sockaddr_un sa;
 
 	while (1) {
-		if (read_msg(&msg) == 0)
-			process_msg(&msg);
+		if (read_msg(&msg, &sa) == 0)
+			process_msg(&msg, &sa);
 	}
 
 	return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
  *
  */
 static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
 {
 	int snd;
 	struct iovec iov;
 	struct msghdr msgh;
 	struct cmsghdr *cmsg;
 	struct sockaddr_un dst;
+	struct mp_msg_internal m;
 	int fd_size = msg->num_fds * sizeof(int);
 	char control[CMSG_SPACE(fd_size)];
 
+	m.type = type;
+	memcpy(&m.msg, msg, sizeof(*msg));
+
 	memset(&dst, 0, sizeof(dst));
 	dst.sun_family = AF_UNIX;
 	snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 	memset(&msgh, 0, sizeof(msgh));
 	memset(control, 0, sizeof(control));
 
-	iov.iov_base = msg;
-	iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+	iov.iov_base = &m;
+	iov.iov_len = sizeof(m) - sizeof(msg->fds);
 
 	msgh.msg_name = &dst;
 	msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
 }
 
 static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
 {
 	int ret = 0;
 	DIR *mp_dir;
 	struct dirent *ent;
 
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		if (send_msg(eal_mp_socket_path(), msg) < 0)
+	if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+		peer = eal_mp_socket_path();
+
+	if (peer) {
+		if (send_msg(peer, msg, type) < 0)
 			return -1;
 		else
 			return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
 		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
 			continue;
 
-		if (send_msg(ent->d_name, msg) < 0)
+		if (send_msg(ent->d_name, msg, type) < 0)
 			ret = -1;
 	}
-	closedir(mp_dir);
 
+	closedir(mp_dir);
 	return ret;
 }
 
@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
 		return -1;
 
 	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
-	return mp_send(msg);
+	return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+	       struct rte_mp_reply *reply, const struct timespec *ts)
+{
+	int ret;
+	struct timeval now;
+	struct rte_mp_msg msg, *tmp;
+	struct sync_request sync_req, *exist;
+
+	sync_req.reply_received = 0;
+	strcpy(sync_req.dst, dst);
+	sync_req.request = req;
+	sync_req.reply = &msg;
+	pthread_cond_init(&sync_req.cond, NULL);
+
+	pthread_mutex_lock(&sync_requests.lock);
+	exist = find_sync_request(dst, req->name);
+	if (!exist)
+		TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+	if (exist) {
+		RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+		rte_errno = -EEXIST;
+		return -1;
+	}
+
+	ret = send_msg(dst, req, MP_REQ);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+			dst, req->name);
+		return -1;
+	} else if (ret == 0)
+		return 0;
+
+	reply->nb_sent++;
+
+	pthread_mutex_lock(&sync_requests.lock);
+	do {
+		pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+		/* Check spurious wakeups */
+		if (sync_req.reply_received == 1)
+			break;
+		/* Check if time is out */
+		if (gettimeofday(&now, NULL) < 0)
+			break;
+		if (now.tv_sec < ts->tv_sec)
+			break;
+		else if (now.tv_sec == ts->tv_sec &&
+			 now.tv_usec * 1000 < ts->tv_nsec)
+			break;
+	} while (1);
+	/* We got the lock now */
+	TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+	pthread_mutex_unlock(&sync_requests.lock);
+
+	if (sync_req.reply_received == 0) {
+		RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ETIMEDOUT;
+		return -1;
+	}
+
+	tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+	if (!tmp) {
+		RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+			dst, req->name);
+		rte_errno = -ENOMEM;
+		return -1;
+	}
+	memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+	reply->msgs = tmp;
+	reply->nb_received++;
+	return 0;
+}
+
+int __rte_experimental
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+		const struct timespec *ts)
+{
+	int ret = 0;
+	DIR *mp_dir;
+	struct dirent *ent;
+	struct timeval now;
+	struct timespec end;
+
+	RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+	if (check_input(req) == false)
+		return -1;
+	if (gettimeofday(&now, NULL) < 0) {
+		RTE_LOG(ERR, EAL, "Faile to get current time\n");
+		rte_errno = errno;
+		return -1;
+	}
+
+	end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+	end.tv_sec = now.tv_sec + ts->tv_sec +
+			(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+	reply->nb_sent = 0;
+	reply->nb_received = 0;
+	reply->msgs = NULL;
+
+	/* for secondary process, send request to the primary process only */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+	/* for primary process, broadcast request, and collect reply 1 by 1 */
+	mp_dir = opendir(mp_dir_path);
+	if (!mp_dir) {
+		RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+		rte_errno = errno;
+		return -1;
+	}
+
+	while ((ent = readdir(mp_dir))) {
+		if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+			continue;
+
+		if (mp_request_one(ent->d_name, req, reply, &end))
+			ret = -1;
+	}
+
+	closedir(mp_dir);
+	return ret;
+}
+
+int __rte_experimental
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+	RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+	if (check_input(msg) == false)
+		return -1;
+
+	if (peer == NULL) {
+		RTE_LOG(ERR, EAL, "peer is not specified\n");
+		rte_errno = -EINVAL;
+		return -1;
+	}
+
+	return mp_send(msg, peer, MP_REP);
 }
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2d022c0..08c6637 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@
 
 #include <stdint.h>
 #include <sched.h>
+#include <time.h>
 
 #include <rte_config.h>
 #include <rte_compat.h>
@@ -214,13 +215,19 @@ struct rte_mp_msg {
 	int fds[RTE_MP_MAX_FD_NUM];
 };
 
+struct rte_mp_reply {
+	int nb_sent;
+	int nb_received;
+	struct rte_mp_msg *msgs; /* caller to free */
+};
+
 /**
  * Action function typedef used by other components.
  *
  * As we create  socket channel for primary/secondary communication, use
  * this function typedef to register action for coming messages.
  */
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);
 
 /**
  * @warning
@@ -282,6 +289,57 @@ int __rte_experimental
 rte_mp_sendmsg(struct rte_mp_msg *msg);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ *   The req argument contains the customized request message.
+ *
+ * @param reply
+ *   The reply argument will be for storing all the replied messages;
+ *   the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ *   The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+	       const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ *   The msg argument contains the customized message.
+ *
+ * @param peer
+ *   The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ *  - On success, return 0.
+ *  - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
  * Usage function typedef used by the application usage function.
  *
  * Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 24deaef..4146907 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,8 @@ EXPERIMENTAL {
 	rte_mp_action_register;
 	rte_mp_action_unregister;
 	rte_mp_sendmsg;
+	rte_mp_request;
+	rte_mp_reply;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v7 0/2] generic channel for multi-process communication
  2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
  2018-01-30  6:58   ` [PATCH v7 1/2] eal: add " Jianfeng Tan
  2018-01-30  6:58   ` [PATCH v7 2/2] eal: add synchronous " Jianfeng Tan
@ 2018-01-30 14:46   ` Thomas Monjalon
  2 siblings, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-01-30 14:46 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, anatoly.burakov, bruce.richardson, konstantin.ananyev

30/01/2018 07:58, Jianfeng Tan:
> This patchset adds a generic channel for multi-process (primary/secondary)
> communication.

Applied, thanks

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v5] vfio: change to use generic multi-process channel
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
  2018-01-13 14:03     ` Burakov, Anatoly
@ 2018-03-04 14:57     ` Jianfeng Tan
  2018-03-14 13:27       ` Burakov, Anatoly
  2018-03-20  8:50     ` [PATCH v6] " Jianfeng Tan
  2018-04-15 15:06     ` [PATCH v7] " Jianfeng Tan
  3 siblings, 1 reply; 88+ messages in thread
From: Jianfeng Tan @ 2018-03-04 14:57 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
				--num-procs=2 --proc-id=1

Cc: anatoly.burakov@intel.com

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  14 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 172 +++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  15 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 ++++---------------------
 4 files changed, 136 insertions(+), 474 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf..4ca06f4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -695,18 +695,8 @@ static int rte_eal_vfio_setup(void)
 		return -1;
 	vfio_enabled = rte_vfio_is_enabled("vfio");
 
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
+	if (vfio_enabled && vfio_mp_sync_setup() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..d905e8e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,31 @@ vfio_get_group_fd(int iommu_group_no)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_no = iommu_group_no;
+	strcpy(mp_req.name, "vfio");
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			cur_grp->group_no = iommu_group_no;
+			vfio_group_fd = mp_rep->fds[0];
+			cur_grp->fd = vfio_group_fd;
+			vfio_cfg.vfio_active_groups++;
 		}
+		free(mp_reply.msgs);
 	}
-	return -1;
+
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
 }
 
 
@@ -200,7 +185,10 @@ int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -214,43 +202,24 @@ rte_vfio_clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_no = vfio_group_fd;
+	strcpy(mp_req.name, "vfio");
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK) {
+			free(mp_reply.msgs);
+			return 0;
+		}
+		free(mp_reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
+	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
 	return -1;
 }
 
@@ -561,6 +530,11 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +565,29 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(mp_req.name, "vfio");
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			free(mp_reply.msgs);
+			return mp_rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(mp_reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..c61cdb9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,358 +20,80 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd;
+	int num;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
+	memset(&reply, 0, sizeof(reply));
 
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_no = m->group_no;
+		fd = vfio_get_group_fd(m->group_no);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			num = 1;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_no = m->group_no;
+		if (rte_vfio_clear_group(m->group_no) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			num = 1;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
+	if (num == 1) {
+		reply.num_fds = 1;
+		reply.fds[0] = fd;
 	}
+	strcpy(reply.name, "vfio");
+	reply.len_param = sizeof(*r);
 
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
-
-	return 0;
+	ret = rte_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_mp_action_register("vfio", vfio_mp_primary);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v5] vfio: change to use generic multi-process channel
  2018-03-04 14:57     ` [PATCH v5] vfio: change to use " Jianfeng Tan
@ 2018-03-14 13:27       ` Burakov, Anatoly
  2018-03-19  6:53         ` Tan, Jianfeng
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-03-14 13:27 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 04-Mar-18 2:57 PM, Jianfeng Tan wrote:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
> 
> This patch changes to use the generic mp channel.
> 
> Test:
>    1. Bind two NICs to vfio-pci.
> 
>    2. Start the primary and secondary process.
>      $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
>      $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
> 				--num-procs=2 --proc-id=1
> 
> Cc: anatoly.burakov@intel.com
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---

<...>

> -		ret = vfio_mp_sync_receive_request(socket_fd);
> -		switch (ret) {
> -		case SOCKET_NO_FD:
> -			close(socket_fd);
> -			return 0;
> -		case SOCKET_OK:
> -			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
> -			/* if we got the fd, store it and return it */
> -			if (vfio_group_fd > 0) {
> -				close(socket_fd);
> -				cur_grp->group_no = iommu_group_no;
> -				cur_grp->fd = vfio_group_fd;
> -				vfio_cfg.vfio_active_groups++;
> -				return vfio_group_fd;
> -			}
> -			/* fall-through on error */
> -		default:
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	p->req = SOCKET_REQ_GROUP;
> +	p->group_no = iommu_group_no;
> +	strcpy(mp_req.name, "vfio");

"vfio" should probably be a #define. Also, i think the identifier is too 
short. Maybe "eal_vfio_mp_sync" or at least "eal_vfio" would be better?

> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_group_fd = -1;
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			cur_grp->group_no = iommu_group_no;
> +			vfio_group_fd = mp_rep->fds[0];
> +			cur_grp->fd = vfio_group_fd;
> +			vfio_cfg.vfio_active_groups++;
>   		}
> +		free(mp_reply.msgs);
>   	}
> -	return -1;
> +
> +	if (vfio_group_fd < 0)
> +		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
> +	return vfio_group_fd;

p->result can be SOCKET_NO_FD, in which case returned value should be 
zero. I think this is missing from this code. There probably should be 
an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.

You should be able to test this by trying to set up a device for VFIO 
that isn't bound to VFIO driver, in a secondary process.

>   }
>   
>   
> @@ -200,7 +185,10 @@ int
>   rte_vfio_clear_group(int vfio_group_fd)
>   {
>   	int i;
> -	int socket_fd, ret;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
>   
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>   
> @@ -214,43 +202,24 @@ rte_vfio_clear_group(int vfio_group_fd)
>   		return 0;
>   	}
>   
> -	/* This is just for SECONDARY processes */
> -	socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -		return -1;
> -	}
> -
> -	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -		close(socket_fd);
> -		return -1;
> -	}
> +	p->req = SOCKET_CLR_GROUP;
> +	p->group_no = vfio_group_fd;
> +	strcpy(mp_req.name, "vfio");

Same here, please use a #define here.

> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
>   
> -	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> -		close(socket_fd);
> -		return -1;
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK) {
> +			free(mp_reply.msgs);
> +			return 0;
> +		}
> +		free(mp_reply.msgs);
>   	}
>   
> -	ret = vfio_mp_sync_receive_request(socket_fd);
> -	switch (ret) {
> -	case SOCKET_NO_FD:
> -		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> -		close(socket_fd);
> -		break;
> -	case SOCKET_OK:
> -		close(socket_fd);
> -		return 0;
> -	case SOCKET_ERR:
> -		RTE_LOG(ERR, EAL, "  Socket error\n");
> -		close(socket_fd);
> -		break;
> -	default:
> -		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> -		close(socket_fd);
> -	}
> +	RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");

Old error messages distinguished between "bad VFIO group fd" and other 
errors. You should probably only output this message of response was 
SOCKET_NO_FD, and output another message in case of other errors.

>   	return -1;
>   }
>   
> @@ -561,6 +530,11 @@ int
>   vfio_get_container_fd(void)
>   {

<...>

> -		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
> -		if (vfio_container_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	}
> +	/*
> +	 * if we're in a secondary process, request container fd from the
> +	 * primary process via mp channel
> +	 */
> +	p->req = SOCKET_REQ_CONTAINER;
> +	strcpy(mp_req.name, "vfio");

Same here, please use #define here.

> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_container_fd = -1;
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			free(mp_reply.msgs);
> +			return mp_rep->fds[0];
>   		}
> -		close(socket_fd);
> -		return vfio_container_fd;
> +		free(mp_reply.msgs);

<...>

> -	/* set up a socket */
> -	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
> -		return -1;
> -	}
> -
> -	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
> -	addr.sun_family = AF_UNIX;
> -
> -	sockaddr_len = sizeof(struct sockaddr_un);
> -
> -	unlink(addr.sun_path);
> -
> -	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
> -		close(socket_fd);
> +		break;
> +	case SOCKET_CLR_GROUP:
> +		r->req = SOCKET_CLR_GROUP;
> +		r->group_no = m->group_no;
> +		if (rte_vfio_clear_group(m->group_no) < 0)
> +			r->result = SOCKET_NO_FD;
> +		else
> +			r->result = SOCKET_OK;
> +		break;
> +	case SOCKET_REQ_CONTAINER:
> +		r->req = SOCKET_REQ_CONTAINER;
> +		fd = vfio_get_container_fd();
> +		if (fd < 0)
> +			r->result = SOCKET_ERR;
> +		else {
> +			r->result = SOCKET_OK;
> +			num = 1;
> +		}
> +		break;
> +	default:
> +		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
>   		return -1;
>   	}
>   
> -	ret = listen(socket_fd, 50);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
> -		close(socket_fd);
> -		return -1;
> +	if (num == 1) {
> +		reply.num_fds = 1;
> +		reply.fds[0] = fd;
>   	}

You're not saving any lines of code with this, but you are sacrificing 
code clarity :) I think this should be done inline, e.g. in "else" 
clause of SOCKET_REQ_CONTAINER and SOCKET_REQ_GROUP.

> +	strcpy(reply.name, "vfio");

Same here, please use #define.

> +	reply.len_param = sizeof(*r);
>   
> -	/* save the socket in local configuration */
> -	mp_socket_fd = socket_fd;
> -
> -	return 0;
> +	ret = rte_mp_reply(&reply, peer);
> +	if (m->req == SOCKET_REQ_CONTAINER && num == 1)

Why not just "fd >= 0"? No need for num variable then.

> +		close(fd);
> +	return ret;
>   }
>   
> -/*
> - * set up a local socket and tell it to listen for incoming connections
> - */
>   int
>   vfio_mp_sync_setup(void)
>   {
> -	int ret;
> -	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> -
> -	if (vfio_mp_sync_socket_setup() < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
> -		return -1;
> -	}
> -
> -	ret = pthread_create(&socket_thread, NULL,
> -			vfio_mp_sync_thread, NULL);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL,
> -			"Failed to create thread for communication with secondary processes!\n");
> -		close(mp_socket_fd);
> -		return -1;
> -	}
> -
> -	/* Set thread_name for aid in debugging. */
> -	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
> -	ret = rte_thread_setname(socket_thread, thread_name);
> -	if (ret)
> -		RTE_LOG(DEBUG, EAL,
> -			"Failed to set thread name for secondary processes!\n");
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		return rte_mp_action_register("vfio", vfio_mp_primary);

Same here, please use #define.

>   
>   	return 0;
>   }
> 

Thanks for doing this patch!

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v5] vfio: change to use generic multi-process channel
  2018-03-14 13:27       ` Burakov, Anatoly
@ 2018-03-19  6:53         ` Tan, Jianfeng
  2018-03-20 10:33           ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Tan, Jianfeng @ 2018-03-19  6:53 UTC (permalink / raw)
  To: Burakov, Anatoly, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

Hi Anatoly,

Thank you for the review. All your comments will be addressed in next version, except for below concern which might be taken care of in another patch if it also concerns you.

> -----Original Message-----
> From: Burakov, Anatoly
> Sent: Wednesday, March 14, 2018 9:27 PM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
> Subject: Re: [PATCH v5] vfio: change to use generic multi-process channel
[...]
> 
> > +	mp_req.len_param = sizeof(*p);
> > +	mp_req.num_fds = 0;
> > +
> > +	vfio_group_fd = -1;
> > +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> > +	    mp_reply.nb_received == 1) {
> > +		mp_rep = &mp_reply.msgs[0];
> > +		p = (struct vfio_mp_param *)mp_rep->param;
> > +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> > +			cur_grp->group_no = iommu_group_no;
> > +			vfio_group_fd = mp_rep->fds[0];
> > +			cur_grp->fd = vfio_group_fd;
> > +			vfio_cfg.vfio_active_groups++;
> >   		}
> > +		free(mp_reply.msgs);
> >   	}
> > -	return -1;
> > +
> > +	if (vfio_group_fd < 0)
> > +		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
> > +	return vfio_group_fd;
> 
> p->result can be SOCKET_NO_FD, in which case returned value should be
> zero. I think this is missing from this code. There probably should be
> an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.
> 
> You should be able to test this by trying to set up a device for VFIO
> that isn't bound to VFIO driver, in a secondary process.

OK, I will fix this.

But really, "zero" could be ambiguous as a fd could, theoretically, be zero too.

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v6] vfio: change to use generic multi-process channel
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
  2018-01-13 14:03     ` Burakov, Anatoly
  2018-03-04 14:57     ` [PATCH v5] vfio: change to use " Jianfeng Tan
@ 2018-03-20  8:50     ` Jianfeng Tan
  2018-04-05 14:26       ` Tan, Jianfeng
  2018-04-12 15:26       ` Burakov, Anatoly
  2018-04-15 15:06     ` [PATCH v7] " Jianfeng Tan
  3 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-03-20  8:50 UTC (permalink / raw)
  To: dev
  Cc: anatoly.burakov, bruce.richardson, konstantin.ananyev, thomas,
	Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
				--num-procs=2 --proc-id=1

Cc: anatoly.burakov@intel.com

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
v5->v6: (Address comments from Anatoly)
  - Naming, return checking, logging.
  - Move vfio action register after rte_bus_probe().
 lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 176 +++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  17 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
 4 files changed, 145 insertions(+), 478 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf..fb41e97 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -689,24 +689,8 @@ rte_eal_iopl_init(void)
 #ifdef VFIO_PRESENT
 static int rte_eal_vfio_setup(void)
 {
-	int vfio_enabled = 0;
-
 	if (rte_vfio_enable("vfio"))
 		return -1;
-	vfio_enabled = rte_vfio_is_enabled("vfio");
-
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
 
 	return 0;
 }
@@ -950,6 +934,12 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+#ifdef VFIO_PRESENT
+	/* Register mp action after probe() so that we got enough info */
+	if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+		return -1;
+#endif
+
 	/* initialize default service/lcore mappings and start running. Ignore
 	 * -ENOTSUP, as it indicates no service coremask passed to EAL.
 	 */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..9b97e5b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,34 @@ vfio_get_group_fd(int iommu_group_no)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_no = iommu_group_no;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			cur_grp->group_no = iommu_group_no;
+			vfio_group_fd = mp_rep->fds[0];
+			cur_grp->fd = vfio_group_fd;
+			vfio_cfg.vfio_active_groups++;
+		} else if (p->result == SOCKET_NO_FD) {
+			RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
+			vfio_group_fd = 0;
 		}
+		free(mp_reply.msgs);
 	}
-	return -1;
+
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
 }
 
 
@@ -200,7 +188,10 @@ int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -214,43 +205,27 @@ rte_vfio_clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_no = vfio_group_fd;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
 
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK) {
+			free(mp_reply.msgs);
+			return 0;
+		} else if (p->result == SOCKET_NO_FD)
+			RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
+		else
+			RTE_LOG(ERR, EAL, "  no such VFIO group fd!\n");
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+		free(mp_reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
 	return -1;
 }
 
@@ -561,6 +536,11 @@ int
 vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +571,29 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			free(mp_reply.msgs);
+			return mp_rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(mp_reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..be2a79b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -150,6 +141,8 @@ vfio_get_group_fd(int iommu_group_no);
 
 int vfio_mp_sync_setup(void);
 
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
 #define SOCKET_REQ_CONTAINER 0x100
 #define SOCKET_REQ_GROUP 0x200
 #define SOCKET_CLR_GROUP 0x300
@@ -157,6 +150,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_no;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..afa556f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,358 +20,77 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd = -1;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
+	memset(&reply, 0, sizeof(reply));
 
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_no = m->group_no;
+		fd = vfio_get_group_fd(m->group_no);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_no = m->group_no;
+		if (rte_vfio_clear_group(m->group_no) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
+	strcpy(reply.name, EAL_VFIO_MP);
+	reply.len_param = sizeof(*r);
 
-	return 0;
+	ret = rte_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v5] vfio: change to use generic multi-process channel
  2018-03-19  6:53         ` Tan, Jianfeng
@ 2018-03-20 10:33           ` Burakov, Anatoly
  2018-03-20 10:56             ` Burakov, Anatoly
  0 siblings, 1 reply; 88+ messages in thread
From: Burakov, Anatoly @ 2018-03-20 10:33 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

On 19-Mar-18 6:53 AM, Tan, Jianfeng wrote:
> Hi Anatoly,
> 
> Thank you for the review. All your comments will be addressed in next version, except for below concern which might be taken care of in another patch if it also concerns you.
> 
>> -----Original Message-----
>> From: Burakov, Anatoly
>> Sent: Wednesday, March 14, 2018 9:27 PM
>> To: Tan, Jianfeng; dev@dpdk.org
>> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
>> Subject: Re: [PATCH v5] vfio: change to use generic multi-process channel
> [...]
>>
>>> +	mp_req.len_param = sizeof(*p);
>>> +	mp_req.num_fds = 0;
>>> +
>>> +	vfio_group_fd = -1;
>>> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
>>> +	    mp_reply.nb_received == 1) {
>>> +		mp_rep = &mp_reply.msgs[0];
>>> +		p = (struct vfio_mp_param *)mp_rep->param;
>>> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
>>> +			cur_grp->group_no = iommu_group_no;
>>> +			vfio_group_fd = mp_rep->fds[0];
>>> +			cur_grp->fd = vfio_group_fd;
>>> +			vfio_cfg.vfio_active_groups++;
>>>    		}
>>> +		free(mp_reply.msgs);
>>>    	}
>>> -	return -1;
>>> +
>>> +	if (vfio_group_fd < 0)
>>> +		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
>>> +	return vfio_group_fd;
>>
>> p->result can be SOCKET_NO_FD, in which case returned value should be
>> zero. I think this is missing from this code. There probably should be
>> an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.
>>
>> You should be able to test this by trying to set up a device for VFIO
>> that isn't bound to VFIO driver, in a secondary process.
> 
> OK, I will fix this.
> 
> But really, "zero" could be ambiguous as a fd could, theoretically, be zero too.

You're correct. Maybe return 0/-1 in case of success/failure and put fd 
into a pointer? i.e.

int func(int *vfio_group_fd) {
<...>
*vfio_group_fd = fd;
return 0;
}

> 
> Thanks,
> Jianfeng
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v5] vfio: change to use generic multi-process channel
  2018-03-20 10:33           ` Burakov, Anatoly
@ 2018-03-20 10:56             ` Burakov, Anatoly
  0 siblings, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-03-20 10:56 UTC (permalink / raw)
  To: Tan, Jianfeng, dev; +Cc: Richardson, Bruce, Ananyev, Konstantin, thomas

On 20-Mar-18 10:33 AM, Burakov, Anatoly wrote:
> On 19-Mar-18 6:53 AM, Tan, Jianfeng wrote:
>> Hi Anatoly,
>>
>> Thank you for the review. All your comments will be addressed in next 
>> version, except for below concern which might be taken care of in 
>> another patch if it also concerns you.
>>
>>> -----Original Message-----
>>> From: Burakov, Anatoly
>>> Sent: Wednesday, March 14, 2018 9:27 PM
>>> To: Tan, Jianfeng; dev@dpdk.org
>>> Cc: Richardson, Bruce; Ananyev, Konstantin; thomas@monjalon.net
>>> Subject: Re: [PATCH v5] vfio: change to use generic multi-process 
>>> channel
>> [...]
>>>
>>>> +    mp_req.len_param = sizeof(*p);
>>>> +    mp_req.num_fds = 0;
>>>> +
>>>> +    vfio_group_fd = -1;
>>>> +    if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
>>>> +        mp_reply.nb_received == 1) {
>>>> +        mp_rep = &mp_reply.msgs[0];
>>>> +        p = (struct vfio_mp_param *)mp_rep->param;
>>>> +        if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
>>>> +            cur_grp->group_no = iommu_group_no;
>>>> +            vfio_group_fd = mp_rep->fds[0];
>>>> +            cur_grp->fd = vfio_group_fd;
>>>> +            vfio_cfg.vfio_active_groups++;
>>>>            }
>>>> +        free(mp_reply.msgs);
>>>>        }
>>>> -    return -1;
>>>> +
>>>> +    if (vfio_group_fd < 0)
>>>> +        RTE_LOG(ERR, EAL, "  cannot request group fd\n");
>>>> +    return vfio_group_fd;
>>>
>>> p->result can be SOCKET_NO_FD, in which case returned value should be
>>> zero. I think this is missing from this code. There probably should be
>>> an "else if (p->result == SOCKET_NO_FD)" clause that sets return 
>>> value to 0.
>>>
>>> You should be able to test this by trying to set up a device for VFIO
>>> that isn't bound to VFIO driver, in a secondary process.
>>
>> OK, I will fix this.
>>
>> But really, "zero" could be ambiguous as a fd could, theoretically, be 
>> zero too.
> 
> You're correct. Maybe return 0/-1 in case of success/failure and put fd 
> into a pointer? i.e.
> 
> int func(int *vfio_group_fd) {
> <...>
> *vfio_group_fd = fd;
> return 0;
> }

Or rather return 1/0/-1 depending on whether we got SOCKET_OK, 
SOCKET_NO_FD or SOCKET_ERR.
> 
>>
>> Thanks,
>> Jianfeng
>>
> 
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6] vfio: change to use generic multi-process channel
  2018-03-20  8:50     ` [PATCH v6] " Jianfeng Tan
@ 2018-04-05 14:26       ` Tan, Jianfeng
  2018-04-05 14:39         ` Burakov, Anatoly
  2018-04-12 23:27         ` Thomas Monjalon
  2018-04-12 15:26       ` Burakov, Anatoly
  1 sibling, 2 replies; 88+ messages in thread
From: Tan, Jianfeng @ 2018-04-05 14:26 UTC (permalink / raw)
  To: anatoly.burakov; +Cc: Thomas Monjalon, dev

Hi Anatoly,

An obvious action would be change rte_mp_request to 
rte_mp_request_sync(). Before sending out the new patch, do you have any 
other comments for this patch?

Hi Thomas,

Several patches will change vfio; may I know the your preferred apply 
sequence? (I'm trying to find out which patch shall rebase on; of 
course, I can wait until other patches are applied)

- http://dpdk.org/dev/patchwork/patch/37258/
- http://dpdk.org/dev/patchwork/patch/37152/
- http://dpdk.org/dev/patchwork/patch/37082/
- http://dpdk.org/dev/patchwork/patch/37047/

Thanks,
Jianfeng

On 3/20/2018 4:50 PM, Jianfeng Tan wrote:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
>
> This patch changes to use the generic mp channel.
>
> Test:
>    1. Bind two NICs to vfio-pci.
>
>    2. Start the primary and secondary process.
>      $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
>      $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
> 				--num-procs=2 --proc-id=1
>
> Cc: anatoly.burakov@intel.com
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
> v5->v6: (Address comments from Anatoly)
>    - Naming, return checking, logging.
>    - Move vfio action register after rte_bus_probe().
>   lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio.c         | 176 +++++------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h         |  17 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
>   4 files changed, 145 insertions(+), 478 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 38306bf..fb41e97 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -689,24 +689,8 @@ rte_eal_iopl_init(void)
>   #ifdef VFIO_PRESENT
>   static int rte_eal_vfio_setup(void)
>   {
> -	int vfio_enabled = 0;
> -
>   	if (rte_vfio_enable("vfio"))
>   		return -1;
> -	vfio_enabled = rte_vfio_is_enabled("vfio");
> -
> -	if (vfio_enabled) {
> -
> -		/* if we are primary process, create a thread to communicate with
> -		 * secondary processes. the thread will use a socket to wait for
> -		 * requests from secondary process to send open file descriptors,
> -		 * because VFIO does not allow multiple open descriptors on a group or
> -		 * VFIO container.
> -		 */
> -		if (internal_config.process_type == RTE_PROC_PRIMARY &&
> -				vfio_mp_sync_setup() < 0)
> -			return -1;
> -	}
>   
>   	return 0;
>   }
> @@ -950,6 +934,12 @@ rte_eal_init(int argc, char **argv)
>   		return -1;
>   	}
>   
> +#ifdef VFIO_PRESENT
> +	/* Register mp action after probe() so that we got enough info */
> +	if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
> +		return -1;
> +#endif
> +
>   	/* initialize default service/lcore mappings and start running. Ignore
>   	 * -ENOTSUP, as it indicates no service coremask passed to EAL.
>   	 */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index e44ae4d..9b97e5b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -1,5 +1,5 @@
>   /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2010-2018 Intel Corporation
>    */
>   
>   #include <string.h>
> @@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
>   	int vfio_group_fd;
>   	char filename[PATH_MAX];
>   	struct vfio_group *cur_grp;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
>   
>   	/* check if we already have the group descriptor open */
>   	for (i = 0; i < VFIO_MAX_GROUPS; i++)
> @@ -101,50 +105,34 @@ vfio_get_group_fd(int iommu_group_no)
>   		return vfio_group_fd;
>   	}
>   	/* if we're in a secondary process, request group fd from the primary
> -	 * process via our socket
> +	 * process via mp channel
>   	 */
> -	else {
> -		int socket_fd, ret;
> -
> -		socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -		if (socket_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		ret = vfio_mp_sync_receive_request(socket_fd);
> -		switch (ret) {
> -		case SOCKET_NO_FD:
> -			close(socket_fd);
> -			return 0;
> -		case SOCKET_OK:
> -			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
> -			/* if we got the fd, store it and return it */
> -			if (vfio_group_fd > 0) {
> -				close(socket_fd);
> -				cur_grp->group_no = iommu_group_no;
> -				cur_grp->fd = vfio_group_fd;
> -				vfio_cfg.vfio_active_groups++;
> -				return vfio_group_fd;
> -			}
> -			/* fall-through on error */
> -		default:
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	p->req = SOCKET_REQ_GROUP;
> +	p->group_no = iommu_group_no;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_group_fd = -1;
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			cur_grp->group_no = iommu_group_no;
> +			vfio_group_fd = mp_rep->fds[0];
> +			cur_grp->fd = vfio_group_fd;
> +			vfio_cfg.vfio_active_groups++;
> +		} else if (p->result == SOCKET_NO_FD) {
> +			RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
> +			vfio_group_fd = 0;
>   		}
> +		free(mp_reply.msgs);
>   	}
> -	return -1;
> +
> +	if (vfio_group_fd < 0)
> +		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
> +	return vfio_group_fd;
>   }
>   
>   
> @@ -200,7 +188,10 @@ int
>   rte_vfio_clear_group(int vfio_group_fd)
>   {
>   	int i;
> -	int socket_fd, ret;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
>   
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>   
> @@ -214,43 +205,27 @@ rte_vfio_clear_group(int vfio_group_fd)
>   		return 0;
>   	}
>   
> -	/* This is just for SECONDARY processes */
> -	socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -		return -1;
> -	}
> +	p->req = SOCKET_CLR_GROUP;
> +	p->group_no = vfio_group_fd;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
>   
> -	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -		close(socket_fd);
> -		return -1;
> -	}
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK) {
> +			free(mp_reply.msgs);
> +			return 0;
> +		} else if (p->result == SOCKET_NO_FD)
> +			RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> +		else
> +			RTE_LOG(ERR, EAL, "  no such VFIO group fd!\n");
>   
> -	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> -		close(socket_fd);
> -		return -1;
> +		free(mp_reply.msgs);
>   	}
>   
> -	ret = vfio_mp_sync_receive_request(socket_fd);
> -	switch (ret) {
> -	case SOCKET_NO_FD:
> -		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> -		close(socket_fd);
> -		break;
> -	case SOCKET_OK:
> -		close(socket_fd);
> -		return 0;
> -	case SOCKET_ERR:
> -		RTE_LOG(ERR, EAL, "  Socket error\n");
> -		close(socket_fd);
> -		break;
> -	default:
> -		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> -		close(socket_fd);
> -	}
>   	return -1;
>   }
>   
> @@ -561,6 +536,11 @@ int
>   vfio_get_container_fd(void)
>   {
>   	int ret, vfio_container_fd;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
> +
>   
>   	/* if we're in a primary process, try to open the container */
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> @@ -591,33 +571,29 @@ vfio_get_container_fd(void)
>   		}
>   
>   		return vfio_container_fd;
> -	} else {
> -		/*
> -		 * if we're in a secondary process, request container fd from the
> -		 * primary process via our socket
> -		 */
> -		int socket_fd;
> -
> -		socket_fd = vfio_mp_sync_connect_to_primary();
> -		if (socket_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
> -		if (vfio_container_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	}
> +	/*
> +	 * if we're in a secondary process, request container fd from the
> +	 * primary process via mp channel
> +	 */
> +	p->req = SOCKET_REQ_CONTAINER;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_container_fd = -1;
> +	if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			free(mp_reply.msgs);
> +			return mp_rep->fds[0];
>   		}
> -		close(socket_fd);
> -		return vfio_container_fd;
> +		free(mp_reply.msgs);
>   	}
>   
> +	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
>   	return -1;
>   }
>   
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 8059577..be2a79b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
>   #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
>   
>   /*
> - * Function prototypes for VFIO multiprocess sync functions
> - */
> -int vfio_mp_sync_send_request(int socket, int req);
> -int vfio_mp_sync_receive_request(int socket);
> -int vfio_mp_sync_send_fd(int socket, int fd);
> -int vfio_mp_sync_receive_fd(int socket);
> -int vfio_mp_sync_connect_to_primary(void);
> -
> -/*
>    * we don't need to store device fd's anywhere since they can be obtained from
>    * the group fd via an ioctl() call.
>    */
> @@ -150,6 +141,8 @@ vfio_get_group_fd(int iommu_group_no);
>   
>   int vfio_mp_sync_setup(void);
>   
> +#define EAL_VFIO_MP "eal_vfio_mp_sync"
> +
>   #define SOCKET_REQ_CONTAINER 0x100
>   #define SOCKET_REQ_GROUP 0x200
>   #define SOCKET_CLR_GROUP 0x300
> @@ -157,6 +150,12 @@ int vfio_mp_sync_setup(void);
>   #define SOCKET_NO_FD 0x1
>   #define SOCKET_ERR 0xFF
>   
> +struct vfio_mp_param {
> +	int req;
> +	int result;
> +	int group_no;
> +};
> +
>   #endif /* VFIO_PRESENT */
>   
>   #endif /* EAL_VFIO_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> index 7cc3c15..afa556f 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> @@ -1,32 +1,15 @@
>   /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2010-2018 Intel Corporation
>    */
>   
> +#include <unistd.h>
>   #include <string.h>
> -#include <fcntl.h>
> -#include <sys/socket.h>
> -#include <pthread.h>
> -
> -/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
> -#ifdef __USE_MISC
> -#define REMOVED_USE_MISC
> -#undef __USE_MISC
> -#endif
> -#include <sys/un.h>
> -/* make sure we redefine __USE_MISC only if it was previously undefined */
> -#ifdef REMOVED_USE_MISC
> -#define __USE_MISC
> -#undef REMOVED_USE_MISC
> -#endif
>   
>   #include <rte_log.h>
> -#include <rte_eal_memconfig.h>
> -#include <rte_malloc.h>
>   #include <rte_vfio.h>
> +#include <rte_eal.h>
>   
> -#include "eal_filesystem.h"
>   #include "eal_vfio.h"
> -#include "eal_thread.h"
>   
>   /**
>    * @file
> @@ -37,358 +20,77 @@
>   
>   #ifdef VFIO_PRESENT
>   
> -#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
> -#define CMSGLEN (CMSG_LEN(sizeof(int)))
> -#define FD_TO_CMSGHDR(fd, chdr) \
> -		do {\
> -			(chdr).cmsg_len = CMSGLEN;\
> -			(chdr).cmsg_level = SOL_SOCKET;\
> -			(chdr).cmsg_type = SCM_RIGHTS;\
> -			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
> -		} while (0)
> -#define CMSGHDR_TO_FD(chdr, fd) \
> -			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
> -
> -static pthread_t socket_thread;
> -static int mp_socket_fd;
> -
> -
> -/* get socket path (/var/run if root, $HOME otherwise) */
> -static void
> -get_socket_path(char *buffer, int bufsz)
> -{
> -	const char *dir = "/var/run";
> -	const char *home_dir = getenv("HOME");
> -
> -	if (getuid() != 0 && home_dir != NULL)
> -		dir = home_dir;
> -
> -	/* use current prefix as file path */
> -	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
> -			internal_config.hugefile_prefix);
> -}
> -
> -
> -
> -/*
> - * data flow for socket comm protocol:
> - * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
> - * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
> - * 2. server receives message
> - * 2a. in case of invalid group, SOCKET_ERR is sent back to client
> - * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
> - * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
> - *
> - * in case of any error, socket is closed.
> - */
> -
> -/* send a request, return -1 on error */
> -int
> -vfio_mp_sync_send_request(int socket, int req)
> -{
> -	struct msghdr hdr;
> -	struct iovec iov;
> -	int buf;
> -	int ret;
> -
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	buf = req;
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -
> -	ret = sendmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -	return 0;
> -}
> -
> -/* receive a request and return it */
> -int
> -vfio_mp_sync_receive_request(int socket)
> -{
> -	int buf;
> -	struct msghdr hdr;
> -	struct iovec iov;
> -	int ret, req;
> -
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	buf = SOCKET_ERR;
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -
> -	ret = recvmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -
> -	req = buf;
> -
> -	return req;
> -}
> -
> -/* send OK in message, fd in control message */
> -int
> -vfio_mp_sync_send_fd(int socket, int fd)
> +static int
> +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
>   {
> -	int buf;
> -	struct msghdr hdr;
> -	struct cmsghdr *chdr;
> -	char chdr_buf[CMSGLEN];
> -	struct iovec iov;
> +	int fd = -1;
>   	int ret;
> +	struct rte_mp_msg reply;
> +	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
> +	const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
>   
> -	chdr = (struct cmsghdr *) chdr_buf;
> -	memset(chdr, 0, sizeof(chdr_buf));
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -	hdr.msg_control = chdr;
> -	hdr.msg_controllen = CMSGLEN;
> -
> -	buf = SOCKET_OK;
> -	FD_TO_CMSGHDR(fd, *chdr);
> -
> -	ret = sendmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -	return 0;
> -}
> -
> -/* receive OK in message, fd in control message */
> -int
> -vfio_mp_sync_receive_fd(int socket)
> -{
> -	int buf;
> -	struct msghdr hdr;
> -	struct cmsghdr *chdr;
> -	char chdr_buf[CMSGLEN];
> -	struct iovec iov;
> -	int ret, req, fd;
> -
> -	buf = SOCKET_ERR;
> -
> -	chdr = (struct cmsghdr *) chdr_buf;
> -	memset(chdr, 0, sizeof(chdr_buf));
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -	hdr.msg_control = chdr;
> -	hdr.msg_controllen = CMSGLEN;
> -
> -	ret = recvmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -
> -	req = buf;
> -
> -	if (req != SOCKET_OK)
> -		return -1;
> -
> -	CMSGHDR_TO_FD(*chdr, fd);
> -
> -	return fd;
> -}
> -
> -/* connect socket_fd in secondary process to the primary process's socket */
> -int
> -vfio_mp_sync_connect_to_primary(void)
> -{
> -	struct sockaddr_un addr;
> -	socklen_t sockaddr_len;
> -	int socket_fd;
> -
> -	/* set up a socket */
> -	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
> +	if (msg->len_param != sizeof(*m)) {
> +		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
>   		return -1;
>   	}
>   
> -	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
> -	addr.sun_family = AF_UNIX;
> -
> -	sockaddr_len = sizeof(struct sockaddr_un);
> -
> -	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
> -		return socket_fd;
> -
> -	/* if connect failed */
> -	close(socket_fd);
> -	return -1;
> -}
> -
> +	memset(&reply, 0, sizeof(reply));
>   
> -
> -/*
> - * socket listening thread for primary process
> - */
> -static __attribute__((noreturn)) void *
> -vfio_mp_sync_thread(void __rte_unused * arg)
> -{
> -	int ret, fd, vfio_data;
> -
> -	/* wait for requests on the socket */
> -	for (;;) {
> -		int conn_sock;
> -		struct sockaddr_un addr;
> -		socklen_t sockaddr_len = sizeof(addr);
> -
> -		/* this is a blocking call */
> -		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
> -				&sockaddr_len);
> -
> -		/* just restart on error */
> -		if (conn_sock == -1)
> -			continue;
> -
> -		/* set socket to linger after close */
> -		struct linger l;
> -		l.l_onoff = 1;
> -		l.l_linger = 60;
> -
> -		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
> -			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
> -					"on listen socket (%s)\n", strerror(errno));
> -
> -		ret = vfio_mp_sync_receive_request(conn_sock);
> -
> -		switch (ret) {
> -		case SOCKET_REQ_CONTAINER:
> -			fd = vfio_get_container_fd();
> -			if (fd < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> -			else
> -				vfio_mp_sync_send_fd(conn_sock, fd);
> -			if (fd >= 0)
> -				close(fd);
> -			break;
> -		case SOCKET_REQ_GROUP:
> -			/* wait for group number */
> -			vfio_data = vfio_mp_sync_receive_request(conn_sock);
> -			if (vfio_data < 0) {
> -				close(conn_sock);
> -				continue;
> -			}
> -
> -			fd = vfio_get_group_fd(vfio_data);
> -
> -			if (fd < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> +	switch (m->req) {
> +	case SOCKET_REQ_GROUP:
> +		r->req = SOCKET_REQ_GROUP;
> +		r->group_no = m->group_no;
> +		fd = vfio_get_group_fd(m->group_no);
> +		if (fd < 0)
> +			r->result = SOCKET_ERR;
> +		else if (fd == 0)
>   			/* if VFIO group exists but isn't bound to VFIO driver */
> -			else if (fd == 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
> +			r->result = SOCKET_NO_FD;
> +		else {
>   			/* if group exists and is bound to VFIO driver */
> -			else {
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
> -				vfio_mp_sync_send_fd(conn_sock, fd);
> -			}
> -			break;
> -		case SOCKET_CLR_GROUP:
> -			/* wait for group fd */
> -			vfio_data = vfio_mp_sync_receive_request(conn_sock);
> -			if (vfio_data < 0) {
> -				close(conn_sock);
> -				continue;
> -			}
> -
> -			ret = rte_vfio_clear_group(vfio_data);
> -
> -			if (ret < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
> -			else
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
> -			break;
> -		default:
> -			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> -			break;
> +			r->result = SOCKET_OK;
> +			reply.num_fds = 1;
> +			reply.fds[0] = fd;
>   		}
> -		close(conn_sock);
> -	}
> -}
> -
> -static int
> -vfio_mp_sync_socket_setup(void)
> -{
> -	int ret, socket_fd;
> -	struct sockaddr_un addr;
> -	socklen_t sockaddr_len;
> -
> -	/* set up a socket */
> -	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
> -		return -1;
> -	}
> -
> -	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
> -	addr.sun_family = AF_UNIX;
> -
> -	sockaddr_len = sizeof(struct sockaddr_un);
> -
> -	unlink(addr.sun_path);
> -
> -	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
> -		close(socket_fd);
> -		return -1;
> -	}
> -
> -	ret = listen(socket_fd, 50);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
> -		close(socket_fd);
> +		break;
> +	case SOCKET_CLR_GROUP:
> +		r->req = SOCKET_CLR_GROUP;
> +		r->group_no = m->group_no;
> +		if (rte_vfio_clear_group(m->group_no) < 0)
> +			r->result = SOCKET_NO_FD;
> +		else
> +			r->result = SOCKET_OK;
> +		break;
> +	case SOCKET_REQ_CONTAINER:
> +		r->req = SOCKET_REQ_CONTAINER;
> +		fd = vfio_get_container_fd();
> +		if (fd < 0)
> +			r->result = SOCKET_ERR;
> +		else {
> +			r->result = SOCKET_OK;
> +			reply.num_fds = 1;
> +			reply.fds[0] = fd;
> +		}
> +		break;
> +	default:
> +		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
>   		return -1;
>   	}
>   
> -	/* save the socket in local configuration */
> -	mp_socket_fd = socket_fd;
> +	strcpy(reply.name, EAL_VFIO_MP);
> +	reply.len_param = sizeof(*r);
>   
> -	return 0;
> +	ret = rte_mp_reply(&reply, peer);
> +	if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
> +		close(fd);
> +	return ret;
>   }
>   
> -/*
> - * set up a local socket and tell it to listen for incoming connections
> - */
>   int
>   vfio_mp_sync_setup(void)
>   {
> -	int ret;
> -	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> -
> -	if (vfio_mp_sync_socket_setup() < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
> -		return -1;
> -	}
> -
> -	ret = pthread_create(&socket_thread, NULL,
> -			vfio_mp_sync_thread, NULL);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL,
> -			"Failed to create thread for communication with secondary processes!\n");
> -		close(mp_socket_fd);
> -		return -1;
> -	}
> -
> -	/* Set thread_name for aid in debugging. */
> -	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
> -	ret = rte_thread_setname(socket_thread, thread_name);
> -	if (ret)
> -		RTE_LOG(DEBUG, EAL,
> -			"Failed to set thread name for secondary processes!\n");
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
>   
>   	return 0;
>   }

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6] vfio: change to use generic multi-process channel
  2018-04-05 14:26       ` Tan, Jianfeng
@ 2018-04-05 14:39         ` Burakov, Anatoly
  2018-04-12 23:27         ` Thomas Monjalon
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-04-05 14:39 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: Thomas Monjalon, dev

On 05-Apr-18 3:26 PM, Tan, Jianfeng wrote:
> Hi Anatoly,
> 
> An obvious action would be change rte_mp_request to 
> rte_mp_request_sync(). Before sending out the new patch, do you have any 
> other comments for this patch?

Hi Jianfeng,

I don't think i do, but i'll have another look at it just in case, when 
i get time.

> 
> Hi Thomas,
> 
> Several patches will change vfio; may I know the your preferred apply 
> sequence? (I'm trying to find out which patch shall rebase on; of 
> course, I can wait until other patches are applied)
> 
> - http://dpdk.org/dev/patchwork/patch/37258/
> - http://dpdk.org/dev/patchwork/patch/37152/
> - http://dpdk.org/dev/patchwork/patch/37082/
> - http://dpdk.org/dev/patchwork/patch/37047/
> 
> Thanks,
> Jianfeng
> 

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6] vfio: change to use generic multi-process channel
  2018-03-20  8:50     ` [PATCH v6] " Jianfeng Tan
  2018-04-05 14:26       ` Tan, Jianfeng
@ 2018-04-12 15:26       ` Burakov, Anatoly
  1 sibling, 0 replies; 88+ messages in thread
From: Burakov, Anatoly @ 2018-04-12 15:26 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: bruce.richardson, konstantin.ananyev, thomas

On 20-Mar-18 8:50 AM, Jianfeng Tan wrote:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
> 
> This patch changes to use the generic mp channel.
> 
> Test:
>    1. Bind two NICs to vfio-pci.
> 
>    2. Start the primary and secondary process.
>      $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
>      $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
> 				--num-procs=2 --proc-id=1
> 
> Cc: anatoly.burakov@intel.com
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
> v5->v6: (Address comments from Anatoly)
>    - Naming, return checking, logging.
>    - Move vfio action register after rte_bus_probe().

Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v6] vfio: change to use generic multi-process channel
  2018-04-05 14:26       ` Tan, Jianfeng
  2018-04-05 14:39         ` Burakov, Anatoly
@ 2018-04-12 23:27         ` Thomas Monjalon
  1 sibling, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-04-12 23:27 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev, anatoly.burakov

05/04/2018 16:26, Tan, Jianfeng:
> Hi Anatoly,
> 
> An obvious action would be change rte_mp_request to 
> rte_mp_request_sync(). Before sending out the new patch, do you have any 
> other comments for this patch?
> 
> Hi Thomas,
> 
> Several patches will change vfio; may I know the your preferred apply 
> sequence? (I'm trying to find out which patch shall rebase on; of 
> course, I can wait until other patches are applied)
> 
> - http://dpdk.org/dev/patchwork/patch/37258/
> - http://dpdk.org/dev/patchwork/patch/37152/
> - http://dpdk.org/dev/patchwork/patch/37082/
> - http://dpdk.org/dev/patchwork/patch/37047/

All, but first one, are applied now.
I guess you can rebase on master.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* [PATCH v7] vfio: change to use generic multi-process channel
  2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
                       ` (2 preceding siblings ...)
  2018-03-20  8:50     ` [PATCH v6] " Jianfeng Tan
@ 2018-04-15 15:06     ` Jianfeng Tan
  2018-04-15 15:10       ` Tan, Jianfeng
  2018-04-17 23:04       ` Thomas Monjalon
  3 siblings, 2 replies; 88+ messages in thread
From: Jianfeng Tan @ 2018-04-15 15:06 UTC (permalink / raw)
  To: dev; +Cc: thomas, anatoly.burakov, Jianfeng Tan

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
  1. Bind two NICs to vfio-pci.

  2. Start the primary and secondary process.
    $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
    $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
				--num-procs=2 --proc-id=1

Cc: anatoly.burakov@intel.com

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 178 +++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  17 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
 4 files changed, 148 insertions(+), 479 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 99c2242..21afa73 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -696,24 +696,8 @@ rte_eal_iopl_init(void)
 #ifdef VFIO_PRESENT
 static int rte_eal_vfio_setup(void)
 {
-	int vfio_enabled = 0;
-
 	if (rte_vfio_enable("vfio"))
 		return -1;
-	vfio_enabled = rte_vfio_is_enabled("vfio");
-
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
 
 	return 0;
 }
@@ -970,6 +954,12 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+#ifdef VFIO_PRESENT
+	/* Register mp action after probe() so that we got enough info */
+	if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+		return -1;
+#endif
+
 	/* initialize default service/lcore mappings and start running. Ignore
 	 * -ENOTSUP, as it indicates no service coremask passed to EAL.
 	 */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 16ee730..957a537 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
 #include <inttypes.h>
@@ -290,6 +290,10 @@ rte_vfio_get_group_fd(int iommu_group_num)
 	int vfio_group_fd;
 	char filename[PATH_MAX];
 	struct vfio_group *cur_grp;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -350,50 +354,34 @@ rte_vfio_get_group_fd(int iommu_group_num)
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel.
 	 */
-	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_num) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_num = iommu_group_num;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_num = iommu_group_num;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			cur_grp->group_num = iommu_group_num;
+			vfio_group_fd = mp_rep->fds[0];
+			cur_grp->fd = vfio_group_fd;
+			vfio_cfg.vfio_active_groups++;
+		} else if (p->result == SOCKET_NO_FD) {
+			RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
+			vfio_group_fd = 0;
 		}
+		free(mp_reply.msgs);
 	}
-	return -1;
+
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
 }
 
 
@@ -481,7 +469,10 @@ int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
@@ -495,43 +486,27 @@ rte_vfio_clear_group(int vfio_group_fd)
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_num = vfio_group_fd;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK) {
+			free(mp_reply.msgs);
+			return 0;
+		} else if (p->result == SOCKET_NO_FD)
+			RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
+		else
+			RTE_LOG(ERR, EAL, "  no such VFIO group fd!\n");
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+		free(mp_reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
 	return -1;
 }
 
@@ -924,6 +899,11 @@ int
 rte_vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -954,33 +934,29 @@ rte_vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			free(mp_reply.msgs);
+			return mp_rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(mp_reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index c788bba..c8c6ee4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -84,15 +84,6 @@ struct vfio_iommu_spapr_tce_info {
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
 
 /*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
@@ -141,6 +132,8 @@ vfio_has_supported_extensions(int vfio_container_fd);
 
 int vfio_mp_sync_setup(void);
 
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
 #define SOCKET_REQ_CONTAINER 0x100
 #define SOCKET_REQ_GROUP 0x200
 #define SOCKET_CLR_GROUP 0x300
@@ -148,6 +141,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_num;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index e19b571..9c202bb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,16 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
+#include <rte_compat.h>
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,358 +21,78 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd = -1;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m =
+		(const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
+	memset(&reply, 0, sizeof(reply));
 
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = rte_vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = rte_vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_num = m->group_num;
+		fd = rte_vfio_get_group_fd(m->group_num);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_num = m->group_num;
+		if (rte_vfio_clear_group(m->group_num) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = rte_vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
+	strcpy(reply.name, EAL_VFIO_MP);
+	reply.len_param = sizeof(*r);
 
-	return 0;
+	ret = rte_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [PATCH v7] vfio: change to use generic multi-process channel
  2018-04-15 15:06     ` [PATCH v7] " Jianfeng Tan
@ 2018-04-15 15:10       ` Tan, Jianfeng
  2018-04-17 23:04       ` Thomas Monjalon
  1 sibling, 0 replies; 88+ messages in thread
From: Tan, Jianfeng @ 2018-04-15 15:10 UTC (permalink / raw)
  To: dev; +Cc: thomas, anatoly.burakov

Sorry, forget the version change log. FYI:

v6->v7:
   - Rebase on master.
v5->v6: (Address comments from Anatoly)
   - Naming, return checking, logging.
   - Move vfio action register after rte_bus_probe().


On 4/15/2018 11:06 PM, Jianfeng Tan wrote:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
>
> This patch changes to use the generic mp channel.
>
> Test:
>    1. Bind two NICs to vfio-pci.
>
>    2. Start the primary and secondary process.
>      $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
>      $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
> 				--num-procs=2 --proc-id=1
>
> Cc: anatoly.burakov@intel.com
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal.c              |  22 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio.c         | 178 +++++------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h         |  17 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
>   4 files changed, 148 insertions(+), 479 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 99c2242..21afa73 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -696,24 +696,8 @@ rte_eal_iopl_init(void)
>   #ifdef VFIO_PRESENT
>   static int rte_eal_vfio_setup(void)
>   {
> -	int vfio_enabled = 0;
> -
>   	if (rte_vfio_enable("vfio"))
>   		return -1;
> -	vfio_enabled = rte_vfio_is_enabled("vfio");
> -
> -	if (vfio_enabled) {
> -
> -		/* if we are primary process, create a thread to communicate with
> -		 * secondary processes. the thread will use a socket to wait for
> -		 * requests from secondary process to send open file descriptors,
> -		 * because VFIO does not allow multiple open descriptors on a group or
> -		 * VFIO container.
> -		 */
> -		if (internal_config.process_type == RTE_PROC_PRIMARY &&
> -				vfio_mp_sync_setup() < 0)
> -			return -1;
> -	}
>   
>   	return 0;
>   }
> @@ -970,6 +954,12 @@ rte_eal_init(int argc, char **argv)
>   		return -1;
>   	}
>   
> +#ifdef VFIO_PRESENT
> +	/* Register mp action after probe() so that we got enough info */
> +	if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
> +		return -1;
> +#endif
> +
>   	/* initialize default service/lcore mappings and start running. Ignore
>   	 * -ENOTSUP, as it indicates no service coremask passed to EAL.
>   	 */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 16ee730..957a537 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -1,5 +1,5 @@
>   /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2010-2018 Intel Corporation
>    */
>   
>   #include <inttypes.h>
> @@ -290,6 +290,10 @@ rte_vfio_get_group_fd(int iommu_group_num)
>   	int vfio_group_fd;
>   	char filename[PATH_MAX];
>   	struct vfio_group *cur_grp;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
>   
>   	/* check if we already have the group descriptor open */
>   	for (i = 0; i < VFIO_MAX_GROUPS; i++)
> @@ -350,50 +354,34 @@ rte_vfio_get_group_fd(int iommu_group_num)
>   		return vfio_group_fd;
>   	}
>   	/* if we're in a secondary process, request group fd from the primary
> -	 * process via our socket
> +	 * process via mp channel.
>   	 */
> -	else {
> -		int socket_fd, ret;
> -
> -		socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -		if (socket_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, iommu_group_num) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		ret = vfio_mp_sync_receive_request(socket_fd);
> -		switch (ret) {
> -		case SOCKET_NO_FD:
> -			close(socket_fd);
> -			return 0;
> -		case SOCKET_OK:
> -			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
> -			/* if we got the fd, store it and return it */
> -			if (vfio_group_fd > 0) {
> -				close(socket_fd);
> -				cur_grp->group_num = iommu_group_num;
> -				cur_grp->fd = vfio_group_fd;
> -				vfio_cfg.vfio_active_groups++;
> -				return vfio_group_fd;
> -			}
> -			/* fall-through on error */
> -		default:
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	p->req = SOCKET_REQ_GROUP;
> +	p->group_num = iommu_group_num;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_group_fd = -1;
> +	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			cur_grp->group_num = iommu_group_num;
> +			vfio_group_fd = mp_rep->fds[0];
> +			cur_grp->fd = vfio_group_fd;
> +			vfio_cfg.vfio_active_groups++;
> +		} else if (p->result == SOCKET_NO_FD) {
> +			RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
> +			vfio_group_fd = 0;
>   		}
> +		free(mp_reply.msgs);
>   	}
> -	return -1;
> +
> +	if (vfio_group_fd < 0)
> +		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
> +	return vfio_group_fd;
>   }
>   
>   
> @@ -481,7 +469,10 @@ int
>   rte_vfio_clear_group(int vfio_group_fd)
>   {
>   	int i;
> -	int socket_fd, ret;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
>   
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>   
> @@ -495,43 +486,27 @@ rte_vfio_clear_group(int vfio_group_fd)
>   		return 0;
>   	}
>   
> -	/* This is just for SECONDARY processes */
> -	socket_fd = vfio_mp_sync_connect_to_primary();
> -
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -		return -1;
> -	}
> -
> -	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -		close(socket_fd);
> -		return -1;
> -	}
> +	p->req = SOCKET_CLR_GROUP;
> +	p->group_num = vfio_group_fd;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK) {
> +			free(mp_reply.msgs);
> +			return 0;
> +		} else if (p->result == SOCKET_NO_FD)
> +			RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> +		else
> +			RTE_LOG(ERR, EAL, "  no such VFIO group fd!\n");
>   
> -	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> -		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> -		close(socket_fd);
> -		return -1;
> +		free(mp_reply.msgs);
>   	}
>   
> -	ret = vfio_mp_sync_receive_request(socket_fd);
> -	switch (ret) {
> -	case SOCKET_NO_FD:
> -		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> -		close(socket_fd);
> -		break;
> -	case SOCKET_OK:
> -		close(socket_fd);
> -		return 0;
> -	case SOCKET_ERR:
> -		RTE_LOG(ERR, EAL, "  Socket error\n");
> -		close(socket_fd);
> -		break;
> -	default:
> -		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> -		close(socket_fd);
> -	}
>   	return -1;
>   }
>   
> @@ -924,6 +899,11 @@ int
>   rte_vfio_get_container_fd(void)
>   {
>   	int ret, vfio_container_fd;
> +	struct rte_mp_msg mp_req, *mp_rep;
> +	struct rte_mp_reply mp_reply;
> +	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
> +	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
> +
>   
>   	/* if we're in a primary process, try to open the container */
>   	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> @@ -954,33 +934,29 @@ rte_vfio_get_container_fd(void)
>   		}
>   
>   		return vfio_container_fd;
> -	} else {
> -		/*
> -		 * if we're in a secondary process, request container fd from the
> -		 * primary process via our socket
> -		 */
> -		int socket_fd;
> -
> -		socket_fd = vfio_mp_sync_connect_to_primary();
> -		if (socket_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
> -			return -1;
> -		}
> -		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> -			close(socket_fd);
> -			return -1;
> -		}
> -		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
> -		if (vfio_container_fd < 0) {
> -			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
> -			close(socket_fd);
> -			return -1;
> +	}
> +	/*
> +	 * if we're in a secondary process, request container fd from the
> +	 * primary process via mp channel
> +	 */
> +	p->req = SOCKET_REQ_CONTAINER;
> +	strcpy(mp_req.name, EAL_VFIO_MP);
> +	mp_req.len_param = sizeof(*p);
> +	mp_req.num_fds = 0;
> +
> +	vfio_container_fd = -1;
> +	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
> +	    mp_reply.nb_received == 1) {
> +		mp_rep = &mp_reply.msgs[0];
> +		p = (struct vfio_mp_param *)mp_rep->param;
> +		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
> +			free(mp_reply.msgs);
> +			return mp_rep->fds[0];
>   		}
> -		close(socket_fd);
> -		return vfio_container_fd;
> +		free(mp_reply.msgs);
>   	}
>   
> +	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
>   	return -1;
>   }
>   
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index c788bba..c8c6ee4 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -84,15 +84,6 @@ struct vfio_iommu_spapr_tce_info {
>   #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
>   
>   /*
> - * Function prototypes for VFIO multiprocess sync functions
> - */
> -int vfio_mp_sync_send_request(int socket, int req);
> -int vfio_mp_sync_receive_request(int socket);
> -int vfio_mp_sync_send_fd(int socket, int fd);
> -int vfio_mp_sync_receive_fd(int socket);
> -int vfio_mp_sync_connect_to_primary(void);
> -
> -/*
>    * we don't need to store device fd's anywhere since they can be obtained from
>    * the group fd via an ioctl() call.
>    */
> @@ -141,6 +132,8 @@ vfio_has_supported_extensions(int vfio_container_fd);
>   
>   int vfio_mp_sync_setup(void);
>   
> +#define EAL_VFIO_MP "eal_vfio_mp_sync"
> +
>   #define SOCKET_REQ_CONTAINER 0x100
>   #define SOCKET_REQ_GROUP 0x200
>   #define SOCKET_CLR_GROUP 0x300
> @@ -148,6 +141,12 @@ int vfio_mp_sync_setup(void);
>   #define SOCKET_NO_FD 0x1
>   #define SOCKET_ERR 0xFF
>   
> +struct vfio_mp_param {
> +	int req;
> +	int result;
> +	int group_num;
> +};
> +
>   #endif /* VFIO_PRESENT */
>   
>   #endif /* EAL_VFIO_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> index e19b571..9c202bb 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
> @@ -1,32 +1,16 @@
>   /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2010-2018 Intel Corporation
>    */
>   
> +#include <unistd.h>
>   #include <string.h>
> -#include <fcntl.h>
> -#include <sys/socket.h>
> -#include <pthread.h>
> -
> -/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
> -#ifdef __USE_MISC
> -#define REMOVED_USE_MISC
> -#undef __USE_MISC
> -#endif
> -#include <sys/un.h>
> -/* make sure we redefine __USE_MISC only if it was previously undefined */
> -#ifdef REMOVED_USE_MISC
> -#define __USE_MISC
> -#undef REMOVED_USE_MISC
> -#endif
>   
> +#include <rte_compat.h>
>   #include <rte_log.h>
> -#include <rte_eal_memconfig.h>
> -#include <rte_malloc.h>
>   #include <rte_vfio.h>
> +#include <rte_eal.h>
>   
> -#include "eal_filesystem.h"
>   #include "eal_vfio.h"
> -#include "eal_thread.h"
>   
>   /**
>    * @file
> @@ -37,358 +21,78 @@
>   
>   #ifdef VFIO_PRESENT
>   
> -#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
> -#define CMSGLEN (CMSG_LEN(sizeof(int)))
> -#define FD_TO_CMSGHDR(fd, chdr) \
> -		do {\
> -			(chdr).cmsg_len = CMSGLEN;\
> -			(chdr).cmsg_level = SOL_SOCKET;\
> -			(chdr).cmsg_type = SCM_RIGHTS;\
> -			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
> -		} while (0)
> -#define CMSGHDR_TO_FD(chdr, fd) \
> -			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
> -
> -static pthread_t socket_thread;
> -static int mp_socket_fd;
> -
> -
> -/* get socket path (/var/run if root, $HOME otherwise) */
> -static void
> -get_socket_path(char *buffer, int bufsz)
> -{
> -	const char *dir = "/var/run";
> -	const char *home_dir = getenv("HOME");
> -
> -	if (getuid() != 0 && home_dir != NULL)
> -		dir = home_dir;
> -
> -	/* use current prefix as file path */
> -	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
> -			internal_config.hugefile_prefix);
> -}
> -
> -
> -
> -/*
> - * data flow for socket comm protocol:
> - * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
> - * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
> - * 2. server receives message
> - * 2a. in case of invalid group, SOCKET_ERR is sent back to client
> - * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
> - * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
> - *
> - * in case of any error, socket is closed.
> - */
> -
> -/* send a request, return -1 on error */
> -int
> -vfio_mp_sync_send_request(int socket, int req)
> -{
> -	struct msghdr hdr;
> -	struct iovec iov;
> -	int buf;
> -	int ret;
> -
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	buf = req;
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -
> -	ret = sendmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -	return 0;
> -}
> -
> -/* receive a request and return it */
> -int
> -vfio_mp_sync_receive_request(int socket)
> -{
> -	int buf;
> -	struct msghdr hdr;
> -	struct iovec iov;
> -	int ret, req;
> -
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	buf = SOCKET_ERR;
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -
> -	ret = recvmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -
> -	req = buf;
> -
> -	return req;
> -}
> -
> -/* send OK in message, fd in control message */
> -int
> -vfio_mp_sync_send_fd(int socket, int fd)
> +static int
> +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
>   {
> -	int buf;
> -	struct msghdr hdr;
> -	struct cmsghdr *chdr;
> -	char chdr_buf[CMSGLEN];
> -	struct iovec iov;
> +	int fd = -1;
>   	int ret;
> +	struct rte_mp_msg reply;
> +	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
> +	const struct vfio_mp_param *m =
> +		(const struct vfio_mp_param *)msg->param;
>   
> -	chdr = (struct cmsghdr *) chdr_buf;
> -	memset(chdr, 0, sizeof(chdr_buf));
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -	hdr.msg_control = chdr;
> -	hdr.msg_controllen = CMSGLEN;
> -
> -	buf = SOCKET_OK;
> -	FD_TO_CMSGHDR(fd, *chdr);
> -
> -	ret = sendmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -	return 0;
> -}
> -
> -/* receive OK in message, fd in control message */
> -int
> -vfio_mp_sync_receive_fd(int socket)
> -{
> -	int buf;
> -	struct msghdr hdr;
> -	struct cmsghdr *chdr;
> -	char chdr_buf[CMSGLEN];
> -	struct iovec iov;
> -	int ret, req, fd;
> -
> -	buf = SOCKET_ERR;
> -
> -	chdr = (struct cmsghdr *) chdr_buf;
> -	memset(chdr, 0, sizeof(chdr_buf));
> -	memset(&hdr, 0, sizeof(hdr));
> -
> -	hdr.msg_iov = &iov;
> -	hdr.msg_iovlen = 1;
> -	iov.iov_base = (char *) &buf;
> -	iov.iov_len = sizeof(buf);
> -	hdr.msg_control = chdr;
> -	hdr.msg_controllen = CMSGLEN;
> -
> -	ret = recvmsg(socket, &hdr, 0);
> -	if (ret < 0)
> -		return -1;
> -
> -	req = buf;
> -
> -	if (req != SOCKET_OK)
> -		return -1;
> -
> -	CMSGHDR_TO_FD(*chdr, fd);
> -
> -	return fd;
> -}
> -
> -/* connect socket_fd in secondary process to the primary process's socket */
> -int
> -vfio_mp_sync_connect_to_primary(void)
> -{
> -	struct sockaddr_un addr;
> -	socklen_t sockaddr_len;
> -	int socket_fd;
> -
> -	/* set up a socket */
> -	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
> +	if (msg->len_param != sizeof(*m)) {
> +		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
>   		return -1;
>   	}
>   
> -	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
> -	addr.sun_family = AF_UNIX;
> -
> -	sockaddr_len = sizeof(struct sockaddr_un);
> -
> -	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
> -		return socket_fd;
> -
> -	/* if connect failed */
> -	close(socket_fd);
> -	return -1;
> -}
> -
> +	memset(&reply, 0, sizeof(reply));
>   
> -
> -/*
> - * socket listening thread for primary process
> - */
> -static __attribute__((noreturn)) void *
> -vfio_mp_sync_thread(void __rte_unused * arg)
> -{
> -	int ret, fd, vfio_data;
> -
> -	/* wait for requests on the socket */
> -	for (;;) {
> -		int conn_sock;
> -		struct sockaddr_un addr;
> -		socklen_t sockaddr_len = sizeof(addr);
> -
> -		/* this is a blocking call */
> -		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
> -				&sockaddr_len);
> -
> -		/* just restart on error */
> -		if (conn_sock == -1)
> -			continue;
> -
> -		/* set socket to linger after close */
> -		struct linger l;
> -		l.l_onoff = 1;
> -		l.l_linger = 60;
> -
> -		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
> -			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
> -					"on listen socket (%s)\n", strerror(errno));
> -
> -		ret = vfio_mp_sync_receive_request(conn_sock);
> -
> -		switch (ret) {
> -		case SOCKET_REQ_CONTAINER:
> -			fd = rte_vfio_get_container_fd();
> -			if (fd < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> -			else
> -				vfio_mp_sync_send_fd(conn_sock, fd);
> -			if (fd >= 0)
> -				close(fd);
> -			break;
> -		case SOCKET_REQ_GROUP:
> -			/* wait for group number */
> -			vfio_data = vfio_mp_sync_receive_request(conn_sock);
> -			if (vfio_data < 0) {
> -				close(conn_sock);
> -				continue;
> -			}
> -
> -			fd = rte_vfio_get_group_fd(vfio_data);
> -
> -			if (fd < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> +	switch (m->req) {
> +	case SOCKET_REQ_GROUP:
> +		r->req = SOCKET_REQ_GROUP;
> +		r->group_num = m->group_num;
> +		fd = rte_vfio_get_group_fd(m->group_num);
> +		if (fd < 0)
> +			r->result = SOCKET_ERR;
> +		else if (fd == 0)
>   			/* if VFIO group exists but isn't bound to VFIO driver */
> -			else if (fd == 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
> +			r->result = SOCKET_NO_FD;
> +		else {
>   			/* if group exists and is bound to VFIO driver */
> -			else {
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
> -				vfio_mp_sync_send_fd(conn_sock, fd);
> -			}
> -			break;
> -		case SOCKET_CLR_GROUP:
> -			/* wait for group fd */
> -			vfio_data = vfio_mp_sync_receive_request(conn_sock);
> -			if (vfio_data < 0) {
> -				close(conn_sock);
> -				continue;
> -			}
> -
> -			ret = rte_vfio_clear_group(vfio_data);
> -
> -			if (ret < 0)
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
> -			else
> -				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
> -			break;
> -		default:
> -			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
> -			break;
> +			r->result = SOCKET_OK;
> +			reply.num_fds = 1;
> +			reply.fds[0] = fd;
>   		}
> -		close(conn_sock);
> -	}
> -}
> -
> -static int
> -vfio_mp_sync_socket_setup(void)
> -{
> -	int ret, socket_fd;
> -	struct sockaddr_un addr;
> -	socklen_t sockaddr_len;
> -
> -	/* set up a socket */
> -	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
> -	if (socket_fd < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
> -		return -1;
> -	}
> -
> -	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
> -	addr.sun_family = AF_UNIX;
> -
> -	sockaddr_len = sizeof(struct sockaddr_un);
> -
> -	unlink(addr.sun_path);
> -
> -	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
> -		close(socket_fd);
> -		return -1;
> -	}
> -
> -	ret = listen(socket_fd, 50);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
> -		close(socket_fd);
> +		break;
> +	case SOCKET_CLR_GROUP:
> +		r->req = SOCKET_CLR_GROUP;
> +		r->group_num = m->group_num;
> +		if (rte_vfio_clear_group(m->group_num) < 0)
> +			r->result = SOCKET_NO_FD;
> +		else
> +			r->result = SOCKET_OK;
> +		break;
> +	case SOCKET_REQ_CONTAINER:
> +		r->req = SOCKET_REQ_CONTAINER;
> +		fd = rte_vfio_get_container_fd();
> +		if (fd < 0)
> +			r->result = SOCKET_ERR;
> +		else {
> +			r->result = SOCKET_OK;
> +			reply.num_fds = 1;
> +			reply.fds[0] = fd;
> +		}
> +		break;
> +	default:
> +		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
>   		return -1;
>   	}
>   
> -	/* save the socket in local configuration */
> -	mp_socket_fd = socket_fd;
> +	strcpy(reply.name, EAL_VFIO_MP);
> +	reply.len_param = sizeof(*r);
>   
> -	return 0;
> +	ret = rte_mp_reply(&reply, peer);
> +	if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
> +		close(fd);
> +	return ret;
>   }
>   
> -/*
> - * set up a local socket and tell it to listen for incoming connections
> - */
>   int
>   vfio_mp_sync_setup(void)
>   {
> -	int ret;
> -	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> -
> -	if (vfio_mp_sync_socket_setup() < 0) {
> -		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
> -		return -1;
> -	}
> -
> -	ret = pthread_create(&socket_thread, NULL,
> -			vfio_mp_sync_thread, NULL);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL,
> -			"Failed to create thread for communication with secondary processes!\n");
> -		close(mp_socket_fd);
> -		return -1;
> -	}
> -
> -	/* Set thread_name for aid in debugging. */
> -	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
> -	ret = rte_thread_setname(socket_thread, thread_name);
> -	if (ret)
> -		RTE_LOG(DEBUG, EAL,
> -			"Failed to set thread name for secondary processes!\n");
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
>   
>   	return 0;
>   }

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [PATCH v7] vfio: change to use generic multi-process channel
  2018-04-15 15:06     ` [PATCH v7] " Jianfeng Tan
  2018-04-15 15:10       ` Tan, Jianfeng
@ 2018-04-17 23:04       ` Thomas Monjalon
  1 sibling, 0 replies; 88+ messages in thread
From: Thomas Monjalon @ 2018-04-17 23:04 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, anatoly.burakov

15/04/2018 17:06, Jianfeng Tan:
> Previously, vfio uses its own private channel for the secondary
> process to get container fd and group fd from the primary process.
> 
> This patch changes to use the generic mp channel.
> 
> Test:
>   1. Bind two NICs to vfio-pci.
> 
>   2. Start the primary and secondary process.
>     $ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
>     $ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
> 				--num-procs=2 --proc-id=1
> 
> Cc: anatoly.burakov@intel.com
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 88+ messages in thread

end of thread, other threads:[~2018-04-17 23:04 UTC | newest]

Thread overview: 88+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-30 18:44 [PATCH 0/3] generic channel for multi-process communication Jianfeng Tan
2017-11-30 18:44 ` [PATCH 1/3] eal: add " Jianfeng Tan
2017-12-11 11:04   ` Burakov, Anatoly
2017-12-11 16:43   ` Ananyev, Konstantin
2017-11-30 18:44 ` [PATCH 2/3] eal: add synchronous " Jianfeng Tan
2017-12-11 11:39   ` Burakov, Anatoly
2017-12-11 16:49     ` Ananyev, Konstantin
2017-11-30 18:44 ` [PATCH 3/3] vfio: use the generic multi-process channel Jianfeng Tan
2017-12-11 12:01   ` Burakov, Anatoly
2017-12-11  9:59 ` [PATCH 0/3] generic channel for multi-process communication Burakov, Anatoly
2017-12-12  7:34   ` Tan, Jianfeng
2017-12-12 16:18     ` Burakov, Anatoly
2018-01-11  4:07 ` [PATCH v2 0/4] " Jianfeng Tan
2018-01-11  4:07   ` [PATCH v2 1/4] eal: add " Jianfeng Tan
2018-01-13 12:57     ` Burakov, Anatoly
2018-01-15 19:52     ` Ananyev, Konstantin
2018-01-11  4:07   ` [PATCH v2 2/4] eal: add and del secondary processes in the primary Jianfeng Tan
2018-01-13 13:11     ` Burakov, Anatoly
2018-01-15 21:45     ` Ananyev, Konstantin
2018-01-11  4:07   ` [PATCH v2 3/4] eal: add synchronous multi-process communication Jianfeng Tan
2018-01-13 13:41     ` Burakov, Anatoly
2018-01-16  0:00     ` Ananyev, Konstantin
2018-01-16  8:10       ` Tan, Jianfeng
2018-01-16 11:12         ` Ananyev, Konstantin
2018-01-16 16:47           ` Tan, Jianfeng
2018-01-17 10:50             ` Ananyev, Konstantin
2018-01-17 13:09               ` Tan, Jianfeng
2018-01-17 13:15                 ` Tan, Jianfeng
2018-01-17 17:20                 ` Ananyev, Konstantin
2018-01-11  4:07   ` [PATCH v2 4/4] vfio: use the generic multi-process channel Jianfeng Tan
2018-01-13 14:03     ` Burakov, Anatoly
2018-03-04 14:57     ` [PATCH v5] vfio: change to use " Jianfeng Tan
2018-03-14 13:27       ` Burakov, Anatoly
2018-03-19  6:53         ` Tan, Jianfeng
2018-03-20 10:33           ` Burakov, Anatoly
2018-03-20 10:56             ` Burakov, Anatoly
2018-03-20  8:50     ` [PATCH v6] " Jianfeng Tan
2018-04-05 14:26       ` Tan, Jianfeng
2018-04-05 14:39         ` Burakov, Anatoly
2018-04-12 23:27         ` Thomas Monjalon
2018-04-12 15:26       ` Burakov, Anatoly
2018-04-15 15:06     ` [PATCH v7] " Jianfeng Tan
2018-04-15 15:10       ` Tan, Jianfeng
2018-04-17 23:04       ` Thomas Monjalon
2018-01-25  4:16 ` [PATCH v3 0/3] generic channel for multi-process communication Jianfeng Tan
2018-01-25  4:16   ` [PATCH v3 1/3] eal: add " Jianfeng Tan
2018-01-25 10:41     ` Thomas Monjalon
2018-01-25 11:27     ` Burakov, Anatoly
2018-01-25 11:34       ` Thomas Monjalon
2018-01-25 12:21     ` Ananyev, Konstantin
2018-01-25  4:16   ` [PATCH v3 2/3] eal: add synchronous " Jianfeng Tan
2018-01-25 12:00     ` Burakov, Anatoly
2018-01-25 12:19       ` Burakov, Anatoly
2018-01-25 12:19       ` Ananyev, Konstantin
2018-01-25 12:25         ` Burakov, Anatoly
2018-01-25 13:00           ` Ananyev, Konstantin
2018-01-25 13:05             ` Burakov, Anatoly
2018-01-25 13:10               ` Burakov, Anatoly
2018-01-25 15:03                 ` Ananyev, Konstantin
2018-01-25 16:22                   ` Burakov, Anatoly
2018-01-25 17:10                     ` Tan, Jianfeng
2018-01-25 18:02                       ` Burakov, Anatoly
2018-01-25 12:22     ` Ananyev, Konstantin
2018-01-25  4:16   ` [PATCH v3 3/3] vfio: use the generic multi-process channel Jianfeng Tan
2018-01-25 10:47     ` Thomas Monjalon
2018-01-25 10:52       ` Burakov, Anatoly
2018-01-25 10:57         ` Thomas Monjalon
2018-01-25 12:15           ` Burakov, Anatoly
2018-01-25 19:14 ` [PATCH v4 0/2] generic channel for multi-process communication Jianfeng Tan
2018-01-25 19:14   ` [PATCH v4 1/2] eal: add synchronous " Jianfeng Tan
2018-01-25 19:14   ` [PATCH v4 2/2] vfio: use the generic multi-process channel Jianfeng Tan
2018-01-25 19:15   ` [PATCH v4 0/2] generic channel for multi-process communication Tan, Jianfeng
2018-01-25 19:21 ` [PATCH v5 " Jianfeng Tan
2018-01-25 19:21   ` [PATCH v5 1/2] eal: add " Jianfeng Tan
2018-01-25 19:21   ` [PATCH v5 2/2] eal: add synchronous " Jianfeng Tan
2018-01-25 21:23   ` [PATCH v5 0/2] generic channel for " Thomas Monjalon
2018-01-26  3:41 ` [PATCH v6 " Jianfeng Tan
2018-01-26  3:41   ` [PATCH v6 1/2] eal: add " Jianfeng Tan
2018-01-26 10:25     ` Burakov, Anatoly
2018-01-29  6:37       ` Tan, Jianfeng
2018-01-29  9:37         ` Burakov, Anatoly
2018-01-26  3:41   ` [PATCH v6 2/2] eal: add synchronous " Jianfeng Tan
2018-01-26 10:31     ` Burakov, Anatoly
2018-01-29 23:52   ` [PATCH v6 0/2] generic channel for " Thomas Monjalon
2018-01-30  6:58 ` [PATCH v7 " Jianfeng Tan
2018-01-30  6:58   ` [PATCH v7 1/2] eal: add " Jianfeng Tan
2018-01-30  6:58   ` [PATCH v7 2/2] eal: add synchronous " Jianfeng Tan
2018-01-30 14:46   ` [PATCH v7 0/2] generic channel for " Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.