linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
@ 2006-03-03 21:40 Chris Leech
  2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
                   ` (9 more replies)
  0 siblings, 10 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:40 UTC (permalink / raw)
  To: linux-kernel, netdev

This patch series is the first full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.

These changes apply to DaveM's net-2.6.17 tree as of commit
2bd84a93d8bb7192ad8c23ef41008502be1cb603 ([IRDA]: TOIM3232 dongle support)

They are available to pull from
	git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17

There are 8 patches in the series:
	1) The memcpy offload APIs and class code
	2) The Intel I/OAT DMA driver (ioatdma)
	3) Core networking code to setup networking as a DMA memcpy client
	4) Utility functions for sk_buff to iovec offloaded copy
	5) Structure changes needed for TCP receive offload
	6) Rename cleanup_rbuf to tcp_cleanup_rbuf
	7) Add a sysctl to tune the minimum offloaded I/O size for TCP
	8) The main TCP receive offload changes

--
Chris Leech <christopher.leech@intel.com>
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-04  1:40   ` David S. Miller
  2006-03-04 19:20   ` Benjamin LaHaise
  2006-03-03 21:42 ` [PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 drivers/Kconfig           |    2 
 drivers/Makefile          |    1 
 drivers/dma/Kconfig       |   13 ++
 drivers/dma/Makefile      |    1 
 drivers/dma/dmaengine.c   |  361 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  322 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 700 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index bddf431..ce7ffa7 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -70,4 +70,6 @@ source "drivers/sn/Kconfig"
 
 source "drivers/edac/Kconfig"
 
+source "drivers/dma/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5c69b86..516ba5e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -73,3 +73,4 @@ obj-$(CONFIG_SGI_SN)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
+obj-$(CONFIG_DMA_ENGINE)	+= dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 0000000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+	bool "Support for DMA engines"
+	---help---
+	  DMA engines offload copy operations from the CPU to dedicated
+	  hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 0000000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 0000000..77cfcb3
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,361 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+
+static spinlock_t dma_list_lock;
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+	sprintf(buf, "%lu\n", count);
+	return strlen(buf) + 1;
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+	sprintf(buf, "%lu\n", count);
+	return strlen(buf) + 1;
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+	sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+	return strlen(buf) + 1;
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+	__ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+	__ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+	__ATTR(in_use, S_IRUGO, show_in_use, NULL),
+	__ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+	.name            = "dma",
+	.class_dev_attrs = dma_class_attrs,
+	.release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_lock held.
+ */
+static struct dma_chan * dma_client_chan_alloc(struct dma_client *client)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	/* Find a channel, any DMA engine will do */
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (chan->client)
+				continue;
+
+			if (chan->device->device_alloc_chan_resources(chan) >= 0) {
+				kref_get(&device->refcount);
+				kref_init(&chan->refcount);
+				chan->slow_ref = 0;
+				INIT_RCU_HEAD(&chan->rcu);
+				chan->client = client;
+				spin_lock_irqsave(&client->lock, flags);
+				list_add_tail_rcu(&chan->client_node, &client->channels);
+				spin_unlock_irqrestore(&client->lock, flags);
+				return chan;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_async_device_cleanup(struct kref *kref);
+void dma_chan_cleanup(struct kref *kref)
+{
+	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+	chan->device->device_free_chan_resources(chan);
+	chan->client = NULL;
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu) {
+	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+	int bias = 0x7FFFFFFF;
+	int i;
+	for_each_cpu(i)
+		bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+	atomic_sub(bias, &chan->refcount.refcount);
+	kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+	chan->slow_ref = 1;
+	call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+	struct dma_client *client;
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	spin_lock(&dma_list_lock);
+	list_for_each_entry(client, &dma_client_list, global_node) {
+
+		while (client->chans_desired > client->chan_count) {
+			chan = dma_client_chan_alloc(client);
+			if (!chan)
+				break;
+
+			client->chan_count++;
+			client->event_callback(client, chan, DMA_RESOURCE_ADDED);
+		}
+
+		while (client->chans_desired < client->chan_count) {
+			spin_lock_irqsave(&client->lock, flags);
+			chan = list_entry(client->channels.next, struct dma_chan, client_node);
+			list_del_rcu(&chan->client_node);
+			spin_unlock_irqrestore(&client->lock, flags);
+			client->chan_count--;
+			client->event_callback(client, chan, DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+	}
+	spin_unlock(&dma_list_lock);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client * dma_async_client_register(dma_event_callback event_callback)
+{
+	struct dma_client *client;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	INIT_LIST_HEAD(&client->channels);
+	spin_lock_init(&client->lock);
+
+	client->chans_desired = 0;
+	client->chan_count = 0;
+	client->event_callback = event_callback;
+
+	spin_lock(&dma_list_lock);
+	list_add_tail(&client->global_node, &dma_client_list);
+	spin_unlock(&dma_list_lock);
+
+	return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+	struct dma_chan *chan;
+
+	if (!client)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(chan, &client->channels, client_node) {
+		dma_client_chan_free(chan);
+	}
+	rcu_read_unlock();
+
+	spin_lock(&dma_list_lock);
+	list_del(&client->global_node);
+	spin_unlock(&dma_list_lock);
+
+	kfree(client);
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+			unsigned int number)
+{
+	client->chans_desired = number;
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+	static int id;
+	int chancnt = 0;
+	struct dma_chan* chan;
+
+	if (!device)
+		return -ENODEV;
+
+	init_completion(&device->done);
+	kref_init(&device->refcount);
+	device->dev_id = id++;
+
+	/* represent channels in sysfs. Probably want devs too */
+	list_for_each_entry(chan, &device->channels, device_node) {
+		chan->local = alloc_percpu(typeof(*chan->local));
+		if (chan->local == NULL)
+			continue;
+
+		chan->chan_id = chancnt++;
+		chan->class_dev.class = &dma_devclass;
+		chan->class_dev.dev = NULL;
+		snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+		         device->dev_id, chan->chan_id);
+
+		kref_get(&device->refcount);
+		class_device_register(&chan->class_dev);
+	}
+
+	spin_lock(&dma_list_lock);
+	list_add_tail(&device->global_node, &dma_device_list);
+	spin_unlock(&dma_list_lock);
+
+	dma_chans_rebalance();
+
+	return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref) {
+	struct dma_device *device = container_of(kref, struct dma_device, refcount);
+	complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	spin_lock(&dma_list_lock);
+	list_del(&device->global_node);
+	spin_unlock(&dma_list_lock);
+
+	list_for_each_entry(chan, &device->channels, device_node) {
+		if (chan->client) {
+			spin_lock_irqsave(&chan->client->lock, flags);
+			list_del(&chan->client_node);
+			chan->client->chan_count--;
+			spin_unlock_irqrestore(&chan->client->lock, flags);
+			chan->client->event_callback(chan->client, chan, DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+		class_device_unregister(&chan->class_dev);
+	}
+
+	dma_chans_rebalance();
+
+	kref_put(&device->refcount, dma_async_device_cleanup);
+	wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+	spin_lock_init(&dma_list_lock);
+
+	return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 0000000..f8a77ab
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,322 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+	DMA_RESOURCE_SUSPEND,
+	DMA_RESOURCE_RESUME,
+	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+	DMA_SUCCESS,
+	DMA_IN_PROGRESS,
+	DMA_ERROR,
+};
+
+struct dma_chan_percpu
+{
+	local_t refcount;
+	/* stats */
+	unsigned long memcpy_count;
+	unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ */
+struct dma_chan
+{
+	struct dma_client *client;
+	struct dma_device *device;
+	dma_cookie_t cookie;
+
+	/* sysfs */
+	int chan_id;
+	struct class_device class_dev;
+
+	struct kref refcount;
+	int slow_ref;
+	struct rcu_head rcu;
+
+	struct list_head client_node;
+	struct list_head device_node;
+	struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_get(&chan->refcount);
+	else {
+		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_put(&chan->refcount, dma_chan_cleanup);
+	else {
+		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+	dma_event_callback	event_callback;
+	unsigned int		chan_count;
+	unsigned int		chans_desired;
+
+	spinlock_t		lock;
+	struct list_head	channels;
+	struct list_head	global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+	unsigned int chancnt;
+	struct list_head channels;
+	struct list_head global_node;
+
+	struct kref refcount;
+	struct completion done;
+
+	int dev_id;
+
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	void (*device_free_chan_resources)(struct dma_chan *chan);
+	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+			void *dest, void *src, size_t len);
+	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+			struct page *page, unsigned int offset, void *kdata,
+			size_t len);
+	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+			struct page *dest_pg, unsigned int dest_off,
+			struct page *src_pg, unsigned int src_off, size_t len);
+	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+			dma_cookie_t cookie, dma_cookie_t *last,
+			dma_cookie_t *used);
+	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+		unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+	                                             kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+	                                            src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+	return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+			dma_cookie_t last_complete, dma_cookie_t last_used) {
+	if (last_complete <= last_used) {
+		if ((cookie <= last_complete) || (cookie > last_used))
+			return DMA_SUCCESS;
+	} else {
+		if ((cookie <= last_complete) && (cookie > last_used))
+			return DMA_SUCCESS;
+	}
+	return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* DMAENGINE_H */


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
  2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-03 21:42 ` [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 drivers/dma/Kconfig       |   12 +++++
 include/linux/netdevice.h |    6 +++
 include/net/netdma.h      |   36 ++++++++++++++++
 net/core/dev.c            |  102 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
 	  DMA engines offload copy operations from the CPU to dedicated
 	  hardware, allowing the copies to happen asynchronously.
 
+comment "DMA Clients"
+
+config NET_DMA
+	bool "Network: TCP receive copy offload"
+	depends on DMA_ENGINE && NET
+	default y
+	---help---
+	  This enables the use of DMA engines in the network stack to
+	  offload receive copy-to-user operations, freeing CPU cycles.
+	  Since this is the main user of the DMA engine, it should be enabled;
+	  say Y here.
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b825be2..ecbde56 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,9 @@
 #include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+#endif
 
 struct divert_blk;
 struct vlan_group;
@@ -592,6 +595,9 @@ struct softnet_data
 	struct sk_buff		*completion_queue;
 
 	struct net_device	backlog_dev;	/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+	struct dma_chan		*net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 0000000..3cd9e6d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+#ifndef NETDMA_H
+#define NETDMA_H
+#include <linux/dmaengine.h>
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+	struct dma_chan *chan;
+	rcu_read_lock();
+	chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+	if (chan)
+		dma_chan_get(chan);
+	rcu_read_unlock();
+	return chan;
+}
+
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ca47bf..b54e5f3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -114,6 +114,7 @@
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 #include <net/iw_handler.h>
 #endif	/* CONFIG_NET_RADIO */
+#include <linux/dmaengine.h>
 #include <asm/current.h>
 
 /*
@@ -148,6 +149,11 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1719,6 +1725,9 @@ static void net_rx_action(struct softirq
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
+#ifdef CONFIG_NET_DMA
+	struct dma_chan *chan;
+#endif
 
 	local_irq_disable();
 
@@ -1750,6 +1759,18 @@ static void net_rx_action(struct softirq
 		}
 	}
 out:
+#ifdef CONFIG_NET_DMA
+	/*
+	 * There may not be any more sk_buffs coming right now, so push
+	 * any pending DMA copies to hardware
+	 */
+	if (net_dma_client) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
+			dma_async_memcpy_issue_pending(chan);
+		rcu_read_unlock();
+	}
+#endif
 	local_irq_enable();
 	return;
 
@@ -3205,6 +3226,85 @@ static int dev_cpu_callback(struct notif
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+	unsigned int cpu, i, n;
+	struct dma_chan *chan;
+
+	lock_cpu_hotplug();
+
+	if (net_dma_count == 0) {
+		for_each_online_cpu(cpu)
+			rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
+		unlock_cpu_hotplug();
+		return;
+	}
+
+	i = 0;
+	cpu = first_cpu(cpu_online_map);
+
+	rcu_read_lock();
+	list_for_each_entry(chan, &net_dma_client->channels, client_node) {
+		n = ((num_online_cpus() / net_dma_count)
+		   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+
+		while(n) {
+			per_cpu(softnet_data.net_dma, cpu) = chan;
+			cpu = next_cpu(cpu, cpu_online_map);
+			n--;
+		}
+		i++;
+	}
+	rcu_read_unlock();
+
+	unlock_cpu_hotplug();
+}
+
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan:
+ * @event:
+ */
+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_event event)
+{
+	switch (event) {
+	case DMA_RESOURCE_ADDED:
+		net_dma_count++;
+		net_dma_rebalance();
+		break;
+	case DMA_RESOURCE_REMOVED:
+		net_dma_count--;
+		net_dma_rebalance();
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * netdev_dma_regiser - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+	net_dma_client = dma_async_client_register(netdev_dma_event);
+	if (net_dma_client == NULL)
+		return -ENOMEM;
+
+	dma_async_client_chan_request(net_dma_client, num_online_cpus());
+	return 0;
+}
+
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
 
 /*
  *	Initialize the DEV module. At boot time this walks the device list and
@@ -3258,6 +3358,8 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
+	netdev_dma_register();
+
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
  2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
  2006-03-03 21:42 ` [PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-05  7:15   ` Andrew Morton
  2006-03-03 21:42 ` [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 drivers/dma/Makefile      |    1 
 drivers/dma/iovlock.c     |  320 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |   24 +++
 include/net/netdma.h      |    5 +
 net/core/Makefile         |    3 
 net/core/user_dma.c       |  133 +++++++++++++++++++
 6 files changed, 485 insertions(+), 1 deletions(-)

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f56..ea2f110 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
 obj-y += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 0000000..edbf581
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+Portions based on net/core/datagram.c and copyrighted by their authors.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_DMA_ENGINE
+
+#define NUM_PAGES_SPANNED(start, length) \
+	((PAGE_ALIGN((unsigned long)start + length) - \
+	((unsigned long)start & PAGE_MASK)) >> PAGE_SHIFT)
+
+/*
+ * Lock down all the iovec pages needed for len bytes.
+ * Return a struct dma_locked_list to keep track of pages locked down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
+	**locked_list)
+{
+	struct dma_locked_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;
+
+	/* don't lock down non-user-based iovecs */
+	if (segment_eq(get_fs(), KERNEL_DS)) {
+		*locked_list = NULL;
+		return 0;
+	}
+
+	/* determine how many iovecs/pages there are, up front */
+	do {
+		iovec_len_used += iov[nr_iovecs].iov_len;
+		iovec_pages_used += NUM_PAGES_SPANNED(iov[nr_iovecs].iov_base,
+		                                      iov[nr_iovecs].iov_len);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for locked list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+	if (!local_list)
+		return -ENOMEM;
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	/* it's a userspace pointer */
+	might_sleep();
+
+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		len -= iov[i].iov_len;
+
+		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+			dma_unlock_iovec_pages(local_list);
+			return -EFAULT;
+		}
+
+		page_list->nr_pages = NUM_PAGES_SPANNED(iov[i].iov_base,
+		                                        iov[i].iov_len);
+		page_list->base_address = iov[i].iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* lock pages down */
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(
+			current,
+			current->mm,
+			(unsigned long) iov[i].iov_base,
+			page_list->nr_pages,
+			1,
+			0,
+			page_list->pages,
+			NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret != page_list->nr_pages) {
+			goto mem_error;
+		}
+
+		local_list->nr_iovecs = i + 1;
+	}
+
+	*locked_list = local_list;
+	return 0;
+
+mem_error:
+	dma_unlock_iovec_pages(local_list);
+	return -ENOMEM;
+}
+
+void dma_unlock_iovec_pages(struct dma_locked_list *locked_list)
+{
+	int i, j;
+
+	if (!locked_list)
+		return;
+
+	for (i = 0; i < locked_list->nr_iovecs; i++) {
+		struct dma_page_list *page_list = &locked_list->page_list[i];
+		for (j = 0; j < page_list->nr_pages; j++) {
+			SetPageDirty(page_list->pages[j]);
+			page_cache_release(page_list->pages[j]);
+		}
+	}
+
+	kfree(locked_list);
+}
+
+static dma_cookie_t dma_memcpy_tokerneliovec(struct dma_chan *chan, struct
+	iovec *iov, unsigned char *kdata, size_t len)
+{
+	dma_cookie_t dma_cookie = 0;
+
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			dma_cookie = dma_async_memcpy_buf_to_buf(
+					chan,
+					iov->iov_base,
+					kdata,
+					copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return dma_cookie;
+}
+
+/*
+ * We have already locked down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in locked_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, unsigned char *kdata, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+
+	if (!chan)
+		return memcpy_toiovec(iov, kdata, len);
+
+	/* -> kernel copies (e.g. smbfs) */
+	if (!locked_list)
+		return dma_memcpy_tokerneliovec(chan, iov, kdata, len);
+
+	iovec_idx = 0;
+	while (iovec_idx < locked_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &locked_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					kdata,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			kdata += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, struct page *page,
+	unsigned int offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !locked_list) {
+		u8 *vaddr = kmap(page);
+		err = memcpy_toiovec(iov, vaddr + offset, len);
+		kunmap(page);
+		return err;
+	}
+
+	iovec_idx = 0;
+	while (iovec_idx < locked_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &locked_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					page,
+					offset,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			offset += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+#else
+
+int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
+	**locked_list)
+{
+	*locked_list = NULL;
+
+	return 0;
+}
+
+void dma_unlock_iovec_pages(struct dma_locked_list* locked_list)
+{ }
+
+#endif
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index f8a77ab..e198712 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -319,4 +319,28 @@ static inline enum dma_status dma_async_
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 
+/* --- Helper iov-locking functions --- */
+
+struct dma_page_list
+{
+	char *base_address;
+	int nr_pages;
+	struct page **pages;
+};
+
+struct dma_locked_list
+{
+	int nr_iovecs;
+	struct dma_page_list page_list[0];
+};
+
+int dma_lock_iovec_pages(struct iovec *iov, size_t len,
+	struct dma_locked_list	**locked_list);
+void dma_unlock_iovec_pages(struct dma_locked_list* locked_list);
+dma_cookie_t dma_memcpy_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, unsigned char *kdata, size_t len);
+dma_cookie_t dma_memcpy_pg_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, struct page *page,
+	unsigned int offset, size_t len);
+
 #endif /* DMAENGINE_H */
diff --git a/include/net/netdma.h b/include/net/netdma.h
index 3cd9e6d..415d74c 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -21,6 +21,7 @@ file called LICENSE.
 #ifndef NETDMA_H
 #define NETDMA_H
 #include <linux/dmaengine.h>
+#include <linux/skbuff.h>
 
 static inline struct dma_chan *get_softnet_dma(void)
 {
@@ -33,4 +34,8 @@ static inline struct dma_chan *get_softn
 	return chan;
 }
 
+int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
+		const struct sk_buff *skb, int offset, struct iovec *to,
+		size_t len, struct dma_locked_list *locked_list);
+
 #endif /* NETDMA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 630da0f..d02132b 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -8,7 +8,8 @@ obj-y := sock.o request_sock.o skbuff.o 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
 obj-y		     += dev.o ethtool.o dev_mcast.o dst.o \
-			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+			user_dma.o
 
 obj-$(CONFIG_XFRM) += flow.o
 obj-$(CONFIG_SYSFS) += net-sysfs.o
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 0000000..1e1aae5
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+Portions based on net/core/datagram.c and copyrighted by their authors.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+
+
+#ifdef CONFIG_NET_DMA
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@locked_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_locked_list *locked_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if ((cookie = dma_memcpy_toiovec(chan, to, locked_list,
+		     skb->data + offset, copy)) < 0)
+			goto fault;
+		if ((len -= copy) == 0)
+			goto end;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_toiovec(chan, to, locked_list, page,
+					frag->page_offset + offset - start, copy);
+			if (cookie < 0)
+				goto fault;
+			if (!(len -= copy))
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset + len);
+
+			end = start + list->len;
+			if ((copy = end - offset) > 0) {
+				if (copy > len)
+					copy = len;
+				if ((cookie = dma_skb_copy_datagram_iovec(chan, list,
+					        offset - start, to, copy, locked_list)) < 0)
+					goto fault;
+				if ((len -= copy) == 0)
+					goto end;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+ 	return -EFAULT;
+}
+
+#else
+
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			const struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_locked_list *locked_list)
+{
+	return skb_copy_datagram_iovec(skb, offset, to, len);
+}
+
+#endif


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (2 preceding siblings ...)
  2006-03-03 21:42 ` [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-05  7:19   ` Andrew Morton
  2006-03-03 21:42 ` [PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Adds an async_wait_queue and some additional fields to tcp_sock, and a 
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 include/linux/skbuff.h |    6 ++++++
 include/linux/tcp.h    |   10 ++++++++++
 include/net/sock.h     |    2 ++
 include/net/tcp.h      |    9 +++++++++
 net/core/sock.c        |    6 ++++++
 5 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 75c9631..572b7ae 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,9 @@
 #include <linux/net.h>
 #include <linux/textsearch.h>
 #include <net/checksum.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+#endif
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
@@ -285,6 +288,9 @@ struct sk_buff {
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+	dma_cookie_t		dma_cookie;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d395..6d7dc19 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,9 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+#endif
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -233,6 +236,13 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
+#ifdef CONFIG_NET_DMA
+		/* members for async copy */
+		struct dma_chan		*dma_chan;
+		int			wakeup;
+		struct dma_locked_list	*locked_list;
+		dma_cookie_t		dma_cookie;
+#endif
 	} ucopy;
 
 	__u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/sock.h b/include/net/sock.h
index 3075803..5d1b895 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
+  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	struct sk_buff_head	sk_receive_queue;
 	struct sk_buff_head	sk_write_queue;
+	struct sk_buff_head	sk_async_wait_queue;
 	int			sk_wmem_queued;
 	int			sk_forward_alloc;
 	gfp_t			sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 16879fa..fd7c3e4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,9 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+#endif
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -813,6 +816,12 @@ static inline void tcp_prequeue_init(str
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	tp->ucopy.wakeup = 0;
+	tp->ucopy.locked_list = NULL;
+	tp->ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e00811..90275ec 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -724,6 +724,9 @@ struct sock *sk_clone(const struct sock 
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+		skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
 
 		rwlock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -1275,6 +1278,9 @@ void sock_init_data(struct socket *sock,
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+	skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
 
 	sk->sk_send_head	=	NULL;
 


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (3 preceding siblings ...)
  2006-03-03 21:42 ` [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-03 21:42 ` [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 include/net/tcp.h |    2 ++
 net/ipv4/tcp.c    |   10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fd7c3e4..2fc7f05 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -295,6 +295,8 @@ extern int			tcp_rcv_established(struct 
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
+extern void			tcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int			tcp_twsk_unique(struct sock *sk,
 						struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 00aa80e..13abfa2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -936,7 +936,7 @@ static int tcp_recv_urg(struct sock *sk,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
@@ -1085,7 +1085,7 @@ int tcp_read_sock(struct sock *sk, read_
 
 	/* Clean up data we have read: This will do ACK frames. */
 	if (copied)
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 	return copied;
 }
 
@@ -1219,7 +1219,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 			}
 		}
 
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 
 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
@@ -1390,7 +1390,7 @@ skip_copy:
 	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
-	cleanup_rbuf(sk, copied);
+	tcp_cleanup_rbuf(sk, copied);
 
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
@@ -1856,7 +1856,7 @@ int tcp_setsockopt(struct sock *sk, int 
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
 			    inet_csk_ack_scheduled(sk)) {
 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				cleanup_rbuf(sk, 1);
+				tcp_cleanup_rbuf(sk, 1);
 				if (!(val & 1))
 					icsk->icsk_ack.pingpong = 1;
 			}


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (4 preceding siblings ...)
  2006-03-03 21:42 ` [PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-04 11:22   ` Alexey Dobriyan
  2006-03-05  7:21   ` Andrew Morton
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
                   ` (3 subsequent siblings)
  9 siblings, 2 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 include/linux/sysctl.h     |    1 +
 include/net/tcp.h          |    1 +
 net/core/user_dma.c        |    4 ++++
 net/ipv4/sysctl_net_ipv4.c |   10 ++++++++++
 4 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index dfcf449..f532f1e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -402,6 +402,7 @@ enum
 	NET_IPV4_IPFRAG_MAX_DIST=112,
  	NET_TCP_MTU_PROBING=113,
 	NET_TCP_BASE_MSS=114,
+	NET_TCP_DMA_COPYBREAK=115,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2fc7f05..0740f32 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -221,6 +221,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 1e1aae5..dd259f0 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -33,6 +33,10 @@ file called LICENSE.
 
 #ifdef CONFIG_NET_DMA
 
+#define NET_DMA_DEFAULT_COPYBREAK 1024
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ebf2e0b..f7bd9c2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -680,6 +680,16 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_NET_DMA
+	{
+		.ctl_name	= NET_TCP_DMA_COPYBREAK,
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 
 	{ .ctl_name = 0 }
 };


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (5 preceding siblings ...)
  2006-03-03 21:42 ` [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
@ 2006-03-03 21:42 ` Chris Leech
  2006-03-04 16:39   ` Pavel Machek
                     ` (3 more replies)
  2006-03-03 22:27 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Jeff Garzik
                   ` (2 subsequent siblings)
  9 siblings, 4 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 21:42 UTC (permalink / raw)
  To: linux-kernel, netdev

Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 include/net/netdma.h |    1 
 net/ipv4/tcp.c       |  111 +++++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/tcp_input.c |   78 ++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c  |   20 +++++++++
 net/ipv6/tcp_ipv6.c  |   12 +++++
 5 files changed, 201 insertions(+), 21 deletions(-)

diff --git a/include/net/netdma.h b/include/net/netdma.h
index 415d74c..2d829e1 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -37,5 +37,6 @@ static inline struct dma_chan *get_softn
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
 		const struct sk_buff *skb, int offset, struct iovec *to,
 		size_t len, struct dma_locked_list *locked_list);
+int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen);
 
 #endif /* NETDMA_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 13abfa2..b792048 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -262,6 +262,9 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
+#ifdef CONFIG_NET_DMA
+#include <net/netdma.h>
+#endif
 
 
 #include <asm/uaccess.h>
@@ -1109,6 +1112,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
+	int copied_early = 0;
 
 	lock_sock(sk);
 
@@ -1132,6 +1136,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
+		dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
+#endif
+
 	do {
 		struct sk_buff *skb;
 		u32 offset;
@@ -1273,6 +1283,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 		} else
 			sk_wait_data(sk, &timeo);
 
+#ifdef CONFIG_NET_DMA
+		tp->ucopy.wakeup = 0;
+#endif
+
 		if (user_recv) {
 			int chunk;
 
@@ -1328,13 +1342,39 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
-			if (err) {
-				/* Exception. Bailout! */
-				if (!copied)
-					copied = -EFAULT;
-				break;
+#ifdef CONFIG_NET_DMA
+			if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
+				tp->ucopy.dma_chan = get_softnet_dma();
+
+			if (tp->ucopy.dma_chan) {
+				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+					tp->ucopy.dma_chan, skb, offset,
+					msg->msg_iov, used,
+					tp->ucopy.locked_list);
+
+				if (tp->ucopy.dma_cookie < 0) {
+
+					printk(KERN_ALERT "dma_cookie < 0\n");
+
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+				if ((offset + used) == skb->len)
+					copied_early = 1;
+
+			} else
+#endif
+			{
+				err = skb_copy_datagram_iovec(skb, offset,
+						msg->msg_iov, used);
+				if (err) {
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
 			}
 		}
 
@@ -1354,15 +1394,33 @@ skip_copy:
 
 		if (skb->h.th->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			if (!copied_early)
+				sk_eat_skb(sk, skb);
+#ifdef CONFIG_NET_DMA
+			else {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+				copied_early = 0;
+			}
+#endif
+		}
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			if (!copied_early)
+				sk_eat_skb(sk, skb);
+#ifdef CONFIG_NET_DMA
+			else {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+				copied_early = 0;
+			}
+#endif
+		}
 		break;
 	} while (len > 0);
 
@@ -1385,6 +1443,34 @@ skip_copy:
 		tp->ucopy.len = 0;
 	}
 
+#ifdef CONFIG_NET_DMA
+	if (tp->ucopy.dma_chan) {
+		struct sk_buff *skb;
+		dma_cookie_t done, used;
+
+		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+		                                 tp->ucopy.dma_cookie, &done,
+		                                 &used) == DMA_IN_PROGRESS) {
+			/* do partial cleanup of sk_async_wait_queue */
+			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+			       (dma_async_is_complete(skb->dma_cookie, done,
+			                              used) == DMA_SUCCESS)) {
+				__skb_dequeue(&sk->sk_async_wait_queue);
+				kfree_skb(skb);
+			}
+		}
+
+		/* Safe to free early-copied skbs now */
+		__skb_queue_purge(&sk->sk_async_wait_queue);
+		dma_unlock_iovec_pages(tp->ucopy.locked_list);
+		dma_chan_put(tp->ucopy.dma_chan);
+		tp->ucopy.dma_chan = NULL;
+		tp->ucopy.locked_list = NULL;
+	}
+#endif
+
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
@@ -1652,6 +1738,9 @@ int tcp_disconnect(struct sock *sk, int 
 	__skb_queue_purge(&sk->sk_receive_queue);
 	sk_stream_writequeue_purge(sk);
 	__skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
 
 	inet->dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7625eaf..9b6290d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,9 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
+#ifdef CONFIG_NET_DMA
+#include <net/netdma.h>
+#endif
 
 int sysctl_tcp_timestamps = 1;
 int sysctl_tcp_window_scaling = 1;
@@ -3901,14 +3904,23 @@ int tcp_rcv_established(struct sock *sk,
 			}
 		} else {
 			int eaten = 0;
+			int copied_early = 0;
 
-			if (tp->ucopy.task == current &&
-			    tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len &&
-			    sock_owned_by_user(sk)) {
-				__set_current_state(TASK_RUNNING);
+			if (tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len) {
+#ifdef CONFIG_NET_DMA
+				if (dma_async_try_early_copy(sk, skb, tcp_header_len)) {
+					copied_early = 1;
+					eaten = 1;
+				}
+#endif
+				if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+					__set_current_state(TASK_RUNNING);
 
-				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+					if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+						eaten = 1;
+				}
+				if (eaten) {
 					/* Predicted packet is in window by definition.
 					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
 					 * Hence, check seq<=rcv_wup reduces to:
@@ -3924,8 +3936,9 @@ int tcp_rcv_established(struct sock *sk,
 					__skb_pull(skb, tcp_header_len);
 					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 					NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
-					eaten = 1;
 				}
+				if (copied_early)
+					tcp_cleanup_rbuf(sk, skb->len);
 			}
 			if (!eaten) {
 				if (tcp_checksum_complete_user(sk, skb))
@@ -3966,6 +3979,11 @@ int tcp_rcv_established(struct sock *sk,
 
 			__tcp_ack_snd_check(sk, 0);
 no_ack:
+#ifdef CONFIG_NET_DMA
+			if (copied_early)
+				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+			else
+#endif
 			if (eaten)
 				__kfree_skb(skb);
 			else
@@ -4049,6 +4067,52 @@ discard:
 	return 0;
 }
 
+#ifdef CONFIG_NET_DMA
+int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int chunk = skb->len - hlen;
+	int dma_cookie;
+	int copied_early = 0;
+
+	if (tp->ucopy.wakeup)
+          	goto out;
+
+	if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
+		tp->ucopy.dma_chan = get_softnet_dma();
+
+	if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+
+		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+			skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.locked_list);
+
+		if (dma_cookie < 0)
+			goto out;
+
+		tp->ucopy.dma_cookie = dma_cookie;
+		copied_early = 1;
+
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		tcp_rcv_space_adjust(sk);
+
+		if ((tp->ucopy.len == 0) ||
+		    (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+			tp->ucopy.wakeup = 1;
+			sk->sk_data_ready(sk, 0);
+		}
+	} else if (chunk > 0) {
+		tp->ucopy.wakeup = 1;
+		sk->sk_data_ready(sk, 0);
+	}
+out:
+	return copied_early;
+}
+
+EXPORT_SYMBOL(dma_async_try_early_copy);
+#endif /* CONFIG_NET_DMA */
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4eb903d..fecc022 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,9 @@
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
+#ifdef CONFIG_NET_DMA
+#include <net/netdma.h>
+#endif
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -1091,8 +1094,18 @@ process:
 	bh_lock_sock(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
+			tp->ucopy.dma_chan = get_softnet_dma();
+		if (tp->ucopy.dma_chan)
+			ret = tcp_v4_do_rcv(sk, skb);
+		else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
 			ret = tcp_v4_do_rcv(sk, skb);
+		}
 	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
@@ -1292,6 +1305,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
   	__skb_queue_purge(&tp->out_of_order_queue);
 
+#ifdef CONFIG_NET_DMA
+	/* Cleans up our sk_async_wait_queue */
+  	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
+
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index af6a0c6..acf798c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1218,8 +1218,16 @@ process:
 	bh_lock_sock(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
-			ret = tcp_v6_do_rcv(sk, skb);
+#ifdef CONFIG_NET_DMA
+                struct tcp_sock *tp = tcp_sk(sk);
+                if (tp->ucopy.dma_chan)
+                        ret = tcp_v6_do_rcv(sk, skb);
+                else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
+				ret = tcp_v6_do_rcv(sk, skb);
+		}
 	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);


^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (6 preceding siblings ...)
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
@ 2006-03-03 22:27 ` Jeff Garzik
  2006-03-03 22:39   ` Chris Leech
  2006-03-03 22:58 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Kumar Gala
  2006-03-04 18:46 ` Jan Engelhardt
  9 siblings, 1 reply; 60+ messages in thread
From: Jeff Garzik @ 2006-03-03 22:27 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech wrote:
> This patch series is the first full release of the Intel(R) I/O
> Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
> for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
> engine, and changes to the TCP stack to offload copies of received
> networking data to application space.
> 
> These changes apply to DaveM's net-2.6.17 tree as of commit
> 2bd84a93d8bb7192ad8c23ef41008502be1cb603 ([IRDA]: TOIM3232 dongle support)
> 
> They are available to pull from
> 	git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17
> 
> There are 8 patches in the series:
> 	1) The memcpy offload APIs and class code
> 	2) The Intel I/OAT DMA driver (ioatdma)

Patch #2 didn't make it.  Too big for the list?

	Jeff




^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 22:27 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Jeff Garzik
@ 2006-03-03 22:39   ` Chris Leech
  2006-03-03 22:45     ` Jeff Garzik
                       ` (2 more replies)
  0 siblings, 3 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 22:39 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 186 bytes --]

> Patch #2 didn't make it.  Too big for the list?

Could be, it's the largest of the series.  I've attached the gziped
patch.  I can try and split this up for the future.

- Chris

[-- Attachment #2: 02-ioatdma_driver.diff.gz --]
[-- Type: application/x-gzip, Size: 10272 bytes --]

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 22:39   ` Chris Leech
@ 2006-03-03 22:45     ` Jeff Garzik
  2006-03-04 11:35     ` Evgeniy Polyakov
  2006-03-05  8:09     ` Andrew Morton
  2 siblings, 0 replies; 60+ messages in thread
From: Jeff Garzik @ 2006-03-03 22:45 UTC (permalink / raw)
  To: chris.leech; +Cc: linux-kernel, netdev

Chris Leech wrote:
>>Patch #2 didn't make it.  Too big for the list?
> 
> 
> Could be, it's the largest of the series.  I've attached the gziped
> patch.  I can try and split this up for the future.

Well, for huge hunks of new code, it sometimes gets silly to split it up.

Once its not in a "reply to email" reviewable form, gzip or URL-to-patch 
work just fine.

	Jeff



^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (7 preceding siblings ...)
  2006-03-03 22:27 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Jeff Garzik
@ 2006-03-03 22:58 ` Kumar Gala
  2006-03-03 23:32   ` Chris Leech
  2006-03-04 18:46 ` Jan Engelhardt
  9 siblings, 1 reply; 60+ messages in thread
From: Kumar Gala @ 2006-03-03 22:58 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev


On Mar 3, 2006, at 3:40 PM, Chris Leech wrote:

> This patch series is the first full release of the Intel(R) I/O
> Acceleration Technology (I/OAT) for Linux.  It includes an in  
> kernel API
> for offloading memory copies to hardware, a driver for the I/OAT  
> DMA memcpy
> engine, and changes to the TCP stack to offload copies of received
> networking data to application space.
>
> These changes apply to DaveM's net-2.6.17 tree as of commit
> 2bd84a93d8bb7192ad8c23ef41008502be1cb603 ([IRDA]: TOIM3232 dongle  
> support)
>
> They are available to pull from
> 	git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17
>
> There are 8 patches in the series:
> 	1) The memcpy offload APIs and class code
> 	2) The Intel I/OAT DMA driver (ioatdma)
> 	3) Core networking code to setup networking as a DMA memcpy client
> 	4) Utility functions for sk_buff to iovec offloaded copy
> 	5) Structure changes needed for TCP receive offload
> 	6) Rename cleanup_rbuf to tcp_cleanup_rbuf
> 	7) Add a sysctl to tune the minimum offloaded I/O size for TCP
> 	8) The main TCP receive offload changes

How does this relate to Dan William's ADMA work?

http://marc.theaimsgroup.com/?t=113892936300001&r=1&w=2

- kumar

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 22:58 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Kumar Gala
@ 2006-03-03 23:32   ` Chris Leech
  0 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-03 23:32 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linux-kernel, netdev

On 3/3/06, Kumar Gala <galak@kernel.crashing.org> wrote:
>
> How does this relate to Dan William's ADMA work?

I only became aware of Dan's ADMA work when he posted it last month,
and so far have not made any attempts to merge the I/OAT code with it.
 Moving forward, combining these interfaces certainly seems like the
right way to go.  I particularly like ADMA's handling of operations
other than just a copy (memset, compare, XOR, CRC).

Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-03-04  1:40   ` David S. Miller
  2006-03-06 19:39     ` Chris Leech
  2006-03-04 19:20   ` Benjamin LaHaise
  1 sibling, 1 reply; 60+ messages in thread
From: David S. Miller @ 2006-03-04  1:40 UTC (permalink / raw)
  To: christopher.leech; +Cc: linux-kernel, netdev

From: Chris Leech <christopher.leech@intel.com>
Date: Fri, 03 Mar 2006 13:42:20 -0800

> +static spinlock_t dma_list_lock;

Please use DEFINE_SPINLOCK().

> +static void dma_chan_free_rcu(struct rcu_head *rcu) {

Newline before the brace please.

> +static void dma_async_device_cleanup(struct kref *kref) {

Newline before the brace please.

> +struct dma_chan_percpu
> +{

Left brace on the same line as "struct dma_chan_percpu" please.

> +struct dma_chan
> +{

Similarly.

Otherwise this patch looks mostly ok.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
  2006-03-03 21:42 ` [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
@ 2006-03-04 11:22   ` Alexey Dobriyan
  2006-03-05  7:21   ` Andrew Morton
  1 sibling, 0 replies; 60+ messages in thread
From: Alexey Dobriyan @ 2006-03-04 11:22 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

On Fri, Mar 03, 2006 at 01:42:34PM -0800, Chris Leech wrote:
> Any socket recv of less than this ammount will not be offloaded

There is no documentation update.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 22:39   ` Chris Leech
  2006-03-03 22:45     ` Jeff Garzik
@ 2006-03-04 11:35     ` Evgeniy Polyakov
  2006-03-05  8:09     ` Andrew Morton
  2 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-03-04 11:35 UTC (permalink / raw)
  To: chris.leech; +Cc: Jeff Garzik, linux-kernel, netdev

On Fri, Mar 03, 2006 at 02:39:22PM -0800, Chris Leech (christopher.leech@intel.com) wrote:
> > Patch #2 didn't make it.  Too big for the list?
> 
> Could be, it's the largest of the series.  I've attached the gziped
> patch.  I can try and split this up for the future.

How can owner of cb_chan->common.device_node be removed?
It looks like that channels are only allocated (without proper error path) 
and queued into device->common.channels list in
enumerate_dma_channels() in PCI probe callback and no removing at all, only lockless access.
PCI remove callback only calls dma_async_device_unregister() where only
channel's clients are removed.

> - Chris



-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
@ 2006-03-04 16:39   ` Pavel Machek
  2006-03-04 23:18   ` Greg KH
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 60+ messages in thread
From: Pavel Machek @ 2006-03-04 16:39 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Hi!

> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -262,6 +262,9 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <net/ip.h>
> +#ifdef CONFIG_NET_DMA
> +#include <net/netdma.h>
> +#endif
>  

Remove the ifdefs, move them inside .h if needed.

> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 7625eaf..9b6290d 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -71,6 +71,9 @@
>  #include <net/inet_common.h>
>  #include <linux/ipsec.h>
>  #include <asm/unaligned.h>
> +#ifdef CONFIG_NET_DMA
> +#include <net/netdma.h>
> +#endif

Here, too.

> +#ifdef CONFIG_NET_DMA
> +			if (copied_early)
> +				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
> +			else
> +#endif
>  			if (eaten)
>  				__kfree_skb(skb);
>  			else

Could you #define copied_early to 0 and avoid ifdefs?

> @@ -1091,8 +1094,18 @@ process:
>  	bh_lock_sock(sk);
>  	ret = 0;
>  	if (!sock_owned_by_user(sk)) {
> -		if (!tcp_prequeue(sk, skb))
> +#ifdef CONFIG_NET_DMA
> +		struct tcp_sock *tp = tcp_sk(sk);
> +		if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> +			tp->ucopy.dma_chan = get_softnet_dma();
> +		if (tp->ucopy.dma_chan)
> +			ret = tcp_v4_do_rcv(sk, skb);
> +		else
> +#endif
> +		{
> +			if (!tcp_prequeue(sk, skb))
>  			ret = tcp_v4_do_rcv(sk, skb);
> +		}
>  	} else

Wrong indentation...
								Pavel
-- 
Thanks, Sharp!

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
                   ` (8 preceding siblings ...)
  2006-03-03 22:58 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Kumar Gala
@ 2006-03-04 18:46 ` Jan Engelhardt
  2006-03-04 21:41   ` David S. Miller
  9 siblings, 1 reply; 60+ messages in thread
From: Jan Engelhardt @ 2006-03-04 18:46 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev


>This patch series is the first full release of the Intel(R) I/O
>Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
>for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
>engine, and changes to the TCP stack to offload copies of received
>networking data to application space.
>
Does this buy the normal standard desktop user anything?


Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
  2006-03-04  1:40   ` David S. Miller
@ 2006-03-04 19:20   ` Benjamin LaHaise
  2006-03-06 19:48     ` Chris Leech
  1 sibling, 1 reply; 60+ messages in thread
From: Benjamin LaHaise @ 2006-03-04 19:20 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

On Fri, Mar 03, 2006 at 01:42:20PM -0800, Chris Leech wrote:
> +void dma_async_device_unregister(struct dma_device* device)
> +{
...
> +	kref_put(&device->refcount, dma_async_device_cleanup);
> +	wait_for_completion(&device->done);
> +}

This looks like a bug: device is dereferenced after it is potentially 
freed.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 18:46 ` Jan Engelhardt
@ 2006-03-04 21:41   ` David S. Miller
  2006-03-04 22:05     ` Gene Heskett
  2006-03-05  1:43     ` Evgeniy Polyakov
  0 siblings, 2 replies; 60+ messages in thread
From: David S. Miller @ 2006-03-04 21:41 UTC (permalink / raw)
  To: jengelh; +Cc: christopher.leech, linux-kernel, netdev

From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)

> Does this buy the normal standard desktop user anything?

Absolutely, it optimizes end-node performance.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 21:41   ` David S. Miller
@ 2006-03-04 22:05     ` Gene Heskett
  2006-03-04 22:16       ` David S. Miller
  2006-03-06 19:15       ` Chris Leech
  2006-03-05  1:43     ` Evgeniy Polyakov
  1 sibling, 2 replies; 60+ messages in thread
From: Gene Heskett @ 2006-03-04 22:05 UTC (permalink / raw)
  To: linux-kernel

On Saturday 04 March 2006 16:41, David S. Miller wrote:
>From: Jan Engelhardt <jengelh@linux01.gwdg.de>
>Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
>
>> Does this buy the normal standard desktop user anything?
>
>Absolutely, it optimizes end-node performance.

Is this quantifiable?, and does it only apply to Intel?

-- 
Cheers, Gene
People having trouble with vz bouncing email to me should add the word
'online' between the 'verizon', and the dot which bypasses vz's
stupid bounce rules.  I do use spamassassin too. :-)
Yahoo.com and AOL/TW attorneys please note, additions to the above
message by Gene Heskett are:
Copyright 2006 by Maurice Eugene Heskett, all rights reserved.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 22:05     ` Gene Heskett
@ 2006-03-04 22:16       ` David S. Miller
  2006-03-05 13:45         ` Jan Engelhardt
  2006-03-05 16:14         ` Matthieu CASTET
  2006-03-06 19:15       ` Chris Leech
  1 sibling, 2 replies; 60+ messages in thread
From: David S. Miller @ 2006-03-04 22:16 UTC (permalink / raw)
  To: gene.heskett, gene.heskett; +Cc: linux-kernel

From: Gene Heskett <gene.heskett@verizon.net>
Date: Sat, 04 Mar 2006 17:05:41 -0500

> On Saturday 04 March 2006 16:41, David S. Miller wrote:
> >From: Jan Engelhardt <jengelh@linux01.gwdg.de>
> >Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
> >
> >> Does this buy the normal standard desktop user anything?
> >
> >Absolutely, it optimizes end-node performance.
> 
> Is this quantifiable?, and does it only apply to Intel?

It applies to whoever has a DMA engine in their computer.

What people need to understand is that this is putting the
optimization in the right place, at the end nodes.  This is about as
old an internet architecting fundamental as you can get, keep the hard
work off the routers and intermediate nodes, and put it on the end
systems.

Intel has done performance numbers, and I'm sure they will post them
at the appropriate time.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
  2006-03-04 16:39   ` Pavel Machek
@ 2006-03-04 23:18   ` Greg KH
  2006-03-06 19:28     ` Chris Leech
  2006-03-05  7:30   ` Andrew Morton
  2006-03-05  8:45   ` Andrew Morton
  3 siblings, 1 reply; 60+ messages in thread
From: Greg KH @ 2006-03-04 23:18 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

On Fri, Mar 03, 2006 at 01:42:36PM -0800, Chris Leech wrote:
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 13abfa2..b792048 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -262,6 +262,9 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <net/ip.h>
> +#ifdef CONFIG_NET_DMA
> +#include <net/netdma.h>
> +#endif

#ifdef is not needed here (try not to put #ifdef in .c files.)  I think
a few of your other usages of #ifdef in this file can also be removed
with judicious use of inline functions in a .h file.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 21:41   ` David S. Miller
  2006-03-04 22:05     ` Gene Heskett
@ 2006-03-05  1:43     ` Evgeniy Polyakov
  2006-03-05  2:08       ` David S. Miller
  2006-03-06 17:44       ` Ingo Oeser
  1 sibling, 2 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-03-05  1:43 UTC (permalink / raw)
  To: David S. Miller; +Cc: jengelh, christopher.leech, linux-kernel, netdev

On Sat, Mar 04, 2006 at 01:41:44PM -0800, David S. Miller (davem@davemloft.net) wrote:
> From: Jan Engelhardt <jengelh@linux01.gwdg.de>
> Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
> 
> > Does this buy the normal standard desktop user anything?
> 
> Absolutely, it optimizes end-node performance.

It really depends on how it is used.
According to investigation made for kevent based FS AIO reading,
get_user_pages() performange graph looks like sqrt() function
with plato starting on about 64-80 pages on Xeon 2.4Ghz with 1Gb of ram,
while memcopy() is linear, so it can be noticebly slower than
copy_to_user() if get_user_pages() is used aggressively, so userspace
application must reuse the same, already grabbed buffer for maximum
performance, but Intel folks did not provide theirs usage case and any
benchmarks as far as I know.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-05  1:43     ` Evgeniy Polyakov
@ 2006-03-05  2:08       ` David S. Miller
  2006-03-06 17:44       ` Ingo Oeser
  1 sibling, 0 replies; 60+ messages in thread
From: David S. Miller @ 2006-03-05  2:08 UTC (permalink / raw)
  To: johnpol; +Cc: jengelh, christopher.leech, linux-kernel, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Sun, 5 Mar 2006 04:43:25 +0300

> According to investigation made for kevent based FS AIO reading,
> get_user_pages() performange graph looks like sqrt() function
> with plato starting on about 64-80 pages on Xeon 2.4Ghz with 1Gb of ram,
> while memcopy() is linear, so it can be noticebly slower than
> copy_to_user() if get_user_pages() is used aggressively, so userspace
> application must reuse the same, already grabbed buffer for maximum
> performance, but Intel folks did not provide theirs usage case and any
> benchmarks as far as I know.

Of course, and programming the DMA controller has overhead
as well.  This is why would would not use I/O AT with small
transfer sizes.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies
  2006-03-03 21:42 ` [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
@ 2006-03-05  7:15   ` Andrew Morton
  0 siblings, 0 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  7:15 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> +
> +#define NUM_PAGES_SPANNED(start, length) \
> +	((PAGE_ALIGN((unsigned long)start + length) - \
> +	((unsigned long)start & PAGE_MASK)) >> PAGE_SHIFT)

static inline all-lower-case functions are much nicer.

> +/*
> + * Lock down all the iovec pages needed for len bytes.
> + * Return a struct dma_locked_list to keep track of pages locked down.
> + *
> + * We are allocating a single chunk of memory, and then carving it up into
> + * 3 sections, the latter 2 whose size depends on the number of iovecs and the
> + * total number of pages, respectively.
> + */
> +int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
> +	**locked_list)

Please rename this to dma_pin_iovec_pages().  Locking a page is a quite
different concept from pinning it, and this function doesn't lock any
pages.

> +{
> +	struct dma_locked_list *local_list;
> +	struct page **pages;
> +	int i;
> +	int ret;
> +
> +	int nr_iovecs = 0;
> +	int iovec_len_used = 0;
> +	int iovec_pages_used = 0;

Extraneous blank line there.

> +	/* don't lock down non-user-based iovecs */
> +	if (segment_eq(get_fs(), KERNEL_DS)) {
> +		*locked_list = NULL;
> +		return 0;
> +	}

hm, haven't seen that before.  Makes sense, I guess.

> +	/* determine how many iovecs/pages there are, up front */
> +	do {
> +		iovec_len_used += iov[nr_iovecs].iov_len;
> +		iovec_pages_used += NUM_PAGES_SPANNED(iov[nr_iovecs].iov_base,
> +		                                      iov[nr_iovecs].iov_len);
> +		nr_iovecs++;
> +	} while (iovec_len_used < len);
> +
> +	/* single kmalloc for locked list, page_list[], and the page arrays */
> +	local_list = kmalloc(sizeof(*local_list)
> +		+ (nr_iovecs * sizeof (struct dma_page_list))
> +		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);

What is the upper bound on the size of this allocation?

> +	if (!local_list)
> +		return -ENOMEM;
> +
> +	/* list of pages starts right after the page list array */
> +	pages = (struct page **) &local_list->page_list[nr_iovecs];
> +
> +	/* it's a userspace pointer */
> +	might_sleep();

kmalloc(GFP_KERNEL) already did that.

> +	for (i = 0; i < nr_iovecs; i++) {
> +		struct dma_page_list *page_list = &local_list->page_list[i];
> +
> +		len -= iov[i].iov_len;
> +
> +		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
> +			dma_unlock_iovec_pages(local_list);
> +			return -EFAULT;
> +		}

A return statement buried down in the guts of a largeish function isn't
good from a code maintainability POV.

> +		page_list->nr_pages = NUM_PAGES_SPANNED(iov[i].iov_base,
> +		                                        iov[i].iov_len);
> +		page_list->base_address = iov[i].iov_base;
> +
> +		page_list->pages = pages;
> +		pages += page_list->nr_pages;
> +
> +		/* lock pages down */
> +		down_read(&current->mm->mmap_sem);
> +		ret = get_user_pages(
> +			current,
> +			current->mm,
> +			(unsigned long) iov[i].iov_base,
> +			page_list->nr_pages,
> +			1,
> +			0,
> +			page_list->pages,
> +			NULL);

Yes, it has a lot of args.  It's nice to add comments like this:

		ret = get_user_pages(
			current,
			current->mm,
			(unsigned long) iov[i].iov_base,
			page_list->nr_pages,
			1,			/* write */
			0,			/* force */
			page_list->pages,
			NULL);


> +		up_read(&current->mm->mmap_sem);
> +
> +		if (ret != page_list->nr_pages) {
> +			goto mem_error;
> +		}

Unneded braces.

> +		local_list->nr_iovecs = i + 1;
> +	}
> +
> +	*locked_list = local_list;
> +	return 0;

Suggest you change this function to return locked_list, or an IS_ERR value
on error.

> +void dma_unlock_iovec_pages(struct dma_locked_list *locked_list)
> +{
> +	int i, j;
> +
> +	if (!locked_list)
> +		return;
> +
> +	for (i = 0; i < locked_list->nr_iovecs; i++) {
> +		struct dma_page_list *page_list = &locked_list->page_list[i];
> +		for (j = 0; j < page_list->nr_pages; j++) {
> +			SetPageDirty(page_list->pages[j]);
> +			page_cache_release(page_list->pages[j]);
> +		}
> +	}
> +
> +	kfree(locked_list);
> +}

SetPageDirty() is very wrong.  It fails to mark pagecache pages as dirty in
the radix tree so they won't get written back.

You'll need to use set_page_dirty_lock() here or, if you happen to have
protected the inode which backs this potential mmap (really the
address_space) from reclaim then set_page_dirty() will work.  Probably
it'll be set_page_dirty_lock().

If this is called from cant-sleep context then things get ugly.  If it's
called from interrupt context then moreso.  See fs/direct-io.c,
bio_set_pages_dirty(), bio_check_pages_dirty(), etc.


I don't see a check for "did we write to user pages" here.  Because we
don't need to dirty the pages if we were reading them (transmitting from
userspace).

But given that dma_lock_iovec_pages() is only set up for writing to
userspace I guess this code is implicitly receive-only.  It's hard to tell
when the description, is, like the code comments, so scant.

> +static dma_cookie_t dma_memcpy_tokerneliovec(struct dma_chan *chan, struct
> +	iovec *iov, unsigned char *kdata, size_t len)

You owe us two underscores ;)

> +/*
> + * We have already locked down the pages we will be using in the iovecs.

"pinned"

> + * Each entry in iov array has corresponding entry in locked_list->page_list.
> + * Using array indexing to keep iov[] and page_list[] in sync.
> + * Initial elements in iov array's iov->iov_len will be 0 if already copied into
> + *   by another call.
> + * iov array length remaining guaranteed to be bigger than len.
> + */
> +dma_cookie_t dma_memcpy_toiovec(struct dma_chan *chan, struct iovec *iov,
> +	struct dma_locked_list *locked_list, unsigned char *kdata, size_t len)
> +{
> +	int iov_byte_offset;
> +	int copy;
> +	dma_cookie_t dma_cookie = 0;
> +	int iovec_idx;
> +	int page_idx;
> +
> +	if (!chan)
> +		return memcpy_toiovec(iov, kdata, len);
> +
> +	/* -> kernel copies (e.g. smbfs) */
> +	if (!locked_list)
> +		return dma_memcpy_tokerneliovec(chan, iov, kdata, len);
> +
> +	iovec_idx = 0;
> +	while (iovec_idx < locked_list->nr_iovecs) {
> +		struct dma_page_list *page_list;
> +
> +		/* skip already used-up iovecs */
> +		while (!iov[iovec_idx].iov_len)
> +			iovec_idx++;

Is it assured that this array was zero-terminated?

> +
> +dma_cookie_t dma_memcpy_pg_toiovec(struct dma_chan *chan, struct iovec *iov,
> +	struct dma_locked_list *locked_list, struct page *page,
> +	unsigned int offset, size_t len)

pleeeeeze comment your code.

> +{
> +	int iov_byte_offset;
> +	int copy;
> +	dma_cookie_t dma_cookie = 0;
> +	int iovec_idx;
> +	int page_idx;
> +	int err;
> +
> +	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
> +	/* TODO: use dma for this */
> +	if (!chan || !locked_list) {

Really you should rename locked_list to pinned_list throughout, and
dma_locked_list to dma_pinned_list.

> +	iovec_idx = 0;
> +	while (iovec_idx < locked_list->nr_iovecs) {
> +		struct dma_page_list *page_list;
> +
> +		/* skip already used-up iovecs */
> +		while (!iov[iovec_idx].iov_len)
> +			iovec_idx++;

Can this also run off the end?

> +int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
> +	**locked_list)
> +{
> +	*locked_list = NULL;
> +
> +	return 0;
> +}
> +
> +void dma_unlock_iovec_pages(struct dma_locked_list* locked_list)
> +{ }

You might want to make these guys static inlines in a header and not
compile this file at all if !CONFIG_DMA_ENGINE.

> +struct dma_page_list
> +{

   struct dma_page_list {

> +struct dma_locked_list
> +{

   struct dma_pinned_list {

> +	int nr_iovecs;
> +	struct dma_page_list page_list[0];

We can use [] instead of [0] now that gcc-2.95.x has gone away.

> +int dma_lock_iovec_pages(struct iovec *iov, size_t len,
> +	struct dma_locked_list	**locked_list);
> +void dma_unlock_iovec_pages(struct dma_locked_list* locked_list);

"pin", "unpin".

> +#ifdef CONFIG_NET_DMA
> +
> +/**
> + *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
> + *	@skb - buffer to copy
> + *	@offset - offset in the buffer to start copying from
> + *	@iovec - io vector to copy to
> + *	@len - amount of data to copy from buffer to iovec
> + *	@locked_list - locked iovec buffer data
> + *
> + *	Note: the iovec is modified during the copy.

Modifying the caller's iovec is a bit rude.    Hard to avoid, I guess.

> + */
> +int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
> +			struct sk_buff *skb, int offset, struct iovec *to,
> +			size_t len, struct dma_locked_list *locked_list)
> +{
> +	int start = skb_headlen(skb);
> +	int i, copy = start - offset;
> +	dma_cookie_t cookie = 0;
> +
> +	/* Copy header. */
> +	if (copy > 0) {
> +		if (copy > len)
> +			copy = len;
> +		if ((cookie = dma_memcpy_toiovec(chan, to, locked_list,
> +		     skb->data + offset, copy)) < 0)
> +			goto fault;
> +		if ((len -= copy) == 0)
> +			goto end;

Please avoid

	if ((lhs = rhs))

constructs.  Instead do

	lhs = rhs;
	if (lhs)

(entire patchset - there are quite a lot)

> +		offset += copy;
> +	}
> +
> +	/* Copy paged appendix. Hmm... why does this look so complicated? */
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		int end;
> +
> +		BUG_TRAP(start <= offset + len);

<wonders why BUG_TRAP still exists>

> +		if ((copy = end - offset) > 0) {
> ...
> +			if (!(len -= copy))
> ...
> +			if ((copy = end - offset) > 0) {
> ...
> +				if ((len -= copy) == 0)
>

See above.

> +#else
> +
> +int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
> +			const struct sk_buff *skb, int offset, struct iovec *to,
> +			size_t len, struct dma_locked_list *locked_list)
> +{
> +	return skb_copy_datagram_iovec(skb, offset, to, len);
> +}
> +
> +#endif

Again, consider putting this in a header as an inline, avoid compiling this
file altogether.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT
  2006-03-03 21:42 ` [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
@ 2006-03-05  7:19   ` Andrew Morton
  0 siblings, 0 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  7:19 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> +#ifdef CONFIG_NET_DMA
>  +#include <linux/dmaengine.h>
>  +#endif

Please move the ifdefs into the header and include it unconditionally
(entire patchset).


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
  2006-03-03 21:42 ` [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
  2006-03-04 11:22   ` Alexey Dobriyan
@ 2006-03-05  7:21   ` Andrew Morton
  1 sibling, 0 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  7:21 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> Any socket recv of less than this ammount will not be offloaded
> 
> ...
>
> +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;

Is it appropriate that this tunable be kernel-wide, rather than more
finely-grained?


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
  2006-03-04 16:39   ` Pavel Machek
  2006-03-04 23:18   ` Greg KH
@ 2006-03-05  7:30   ` Andrew Morton
  2006-03-05  8:45   ` Andrew Morton
  3 siblings, 0 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  7:30 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
> dma_async_try_early_copy in tcp_v4_do_rcv
> 

+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA

waaay too many ifdefs.   There are various tricks we use to minimise them.

> +#ifdef CONFIG_NET_DMA
> +	tp->ucopy.dma_chan = NULL;
> +	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
> +		dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
> +#endif

Please try to fit code into 80 columns.

That's decimal 80 ;)

> @@ -1328,13 +1342,39 @@ do_prequeue:
>  		}
>  
>  		if (!(flags & MSG_TRUNC)) {
> -			err = skb_copy_datagram_iovec(skb, offset,
> -						      msg->msg_iov, used);
> -			if (err) {
> -				/* Exception. Bailout! */
> -				if (!copied)
> -					copied = -EFAULT;
> -				break;
> +#ifdef CONFIG_NET_DMA
> +			if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> +				tp->ucopy.dma_chan = get_softnet_dma();
> +
> +			if (tp->ucopy.dma_chan) {
> +				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
> +					tp->ucopy.dma_chan, skb, offset,
> +					msg->msg_iov, used,
> +					tp->ucopy.locked_list);
> +
> +				if (tp->ucopy.dma_cookie < 0) {
> +
> +					printk(KERN_ALERT "dma_cookie < 0\n");
> +
> +					/* Exception. Bailout! */
> +					if (!copied)
> +						copied = -EFAULT;
> +					break;
> +				}
> +				if ((offset + used) == skb->len)
> +					copied_early = 1;
> +

Consider trimming some of those blank lines.  I don't think they add any
value?

> +			} else
> +#endif
> +			{

These games with ifdefs and else statements aren't at all pleasant. 
Sometimes they're hard to avoid, but you'll probably find that some code
rearrangemnt (in a preceding patch) makes it easier.  Like, split this
function into several.

> @@ -1354,15 +1394,33 @@ skip_copy:
>  
>  		if (skb->h.th->fin)
>  			goto found_fin_ok;
> -		if (!(flags & MSG_PEEK))
> -			sk_eat_skb(sk, skb);
> +		if (!(flags & MSG_PEEK)) {
> +			if (!copied_early)
> +				sk_eat_skb(sk, skb);
> +#ifdef CONFIG_NET_DMA
> +			else {
> +				__skb_unlink(skb, &sk->sk_receive_queue);
> +				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
> +				copied_early = 0;
> +			}
> +#endif
> ...
> -			sk_eat_skb(sk, skb);
> +		if (!(flags & MSG_PEEK)) {
> +			if (!copied_early)
> +				sk_eat_skb(sk, skb);
> +#ifdef CONFIG_NET_DMA
> +			else {
> +				__skb_unlink(skb, &sk->sk_receive_queue);
> +				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
> +				copied_early = 0;
> +			}
> +#endif
> +		}

etc.

> +#ifdef CONFIG_NET_DMA
> +			if (copied_early)
> +				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
> +			else
> +#endif
>  			if (eaten)
>  				__kfree_skb(skb);
>  			else

etc.

> @@ -4049,6 +4067,52 @@ discard:
>  	return 0;
>  }
>  
> +#ifdef CONFIG_NET_DMA
> +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
> +{
> +	struct tcp_sock *tp = tcp_sk(sk);
> +	int chunk = skb->len - hlen;
> +	int dma_cookie;
> +	int copied_early = 0;
> +
> +	if (tp->ucopy.wakeup)
> +          	goto out;

In this case a simple

		return 0;

would be fine.  We haven't done anything yet.

> +#ifdef CONFIG_NET_DMA
> +		struct tcp_sock *tp = tcp_sk(sk);
> +		if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> +			tp->ucopy.dma_chan = get_softnet_dma();
> +		if (tp->ucopy.dma_chan)
> +			ret = tcp_v4_do_rcv(sk, skb);
> +		else
> +#endif
> +		{
> +			if (!tcp_prequeue(sk, skb))
>  			ret = tcp_v4_do_rcv(sk, skb);
> +		}
>  	} else

etc.

> +#ifdef CONFIG_NET_DMA
> +                struct tcp_sock *tp = tcp_sk(sk);
> +                if (tp->ucopy.dma_chan)
> +                        ret = tcp_v6_do_rcv(sk, skb);
> +                else
> +#endif
> +		{
> +			if (!tcp_prequeue(sk, skb))
> +				ret = tcp_v6_do_rcv(sk, skb);
> +		}
>  	} else

ow, my eyes!

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-03 22:39   ` Chris Leech
  2006-03-03 22:45     ` Jeff Garzik
  2006-03-04 11:35     ` Evgeniy Polyakov
@ 2006-03-05  8:09     ` Andrew Morton
  2006-03-05  9:02       ` Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...] Sam Ravnborg
  2 siblings, 1 reply; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  8:09 UTC (permalink / raw)
  To: chris.leech; +Cc: christopher.leech, jeff, linux-kernel, netdev

"Chris Leech" <christopher.leech@intel.com> wrote:
>
> > Patch #2 didn't make it.  Too big for the list?
> 
>  Could be, it's the largest of the series.  I've attached the gziped
>  patch.  I can try and split this up for the future.
>
> ..
>
> [I/OAT] Driver for the Intel(R) I/OAT DMA engine
> Adds a new ioatdma driver
> 
> ...
> +struct cb_pci_pmcap_register {
> +	uint32_t	capid:8;	/* RO: 01h */
> +	uint32_t	nxtcapptr:8;
> +	uint32_t	version:3;	/* RO: 010b */
> +	uint32_t	pmeclk:1;	/* RO: 0b */
> +	uint32_t	reserved:1;	/* RV: 0b */
> +	uint32_t	dsi:1;		/* RO: 0b */
> +	uint32_t	aux_current:3;	/* RO: 000b */
> +	uint32_t	d1_support:1;	/* RO: 0b */
> +	uint32_t	d2_support:1;	/* RO: 0b */
> +	uint32_t	pme_support:5;	/* RO: 11001b */
> +};

This maps onto hardware registers?  No big-endian plans in Intel's future? ;)

I have a vague feeling that gcc changed its layout of bitfields many years
ago.  I guess we're fairly safe against that.  Presumably gcc and icc use the
same layout?

Still.  It's a bit of a concern, but I guess we can worry about that if it
happens.

> +
> +static inline u8 read_reg8(struct cb_device *device, unsigned int offset)
> +{
> +	return readb(device->reg_base + offset);
> +}

These are fairly generic-sounding names.  In fact the as-yet-unmerged tiacx
wireless driver is already using these, privately to
drivers/net/wireless/tiacx/pci.c.

> +static int enumerate_dma_channels(struct cb_device *device)
> +{
> +	u8 xfercap_scale;
> +	u32 xfercap;
> +	int i;
> +	struct cb_dma_chan *cb_chan;
> +
> +	device->common.chancnt = read_reg8(device, CB_CHANCNT_OFFSET);
> +	xfercap_scale = read_reg8(device, CB_XFERCAP_OFFSET);
> +	xfercap = (xfercap_scale == 0 ? ~0UL : (1 << xfercap_scale));

I recommend using just "-1" to represent the all-ones pattern.  It simply
works, in all situations.

Where you _did_ want the UL was after that "1".

> +	for (i = 0; i < device->common.chancnt; i++) {
> +		cb_chan = kzalloc(sizeof(*cb_chan), GFP_KERNEL);
> +		if (!cb_chan)
> +			return -ENOMEM;

memory leak?

> +		cb_chan->device = device;
> +		cb_chan->reg_base = device->reg_base + (0x80 * (i + 1));
> +		cb_chan->xfercap = xfercap;
> +		spin_lock_init(&cb_chan->cleanup_lock);
> +		spin_lock_init(&cb_chan->desc_lock);
> +		INIT_LIST_HEAD(&cb_chan->free_desc);
> +		INIT_LIST_HEAD(&cb_chan->used_desc);
> +		/* This should be made common somewhere in dmaengine.c */
> +		cb_chan->common.device = &device->common;
> +		cb_chan->common.client = NULL;
> +		list_add_tail(&cb_chan->common.device_node, &device->common.channels);

No locking needed for that list?

> +static struct cb_desc_sw * cb_dma_alloc_descriptor(struct cb_dma_chan *cb_chan)

There's a mix of styles here.  I don't think the space after the asterisk does
anything useful, and it could be argued that it's incorrect (or misleading)
wrt C declaration semantics.

> +{
> +	struct cb_dma_descriptor *desc;

What do all these "cb"'s stand for, anyway?

> +	struct cb_desc_sw *desc_sw;
> +	struct cb_device *cb_device = to_cb_device(cb_chan->common.device);
> +	dma_addr_t phys;
> +
> +	desc = pci_pool_alloc(cb_device->dma_pool, GFP_ATOMIC, &phys);
> +	if (!desc)
> +		return NULL;
> +
> +	desc_sw = kzalloc(sizeof(*desc_sw), GFP_ATOMIC);

GFP_ATOMIC is to be avoided if at all possible.  It stresses the memory system
and can easily fail under load.

>From my reading, two of the callers could trivially call this function outside
spin_lock_bh() and the third could perhaps do so with a little work.  You
could at least fix up two of those callers, and pass in the gfp_flags.


<wonders why the heck dma_pool_alloc() uses SLAB_ATOMIC when the caller's
passing in the gfp_flags>

> +/* returns the actual number of allocated descriptors */
> +static int cb_dma_alloc_chan_resources(struct dma_chan *chan)
> +{
> ...
> +	/* Allocate descriptors */
> +	spin_lock_bh(&cb_chan->desc_lock);
> +	for (i = 0; i < INITIAL_CB_DESC_COUNT; i++) {
> +		desc = cb_dma_alloc_descriptor(cb_chan);
> +		if (!desc) {
> +			printk(KERN_ERR "CB: Only %d initial descriptors\n", i);
> +			break;
> +		}
> +		list_add_tail(&desc->node, &cb_chan->free_desc);
> +	}
> +	spin_unlock_bh(&cb_chan->desc_lock);

Here's one such caller.

> +
> +static void cb_dma_free_chan_resources(struct dma_chan *chan)
> +{
> +	struct cb_dma_chan *cb_chan = to_cb_chan(chan);
> +	struct cb_device *cb_device = to_cb_device(chan->device);
> +	struct cb_desc_sw *desc, *_desc;
> +	u16 chanctrl;
> +	int in_use_descs = 0;
> +
> +	cb_dma_memcpy_cleanup(cb_chan);
> +
> +	chan_write_reg8(cb_chan, CB_CHANCMD_OFFSET, CB_CHANCMD_RESET);
> +
> +	spin_lock_bh(&cb_chan->desc_lock);
> +	list_for_each_entry_safe(desc, _desc, &cb_chan->used_desc, node) {
> +		in_use_descs++;
> +		list_del(&desc->node);
> +		pci_pool_free(cb_device->dma_pool, desc->hw, desc->phys);
> +		kfree(desc);
> +	}
> +	list_for_each_entry_safe(desc, _desc, &cb_chan->free_desc, node) {
> +		list_del(&desc->node);
> +		pci_pool_free(cb_device->dma_pool, desc->hw, desc->phys);
> +		kfree(desc);
> +	}
> +	spin_unlock_bh(&cb_chan->desc_lock);

Do we actually need the lock there?  If we're freeing everything which it
protects anwyay?

> +
> +static void cb_dma_memcpy_cleanup(struct cb_dma_chan *chan)
> +{
> +	unsigned long phys_complete;
> +	struct cb_desc_sw *desc, *_desc;
> +	dma_cookie_t cookie = 0;
> +
> +	prefetch(chan->completion_virt);
> +
> +	if (!spin_trylock(&chan->cleanup_lock))
> +		return;

What's going on here?  Lock ranking problems?  spin_trylock() in
non-infrastructural code is a bit of a red flag.

Whatever the reason, it needs a comment in there please.  That comment should
also explain why simply baling out is acceptable.

> +
> +static irqreturn_t cb_do_interrupt(int irq, void *data, struct pt_regs *regs)
> +{
> +	struct cb_device *instance = data;
> +	unsigned long attnstatus;
> +	u8 intrctrl;
> +
> +	intrctrl = read_reg8(instance, CB_INTRCTRL_OFFSET);
> +
> +	if (!(intrctrl & CB_INTRCTRL_MASTER_INT_EN)) {
> +		return IRQ_NONE;
> +	}

braces.

> +	attnstatus = (unsigned long) read_reg32(instance, CB_ATTNSTATUS_OFFSET);

Unneeded cast.

> +static void cb_start_null_desc(struct cb_dma_chan *cb_chan)
> +{
> +	struct cb_desc_sw *desc;
> +
> +	spin_lock_bh(&cb_chan->desc_lock);
> +
> +	if (!list_empty(&cb_chan->free_desc)) {
> +		desc = to_cb_desc(cb_chan->free_desc.next);
> +		list_del(&desc->node);
> +	} else {
> +		/* try to get another desc */
> +		desc = cb_dma_alloc_descriptor(cb_chan);
> +		/* will this ever happen? */
> +		BUG_ON(!desc);
> +	}
> +
> +	desc->hw->ctl = CB_DMA_DESCRIPTOR_NUL;
> +	desc->hw->next = 0;
> +
> +	list_add_tail(&desc->node, &cb_chan->used_desc);
> +
> +#if (BITS_PER_LONG == 64)
> +	chan_write_reg64(cb_chan, CB_CHAINADDR_OFFSET, desc->phys);
> +#else
> +	chan_write_reg32(cb_chan, CB_CHAINADDR_OFFSET_LOW, (u32) desc->phys);
> +	chan_write_reg32(cb_chan, CB_CHAINADDR_OFFSET_HIGH, 0);
> +#endif
> +	chan_write_reg8(cb_chan, CB_CHANCMD_OFFSET, CB_CHANCMD_START);
> +
> +	spin_unlock_bh(&cb_chan->desc_lock);
> +}

Can the chan_write*() calls be moved outside the locked region?

> +/*
> + * Perform a CB transaction to verify the HW works.
> + */

Damn, I wish I knew what CB meant.

> +#define CB_TEST_SIZE 2000
> +
> +static int cb_self_test(struct cb_device *device)
> +{
> +	int i;
> +	u8 *src;
> +	u8 *dest;
> +	struct dma_chan *dma_chan;
> +	dma_cookie_t cookie;
> +	int err = 0;
> +
> +	src = kzalloc(sizeof(u8) * CB_TEST_SIZE, SLAB_KERNEL);
> +	if (!src)
> +		return -ENOMEM;
> +	dest = kzalloc(sizeof(u8) * CB_TEST_SIZE, SLAB_KERNEL);
> +	if (!dest) {
> +		kfree(src);
> +		return -ENOMEM;
> +	}
> +
> +	/* Fill in src buffer */
> +	for (i = 0; i < CB_TEST_SIZE; i++)
> +		src[i] = (u8)i;

memset?

> +	/* Start copy, using first DMA channel */
> +	dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node);
> +
> +	cb_dma_alloc_chan_resources(dma_chan);

cb_dma_alloc_chan_resources() can fail.

> +	cookie = cb_dma_memcpy_buf_to_buf(dma_chan, dest, src, CB_TEST_SIZE);
> +	cb_dma_memcpy_issue_pending(dma_chan);
> +
> +	udelay(1000);

msleep(1) would be preferred.

> +static int __devinit cb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> +{
> +	int err;
> +	unsigned long mmio_start, mmio_len;
> +	void *reg_base;
> +	struct cb_device *device;
> +
> +	err = pci_enable_device(pdev);
> +	if (err)
> +		goto err_enable_device;
> +
> +	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
> +	if (err)
> +		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
> +	if (err)
> +		goto err_set_dma_mask;
> +
> +	err = pci_request_regions(pdev, cb_pci_drv.name);
> +	if (err)
> +		goto err_request_regions;
> +
> +	mmio_start = pci_resource_start(pdev, 0);
> +	mmio_len = pci_resource_len(pdev, 0);
> +
> +	reg_base = ioremap(mmio_start, mmio_len);
> +	if (!reg_base) {
> +		err = -ENOMEM;
> +		goto err_ioremap;
> +	}
> +
> +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> +	if (!device) {
> +		err = -ENOMEM;
> +		goto err_kzalloc;
> +	}
> +
> +	/* DMA coherent memory pool for DMA descriptor allocations */
> +	device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
> +		sizeof(struct cb_dma_descriptor), 64, 0);
> +	if (!device->dma_pool) {
> +		err = -ENOMEM;
> +		goto err_dma_pool;
> +	}
> +
> +	device->completion_pool = pci_pool_create("completion_pool", pdev, sizeof(u64), SMP_CACHE_BYTES, SMP_CACHE_BYTES);
> +	if (!device->completion_pool) {
> +		err = -ENOMEM;
> +		goto err_completion_pool;
> +	}
> +
> +	device->pdev = pdev;
> +	pci_set_drvdata(pdev, device);
> +#ifdef CONFIG_PCI_MSI
> +	if (pci_enable_msi(pdev) == 0) {
> +		device->msi = 1;
> +	} else {
> +		device->msi = 0;
> +	}
> +#endif
> +	err = request_irq(pdev->irq, &cb_do_interrupt, SA_SHIRQ, "ioat",
> +		device);
> +	if (err)
> +		goto err_irq;
> +
> +	device->reg_base = reg_base;
> +
> +	write_reg8(device, CB_INTRCTRL_OFFSET, CB_INTRCTRL_MASTER_INT_EN);
> +	pci_set_master(pdev);
> +
> +	INIT_LIST_HEAD(&device->common.channels);
> +	enumerate_dma_channels(device);

enumerate_dma_channels() can fail.

> +	device->common.device_alloc_chan_resources = cb_dma_alloc_chan_resources;
> +	device->common.device_free_chan_resources = cb_dma_free_chan_resources;
> +	device->common.device_memcpy_buf_to_buf = cb_dma_memcpy_buf_to_buf;
> +	device->common.device_memcpy_buf_to_pg = cb_dma_memcpy_buf_to_pg;
> +	device->common.device_memcpy_pg_to_pg = cb_dma_memcpy_pg_to_pg;
> +	device->common.device_memcpy_complete = cb_dma_is_complete;
> +	device->common.device_memcpy_issue_pending = cb_dma_memcpy_issue_pending;
> +	printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
> +		device->common.chancnt);
> +
> +	if ((err = cb_self_test(device)))
> +		goto err_self_test;
> +
> +	dma_async_device_register(&device->common);
> +
> +	return 0;
> +
> +err_self_test:
> +err_irq:
> +	pci_pool_destroy(device->completion_pool);
> +err_completion_pool:
> +	pci_pool_destroy(device->dma_pool);
> +err_dma_pool:
> +	kfree(device);
> +err_kzalloc:
> +	iounmap(reg_base);
> +err_ioremap:
> +	pci_release_regions(pdev);
> +err_request_regions:
> +err_set_dma_mask:

You might want a pci_disable_device() in here.

> +err_enable_device:
> +	return err;
> +}
> +
> +static void __devexit cb_remove(struct pci_dev *pdev)
> +{
> +	struct cb_device *device;
> +
> +	device = pci_get_drvdata(pdev);
> +	dma_async_device_unregister(&device->common);

pci_disable_device()?

> +	free_irq(device->pdev->irq, device);
> +#ifdef CONFIG_PCI_MSI
> +	if (device->msi)
> +		pci_disable_msi(device->pdev);
> +#endif
> +	pci_pool_destroy(device->dma_pool);
> +	pci_pool_destroy(device->completion_pool);
> +	iounmap(device->reg_base);
> +	pci_release_regions(pdev);
> +	kfree(device);
> +}
> +
> +/* MODULE API */
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Intel Corporation");
> +
> +static int __init cb_init_module(void)
> +{
> +	/* it's currently unsafe to unload this module */
> +	/* if forced, worst case is that rmmod hangs */

How come?

> +	if (THIS_MODULE != NULL)
> +		THIS_MODULE->unsafe = 1;
> +
> +	return pci_module_init(&cb_pci_drv);
> +}
> +



> +#define CB_LOW_COMPLETION_MASK		0xffffffc0
> +
> +extern struct list_head dma_device_list;
> +extern struct list_head dma_client_list;

It's strange to see extern decls for lists, but no decl for their lock.  A
comment might help.

> +struct cb_dma_chan {
> +
> +	void *reg_base;
> +
> +	dma_cookie_t completed_cookie;
> +	unsigned long last_completion;
> +
> +	u32 xfercap;	/* XFERCAP register value expanded out */
> +
> +	spinlock_t cleanup_lock;
> +	spinlock_t desc_lock;
> +	struct list_head free_desc;
> +	struct list_head used_desc;
> +
> +	int pending;
> +
> +	struct cb_device *device;
> +	struct dma_chan common;
> +
> +	dma_addr_t completion_addr;
> +	union {
> +		u64 full; /* HW completion writeback */
> +		struct {
> +			u32 low;
> +			u32 high;
> +		};
> +	} *completion_virt;
> +};

Again, is it safe to assume that these parts will never be present in
big-endian machines?



^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
                     ` (2 preceding siblings ...)
  2006-03-05  7:30   ` Andrew Morton
@ 2006-03-05  8:45   ` Andrew Morton
  2006-03-05 10:27     ` David S. Miller
  2006-03-06 19:36     ` Chris Leech
  3 siblings, 2 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  8:45 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> +#ifdef CONFIG_NET_DMA
>  +	tp->ucopy.dma_chan = NULL;
>  +	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
>  +		dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
>  +#endif

The __get_cpu_var() here will run smp_processor_id() from preemptible
context.  You'll get a big warning if the correct debug options are set.

The reason for this is that preemption could cause this code to hop between
CPUs.

Please always test code with all debug options enabled and with full kernel
preemption.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...]
  2006-03-05  8:09     ` Andrew Morton
@ 2006-03-05  9:02       ` Sam Ravnborg
  2006-03-05  9:18         ` Andrew Morton
  0 siblings, 1 reply; 60+ messages in thread
From: Sam Ravnborg @ 2006-03-05  9:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: chris.leech, christopher.leech, jeff, linux-kernel, netdev

On Sun, Mar 05, 2006 at 12:09:33AM -0800, Andrew Morton wrote:
> > +
> > +static inline u8 read_reg8(struct cb_device *device, unsigned int offset)
> > +{
> > +	return readb(device->reg_base + offset);
> > +}
> 
> These are fairly generic-sounding names.  In fact the as-yet-unmerged tiacx
> wireless driver is already using these, privately to
> drivers/net/wireless/tiacx/pci.c.

Do we in general discourage duplicate symbols even if they are static?

[ppc64, allmodconfig]

$> nm vmlinux | fgrep ' t ' | awk '{print $3}' | sort | uniq -dc
      2 .add_bridge
      2 .base_probe
      2 .c_next
      2 .c_start
      2 .c_stop
      3 .cpu_callback
      2 .default_open
      2 .default_read_file
      2 .default_write_file
      2 .dev_ifsioc
      2 .do_open
      4 .dst_output
      2 .dump_seek
      2 .dump_write
      2 .elf_core_dump
      2 .elf_map
      2 .exact_lock
      2 .exact_match
      2 .exit_elf_binfmt
      2 .fill_note
      2 .fill_prstatus
      2 .fillonedir
      2 .fini
      2 .fixup_one_level_bus_range
      5 .init
      8 .init_once
      3 .iommu_bus_setup_null
      3 .iommu_dev_setup_null
      2 .klist_devices_get
      2 .klist_devices_put
      2 .load_elf_binary
      2 .load_elf_interp
      2 .load_elf_library
      3 .m_next
      3 .m_start
      3 .m_stop
      2 .maydump
      3 .modalias_show
      2 .next_device
      3 .notesize
      2 .padzero
      2 .raw_ioctl
      2 .s_next
      2 .s_show
      2 .s_start
      2 .s_stop
      2 .seq_next
      2 .seq_show
      2 .seq_start
      2 .seq_stop
      2 .set_brk
      2 .setkey
      2 .state_show
      2 .state_store
      2 .store_uevent
      2 .u3_ht_cfg_access
      2 .u3_ht_read_config
      2 .u3_ht_write_config
      2 .writenote
      3 __initcall_init
      2 __setup_netdev_boot_setup
      2 __setup_str_netdev_boot_setup

If I did a make allyesconfig the result looks much more scary.

	Sam

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...]
  2006-03-05  9:02       ` Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...] Sam Ravnborg
@ 2006-03-05  9:18         ` Andrew Morton
  2006-03-06 19:56           ` Chris Leech
  0 siblings, 1 reply; 60+ messages in thread
From: Andrew Morton @ 2006-03-05  9:18 UTC (permalink / raw)
  To: Sam Ravnborg; +Cc: chris.leech, christopher.leech, jeff, linux-kernel, netdev

Sam Ravnborg <sam@ravnborg.org> wrote:
>
> On Sun, Mar 05, 2006 at 12:09:33AM -0800, Andrew Morton wrote:
>  > > +
>  > > +static inline u8 read_reg8(struct cb_device *device, unsigned int offset)
>  > > +{
>  > > +	return readb(device->reg_base + offset);
>  > > +}
>  > 
>  > These are fairly generic-sounding names.  In fact the as-yet-unmerged tiacx
>  > wireless driver is already using these, privately to
>  > drivers/net/wireless/tiacx/pci.c.
> 
>  Do we in general discourage duplicate symbols even if they are static?

Well, it's a bit irritating that it confuses ctags.  But in this case, one
set is in a header file so the risk of collisions is much-increased.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-05  8:45   ` Andrew Morton
@ 2006-03-05 10:27     ` David S. Miller
  2006-03-06 19:36     ` Chris Leech
  1 sibling, 0 replies; 60+ messages in thread
From: David S. Miller @ 2006-03-05 10:27 UTC (permalink / raw)
  To: akpm; +Cc: christopher.leech, linux-kernel, netdev

From: Andrew Morton <akpm@osdl.org>
Date: Sun, 5 Mar 2006 00:45:34 -0800

> The __get_cpu_var() here will run smp_processor_id() from preemptible
> context.  You'll get a big warning if the correct debug options are set.
> 
> The reason for this is that preemption could cause this code to hop between
> CPUs.
> 
> Please always test code with all debug options enabled and with full kernel
> preemption.

To be fair that warning doesn't trigger on some platforms, such as
sparc64 where the __get_cpu_var() implementation simply takes the
value from a fixed cpu register and doesn't do the debugging check.

Sparc64 should add the check when debugging options are enabled, for
sure, but the point is that it may not entirely be the tester's fault.
:-)

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 22:16       ` David S. Miller
@ 2006-03-05 13:45         ` Jan Engelhardt
  2006-03-05 13:55           ` Arjan van de Ven
  2006-03-05 16:14         ` Matthieu CASTET
  1 sibling, 1 reply; 60+ messages in thread
From: Jan Engelhardt @ 2006-03-05 13:45 UTC (permalink / raw)
  To: David S. Miller; +Cc: gene.heskett, gene.heskett, linux-kernel

>> >> Does this buy the normal standard desktop user anything?
>> >Absolutely, it optimizes end-node performance.
>> Is this quantifiable?, and does it only apply to Intel?
>It applies to whoever has a DMA engine in their computer.
>
How do I find out?


Jan Engelhardt
-- 

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-05 13:45         ` Jan Engelhardt
@ 2006-03-05 13:55           ` Arjan van de Ven
  0 siblings, 0 replies; 60+ messages in thread
From: Arjan van de Ven @ 2006-03-05 13:55 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: David S. Miller, gene.heskett, gene.heskett, linux-kernel

On Sun, 2006-03-05 at 14:45 +0100, Jan Engelhardt wrote:
> >> >> Does this buy the normal standard desktop user anything?
> >> >Absolutely, it optimizes end-node performance.
> >> Is this quantifiable?, and does it only apply to Intel?
> >It applies to whoever has a DMA engine in their computer.
> >
> How do I find out?


if you have an off-the-shelf standard PC, today you don't have one of
those.




^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 22:16       ` David S. Miller
  2006-03-05 13:45         ` Jan Engelhardt
@ 2006-03-05 16:14         ` Matthieu CASTET
  2006-03-05 16:30           ` Jeff Garzik
  2006-03-06 19:24           ` Chris Leech
  1 sibling, 2 replies; 60+ messages in thread
From: Matthieu CASTET @ 2006-03-05 16:14 UTC (permalink / raw)
  To: linux-kernel

Hi,
Le Sat, 04 Mar 2006 14:16:43 -0800, David S. Miller a écrit :

> From: Gene Heskett <gene.heskett@verizon.net>
> Date: Sat, 04 Mar 2006 17:05:41 -0500
> 
>> On Saturday 04 March 2006 16:41, David S. Miller wrote:
>> >From: Jan Engelhardt <jengelh@linux01.gwdg.de>
>> >Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
>> >
>> >> Does this buy the normal standard desktop user anything?
>> >
>> >Absolutely, it optimizes end-node performance.
>> 
>> Is this quantifiable?, and does it only apply to Intel?
> 
> It applies to whoever has a DMA engine in their computer.
> 
But we need a special driver ?
The IOAT driver from intel seems to expect a pci device (0x8086 0x1a38)
and the common x86 computer have their dma in lpc/isa bridge.


Matthieu


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-05 16:14         ` Matthieu CASTET
@ 2006-03-05 16:30           ` Jeff Garzik
  2006-03-06 19:24           ` Chris Leech
  1 sibling, 0 replies; 60+ messages in thread
From: Jeff Garzik @ 2006-03-05 16:30 UTC (permalink / raw)
  To: Matthieu CASTET; +Cc: linux-kernel

Matthieu CASTET wrote:
> But we need a special driver ?
> The IOAT driver from intel seems to expect a pci device (0x8086 0x1a38)
> and the common x86 computer have their dma in lpc/isa bridge.

The common x86 computer does not have -asynchronous- DMA.

	Jeff



^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-05  1:43     ` Evgeniy Polyakov
  2006-03-05  2:08       ` David S. Miller
@ 2006-03-06 17:44       ` Ingo Oeser
  2006-03-07  7:44         ` Evgeniy Polyakov
  1 sibling, 1 reply; 60+ messages in thread
From: Ingo Oeser @ 2006-03-06 17:44 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David S. Miller, jengelh, christopher.leech, linux-kernel, netdev

Evgeniy Polyakov wrote:
> On Sat, Mar 04, 2006 at 01:41:44PM -0800, David S. Miller (davem@davemloft.net) wrote:
> > From: Jan Engelhardt <jengelh@linux01.gwdg.de>
> > Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
> > 
> > > Does this buy the normal standard desktop user anything?
> > 
> > Absolutely, it optimizes end-node performance.
> 
> It really depends on how it is used.
> According to investigation made for kevent based FS AIO reading,
> get_user_pages() performange graph looks like sqrt() function

Hmm, so I should resurrect my user page table walker abstraction?

There I would hand each page to a "recording" function, which
can drop the page from the collection or coalesce it in the collector
if your scatter gather implementation allows it.

Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-04 22:05     ` Gene Heskett
  2006-03-04 22:16       ` David S. Miller
@ 2006-03-06 19:15       ` Chris Leech
  1 sibling, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:15 UTC (permalink / raw)
  To: gene.heskett; +Cc: linux-kernel

> >> Does this buy the normal standard desktop user anything?
> >
> >Absolutely, it optimizes end-node performance.
>
> Is this quantifiable?, and does it only apply to Intel?

What we've been developing for is a device integrated into Intel's
Enterprise South Bridge 2 (ESB2), so it's a feature of Intel server
platforms.  But the networking changes are written so that you could
drop in a driver if similar functionality existed on other
architectures.

I'll look into what performance data I can share, I have to ask the
marketing folks.

- Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-05 16:14         ` Matthieu CASTET
  2006-03-05 16:30           ` Jeff Garzik
@ 2006-03-06 19:24           ` Chris Leech
  1 sibling, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:24 UTC (permalink / raw)
  To: Matthieu CASTET; +Cc: linux-kernel

> But we need a special driver ?
> The IOAT driver from intel seems to expect a pci device (0x8086 0x1a38)
> and the common x86 computer have their dma in lpc/isa bridge.

It's really about bringing the concept of a generic DMA engine up to
date with modern system design in order to make it useful for I/O
offload.  This is a new descriptor programed memory copy engine that
shows up as a PCI Express device integrated into the MCH.

- Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-04 23:18   ` Greg KH
@ 2006-03-06 19:28     ` Chris Leech
  0 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:28 UTC (permalink / raw)
  To: Greg KH; +Cc: linux-kernel, netdev

> #ifdef is not needed here (try not to put #ifdef in .c files.)  I think
> a few of your other usages of #ifdef in this file can also be removed
> with judicious use of inline functions in a .h file.

ACK on all the ifdef comments.  I may have gone a little ifdef crazy
making sure I could get to a zero impact state with these patches
applied but CONFIG_NET_DMA turned off.  I'll get these cleaned up.

- Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
  2006-03-05  8:45   ` Andrew Morton
  2006-03-05 10:27     ` David S. Miller
@ 2006-03-06 19:36     ` Chris Leech
  1 sibling, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, netdev

On 3/5/06, Andrew Morton <akpm@osdl.org> wrote:
> Chris Leech <christopher.leech@intel.com> wrote:
> >
> > +#ifdef CONFIG_NET_DMA
> >  +    tp->ucopy.dma_chan = NULL;
> >  +    if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
> >  +            dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
> >  +#endif
>
> The __get_cpu_var() here will run smp_processor_id() from preemptible
> context.  You'll get a big warning if the correct debug options are set.
>
> The reason for this is that preemption could cause this code to hop between
> CPUs.

I've been playing with different models of where to select which DMA
channel to use in order to reduce cache thrash and lock contention in
the driver.  It's not a clean per-cpu issue because per I/O there are
potentially operations happening in both the process syscall and the
netrx softirq context.

Right now the code delays selection of a DMA channel until the first
offload copy is ready to go, so the __get_cpu_var() you point out is
just checking to see if any hardware exists for I/OAT at this point
before doing the page pinning.  Before anything is done with the
channel the per-cpu pointer is re-read safely with preemption disabled
and a reference count is incremented.

 - Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-04  1:40   ` David S. Miller
@ 2006-03-06 19:39     ` Chris Leech
  0 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, netdev

On 3/3/06, David S. Miller <davem@davemloft.net> wrote:
> > +static spinlock_t dma_list_lock;
>
> Please use DEFINE_SPINLOCK().
>
> > +static void dma_chan_free_rcu(struct rcu_head *rcu) {
>
> Newline before the brace please.
>
> > +static void dma_async_device_cleanup(struct kref *kref) {
>
> Newline before the brace please.
>
> > +struct dma_chan_percpu
> > +{
>
> Left brace on the same line as "struct dma_chan_percpu" please.
>
> > +struct dma_chan
> > +{
>
> Similarly.
>
> Otherwise this patch looks mostly ok.

Thanks Dave,

I'll apply these and other feedback and get updated patches generated.

- Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-04 19:20   ` Benjamin LaHaise
@ 2006-03-06 19:48     ` Chris Leech
  0 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:48 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel, netdev

On 3/4/06, Benjamin LaHaise <bcrl@kvack.org> wrote:
> On Fri, Mar 03, 2006 at 01:42:20PM -0800, Chris Leech wrote:
> > +void dma_async_device_unregister(struct dma_device* device)
> > +{
> ...
> > +     kref_put(&device->refcount, dma_async_device_cleanup);
> > +     wait_for_completion(&device->done);
> > +}
>
> This looks like a bug: device is dereferenced after it is potentially
> freed.

Actually, this is where the code is waiting to make sure it's safe to
free device.  The release function for the kref completes
device->done.  Each of the devices channels holds a reference to the
device.  When a device is unregistered it's channels are removed from
the clients, which hold a reference for each outstanding transaction. 
When all the outstanding transactions complete, the channels kref goes
to 0, and the reference to the device is dropped.  When the device
kref goes to 0 the completion is set, and it's now safe to free the
memory for the device and channel structures.

I have a writeup of the locking and reference counting that I'll
finish and add in as a big comment to the code.

-Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...]
  2006-03-05  9:18         ` Andrew Morton
@ 2006-03-06 19:56           ` Chris Leech
  0 siblings, 0 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-06 19:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Sam Ravnborg, jeff, linux-kernel, netdev

On 3/5/06, Andrew Morton <akpm@osdl.org> wrote:
> Sam Ravnborg <sam@ravnborg.org> wrote:
> >
> > On Sun, Mar 05, 2006 at 12:09:33AM -0800, Andrew Morton wrote:
> >  > > +
> >  > > +static inline u8 read_reg8(struct cb_device *device, unsigned int offset)
> >  > > +{
> >  > > +        return readb(device->reg_base + offset);
> >  > > +}
> >  >
> >  > These are fairly generic-sounding names.  In fact the as-yet-unmerged tiacx
> >  > wireless driver is already using these, privately to
> >  > drivers/net/wireless/tiacx/pci.c.
> >
> >  Do we in general discourage duplicate symbols even if they are static?
>
> Well, it's a bit irritating that it confuses ctags.  But in this case, one
> set is in a header file so the risk of collisions is much-increased.

They're in a header file that's specific to a single driver, so I
don't see where a conflict would occur.  But I didn't think about
ctags, and these can easily be prefixed so I'll go ahead and change
them.

- Chris

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-06 17:44       ` Ingo Oeser
@ 2006-03-07  7:44         ` Evgeniy Polyakov
  2006-03-07  9:43           ` Ingo Oeser
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-03-07  7:44 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: David S. Miller, jengelh, christopher.leech, linux-kernel, netdev

On Mon, Mar 06, 2006 at 06:44:07PM +0100, Ingo Oeser (netdev@axxeo.de) wrote:
> Evgeniy Polyakov wrote:
> > On Sat, Mar 04, 2006 at 01:41:44PM -0800, David S. Miller (davem@davemloft.net) wrote:
> > > From: Jan Engelhardt <jengelh@linux01.gwdg.de>
> > > Date: Sat, 4 Mar 2006 19:46:22 +0100 (MET)
> > > 
> > > > Does this buy the normal standard desktop user anything?
> > > 
> > > Absolutely, it optimizes end-node performance.
> > 
> > It really depends on how it is used.
> > According to investigation made for kevent based FS AIO reading,
> > get_user_pages() performange graph looks like sqrt() function
> 
> Hmm, so I should resurrect my user page table walker abstraction?
> 
> There I would hand each page to a "recording" function, which
> can drop the page from the collection or coalesce it in the collector
> if your scatter gather implementation allows it.

It depends on where performance growth is stopped.
>From the first glance it does not look like find_extend_vma(),
probably follow_page() fault and thus __handle_mm_fault().
I can not say actually, but if it is true and performance growth is
stopped due to increased number of faults and it's processing, 
your approach will hit this problem too, doesn't it?

> Regards
> 
> Ingo Oeser

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-07  7:44         ` Evgeniy Polyakov
@ 2006-03-07  9:43           ` Ingo Oeser
  2006-03-07 10:16             ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: Ingo Oeser @ 2006-03-07  9:43 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David S. Miller, jengelh, christopher.leech, linux-kernel, netdev

Evgeniy Polyakov wrote:
> On Mon, Mar 06, 2006 at 06:44:07PM +0100, Ingo Oeser (netdev@axxeo.de) wrote:
> > Hmm, so I should resurrect my user page table walker abstraction?
> > 
> > There I would hand each page to a "recording" function, which
> > can drop the page from the collection or coalesce it in the collector
> > if your scatter gather implementation allows it.
> 
> It depends on where performance growth is stopped.
> From the first glance it does not look like find_extend_vma(),
> probably follow_page() fault and thus __handle_mm_fault().
> I can not say actually, but if it is true and performance growth is
> stopped due to increased number of faults and it's processing, 
> your approach will hit this problem too, doesn't it?

My approach reduced the number of loops performed and number
of memory needed at the expense of doing more work in the main
loop of get_user_pages. 

This was mitigated for the common case of getting just one page by 
providing a get_one_user_page() function.

The whole problem, why we need such multiple loops is that we have
no common container object for "IO vector + additional data".

So we always do a loop working over the vector returned by 
get_user_pages() all the time. The bigger that vector, 
the bigger the impact.

Maybe sth. as simple as providing get_user_pages() with some offset_of 
and container_of hackery will work these days without the disadvantages 
my old get_user_pages() work had.

The idea is, that you'll provide a vector (like arguments to calloc) and two 
offsets: One for the page to store within the offset and one for the vma 
to store.

If the offset has a special value (e.g MAX_LONG) you don't store there at all.

But if the performance problem really is get_user_pages() itself 
(and not its callers), then my approach won't help at all.


Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
  2006-03-07  9:43           ` Ingo Oeser
@ 2006-03-07 10:16             ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-03-07 10:16 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: David S. Miller, jengelh, christopher.leech, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 2527 bytes --]

On Tue, Mar 07, 2006 at 10:43:59AM +0100, Ingo Oeser (netdev@axxeo.de) wrote:
> Evgeniy Polyakov wrote:
> > On Mon, Mar 06, 2006 at 06:44:07PM +0100, Ingo Oeser (netdev@axxeo.de) wrote:
> > > Hmm, so I should resurrect my user page table walker abstraction?
> > > 
> > > There I would hand each page to a "recording" function, which
> > > can drop the page from the collection or coalesce it in the collector
> > > if your scatter gather implementation allows it.
> > 
> > It depends on where performance growth is stopped.
> > From the first glance it does not look like find_extend_vma(),
> > probably follow_page() fault and thus __handle_mm_fault().
> > I can not say actually, but if it is true and performance growth is
> > stopped due to increased number of faults and it's processing, 
> > your approach will hit this problem too, doesn't it?
> 
> My approach reduced the number of loops performed and number
> of memory needed at the expense of doing more work in the main
> loop of get_user_pages. 
> 
> This was mitigated for the common case of getting just one page by 
> providing a get_one_user_page() function.
> 
> The whole problem, why we need such multiple loops is that we have
> no common container object for "IO vector + additional data".
> 
> So we always do a loop working over the vector returned by 
> get_user_pages() all the time. The bigger that vector, 
> the bigger the impact.
> 
> Maybe sth. as simple as providing get_user_pages() with some offset_of 
> and container_of hackery will work these days without the disadvantages 
> my old get_user_pages() work had.
> 
> The idea is, that you'll provide a vector (like arguments to calloc) and two 
> offsets: One for the page to store within the offset and one for the vma 
> to store.
> 
> If the offset has a special value (e.g MAX_LONG) you don't store there at all.

You still need to find VMA in one loop, and run through it's(mm_structu) pages in
second loop.

> But if the performance problem really is get_user_pages() itself 
> (and not its callers), then my approach won't help at all.

It looks so.
My test pseudocode is following:
fget_light();
igrab();
kzalloc(number_of_pages * sizeof(void *));
get_user_pages(number_of_pages);
... undo ...

I've attached two graphs of performance with and without
get_user_pages(), it is get_user_pages.png and kmalloc.png.

Vertical axis is number of Mbytes per second thrown through above code,
horizontal one is number of pages in each run.
 
> Regards
> 
> Ingo Oeser

-- 
	Evgeniy Polyakov

[-- Attachment #2: get_user_pages.png --]
[-- Type: image/png, Size: 5498 bytes --]

[-- Attachment #3: kmalloc.png --]
[-- Type: image/png, Size: 5816 bytes --]

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-30  8:01               ` Kumar Gala
@ 2006-03-30 18:27                 ` Andrew Grover
  0 siblings, 0 replies; 60+ messages in thread
From: Andrew Grover @ 2006-03-30 18:27 UTC (permalink / raw)
  To: Kumar Gala; +Cc: Chris Leech, linux kernel mailing list, netdev

On 3/30/06, Kumar Gala <galak@kernel.crashing.org> wrote:
> I was under the impression that the DMA engine would provide a "sync"
> cpu based memcpy (PIO) if a real HW channel wasn't avail, if this is
> left to the client that's fine.  So how does the client know he
> should use normal memcpy()?

It has to keep track of what DMA channel to use, which it gets when
the channel ADDED callback happens. So it's basically

if (some_client_struct->dma_chan)
    dma_memcpy()
else
    memcpy()

The async memcpy has the added requirement that at some point the
client must verify the copies have been completed, so doing async
memcopies does require more work on the client's part.

> Sounds good for a start.  Have you given any thoughts on handling
> priorities between clients?
>
> I need to take a look at the latest patches. How would you guys like
> modifications?

Haven't given any thought to priorities yet -- we've been focusing on
getting the 1 client case to perform well. :)

Chris posted a link to this: git://198.78.49.142/~cleech/linux-2.6
branch ioat-2.6.17

So you can post patches against that, or the patches posted here apply
against davem's git tree.

Regards -- Andy

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-29 23:05             ` Andrew Grover
@ 2006-03-30  8:01               ` Kumar Gala
  2006-03-30 18:27                 ` Andrew Grover
  0 siblings, 1 reply; 60+ messages in thread
From: Kumar Gala @ 2006-03-30  8:01 UTC (permalink / raw)
  To: Andrew Grover; +Cc: Chris Leech, linux kernel mailing list, netdev


On Mar 29, 2006, at 5:05 PM, Andrew Grover wrote:

> On 3/28/06, Kumar Gala <galak@kernel.crashing.org> wrote:
>> Do you only get callback when a channel is available?
>
> Yes
>
>> How do you
>> decide to do to provide PIO to the client?
>
> The client is responsible for using any channels it gets, or falling
> back to memcpy() if it doesn't get any. (I don't understand how PIO
> comes into the picture..?)

I was under the impression that the DMA engine would provide a "sync"  
cpu based memcpy (PIO) if a real HW channel wasn't avail, if this is  
left to the client that's fine.  So how does the client know he  
should use normal memcpy()?

>> A client should only request multiple channel to handle multiple
>> concurrent operations.
>
> Correct, if there aren't any CPU concurrency issues then 1 channel
> will use the device's full bandwidth (unless some other client has
> acquired the other channels and is using them, of course.)
>
>>> This gets around the problem of DMA clients registering (and  
>>> therefore
>>> not getting) channels simply because they init before the DMA device
>>> is discovered.
>>
>> What do you expect to happen in a system in which the channels are
>> over subscribed?
>>
>> Do you expect the DMA device driver to handle scheduling of channels
>> between multiple clients?
>
> It does the simplest thing that could possibly work right now:
> channels are allocated first come first serve. When there is a need,
> it should be straightforward to allow multiple clients to share DMA
> channels.

Sounds good for a start.  Have you given any thoughts on handling  
priorities between clients?

I need to take a look at the latest patches. How would you guys like  
modifications?

- k

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-28 23:03           ` Kumar Gala
@ 2006-03-29 23:05             ` Andrew Grover
  2006-03-30  8:01               ` Kumar Gala
  0 siblings, 1 reply; 60+ messages in thread
From: Andrew Grover @ 2006-03-29 23:05 UTC (permalink / raw)
  To: Kumar Gala; +Cc: Chris Leech, linux kernel mailing list, netdev

On 3/28/06, Kumar Gala <galak@kernel.crashing.org> wrote:
> Do you only get callback when a channel is available?

Yes

> How do you
> decide to do to provide PIO to the client?

The client is responsible for using any channels it gets, or falling
back to memcpy() if it doesn't get any. (I don't understand how PIO
comes into the picture..?)

> A client should only request multiple channel to handle multiple
> concurrent operations.

Correct, if there aren't any CPU concurrency issues then 1 channel
will use the device's full bandwidth (unless some other client has
acquired the other channels and is using them, of course.)

> > This gets around the problem of DMA clients registering (and therefore
> > not getting) channels simply because they init before the DMA device
> > is discovered.
>
> What do you expect to happen in a system in which the channels are
> over subscribed?
>
> Do you expect the DMA device driver to handle scheduling of channels
> between multiple clients?

It does the simplest thing that could possibly work right now:
channels are allocated first come first serve. When there is a need,
it should be straightforward to allow multiple clients to share DMA
channels.

Regards -- Andy

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-28 22:01         ` Andrew Grover
@ 2006-03-28 23:03           ` Kumar Gala
  2006-03-29 23:05             ` Andrew Grover
  0 siblings, 1 reply; 60+ messages in thread
From: Kumar Gala @ 2006-03-28 23:03 UTC (permalink / raw)
  To: Andrew Grover; +Cc: Chris Leech, linux kernel mailing list, netdev


On Mar 28, 2006, at 4:01 PM, Andrew Grover wrote:

> On 3/28/06, Kumar Gala <galak@kernel.crashing.org> wrote:
>
>>>> Also, what do you think about adding an operation type (MEMCPY,  
>>>> XOR,
>>>> CRYPTO_AES, etc).  We can than validate if the operation type
>>>> expected is supported by the devices that exist.
>>>
>>> No objections, but this speculative support doesn't need to be in  
>>> our
>>> initial patchset.
>>
>> I don't consider it speculative.  The patch is for a generic DMA
>> engine interface.  That interface should encompass all users.  I have
>> a security/crypto DMA engine that I'd like to front with the generic
>> DMA interface today.  Also, I believe there is another Intel group
>> with an XOR engine that had a similar concept called ADMA posted a
>> while ago.
>
> Please submit patches then. We will be doing another rev of the I/OAT
> patch very soon, which you will be able to patch against. Or, once the
> patch gets in mainline then we can enhance it. Code in the Linux
> kernel is never "done", and the burden of implementing additional
> functionality falls on those who want it.

I completely understand that.  However, I think putting something  
into mainline that only works or solves the particular problem you  
have is a bad idea.  I'll provide patches for the changes I'd like to  
see.  However, I figured a little discussion on the subject before I  
went off an spent time on it was worth while.

>> Can you explain what the semantics are.
>>
>> It's been a little while since I posted so my thoughts on the subject
>> are going to take a little while to come back to me :)
>
> Yeah. Basically you register as a DMA client, and say how many DMA
> channels you want. Our net_dma patch for example uses multiple
> channels to help lock contention. Then when channels are available
> (i.e. a DMA device added or another client gives them up) then you get
> a callback. If the channel goes away (i.e. DMA device is removed
> (theoretically possible but practically never happens) or *you* are
> going away and change your request to 0 channels) then you get a
> remove callback.

Do you only get callback when a channel is available?  How do you  
decide to do to provide PIO to the client?

A client should only request multiple channel to handle multiple  
concurrent operations.

> This gets around the problem of DMA clients registering (and therefore
> not getting) channels simply because they init before the DMA device
> is discovered.

What do you expect to happen in a system in which the channels are  
over subscribed?

Do you expect the DMA device driver to handle scheduling of channels  
between multiple clients?

- kumar


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-28 18:58       ` Kumar Gala
@ 2006-03-28 22:01         ` Andrew Grover
  2006-03-28 23:03           ` Kumar Gala
  0 siblings, 1 reply; 60+ messages in thread
From: Andrew Grover @ 2006-03-28 22:01 UTC (permalink / raw)
  To: Kumar Gala; +Cc: Chris Leech, linux kernel mailing list, netdev

On 3/28/06, Kumar Gala <galak@kernel.crashing.org> wrote:

> >> Also, what do you think about adding an operation type (MEMCPY, XOR,
> >> CRYPTO_AES, etc).  We can than validate if the operation type
> >> expected is supported by the devices that exist.
> >
> > No objections, but this speculative support doesn't need to be in our
> > initial patchset.
>
> I don't consider it speculative.  The patch is for a generic DMA
> engine interface.  That interface should encompass all users.  I have
> a security/crypto DMA engine that I'd like to front with the generic
> DMA interface today.  Also, I believe there is another Intel group
> with an XOR engine that had a similar concept called ADMA posted a
> while ago.

Please submit patches then. We will be doing another rev of the I/OAT
patch very soon, which you will be able to patch against. Or, once the
patch gets in mainline then we can enhance it. Code in the Linux
kernel is never "done", and the burden of implementing additional
functionality falls on those who want it.

> Can you explain what the semantics are.
>
> It's been a little while since I posted so my thoughts on the subject
> are going to take a little while to come back to me :)

Yeah. Basically you register as a DMA client, and say how many DMA
channels you want. Our net_dma patch for example uses multiple
channels to help lock contention. Then when channels are available
(i.e. a DMA device added or another client gives them up) then you get
a callback. If the channel goes away (i.e. DMA device is removed
(theoretically possible but practically never happens) or *you* are
going away and change your request to 0 channels) then you get a
remove callback.

This gets around the problem of DMA clients registering (and therefore
not getting) channels simply because they init before the DMA device
is discovered.

Regards -- Andy

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-28 18:44     ` Andrew Grover
@ 2006-03-28 18:58       ` Kumar Gala
  2006-03-28 22:01         ` Andrew Grover
  0 siblings, 1 reply; 60+ messages in thread
From: Kumar Gala @ 2006-03-28 18:58 UTC (permalink / raw)
  To: Andrew Grover; +Cc: Chris Leech, linux kernel mailing list, netdev


On Mar 28, 2006, at 12:44 PM, Andrew Grover wrote:

> On 3/16/06, Kumar Gala <galak@kernel.crashing.org> wrote:
>> It would seem that when a client registers (or shortly there after
>> when they call dma_async_client_chan_request()) they would expect to
>> get the number of channels they need by some given time period.
>>
>> For example, lets say a client registers but no dma device exists.
>> They will never get called to be aware of this condition.
>>
>> I would think most clients would either spin until they have all the
>> channels they need or fall back to a non-async mechanism.
>
> Clients *are* expected to fall back to non-async if they are not given
> channels. The reason it was implemented with callbacks for
> added/removed was that the client may be initializing before the
> channels are enumerated. For example, the net subsystem will ask for
> channels and not get them for a while, until the ioatdma PCI device is
> found and its driver loads. In this scenario, we'd like the net
> subsystem to be given these channels, instead of them going unused.

Fair, I need to think on that a little more.

>> Also, what do you think about adding an operation type (MEMCPY, XOR,
>> CRYPTO_AES, etc).  We can than validate if the operation type
>> expected is supported by the devices that exist.
>
> No objections, but this speculative support doesn't need to be in our
> initial patchset.

I don't consider it speculative.  The patch is for a generic DMA  
engine interface.  That interface should encompass all users.  I have  
a security/crypto DMA engine that I'd like to front with the generic  
DMA interface today.  Also, I believe there is another Intel group  
with an XOR engine that had a similar concept called ADMA posted a  
while ago.

http://marc.theaimsgroup.com/?t=112603120100004&r=1&w=2

>> Shouldn't we also have a dma_async_client_chan_free()?
>
> Well we could just define it to be chan_request(0) but it doesn't seem
> to be needed. Also, the allocation mechanism we have for channels is
> different from alloc/free's semantics, so it may be best to not muddy
> the water in this area.

Can you explain what the semantics are.

It's been a little while since I posted so my thoughts on the subject  
are going to take a little while to come back to me :)

- kumar

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-17  7:30   ` Kumar Gala
@ 2006-03-28 18:44     ` Andrew Grover
  2006-03-28 18:58       ` Kumar Gala
  0 siblings, 1 reply; 60+ messages in thread
From: Andrew Grover @ 2006-03-28 18:44 UTC (permalink / raw)
  To: Kumar Gala; +Cc: Chris Leech, linux kernel mailing list, netdev

On 3/16/06, Kumar Gala <galak@kernel.crashing.org> wrote:
> It would seem that when a client registers (or shortly there after
> when they call dma_async_client_chan_request()) they would expect to
> get the number of channels they need by some given time period.
>
> For example, lets say a client registers but no dma device exists.
> They will never get called to be aware of this condition.
>
> I would think most clients would either spin until they have all the
> channels they need or fall back to a non-async mechanism.

Clients *are* expected to fall back to non-async if they are not given
channels. The reason it was implemented with callbacks for
added/removed was that the client may be initializing before the
channels are enumerated. For example, the net subsystem will ask for
channels and not get them for a while, until the ioatdma PCI device is
found and its driver loads. In this scenario, we'd like the net
subsystem to be given these channels, instead of them going unused.

> Also, what do you think about adding an operation type (MEMCPY, XOR,
> CRYPTO_AES, etc).  We can than validate if the operation type
> expected is supported by the devices that exist.

No objections, but this speculative support doesn't need to be in our
initial patchset.

> Shouldn't we also have a dma_async_client_chan_free()?

Well we could just define it to be chan_request(0) but it doesn't seem
to be needed. Also, the allocation mechanism we have for channels is
different from alloc/free's semantics, so it may be best to not muddy
the water in this area.

Regards -- Andy

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-11  2:29 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
  2006-03-11  8:53   ` Andrew Morton
  2006-03-14 22:13   ` Pavel Machek
@ 2006-03-17  7:30   ` Kumar Gala
  2006-03-28 18:44     ` Andrew Grover
  2 siblings, 1 reply; 60+ messages in thread
From: Kumar Gala @ 2006-03-17  7:30 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux kernel mailing list, netdev

[snip]

> +/**
> + * dma_async_client_register - allocate and register a &dma_client
> + * @event_callback: callback for notification of channel addition/ 
> removal
> + */
> +struct dma_client *dma_async_client_register(dma_event_callback  
> event_callback)
> +{
> +	struct dma_client *client;
> +
> +	client = kzalloc(sizeof(*client), GFP_KERNEL);
> +	if (!client)
> +		return NULL;
> +
> +	INIT_LIST_HEAD(&client->channels);
> +	spin_lock_init(&client->lock);
> +
> +	client->chans_desired = 0;
> +	client->chan_count = 0;
> +	client->event_callback = event_callback;
> +
> +	spin_lock(&dma_list_lock);
> +	list_add_tail(&client->global_node, &dma_client_list);
> +	spin_unlock(&dma_list_lock);
> +
> +	return client;
> +}

It would seem that when a client registers (or shortly there after  
when they call dma_async_client_chan_request()) they would expect to  
get the number of channels they need by some given time period.

For example, lets say a client registers but no dma device exists.   
They will never get called to be aware of this condition.

I would think most clients would either spin until they have all the  
channels they need or fall back to a non-async mechanism.

Also, what do you think about adding an operation type (MEMCPY, XOR,  
CRYPTO_AES, etc).  We can than validate if the operation type  
expected is supported by the devices that exist.

> +
> +/**
> + * dma_async_client_unregister - unregister a client and free the  
> &dma_client
> + * @client:
> + *
> + * Force frees any allocated DMA channels, frees the &dma_client  
> memory
> + */
> +void dma_async_client_unregister(struct dma_client *client)
> +{
> +	struct dma_chan *chan;
> +
> +	if (!client)
> +		return;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(chan, &client->channels, client_node) {
> +		dma_client_chan_free(chan);
> +	}
> +	rcu_read_unlock();
> +
> +	spin_lock(&dma_list_lock);
> +	list_del(&client->global_node);
> +	spin_unlock(&dma_list_lock);
> +
> +	kfree(client);
> +	dma_chans_rebalance();
> +}
> +
> +/**
> + * dma_async_client_chan_request - request DMA channels
> + * @client: &dma_client
> + * @number: count of DMA channels requested
> + *
> + * Clients call dma_async_client_chan_request() to specify how many
> + * DMA channels they need, 0 to free all currently allocated.
> + * The resulting allocations/frees are indicated to the client via  
> the
> + * event callback.
> + */
> +void dma_async_client_chan_request(struct dma_client *client,
> +			unsigned int number)
> +{
> +	client->chans_desired = number;
> +	dma_chans_rebalance();
> +}
> +

Shouldn't we also have a dma_async_client_chan_free()?

[snip]

> +/* --- public DMA engine API --- */
> +
> +struct dma_client *dma_async_client_register(dma_event_callback  
> event_callback);
> +void dma_async_client_unregister(struct dma_client *client);
> +void dma_async_client_chan_request(struct dma_client *client,
> +		unsigned int number);
> +
> +/**
> + * dma_async_memcpy_buf_to_buf - offloaded copy between virtual  
> addresses
> + * @chan: DMA channel to offload copy to
> + * @dest: destination address (virtual)
> + * @src: source address (virtual)
> + * @len: length
> + *
> + * Both @dest and @src must be mappable to a bus address according  
> to the
> + * DMA mapping API rules for streaming mappings.
> + * Both @dest and @src must stay memory resident (kernel memory or  
> locked
> + * user space pages)
> + */
> +static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct  
> dma_chan *chan,
> +	void *dest, void *src, size_t len)
> +{
> +	int cpu = get_cpu();
> +	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
> +	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
> +	put_cpu();
> +
> +	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
> +}

What about renaming the dma_async_memcpy_* to something like  
dma_async_op_* and have them take an additional operation argument.

> +
> +/**
> + * dma_async_memcpy_buf_to_pg - offloaded copy
> + * @chan: DMA channel to offload copy to
> + * @page: destination page
> + * @offset: offset in page to copy to
> + * @kdata: source address (virtual)
> + * @len: length
> + *
> + * Both @page/@offset and @kdata must be mappable to a bus address  
> according
> + * to the DMA mapping API rules for streaming mappings.
> + * Both @page/@offset and @kdata must stay memory resident (kernel  
> memory or
> + * locked user space pages)
> + */
> +static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct  
> dma_chan *chan,
> +	struct page *page, unsigned int offset, void *kdata, size_t len)
> +{
> +	int cpu = get_cpu();
> +	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
> +	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
> +	put_cpu();
> +
> +	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
> +	                                             kdata, len);
> +}
> +
> +/**
> + * dma_async_memcpy_buf_to_pg - offloaded copy
> + * @chan: DMA channel to offload copy to
> + * @dest_page: destination page
> + * @dest_off: offset in page to copy to
> + * @src_page: source page
> + * @src_off: offset in page to copy from
> + * @len: length
> + *
> + * Both @dest_page/@dest_off and @src_page/@src_off must be  
> mappable to a bus
> + * address according to the DMA mapping API rules for streaming  
> mappings.
> + * Both @dest_page/@dest_off and @src_page/@src_off must stay  
> memory resident
> + * (kernel memory or locked user space pages)
> + */
> +static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct  
> dma_chan *chan,
> +	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
> +	unsigned int src_off, size_t len)
> +{
> +	int cpu = get_cpu();
> +	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
> +	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
> +	put_cpu();
> +
> +	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
> +	                                            src_pg, src_off, len);
> +}
> +
> +/**
> + * dma_async_memcpy_issue_pending - flush pending copies to HW
> + * @chan:
> + *
> + * This allows drivers to push copies to HW in batches,
> + * reducing MMIO writes where possible.
> + */
> +static inline void dma_async_memcpy_issue_pending(struct dma_chan  
> *chan)
> +{
> +	return chan->device->device_memcpy_issue_pending(chan);
> +}
> +
> +/**
> + * dma_async_memcpy_complete - poll for transaction completion
> + * @chan: DMA channel
> + * @cookie: transaction identifier to check status of
> + * @last: returns last completed cookie, can be NULL
> + * @used: returns last issued cookie, can be NULL
> + *
> + * If @last and @used are passed in, upon return they reflect the  
> driver
> + * internal state and can be used with dma_async_is_complete() to  
> check
> + * the status of multiple cookies without re-checking hardware state.
> + */
> +static inline enum dma_status dma_async_memcpy_complete(struct  
> dma_chan *chan,
> +	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
> +{
> +	return chan->device->device_memcpy_complete(chan, cookie, last,  
> used);
> +}
> +
> +/**
> + * dma_async_is_complete - test a cookie against chan state
> + * @cookie: transaction identifier to test status of
> + * @last_complete: last know completed transaction
> + * @last_used: last cookie value handed out
> + *
> + * dma_async_is_complete() is used in dma_async_memcpy_complete()
> + * the test logic is seperated for lightweight testing of multiple  
> cookies
> + */
> +static inline enum dma_status dma_async_is_complete(dma_cookie_t  
> cookie,
> +			dma_cookie_t last_complete, dma_cookie_t last_used)
> +{
> +	if (last_complete <= last_used) {
> +		if ((cookie <= last_complete) || (cookie > last_used))
> +			return DMA_SUCCESS;
> +	} else {
> +		if ((cookie <= last_complete) && (cookie > last_used))
> +			return DMA_SUCCESS;
> +	}
> +	return DMA_IN_PROGRESS;
> +}
> +
> +
> +/* --- DMA device --- */
> +
> +int dma_async_device_register(struct dma_device *device);
> +void dma_async_device_unregister(struct dma_device *device);
> +
> +#endif /* CONFIG_DMA_ENGINE */
> +#endif /* DMAENGINE_H */
>
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-11  2:29 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
  2006-03-11  8:53   ` Andrew Morton
@ 2006-03-14 22:13   ` Pavel Machek
  2006-03-17  7:30   ` Kumar Gala
  2 siblings, 0 replies; 60+ messages in thread
From: Pavel Machek @ 2006-03-14 22:13 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Hi!

> --- /dev/null
> +++ b/drivers/dma/dmaengine.c
> @@ -0,0 +1,360 @@
> +/*****************************************************************************
> +Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
> +
> +This program is free software; you can redistribute it and/or modify it
> +under the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 2 of the License, or (at your option)
> +any later version.
> +
> +This program is distributed in the hope that it will be useful, but WITHOUT
> +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +more details.
> +
> +You should have received a copy of the GNU General Public License along with
> +this program; if not, write to the Free Software Foundation, Inc., 59
> +Temple Place - Suite 330, Boston, MA  02111-1307, USA.
> +
> +The full GNU General Public License is included in this distribution in the
> +file called LICENSE.
> +*****************************************************************************/


Could you use 
/*
 *
 */

comment style, and describe in one or two lines what the source does
in the header?

								Pavel
-- 
209:using System.IO;

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-11  2:29 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-03-11  8:53   ` Andrew Morton
  2006-03-14 22:13   ` Pavel Machek
  2006-03-17  7:30   ` Kumar Gala
  2 siblings, 0 replies; 60+ messages in thread
From: Andrew Morton @ 2006-03-11  8:53 UTC (permalink / raw)
  To: Chris Leech; +Cc: linux-kernel, netdev

Chris Leech <christopher.leech@intel.com> wrote:
>
> +void dma_async_device_cleanup(struct kref *kref);
>

Declarations go in header files, please.  Or give it static scope.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH 1/8] [I/OAT] DMA memcpy subsystem
  2006-03-11  2:27 Chris Leech
@ 2006-03-11  2:29 ` Chris Leech
  2006-03-11  8:53   ` Andrew Morton
                     ` (2 more replies)
  0 siblings, 3 replies; 60+ messages in thread
From: Chris Leech @ 2006-03-11  2:29 UTC (permalink / raw)
  To: linux-kernel, netdev

Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech <christopher.leech@intel.com>
---

 drivers/Kconfig           |    2 
 drivers/Makefile          |    1 
 drivers/dma/Kconfig       |   13 ++
 drivers/dma/Makefile      |    1 
 drivers/dma/dmaengine.c   |  360 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  323 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 700 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index bddf431..ce7ffa7 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -70,4 +70,6 @@ source "drivers/sn/Kconfig"
 
 source "drivers/edac/Kconfig"
 
+source "drivers/dma/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5c69b86..516ba5e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -73,3 +73,4 @@ obj-$(CONFIG_SGI_SN)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
+obj-$(CONFIG_DMA_ENGINE)	+= dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 0000000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+	bool "Support for DMA engines"
+	---help---
+	  DMA engines offload copy operations from the CPU to dedicated
+	  hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 0000000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 0000000..35a63d8
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,360 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+
+static DEFINE_SPINLOCK(dma_list_lock);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+	return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+	return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+	return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+	__ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+	__ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+	__ATTR(in_use, S_IRUGO, show_in_use, NULL),
+	__ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+	.name            = "dma",
+	.class_dev_attrs = dma_class_attrs,
+	.release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_lock held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	/* Find a channel, any DMA engine will do */
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (chan->client)
+				continue;
+
+			if (chan->device->device_alloc_chan_resources(chan) >= 0) {
+				kref_get(&device->refcount);
+				kref_init(&chan->refcount);
+				chan->slow_ref = 0;
+				INIT_RCU_HEAD(&chan->rcu);
+				chan->client = client;
+				spin_lock_irqsave(&client->lock, flags);
+				list_add_tail_rcu(&chan->client_node, &client->channels);
+				spin_unlock_irqrestore(&client->lock, flags);
+				return chan;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_async_device_cleanup(struct kref *kref);
+void dma_chan_cleanup(struct kref *kref)
+{
+	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+	chan->device->device_free_chan_resources(chan);
+	chan->client = NULL;
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+	int bias = 0x7FFFFFFF;
+	int i;
+	for_each_cpu(i)
+		bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+	atomic_sub(bias, &chan->refcount.refcount);
+	kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+	chan->slow_ref = 1;
+	call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+	struct dma_client *client;
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	spin_lock(&dma_list_lock);
+	list_for_each_entry(client, &dma_client_list, global_node) {
+
+		while (client->chans_desired > client->chan_count) {
+			chan = dma_client_chan_alloc(client);
+			if (!chan)
+				break;
+
+			client->chan_count++;
+			client->event_callback(client, chan, DMA_RESOURCE_ADDED);
+		}
+
+		while (client->chans_desired < client->chan_count) {
+			spin_lock_irqsave(&client->lock, flags);
+			chan = list_entry(client->channels.next, struct dma_chan, client_node);
+			list_del_rcu(&chan->client_node);
+			spin_unlock_irqrestore(&client->lock, flags);
+			client->chan_count--;
+			client->event_callback(client, chan, DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+	}
+	spin_unlock(&dma_list_lock);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+	struct dma_client *client;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	INIT_LIST_HEAD(&client->channels);
+	spin_lock_init(&client->lock);
+
+	client->chans_desired = 0;
+	client->chan_count = 0;
+	client->event_callback = event_callback;
+
+	spin_lock(&dma_list_lock);
+	list_add_tail(&client->global_node, &dma_client_list);
+	spin_unlock(&dma_list_lock);
+
+	return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+	struct dma_chan *chan;
+
+	if (!client)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(chan, &client->channels, client_node) {
+		dma_client_chan_free(chan);
+	}
+	rcu_read_unlock();
+
+	spin_lock(&dma_list_lock);
+	list_del(&client->global_node);
+	spin_unlock(&dma_list_lock);
+
+	kfree(client);
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+			unsigned int number)
+{
+	client->chans_desired = number;
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+	static int id;
+	int chancnt = 0;
+	struct dma_chan* chan;
+
+	if (!device)
+		return -ENODEV;
+
+	init_completion(&device->done);
+	kref_init(&device->refcount);
+	device->dev_id = id++;
+
+	/* represent channels in sysfs. Probably want devs too */
+	list_for_each_entry(chan, &device->channels, device_node) {
+		chan->local = alloc_percpu(typeof(*chan->local));
+		if (chan->local == NULL)
+			continue;
+
+		chan->chan_id = chancnt++;
+		chan->class_dev.class = &dma_devclass;
+		chan->class_dev.dev = NULL;
+		snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+		         device->dev_id, chan->chan_id);
+
+		kref_get(&device->refcount);
+		class_device_register(&chan->class_dev);
+	}
+
+	spin_lock(&dma_list_lock);
+	list_add_tail(&device->global_node, &dma_device_list);
+	spin_unlock(&dma_list_lock);
+
+	dma_chans_rebalance();
+
+	return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+	struct dma_device *device = container_of(kref, struct dma_device, refcount);
+	complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	spin_lock(&dma_list_lock);
+	list_del(&device->global_node);
+	spin_unlock(&dma_list_lock);
+
+	list_for_each_entry(chan, &device->channels, device_node) {
+		if (chan->client) {
+			spin_lock_irqsave(&chan->client->lock, flags);
+			list_del(&chan->client_node);
+			chan->client->chan_count--;
+			spin_unlock_irqrestore(&chan->client->lock, flags);
+			chan->client->event_callback(chan->client, chan, DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+		class_device_unregister(&chan->class_dev);
+	}
+
+	dma_chans_rebalance();
+
+	kref_put(&device->refcount, dma_async_device_cleanup);
+	wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+	spin_lock_init(&dma_list_lock);
+
+	return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 0000000..ac3bff9
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,323 @@
+/*****************************************************************************
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*****************************************************************************/
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+	DMA_RESOURCE_SUSPEND,
+	DMA_RESOURCE_RESUME,
+	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+	DMA_SUCCESS,
+	DMA_IN_PROGRESS,
+	DMA_ERROR,
+};
+
+struct dma_chan_percpu {
+	local_t refcount;
+	/* stats */
+	unsigned long memcpy_count;
+	unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ */
+struct dma_chan {
+	struct dma_client *client;
+	struct dma_device *device;
+	dma_cookie_t cookie;
+
+	/* sysfs */
+	int chan_id;
+	struct class_device class_dev;
+
+	struct kref refcount;
+	int slow_ref;
+	struct rcu_head rcu;
+
+	struct list_head client_node;
+	struct list_head device_node;
+	struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_get(&chan->refcount);
+	else {
+		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_put(&chan->refcount, dma_chan_cleanup);
+	else {
+		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+	dma_event_callback	event_callback;
+	unsigned int		chan_count;
+	unsigned int		chans_desired;
+
+	spinlock_t		lock;
+	struct list_head	channels;
+	struct list_head	global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+	unsigned int chancnt;
+	struct list_head channels;
+	struct list_head global_node;
+
+	struct kref refcount;
+	struct completion done;
+
+	int dev_id;
+
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	void (*device_free_chan_resources)(struct dma_chan *chan);
+	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+			void *dest, void *src, size_t len);
+	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+			struct page *page, unsigned int offset, void *kdata,
+			size_t len);
+	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+			struct page *dest_pg, unsigned int dest_off,
+			struct page *src_pg, unsigned int src_off, size_t len);
+	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+			dma_cookie_t cookie, dma_cookie_t *last,
+			dma_cookie_t *used);
+	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+		unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+	                                             kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+	                                            src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+	return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+			dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+	if (last_complete <= last_used) {
+		if ((cookie <= last_complete) || (cookie > last_used))
+			return DMA_SUCCESS;
+	} else {
+		if ((cookie <= last_complete) && (cookie > last_used))
+			return DMA_SUCCESS;
+	}
+	return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */


^ permalink raw reply related	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2006-03-30 18:27 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-03-03 21:40 [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Chris Leech
2006-03-03 21:42 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
2006-03-04  1:40   ` David S. Miller
2006-03-06 19:39     ` Chris Leech
2006-03-04 19:20   ` Benjamin LaHaise
2006-03-06 19:48     ` Chris Leech
2006-03-03 21:42 ` [PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
2006-03-03 21:42 ` [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
2006-03-05  7:15   ` Andrew Morton
2006-03-03 21:42 ` [PATCH 5/8] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
2006-03-05  7:19   ` Andrew Morton
2006-03-03 21:42 ` [PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
2006-03-03 21:42 ` [PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
2006-03-04 11:22   ` Alexey Dobriyan
2006-03-05  7:21   ` Andrew Morton
2006-03-03 21:42 ` [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT Chris Leech
2006-03-04 16:39   ` Pavel Machek
2006-03-04 23:18   ` Greg KH
2006-03-06 19:28     ` Chris Leech
2006-03-05  7:30   ` Andrew Morton
2006-03-05  8:45   ` Andrew Morton
2006-03-05 10:27     ` David S. Miller
2006-03-06 19:36     ` Chris Leech
2006-03-03 22:27 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Jeff Garzik
2006-03-03 22:39   ` Chris Leech
2006-03-03 22:45     ` Jeff Garzik
2006-03-04 11:35     ` Evgeniy Polyakov
2006-03-05  8:09     ` Andrew Morton
2006-03-05  9:02       ` Discourage duplicate symbols in the kernel? [Was: Intel I/O Acc...] Sam Ravnborg
2006-03-05  9:18         ` Andrew Morton
2006-03-06 19:56           ` Chris Leech
2006-03-03 22:58 ` [PATCH 0/8] Intel I/O Acceleration Technology (I/OAT) Kumar Gala
2006-03-03 23:32   ` Chris Leech
2006-03-04 18:46 ` Jan Engelhardt
2006-03-04 21:41   ` David S. Miller
2006-03-04 22:05     ` Gene Heskett
2006-03-04 22:16       ` David S. Miller
2006-03-05 13:45         ` Jan Engelhardt
2006-03-05 13:55           ` Arjan van de Ven
2006-03-05 16:14         ` Matthieu CASTET
2006-03-05 16:30           ` Jeff Garzik
2006-03-06 19:24           ` Chris Leech
2006-03-06 19:15       ` Chris Leech
2006-03-05  1:43     ` Evgeniy Polyakov
2006-03-05  2:08       ` David S. Miller
2006-03-06 17:44       ` Ingo Oeser
2006-03-07  7:44         ` Evgeniy Polyakov
2006-03-07  9:43           ` Ingo Oeser
2006-03-07 10:16             ` Evgeniy Polyakov
2006-03-11  2:27 Chris Leech
2006-03-11  2:29 ` [PATCH 1/8] [I/OAT] DMA memcpy subsystem Chris Leech
2006-03-11  8:53   ` Andrew Morton
2006-03-14 22:13   ` Pavel Machek
2006-03-17  7:30   ` Kumar Gala
2006-03-28 18:44     ` Andrew Grover
2006-03-28 18:58       ` Kumar Gala
2006-03-28 22:01         ` Andrew Grover
2006-03-28 23:03           ` Kumar Gala
2006-03-29 23:05             ` Andrew Grover
2006-03-30  8:01               ` Kumar Gala
2006-03-30 18:27                 ` Andrew Grover

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).