All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <jens.axboe@oracle.com>
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Cc: chris.mason@oracle.com, david@fromorbit.com, hch@infradead.org,
	akpm@linux-foundation.org, jack@suse.cz,
	yanmin_zhang@linux.intel.com, Jens Axboe <jens.axboe@oracle.com>
Subject: [PATCH 12/13] block: first cut at implementing a NAPI approach for block devices
Date: Mon, 25 May 2009 09:31:06 +0200	[thread overview]
Message-ID: <1243236668-3398-24-git-send-email-jens.axboe@oracle.com> (raw)
In-Reply-To: <1243236668-3398-1-git-send-email-jens.axboe@oracle.com>

Adds support for AHCI only, along with the generic code.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile            |    2 +-
 block/blk-ipoll.c         |  160 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/ahci.c        |   53 ++++++++++++++-
 include/linux/blk-ipoll.h |   38 +++++++++++
 include/linux/interrupt.h |    1 +
 include/linux/libata.h    |    2 +
 6 files changed, 252 insertions(+), 4 deletions(-)
 create mode 100644 block/blk-ipoll.c
 create mode 100644 include/linux/blk-ipoll.h

diff --git a/block/Makefile b/block/Makefile
index e9fa4dd..537e88a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
+			blk-ipoll.o ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-ipoll.c b/block/blk-ipoll.c
new file mode 100644
index 0000000..700b74d
--- /dev/null
+++ b/block/blk-ipoll.c
@@ -0,0 +1,160 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-ipoll.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_ipoll);
+
+void blk_ipoll_sched(struct blk_ipoll *ipoll)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&ipoll->list, &__get_cpu_var(blk_cpu_ipoll));
+	__raise_softirq_irqoff(BLOCK_IPOLL_SOFTIRQ);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_ipoll_sched);
+
+void __blk_ipoll_complete(struct blk_ipoll *ipoll)
+{
+	list_del(&ipoll->list);
+	smp_mb__before_clear_bit();
+	clear_bit(IPOLL_F_SCHED, &ipoll->state);
+}
+
+void blk_ipoll_complete(struct blk_ipoll *ipoll)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__blk_ipoll_complete(ipoll);
+	local_irq_restore(flags);
+}
+
+static void blk_ipoll_softirq(struct softirq_action *h)
+{
+	struct list_head *list = &__get_cpu_var(blk_cpu_ipoll);
+	unsigned long start_time = jiffies;
+	int rearm = 0, budget = 64;
+
+	local_irq_disable();
+
+	while (!list_empty(list)) {
+		struct blk_ipoll *ipoll;
+		int work, weight;
+
+		/*
+		 * If softirq window is exhausted then punt.
+		 */
+		if (budget <= 0 || jiffies != start_time) {
+			rearm = 1;
+			break;
+		}
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->ipoll()
+		 * calls can remove this head entry from the list.
+		 */
+		ipoll = list_entry(list->next, struct blk_ipoll, list);
+
+		weight = ipoll->weight;
+		work = ipoll->ipoll(ipoll, weight);
+		budget -= work;
+
+		local_irq_disable();
+
+		/* Drivers must not modify the NAPI state if they
+		 * consume the entire weight.  In such cases this code
+		 * still "owns" the NAPI instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (work >= weight) {
+			if (blk_ipoll_disable_pending(ipoll))
+				__blk_ipoll_complete(ipoll);
+			else
+				list_move_tail(&ipoll->list, list);
+		}
+	}
+
+	if (rearm)
+		__raise_softirq_irqoff(BLOCK_IPOLL_SOFTIRQ);
+
+	local_irq_enable();
+}
+
+void blk_ipoll_disable(struct blk_ipoll *ipoll)
+{
+	set_bit(IPOLL_F_DISABLE, &ipoll->state);
+	while (test_and_set_bit(IPOLL_F_SCHED, &ipoll->state))
+		msleep(1);
+	clear_bit(IPOLL_F_DISABLE, &ipoll->state);
+}
+EXPORT_SYMBOL(blk_ipoll_disable);
+
+void blk_ipoll_enable(struct blk_ipoll *ipoll)
+{
+	BUG_ON(!test_bit(IPOLL_F_SCHED, &ipoll->state));
+        smp_mb__before_clear_bit();
+        clear_bit(IPOLL_F_SCHED, &ipoll->state);
+}
+EXPORT_SYMBOL(blk_ipoll_enable);
+
+void blk_ipoll_init(struct blk_ipoll *ipoll, int weight, blk_ipoll_fn *poll_fn)
+{
+	memset(ipoll, 0, sizeof(*ipoll));
+	INIT_LIST_HEAD(&ipoll->list);
+	ipoll->weight = weight;
+	ipoll->ipoll = poll_fn;
+}
+EXPORT_SYMBOL(blk_ipoll_init);
+
+static int __cpuinit blk_ipoll_cpu_notify(struct notifier_block *self,
+					  unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_ipoll, cpu),
+				 &__get_cpu_var(blk_cpu_ipoll));
+		raise_softirq_irqoff(BLOCK_IPOLL_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_ipoll_cpu_notifier = {
+	.notifier_call	= blk_ipoll_cpu_notify,
+};
+
+static __init int blk_ipoll_setup(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_ipoll, i));
+
+	open_softirq(BLOCK_IPOLL_SOFTIRQ, blk_ipoll_softirq);
+	register_hotcpu_notifier(&blk_ipoll_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_ipoll_setup);
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 08186ec..9701f93 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -45,6 +45,7 @@
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_cmnd.h>
 #include <linux/libata.h>
+#include <linux/blk-ipoll.h>
 
 #define DRV_NAME	"ahci"
 #define DRV_VERSION	"3.0"
@@ -2047,7 +2048,7 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
 		ata_port_abort(ap);
 }
 
-static void ahci_port_intr(struct ata_port *ap)
+static int ahci_port_intr(struct ata_port *ap)
 {
 	void __iomem *port_mmio = ahci_port_base(ap);
 	struct ata_eh_info *ehi = &ap->link.eh_info;
@@ -2077,7 +2078,7 @@ static void ahci_port_intr(struct ata_port *ap)
 
 	if (unlikely(status & PORT_IRQ_ERROR)) {
 		ahci_error_intr(ap, status);
-		return;
+		return 0;
 	}
 
 	if (status & PORT_IRQ_SDB_FIS) {
@@ -2118,7 +2119,48 @@ static void ahci_port_intr(struct ata_port *ap)
 		ehi->err_mask |= AC_ERR_HSM;
 		ehi->action |= ATA_EH_RESET;
 		ata_port_freeze(ap);
+		rc = 0;
+	}
+
+	return rc;
+}
+
+static void ap_irq_disable(struct ata_port *ap)
+{
+	void __iomem *port_mmio = ahci_port_base(ap);
+
+	writel(0, port_mmio + PORT_IRQ_MASK);
+}
+
+static void ap_irq_enable(struct ata_port *ap)
+{
+	void __iomem *port_mmio = ahci_port_base(ap);
+	struct ahci_port_priv *pp = ap->private_data;
+
+	writel(pp->intr_mask, port_mmio + PORT_IRQ_MASK);
+}
+
+static int ahci_ipoll(struct blk_ipoll *ipoll, int budget)
+{
+	struct ata_port *ap = container_of(ipoll, struct ata_port, ipoll);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&ap->host->lock, flags);
+	ret = ahci_port_intr(ap);
+	spin_unlock_irqrestore(&ap->host->lock, flags);
+
+	if (ret > ipoll->max) {
+		printk("new ipoll max of %d\n", ret);
+		ipoll->max = ret;
+	}
+
+	if (ret < budget) {
+		blk_ipoll_complete(ipoll);
+		ap_irq_enable(ap);
 	}
+
+	return ret;
 }
 
 static irqreturn_t ahci_interrupt(int irq, void *dev_instance)
@@ -2151,7 +2193,10 @@ static irqreturn_t ahci_interrupt(int irq, void *dev_instance)
 
 		ap = host->ports[i];
 		if (ap) {
-			ahci_port_intr(ap);
+			if (blk_ipoll_sched_prep(&ap->ipoll)) {
+				ap_irq_disable(ap);
+				blk_ipoll_sched(&ap->ipoll);
+			}
 			VPRINTK("port %u\n", i);
 		} else {
 			VPRINTK("port %u (no irq)\n", i);
@@ -2407,6 +2452,8 @@ static int ahci_port_start(struct ata_port *ap)
 
 	ap->private_data = pp;
 
+	blk_ipoll_init(&ap->ipoll, 32, ahci_ipoll);
+
 	/* engage engines, captain */
 	return ahci_port_resume(ap);
 }
diff --git a/include/linux/blk-ipoll.h b/include/linux/blk-ipoll.h
new file mode 100644
index 0000000..dcc638f
--- /dev/null
+++ b/include/linux/blk-ipoll.h
@@ -0,0 +1,38 @@
+#ifndef BLK_IPOLL_H
+#define BLK_IPOLL_H
+
+struct blk_ipoll;
+typedef int (blk_ipoll_fn)(struct blk_ipoll *, int);
+
+struct blk_ipoll {
+	struct list_head list;
+	unsigned long state;
+	int weight;
+	int max;
+	blk_ipoll_fn *ipoll;
+};
+
+enum {
+	IPOLL_F_SCHED		= 0,
+	IPOLL_F_DISABLE		= 1,
+};
+
+static inline int blk_ipoll_sched_prep(struct blk_ipoll *ipoll)
+{
+	return !test_bit(IPOLL_F_DISABLE, &ipoll->state) &&
+		!test_and_set_bit(IPOLL_F_SCHED, &ipoll->state);
+}
+
+static inline int blk_ipoll_disable_pending(struct blk_ipoll *ipoll)
+{
+	return test_bit(IPOLL_F_DISABLE, &ipoll->state);
+}
+
+extern void blk_ipoll_sched(struct blk_ipoll *);
+extern void blk_ipoll_init(struct blk_ipoll *, int, blk_ipoll_fn *);
+extern void blk_ipoll_complete(struct blk_ipoll *);
+extern void __blk_ipoll_complete(struct blk_ipoll *);
+extern void blk_ipoll_enable(struct blk_ipoll *);
+extern void blk_ipoll_disable(struct blk_ipoll *);
+
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f..514cd75 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -335,6 +335,7 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
+	BLOCK_IPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cf1e54e..9f9df5e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -37,6 +37,7 @@
 #include <scsi/scsi_host.h>
 #include <linux/acpi.h>
 #include <linux/cdrom.h>
+#include <linux/blk-ipoll.h>
 
 /*
  * Define if arch has non-standard setup.  This is a _PCI_ standard
@@ -759,6 +760,7 @@ struct ata_port {
 #endif
 	/* owned by EH */
 	u8			sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
+	struct blk_ipoll	ipoll;
 };
 
 /* The following initializer overrides a method to NULL whether one of
-- 
1.6.3.rc0.1.gf800


  parent reply	other threads:[~2009-05-25  7:33 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-25  7:30 [PATCH 0/12] Per-bdi writeback flusher threads #5 Jens Axboe
2009-05-25  7:30 ` [PATCH 01/13] libata: get rid of ATA_MAX_QUEUE loop in ata_qc_complete_multiple() Jens Axboe
2009-05-25  7:30 ` [PATCH 01/12] ntfs: remove old debug check for dirty data in ntfs_put_super() Jens Axboe
2009-05-25  7:30 ` [PATCH 02/13] block: add static rq allocation cache Jens Axboe
2009-05-25  7:30 ` [PATCH 02/12] btrfs: properly register fs backing device Jens Axboe
2009-05-25  7:30 ` [PATCH 03/13] scsi: unify allocation of scsi command and sense buffer Jens Axboe
2009-05-25  7:41   ` Christoph Hellwig
2009-05-25  7:46     ` Jens Axboe
2009-05-25  7:50       ` Christoph Hellwig
2009-05-25  7:54         ` Jens Axboe
2009-05-25 10:33         ` Boaz Harrosh
2009-05-25 10:42           ` Christoph Hellwig
2009-05-25 10:49             ` Jens Axboe
2009-05-26  4:36         ` FUJITA Tomonori
2009-05-26  5:08           ` FUJITA Tomonori
2009-05-25  8:15   ` Pekka Enberg
2009-05-25  8:15     ` Pekka Enberg
2009-05-25 11:32     ` Nick Piggin
2009-05-25  9:28   ` Boaz Harrosh
2009-05-26  1:45     ` Roland Dreier
2009-05-26  4:36       ` FUJITA Tomonori
2009-05-26  6:29         ` Jens Axboe
2009-05-26  7:25           ` FUJITA Tomonori
2009-05-26  7:32             ` Jens Axboe
2009-05-26  7:38               ` FUJITA Tomonori
2009-05-26 14:47                 ` James Bottomley
2009-05-26 15:13                   ` Matthew Wilcox
2009-05-26 15:31                   ` FUJITA Tomonori
2009-05-26 16:05                     ` Boaz Harrosh
2009-05-27  1:36                       ` FUJITA Tomonori
2009-05-27  7:54                         ` Boaz Harrosh
2009-05-27  8:26                           ` FUJITA Tomonori
2009-05-27  9:11                             ` Boaz Harrosh
2009-05-26 16:12                   ` Boaz Harrosh
2009-05-26 16:28                     ` Boaz Harrosh
2009-05-26  7:56               ` FUJITA Tomonori
2009-05-26  5:23     ` FUJITA Tomonori
2009-05-25  7:30 ` [PATCH 03/12] writeback: move dirty inodes from super_block to backing_dev_info Jens Axboe
2009-05-25  7:30 ` [PATCH 04/13] scsi: get rid of lock in __scsi_put_command() Jens Axboe
2009-05-25  7:30 ` [PATCH 04/12] writeback: switch to per-bdi threads for flushing data Jens Axboe
2009-05-25  7:30 ` [PATCH 05/13] aio: mostly crap Jens Axboe
2009-05-25  9:09   ` Jan Kara
2009-05-25  7:30 ` [PATCH 05/12] writeback: get rid of pdflush completely Jens Axboe
2009-05-25  7:30 ` [PATCH 06/13] block: move elevator ops into the queue Jens Axboe
2009-05-25  7:30 ` [PATCH 06/12] writeback: separate the flushing state/task from the bdi Jens Axboe
2009-05-25  7:30 ` [PATCH 07/13] block: avoid indirect calls to enter cfq io scheduler Jens Axboe
2009-05-26  9:02   ` Nikanth K
2009-05-26  9:02     ` Nikanth K
2009-05-25  7:30 ` [PATCH 07/12] writeback: support > 1 flusher thread per bdi Jens Axboe
2009-05-25  7:30 ` [PATCH 08/13] block: change the tag sync vs async restriction logic Jens Axboe
2009-05-25  7:30 ` [PATCH 08/12] writeback: include default_backing_dev_info in writeback Jens Axboe
2009-05-25  7:31 ` [PATCH 09/13] libata: switch to using block layer tagging support Jens Axboe
2009-05-25  7:31 ` [PATCH 09/12] writeback: allow sleepy exit of default writeback task Jens Axboe
2009-05-25  7:31 ` [PATCH 10/13] block: add function for waiting for a specific free tag Jens Axboe
2009-05-25  7:31 ` [PATCH 10/12] writeback: add some debug inode list counters to bdi stats Jens Axboe
2009-05-25  7:31 ` [PATCH 11/13] block: disallow merging of read-ahead bits into normal request Jens Axboe
2009-05-25  7:31 ` [PATCH 11/12] writeback: add name to backing_dev_info Jens Axboe
2009-05-25  7:31 ` Jens Axboe [this message]
2009-05-25  7:31 ` [PATCH 12/12] writeback: check for registered bdi in flusher add and inode dirty Jens Axboe
2009-05-25  7:31 ` [PATCH 13/13] block: unlocked completion test patch Jens Axboe
2009-05-25  7:33 ` [PATCH 0/12] Per-bdi writeback flusher threads #5 Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1243236668-3398-24-git-send-email-jens.axboe@oracle.com \
    --to=jens.axboe@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=chris.mason@oracle.com \
    --cc=david@fromorbit.com \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=yanmin_zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.