All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lukas Wunner <lukas-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>
To: Mark Brown <broonie-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: linux-rpi-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-spi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: [PATCH v2 10/10] spi: bcm2835: Speed up RX-only DMA transfers by zero-filling TX FIFO
Date: Wed, 11 Sep 2019 12:15:30 +0200	[thread overview]
Message-ID: <f45920af18dbf06e34129bbc406f53dc9c5d1075.1568187525.git.lukas@wunner.de> (raw)
In-Reply-To: <cover.1568187525.git.lukas-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>

The BCM2835 SPI driver currently sets the SPI_CONTROLLER_MUST_TX flag.
When performing an RX-only transfer, this flag causes the SPI core to
allocate and DMA-map a dummy buffer which is copied to the TX FIFO.
The dummy buffer is necessary because the chip is not capable of
automatically clocking out null bytes.

Avoid the overhead induced by the dummy buffer by preallocating a
reusable DMA transaction which fills the TX FIFO by cyclically copying
from the zero page.  The transaction requires very little CPU time to
submit and generates no interrupts while running.  Specifics are
provided in kerneldoc comments.

[Nathan Chancellor contributed a DMA mapping fixup for an early version
of this commit, hence his Signed-off-by.]

Tested-by: Nuno Sá <nuno.sa@analog.com>
Tested-by: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Acked-by: Stefan Wahren <wahrenst@gmx.net>
Acked-by: Martin Sperl <kernel@martin.sperl.org>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
---
 drivers/spi/spi-bcm2835.c | 93 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 82 insertions(+), 11 deletions(-)

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index b7cf0b6a5d65..b4070c0de3df 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -111,6 +111,9 @@ MODULE_PARM_DESC(polling_limit_us,
  * @tx_dma_active: whether a TX DMA descriptor is in progress
  * @rx_dma_active: whether a RX DMA descriptor is in progress
  *	(used by bcm2835_spi_dma_tx_done() to handle a race)
+ * @fill_tx_desc: preallocated TX DMA descriptor used for RX-only transfers
+ *	(cyclically copies from zero page to TX FIFO)
+ * @fill_tx_addr: bus address of zero page
  * @clear_rx_desc: preallocated RX DMA descriptor used for TX-only transfers
  *	(cyclically clears RX FIFO by writing @clear_rx_cs to CS register)
  * @clear_rx_addr: bus address of @clear_rx_cs
@@ -140,6 +143,8 @@ struct bcm2835_spi {
 	u8 chip_select;
 	unsigned int tx_dma_active;
 	unsigned int rx_dma_active;
+	struct dma_async_tx_descriptor *fill_tx_desc;
+	dma_addr_t fill_tx_addr;
 	struct dma_async_tx_descriptor *clear_rx_desc[BCM2835_SPI_NUM_CS];
 	dma_addr_t clear_rx_addr;
 	u32 clear_rx_cs[BCM2835_SPI_NUM_CS] ____cacheline_aligned;
@@ -476,14 +481,14 @@ static void bcm2835_spi_transfer_prologue(struct spi_controller *ctlr,
 	bs->rx_prologue  = 0;
 	bs->tx_spillover = false;
 
-	if (!sg_is_last(&tfr->tx_sg.sgl[0]))
+	if (bs->tx_buf && !sg_is_last(&tfr->tx_sg.sgl[0]))
 		bs->tx_prologue = sg_dma_len(&tfr->tx_sg.sgl[0]) & 3;
 
 	if (bs->rx_buf && !sg_is_last(&tfr->rx_sg.sgl[0])) {
 		bs->rx_prologue = sg_dma_len(&tfr->rx_sg.sgl[0]) & 3;
 
 		if (bs->rx_prologue > bs->tx_prologue) {
-			if (sg_is_last(&tfr->tx_sg.sgl[0])) {
+			if (!bs->tx_buf || sg_is_last(&tfr->tx_sg.sgl[0])) {
 				bs->tx_prologue  = bs->rx_prologue;
 			} else {
 				bs->tx_prologue += 4;
@@ -517,6 +522,9 @@ static void bcm2835_spi_transfer_prologue(struct spi_controller *ctlr,
 		sg_dma_len(&tfr->rx_sg.sgl[0])     -= bs->rx_prologue;
 	}
 
+	if (!bs->tx_buf)
+		return;
+
 	/*
 	 * Write remaining TX prologue.  Adjust first entry in TX sglist.
 	 * Also adjust second entry if prologue spills over to it.
@@ -562,6 +570,9 @@ static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
 		sg_dma_len(&tfr->rx_sg.sgl[0])     += bs->rx_prologue;
 	}
 
+	if (!bs->tx_buf)
+		goto out;
+
 	if (likely(!bs->tx_spillover)) {
 		sg_dma_address(&tfr->tx_sg.sgl[0]) -= bs->tx_prologue;
 		sg_dma_len(&tfr->tx_sg.sgl[0])     += bs->tx_prologue;
@@ -570,7 +581,7 @@ static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
 		sg_dma_address(&tfr->tx_sg.sgl[1]) -= 4;
 		sg_dma_len(&tfr->tx_sg.sgl[1])     += 4;
 	}
-
+out:
 	bs->tx_prologue = 0;
 }
 
@@ -585,10 +596,7 @@ static void bcm2835_spi_dma_rx_done(void *data)
 	struct spi_controller *ctlr = data;
 	struct bcm2835_spi *bs = spi_controller_get_devdata(ctlr);
 
-	/* reset fifo and HW */
-	bcm2835_spi_reset_hw(ctlr);
-
-	/* and terminate tx-dma as we do not have an irq for it
+	/* terminate tx-dma as we do not have an irq for it
 	 * because when the rx dma will terminate and this callback
 	 * is called the tx-dma must have finished - can't get to this
 	 * situation otherwise...
@@ -598,6 +606,9 @@ static void bcm2835_spi_dma_rx_done(void *data)
 	bs->rx_dma_active = false;
 	bcm2835_spi_undo_prologue(bs);
 
+	/* reset fifo and HW */
+	bcm2835_spi_reset_hw(ctlr);
+
 	/* and mark as completed */;
 	complete(&ctlr->xfer_completion);
 }
@@ -725,6 +736,24 @@ static int bcm2835_spi_prepare_sg(struct spi_controller *ctlr,
  * register.)  Reading 32 bytes from the RX FIFO would normally require 8 bus
  * accesses, whereas clearing it requires only 1 bus access.  So an 8-fold
  * reduction in bus traffic and thus energy consumption is achieved.
+ *
+ * For *RX-only* transfers (tx_buf is %NULL), fill the TX FIFO by cyclically
+ * copying from the zero page.  The DMA descriptor to do this is preallocated
+ * in bcm2835_dma_init().  It must be terminated once the RX DMA channel is
+ * done and can then be reused.
+ *
+ * The BCM2835 DMA driver autodetects when a transaction copies from the zero
+ * page and utilizes the DMA controller's ability to synthesize zeroes instead
+ * of copying them from memory.  This reduces traffic on the memory bus.  The
+ * feature is not available on so-called "lite" channels, but normally TX DMA
+ * is backed by a full-featured channel.
+ *
+ * Zero-filling the TX FIFO is paced by the DREQ signal.  Unfortunately the
+ * BCM2835 SPI controller continues to assert DREQ even after the DLEN register
+ * has been counted down to zero (hardware erratum).  Thus, when the transfer
+ * has finished, the DMA engine zero-fills the TX FIFO until it is half full.
+ * (Tuneable with the DC register.)  So up to 9 gratuitous bus accesses are
+ * performed at the end of an RX-only transfer.
  */
 static int bcm2835_spi_transfer_one_dma(struct spi_controller *ctlr,
 					struct spi_device *spi,
@@ -745,7 +774,12 @@ static int bcm2835_spi_transfer_one_dma(struct spi_controller *ctlr,
 	bcm2835_spi_transfer_prologue(ctlr, tfr, bs, cs);
 
 	/* setup tx-DMA */
-	ret = bcm2835_spi_prepare_sg(ctlr, spi, tfr, bs, true);
+	if (bs->tx_buf) {
+		ret = bcm2835_spi_prepare_sg(ctlr, spi, tfr, bs, true);
+	} else {
+		cookie = dmaengine_submit(bs->fill_tx_desc);
+		ret = dma_submit_error(cookie);
+	}
 	if (ret)
 		goto err_reset_hw;
 
@@ -822,6 +856,16 @@ static void bcm2835_dma_release(struct spi_controller *ctlr,
 
 	if (ctlr->dma_tx) {
 		dmaengine_terminate_sync(ctlr->dma_tx);
+
+		if (bs->fill_tx_desc)
+			dmaengine_desc_free(bs->fill_tx_desc);
+
+		if (bs->fill_tx_addr)
+			dma_unmap_page_attrs(ctlr->dma_tx->device->dev,
+					     bs->fill_tx_addr, sizeof(u32),
+					     DMA_TO_DEVICE,
+					     DMA_ATTR_SKIP_CPU_SYNC);
+
 		dma_release_channel(ctlr->dma_tx);
 		ctlr->dma_tx = NULL;
 	}
@@ -872,7 +916,11 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
 		goto err_release;
 	}
 
-	/* configure DMAs */
+	/*
+	 * The TX DMA channel either copies a transfer's TX buffer to the FIFO
+	 * or, in case of an RX-only transfer, cyclically copies from the zero
+	 * page to the FIFO using a preallocated, reusable descriptor.
+	 */
 	slave_config.dst_addr = (u32)(dma_reg_base + BCM2835_SPI_FIFO);
 	slave_config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
 
@@ -880,6 +928,31 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
 	if (ret)
 		goto err_config;
 
+	bs->fill_tx_addr = dma_map_page_attrs(ctlr->dma_tx->device->dev,
+					      ZERO_PAGE(0), 0, sizeof(u32),
+					      DMA_TO_DEVICE,
+					      DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(ctlr->dma_tx->device->dev, bs->fill_tx_addr)) {
+		dev_err(dev, "cannot map zero page - not using DMA mode\n");
+		bs->fill_tx_addr = 0;
+		goto err_release;
+	}
+
+	bs->fill_tx_desc = dmaengine_prep_dma_cyclic(ctlr->dma_tx,
+						     bs->fill_tx_addr,
+						     sizeof(u32), 0,
+						     DMA_MEM_TO_DEV, 0);
+	if (!bs->fill_tx_desc) {
+		dev_err(dev, "cannot prepare fill_tx_desc - not using DMA mode\n");
+		goto err_release;
+	}
+
+	ret = dmaengine_desc_set_reuse(bs->fill_tx_desc);
+	if (ret) {
+		dev_err(dev, "cannot reuse fill_tx_desc - not using DMA mode\n");
+		goto err_release;
+	}
+
 	/*
 	 * The RX DMA channel is used bidirectionally:  It either reads the
 	 * RX FIFO or, in case of a TX-only transfer, cyclically writes a
@@ -923,8 +996,6 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
 
 	/* all went well, so set can_dma */
 	ctlr->can_dma = bcm2835_spi_can_dma;
-	/* need to do TX DMA, so we need a dummy buffer */
-	ctlr->flags = SPI_CONTROLLER_MUST_TX;
 
 	return;
 
-- 
2.23.0


_______________________________________________
linux-rpi-kernel mailing list
linux-rpi-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-rpi-kernel

  parent reply	other threads:[~2019-09-11 10:15 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-11 10:15 [PATCH v2 00/10] Speed up SPI simplex transfers on Raspberry Pi Lukas Wunner
     [not found] ` <cover.1568187525.git.lukas-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>
2019-09-11 10:15   ` [PATCH v2 04/10] spi: bcm2835: Work around DONE bit erratum Lukas Wunner
     [not found]     ` <7ceb98f154cdcf72c577615fa312df41adea5f47.1568187525.git.lukas-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>
2019-09-11 11:25       ` Mark Brown
2019-09-11 10:15   ` [PATCH v2 07/10] spi: bcm2835: Speed up TX-only DMA transfers by clearing RX FIFO Lukas Wunner
2019-09-11 10:15   ` [PATCH v2 06/10] spi: bcm2835: Cache CS register value for ->prepare_message() Lukas Wunner
2019-09-11 10:15   ` [PATCH v2 01/10] dmaengine: bcm2835: Allow reusable descriptors Lukas Wunner
2019-09-11 10:15   ` [PATCH v2 09/10] dmaengine: bcm2835: Avoid accessing memory when copying zeroes Lukas Wunner
2019-09-11 10:15   ` Lukas Wunner [this message]
2019-09-11 10:15   ` [PATCH v2 05/10] spi: bcm2835: Drop dma_pending flag Lukas Wunner
2022-07-19  6:52     ` Marc Kleine-Budde
2022-07-19  7:34       ` Stefan Wahren
2022-07-19  7:45         ` Marc Kleine-Budde
2019-09-11 10:15   ` [PATCH v2 08/10] dmaengine: bcm2835: Document struct bcm2835_dmadev Lukas Wunner
2019-09-11 10:15   ` [PATCH v2 02/10] dmaengine: bcm2835: Allow cyclic transactions without interrupt Lukas Wunner
2019-09-11 10:15   ` [PATCH v2 03/10] spi: Guarantee cacheline alignment of driver-private data Lukas Wunner
     [not found]     ` <01625b9b26b93417fb09d2c15ad02dfe9cdbbbe5.1568187525.git.lukas-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>
2019-09-11 14:59       ` Applied "spi: Guarantee cacheline alignment of driver-private data" to the spi tree Mark Brown
2019-09-11 10:47   ` [PATCH v2 00/10] Speed up SPI simplex transfers on Raspberry Pi Mark Brown
     [not found]     ` <20190911114352.w2htkzfi5v6zl7nq@wunner.de>
     [not found]       ` <20190911114352.w2htkzfi5v6zl7nq-JFq808J9C/izQB+pC5nmwQ@public.gmane.org>
2019-09-11 11:59         ` Mark Brown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f45920af18dbf06e34129bbc406f53dc9c5d1075.1568187525.git.lukas@wunner.de \
    --to=lukas-jfq808j9c/izqb+pc5nmwq@public.gmane.org \
    --cc=broonie-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
    --cc=linux-rpi-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org \
    --cc=linux-spi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.